You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2023/06/15 01:23:27 UTC
[arrow-nanoarrow] branch main updated: feat(python): Python schema, array, and array view skeleton (#117)
This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2f05e99 feat(python): Python schema, array, and array view skeleton (#117)
2f05e99 is described below
commit 2f05e99d18638cdd07675642f9a7dd2a73211066
Author: Dewey Dunnington <de...@dunnington.ca>
AuthorDate: Wed Jun 14 22:23:21 2023 -0300
feat(python): Python schema, array, and array view skeleton (#117)
This PR is an attempt to add minimum usable Python bindings to the
nanoarrow C library. That minimum scope is essentially just the ability
to extract field values from
`ArrowSchema`/`ArrowArray`/`ArrowArrayStream` objects in a way that will
not crash Python. This PR also includes bindings for nanoarrow's
`ArrowSchemaView` (so that the parameters of parameterized types can be
extracted) and `ArrowArrayView` (so that buffer types/sizes can be
exported using the Python buffer protocol).
I've updated [the
README](https://github.com/paleolimbot/arrow-nanoarrow/tree/python-tidbits/python#readme)
to showcase the extent of the bindings as implemented in this PR;
several basic examples are also provided below.
Example schema usage:
```python
import nanoarrow as na
import pyarrow as pa
schema = na.schema(pa.decimal128(10, 3))
print(schema.format)
#> d:10,3
print(schema.view().decimal_precision)
#> 10
print(schema.view().decimal_scale)
#> 3
```
Example Array usage:
```python
array = na.array(pa.array(["one", "two", "three", None]))
print(array.length)
#> 4
print(array.null_count)
#> 1
import numpy as np
view = array.view()
[np.array(buffer) for buffer in view.buffers]
#> [array([7], dtype=uint8),
#> array([ 0, 3, 6, 11, 11], dtype=int32),
#> array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
#> dtype='|S1')]
```
Example ArrayStream usage:
```python
pa_array_child = pa.array([1, 2, 3], pa.int32())
pa_array = pa.record_batch([pa_array_child], names=["some_column"])
reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
array_stream = na.array_stream(reader)
print(array_stream.get_schema())
#> struct<some_column: int32>
for array in array_stream:
print(array.length)
#> 3
print(array_stream.get_next() is None)
#> True
```
---------
Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
---
.github/workflows/python.yaml | 30 +-
python/{src/nanoarrow/__init__.py => .coveragerc} | 6 +-
python/.gitignore | 7 +-
python/{src/nanoarrow/__init__.py => MANIFEST.in} | 7 +-
python/README.ipynb | 392 ++++++++++
python/README.md | 192 ++++-
python/bootstrap.py | 199 +++++
python/{src => }/nanoarrow/__init__.py | 5 +-
python/nanoarrow/_lib.pyx | 903 ++++++++++++++++++++++
python/nanoarrow/lib.py | 69 ++
python/pyproject.toml | 8 +-
python/setup.py | 42 +-
python/src/nanoarrow/_lib.pyx | 86 ---
python/src/nanoarrow/nanoarrow_c.pxd | 127 ---
python/tests/test_nanoarrow.py | 302 +++++++-
src/nanoarrow/nanoarrow_types.h | 6 +
16 files changed, 2104 insertions(+), 277 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 7d93578..4b599f7 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -40,7 +40,7 @@ jobs:
steps:
- uses: actions/checkout@v3
-
+
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@@ -57,3 +57,31 @@ jobs:
- name: Run tests
run: |
pytest python/tests -v -s
+
+ - name: Run doctests
+ if: success() && matrix.python-version == '3.10'
+ run: |
+ # Needs editable install to run --doctest-cython
+ pip install pytest-cython
+ pip install -e python
+ pytest python --doctest-cython
+
+ - name: Coverage
+ if: success() && matrix.python-version == '3.10'
+ run: |
+ pip uninstall --yes nanoarrow
+ pip install pytest-cov Cython
+ pushd python
+
+ # Build with Cython + gcc coverage options
+ NANOARROW_PYTHON_COVERAGE=1 python setup.py build_ext --inplace
+
+ # Run tests + coverage.py (generates .coverage + coverage.xml files)
+ python -m pytest --cov ./nanoarrow
+ python -m coverage xml
+
+ - name: Upload coverage to codecov
+ if: success() && matrix.python-version == '3.10'
+ uses: codecov/codecov-action@v2
+ with:
+ files: 'python/coverage.xml'
diff --git a/python/src/nanoarrow/__init__.py b/python/.coveragerc
similarity index 91%
copy from python/src/nanoarrow/__init__.py
copy to python/.coveragerc
index 1586e60..1fb6a24 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/.coveragerc
@@ -15,6 +15,6 @@
# specific language governing permissions and limitations
# under the License.
-from ._lib import ( # noqa: F401
- as_numpy_array,
-)
+# .coveragerc to control coverage.py
+[run]
+plugins = Cython.Coverage
diff --git a/python/.gitignore b/python/.gitignore
index fcf8363..b372452 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -16,9 +16,10 @@
# specific language governing permissions and limitations
# under the License.
-src/nanoarrow/nanoarrow.c
-src/nanoarrow/nanoarrow.h
-src/nanoarrow/*.cpp
+nanoarrow/nanoarrow.c
+nanoarrow/nanoarrow.h
+nanoarrow/nanoarrow_c.pxd
+nanoarrow/*.c
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/python/src/nanoarrow/__init__.py b/python/MANIFEST.in
similarity index 87%
copy from python/src/nanoarrow/__init__.py
copy to python/MANIFEST.in
index 1586e60..93ed2fd 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/MANIFEST.in
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-from ._lib import ( # noqa: F401
- as_numpy_array,
-)
+exclude bootstrap.py
+include nanoarrow/nanoarrow.c
+include nanoarrow/nanoarrow.h
+include nanoarrow/nanoarrow_c.pxd
diff --git a/python/README.ipynb b/python/README.ipynb
new file mode 100644
index 0000000..d89d4c4
--- /dev/null
+++ b/python/README.ipynb
@@ -0,0 +1,392 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "<!---\n",
+ " Licensed to the Apache Software Foundation (ASF) under one\n",
+ " or more contributor license agreements. See the NOTICE file\n",
+ " distributed with this work for additional information\n",
+ " regarding copyright ownership. The ASF licenses this file\n",
+ " to you under the Apache License, Version 2.0 (the\n",
+ " \"License\"); you may not use this file except in compliance\n",
+ " with the License. You may obtain a copy of the License at\n",
+ "\n",
+ " http://www.apache.org/licenses/LICENSE-2.0\n",
+ "\n",
+ " Unless required by applicable law or agreed to in writing,\n",
+ " software distributed under the License is distributed on an\n",
+ " \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
+ " KIND, either express or implied. See the License for the\n",
+ " specific language governing permissions and limitations\n",
+ " under the License.\n",
+ "-->\n",
+ "\n",
+ "<!-- Render with jupyter nbconvert --to markdown README.ipynb -->\n",
+ "\n",
+ "# nanoarrow for Python\n",
+ "\n",
+ "The nanoarrow Python package provides bindings to the nanoarrow C library. Like\n",
+ "the nanoarrow C library, it provides tools to facilitate the use of the\n",
+ "[Arrow C Data](https://arrow.apache.org/docs/format/CDataInterface.html) \n",
+ "and [Arrow C Stream](https://arrow.apache.org/docs/format/CStreamInterface.html) \n",
+ "interfaces.\n",
+ "\n",
+ "## Installation\n",
+ "\n",
+ "Python bindings for nanoarrow are not yet available on PyPI. You can install via\n",
+ "URL (requires a C compiler):\n",
+ "\n",
+ "```bash\n",
+ "python -m pip install \"https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python\"\n",
+ "```\n",
+ "\n",
+ "If you can import the namespace, you're good to go!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import nanoarrow as na"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Example\n",
+ "\n",
+ "The Arrow C Data and Arrow C Stream interfaces are comprised of three structures: the `ArrowSchema` which represents a data type of an array, the `ArrowArray` which represents the values of an array, and an `ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common `ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow Python package.\n",
+ "\n",
+ "### Schemas\n",
+ "\n",
+ "Use `nanoarrow.schema()` to convert a data type-like object to an `ArrowSchema`. This is currently only implemented for pyarrow objects."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pyarrow as pa\n",
+ "schema = na.schema(pa.decimal128(10, 3))"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can extract the fields of a `Schema` object one at a time or parse it into a view to extract deserialized parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "d:10,3\n",
+ "10\n",
+ "3\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(schema.format)\n",
+ "print(schema.view().decimal_precision)\n",
+ "print(schema.view().decimal_scale)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The `nanoarrow.schema()` helper is currently only implemented for pyarrow objects. If your data type has an `_export_to_c()`-like function, you can get the address of a freshly-allocated `ArrowSchema` as well:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'int32'"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "schema = na.Schema.allocate()\n",
+ "pa.int32()._export_to_c(schema._addr())\n",
+ "schema.view().type"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The `Schema` object cleans up after itself: when the object is deleted, the underlying `Schema` is released."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Arrays\n",
+ "\n",
+ "You can use `nanoarrow.array()` to convert an array-like object to a `nanoarrow.Array`, optionally attaching a `Schema` that can be used to interpret its contents. This is currently only implemented for pyarrow objects."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "array = na.array(pa.array([\"one\", \"two\", \"three\", None]))"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Like the `Schema`, you can inspect an `Array` by extracting fields individually:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4\n",
+ "1\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(array.length)\n",
+ "print(array.null_count)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "...and parse the `Array`/`Schema` combination into a view whose contents is more readily accessible."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[array([7], dtype=uint8),\n",
+ " array([ 0, 3, 6, 11, 11], dtype=int32),\n",
+ " array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],\n",
+ " dtype='|S1')]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "view = array.view()\n",
+ "[np.array(buffer) for buffer in view.buffers]"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Like the `Schema`, you can allocate an empty one and access its address with `_addr()` to pass to other array-exporting functions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "array = na.Array.allocate(na.Schema.allocate())\n",
+ "pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr())\n",
+ "array.length"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Array streams\n",
+ "\n",
+ "You can use `nanoarrow.array_stream()` to convert an object representing a sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`. This is currently only implemented for pyarrow objects."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pa_array_child = pa.array([1, 2, 3], pa.int32())\n",
+ "pa_array = pa.record_batch([pa_array_child], names=[\"some_column\"])\n",
+ "reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])\n",
+ "array_stream = na.array_stream(reader)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can pull the next array from the stream using `.get_next()` or use it like an interator. The `.get_next()` method will return `None` when there are no more arrays in the stream."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "struct<some_column: int32>\n",
+ "3\n",
+ "True\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(array_stream.get_schema())\n",
+ "\n",
+ "for array in array_stream:\n",
+ " print(array.length)\n",
+ "\n",
+ "print(array_stream.get_next() is None)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also get the address of a freshly-allocated stream to pass to a suitable exporting function:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "struct<some_column: int32>"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "array_stream = na.ArrayStream.allocate()\n",
+ "reader._export_to_c(array_stream._addr())\n",
+ "array_stream.get_schema()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Development\n",
+ "\n",
+ "Python bindings for nanoarrow are managed with [setuptools](https://setuptools.pypa.io/en/latest/index.html).\n",
+ "This means you can build the project using:\n",
+ "\n",
+ "```shell\n",
+ "git clone https://github.com/apache/arrow-nanoarrow.git\n",
+ "cd arrow-nanoarrow/python\n",
+ "pip install -e .\n",
+ "```\n",
+ "\n",
+ "Tests use [pytest](https://docs.pytest.org/):\n",
+ "\n",
+ "```shell\n",
+ "# Install dependencies\n",
+ "pip install -e .[test]\n",
+ "\n",
+ "# Run tests\n",
+ "pytest -vvx\n",
+ "```"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/README.md b/python/README.md
index 701896b..db898d2 100644
--- a/python/README.md
+++ b/python/README.md
@@ -17,28 +17,196 @@
under the License.
-->
+<!-- Render with jupyter nbconvert --to markdown README.ipynb -->
+
# nanoarrow for Python
-Python bindings for nanoarrow.
-## Building
+The nanoarrow Python package provides bindings to the nanoarrow C library. Like
+the nanoarrow C library, it provides tools to facilitate the use of the
+[Arrow C Data](https://arrow.apache.org/docs/format/CDataInterface.html)
+and [Arrow C Stream](https://arrow.apache.org/docs/format/CStreamInterface.html)
+interfaces.
+
+## Installation
+
+Python bindings for nanoarrow are not yet available on PyPI. You can install via
+URL (requires a C compiler):
+
+```bash
+python -m pip install "https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python"
+```
+
+If you can import the namespace, you're good to go!
+
+
+```python
+import nanoarrow as na
+```
+
+## Example
+
+The Arrow C Data and Arrow C Stream interfaces are comprised of three structures: the `ArrowSchema` which represents a data type of an array, the `ArrowArray` which represents the values of an array, and an `ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common `ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow Python package.
+
+### Schemas
+
+Use `nanoarrow.schema()` to convert a data type-like object to an `ArrowSchema`. This is currently only implemented for pyarrow objects.
+
+
+```python
+import pyarrow as pa
+schema = na.schema(pa.decimal128(10, 3))
+```
+
+You can extract the fields of a `Schema` object one at a time or parse it into a view to extract deserialized parameters.
+
+
+```python
+print(schema.format)
+print(schema.view().decimal_precision)
+print(schema.view().decimal_scale)
+```
+
+ d:10,3
+ 10
+ 3
+
+
+The `nanoarrow.schema()` helper is currently only implemented for pyarrow objects. If your data type has an `_export_to_c()`-like function, you can get the address of a freshly-allocated `ArrowSchema` as well:
+
+
+```python
+schema = na.Schema.allocate()
+pa.int32()._export_to_c(schema._addr())
+schema.view().type
+```
+
+
+
+
+ 'int32'
+
+
+
+The `Schema` object cleans up after itself: when the object is deleted, the underlying `Schema` is released.
+
+### Arrays
+
+You can use `nanoarrow.array()` to convert an array-like object to a `nanoarrow.Array`, optionally attaching a `Schema` that can be used to interpret its contents. This is currently only implemented for pyarrow objects.
+
+
+```python
+array = na.array(pa.array(["one", "two", "three", None]))
+```
+
+Like the `Schema`, you can inspect an `Array` by extracting fields individually:
+
+
+```python
+print(array.length)
+print(array.null_count)
+```
+
+ 4
+ 1
+
+
+...and parse the `Array`/`Schema` combination into a view whose contents is more readily accessible.
-Python libraries are managed with [setuptools][setuptools]. In general, that
-means all projects can be built as follows:
+
+```python
+import numpy as np
+view = array.view()
+[np.array(buffer) for buffer in view.buffers]
+```
+
+
+
+
+ [array([7], dtype=uint8),
+ array([ 0, 3, 6, 11, 11], dtype=int32),
+ array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
+ dtype='|S1')]
+
+
+
+Like the `Schema`, you can allocate an empty one and access its address with `_addr()` to pass to other array-exporting functions.
+
+
+```python
+array = na.Array.allocate(na.Schema.allocate())
+pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr())
+array.length
+```
+
+
+
+
+ 3
+
+
+
+### Array streams
+
+You can use `nanoarrow.array_stream()` to convert an object representing a sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`. This is currently only implemented for pyarrow objects.
+
+
+```python
+pa_array_child = pa.array([1, 2, 3], pa.int32())
+pa_array = pa.record_batch([pa_array_child], names=["some_column"])
+reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
+array_stream = na.array_stream(reader)
+```
+
+You can pull the next array from the stream using `.get_next()` or use it like an interator. The `.get_next()` method will return `None` when there are no more arrays in the stream.
+
+
+```python
+print(array_stream.get_schema())
+
+for array in array_stream:
+ print(array.length)
+
+print(array_stream.get_next() is None)
+```
+
+ struct<some_column: int32>
+ 3
+ True
+
+
+You can also get the address of a freshly-allocated stream to pass to a suitable exporting function:
+
+
+```python
+array_stream = na.ArrayStream.allocate()
+reader._export_to_c(array_stream._addr())
+array_stream.get_schema()
+```
+
+
+
+
+ struct<some_column: int32>
+
+
+
+## Development
+
+Python bindings for nanoarrow are managed with [setuptools](https://setuptools.pypa.io/en/latest/index.html).
+This means you can build the project using:
```shell
-$ cd python
-$ pip install -e .
+git clone https://github.com/apache/arrow-nanoarrow.git
+cd arrow-nanoarrow/python
+pip install -e .
```
-Tests use [pytest][pytest]:
+Tests use [pytest](https://docs.pytest.org/):
```shell
# Install dependencies
-$ pip install -e .[test]
+pip install -e .[test]
# Run tests
-$ pytest -vvx
+pytest -vvx
```
-
-[pytest]: https://docs.pytest.org/
-[setuptools]: https://setuptools.pypa.io/en/latest/index.html
\ No newline at end of file
diff --git a/python/bootstrap.py b/python/bootstrap.py
new file mode 100644
index 0000000..39b4fd9
--- /dev/null
+++ b/python/bootstrap.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+import os
+import shutil
+
+# Generate the nanoarrow_c.pxd file used by the Cython extension
+class NanoarrowPxdGenerator:
+
+ def __init__(self):
+ self._define_regexes()
+
+ def generate_nanoarrow_pxd(self, file_in, file_out):
+ file_in_name = os.path.basename(file_in)
+
+ # Read the nanoarrow.h header
+ content = None
+ with open(file_in, 'r') as input:
+ content = input.read()
+
+ # Strip comments
+ content = self.re_comment.sub('', content)
+
+ # Find types and function definitions
+ types = self._find_types(content)
+ func_defs = self._find_func_defs(content)
+
+ # Make corresponding cython definitions
+ types_cython = [self._type_to_cython(t, ' ') for t in types]
+ func_defs_cython = [self._func_def_to_cython(d, ' ') for d in func_defs]
+
+ # Unindent the header
+ header = self.re_newline_plus_indent.sub('\n', self._pxd_header())
+
+ # Write nanoarrow_c.pxd
+ with open(file_out, 'wb') as output:
+ output.write(header.encode('UTF-8'))
+
+ output.write(f'\ncdef extern from "{file_in_name}" nogil:\n'.encode("UTF-8"))
+
+ # A few things we add in manually
+ output.write(b'\n')
+ output.write(b' ctypedef int ArrowErrorCode\n')
+ output.write(b' cdef int NANOARROW_OK\n')
+ output.write(b'\n')
+
+ for type in types_cython:
+ output.write(type.encode('UTF-8'))
+ output.write(b'\n\n')
+
+ for func_def in func_defs_cython:
+ output.write(func_def.encode('UTF-8'))
+ output.write(b'\n')
+
+ def _define_regexes(self):
+ self.re_comment = re.compile(r'\s*//[^\n]*')
+ self.re_type = re.compile(r'(?P<type>struct|union|enum) (?P<name>Arrow[^ ]+) {(?P<body>[^}]*)}')
+ self.re_func_def = re.compile(r'\n(static inline )?(?P<const>const )?(struct|enum )?(?P<return_type>[A-Za-z0-9_*]+) (?P<name>Arrow[A-Za-z]+)\((?P<args>[^\)]*)\);')
+ self.re_tagged_type = re.compile(r'(?P<type>struct|union|enum) (?P<name>Arrow[A-Za-z]+)')
+ self.re_struct_delim = re.compile(r';\s*')
+ self.re_enum_delim = re.compile(r',\s*')
+ self.re_whitespace = re.compile(r'\s+')
+ self.re_newline_plus_indent = re.compile(r'\n +')
+
+ def _strip_comments(self, content):
+ return self.re_comment.sub('', content)
+
+ def _find_types(self, content):
+ return [m.groupdict() for m in self.re_type.finditer(content)]
+
+ def _find_func_defs(self, content):
+ return [m.groupdict() for m in self.re_func_def.finditer(content)]
+
+ def _type_to_cython(self, t, indent=''):
+ type = t['type']
+ name = t['name']
+ body = self.re_tagged_type.sub(r'\2', t['body'].strip())
+ if type == 'enum':
+ items = [item for item in self.re_enum_delim.split(body) if item]
+ else:
+ items = [item for item in self.re_struct_delim.split(body) if item]
+
+ cython_body = f'\n{indent} '.join([''] + items)
+ return f'{indent}{type} {name}:{cython_body}'
+
+ def _func_def_to_cython(self, d, indent=''):
+ return_type = d['return_type'].strip()
+ if d['const']:
+ return_type = 'const ' + return_type
+ name = d['name']
+ args = re.sub(r'\s+', ' ', d['args'].strip())
+ args = self.re_tagged_type.sub(r'\2', args)
+
+ # Cython doesn't do (void)
+ if args == 'void':
+ args = ''
+
+ return f'{indent}{return_type} {name}({args})'
+
+ def _pxd_header(self):
+ return """
+ # Licensed to the Apache Software Foundation (ASF) under one
+ # or more contributor license agreements. See the NOTICE file
+ # distributed with this work for additional information
+ # regarding copyright ownership. The ASF licenses this file
+ # to you under the Apache License, Version 2.0 (the
+ # "License"); you may not use this file except in compliance
+ # with the License. You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing,
+ # software distributed under the License is distributed on an
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ # KIND, either express or implied. See the License for the
+ # specific language governing permissions and limitations
+ # under the License.
+
+ # cython: language_level = 3
+
+ from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t
+ """
+
+# Runs cmake -DNANOARROW_BUNDLE=ON if cmake exists or copies nanoarrow.c/h
+# from ../dist if it does not. Running cmake is safer because it will sync
+# any changes from nanoarrow C library sources in the checkout but is not
+# strictly necessary for things like installing from GitHub.
+def copy_or_generate_nanoarrow_c():
+ this_wd = os.getcwd()
+ this_dir = os.path.abspath(os.path.dirname(__file__))
+ source_dir = os.path.dirname(this_dir)
+
+ maybe_nanoarrow_h = os.path.join(this_dir, 'nanoarrow/nanoarrow.h')
+ maybe_nanoarrow_c = os.path.join(this_dir, 'nanoarrow/nanoarrow.c')
+ for f in (maybe_nanoarrow_c, maybe_nanoarrow_h):
+ if os.path.exists(f):
+ os.unlink(f)
+
+ is_cmake_dir = 'CMakeLists.txt' in os.listdir(source_dir)
+ is_in_nanoarrow_repo = 'nanoarrow.h' in os.listdir(os.path.join(source_dir, 'src', 'nanoarrow'))
+ has_cmake = os.system('cmake --version') == 0
+ build_dir = os.path.join(this_dir, '_cmake')
+
+ if has_cmake and is_cmake_dir and is_in_nanoarrow_repo:
+ try:
+ os.mkdir(build_dir)
+ os.chdir(build_dir)
+ os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON -DNANOARROW_NAMESPACE=PythonPkg')
+ os.system(f'cmake --install . --prefix=../nanoarrow')
+ finally:
+ if os.path.exists(build_dir):
+ # Can fail on Windows with permission issues
+ try:
+ shutil.rmtree(build_dir)
+ except Exception as e:
+ print(f'Failed to remove _cmake temp directory: {str(e)}')
+ os.chdir(this_wd)
+
+ elif is_in_nanoarrow_repo:
+ shutil.copyfile()
+ else:
+ raise ValueError('Attempt to build source distribution outside the nanoarrow repo')
+
+ if not os.path.exists(os.path.join(this_dir, 'nanoarrow/nanoarrow.h')):
+ raise ValueError('Attempt to vendor nanoarrow.c/h failed')
+
+ maybe_nanoarrow_hpp = os.path.join(this_dir, 'nanoarrow/nanoarrow.hpp')
+ if os.path.exists(maybe_nanoarrow_hpp):
+ os.unlink(maybe_nanoarrow_hpp)
+
+# Runs the pxd generator with some information about the file name
+def generate_nanoarrow_pxd():
+ this_dir = os.path.abspath(os.path.dirname(__file__))
+ maybe_nanoarrow_h = os.path.join(this_dir, 'nanoarrow/nanoarrow.h')
+ maybe_nanoarrow_pxd = os.path.join(this_dir, 'nanoarrow/nanoarrow_c.pxd')
+
+ NanoarrowPxdGenerator().generate_nanoarrow_pxd(
+ maybe_nanoarrow_h,
+ maybe_nanoarrow_pxd
+ )
+
+if __name__ == '__main__':
+ copy_or_generate_nanoarrow_c()
+ generate_nanoarrow_pxd()
diff --git a/python/src/nanoarrow/__init__.py b/python/nanoarrow/__init__.py
similarity index 87%
rename from python/src/nanoarrow/__init__.py
rename to python/nanoarrow/__init__.py
index 1586e60..bb43726 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/nanoarrow/__init__.py
@@ -15,6 +15,5 @@
# specific language governing permissions and limitations
# under the License.
-from ._lib import ( # noqa: F401
- as_numpy_array,
-)
+from ._lib import c_version, Schema, Array, ArrayView, ArrayStream
+from .lib import schema, array, array_stream
diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx
new file mode 100644
index 0000000..b5210e3
--- /dev/null
+++ b/python/nanoarrow/_lib.pyx
@@ -0,0 +1,903 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# cython: language_level = 3
+# cython: linetrace=True
+
+"""Low-level nanoarrow Python bindings
+
+This Cython extension provides low-level Python wrappers around the
+Arrow C Data and Arrow C Stream interface structs. In general, there
+is one wrapper per C struct and pointer validity is managed by keeping
+strong references to Python objects. These wrappers are intended to
+be literal and stay close to the structure definitions.
+"""
+
+from libc.stdint cimport uintptr_t, int64_t
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+from cpython.bytes cimport PyBytes_FromStringAndSize
+from cpython cimport Py_buffer
+from nanoarrow_c cimport *
+
+def c_version():
+ """Return the nanoarrow C library version string
+ """
+ return ArrowNanoarrowVersion().decode("UTF-8")
+
+
+cdef class SchemaHolder:
+ """Memory holder for an ArrowSchema
+
+ This class is responsible for the lifecycle of the ArrowSchema
+ whose memory it is responsible for. When this object is deleted,
+ a non-NULL release callback is invoked.
+ """
+ cdef ArrowSchema c_schema
+
+ def __cinit__(self):
+ self.c_schema.release = NULL
+
+ def __dealloc__(self):
+ if self.c_schema.release != NULL:
+ self.c_schema.release(&self.c_schema)
+
+ def _addr(self):
+ return <uintptr_t>&self.c_schema
+
+
+cdef class ArrayHolder:
+ """Memory holder for an ArrowArray
+
+ This class is responsible for the lifecycle of the ArrowArray
+ whose memory it is responsible. When this object is deleted,
+ a non-NULL release callback is invoked.
+ """
+ cdef ArrowArray c_array
+
+ def __cinit__(self):
+ self.c_array.release = NULL
+
+ def __dealloc__(self):
+ if self.c_array.release != NULL:
+ self.c_array.release(&self.c_array)
+
+ def _addr(self):
+ return <uintptr_t>&self.c_array
+
+cdef class ArrayStreamHolder:
+ """Memory holder for an ArrowArrayStream
+
+ This class is responsible for the lifecycle of the ArrowArrayStream
+ whose memory it is responsible. When this object is deleted,
+ a non-NULL release callback is invoked.
+ """
+ cdef ArrowArrayStream c_array_stream
+
+ def __cinit__(self):
+ self.c_array_stream.release = NULL
+
+ def __dealloc__(self):
+ if self.c_array_stream.release != NULL:
+ self.c_array_stream.release(&self.c_array_stream)
+
+ def _addr(self):
+ return <uintptr_t>&self.c_array_stream
+
+
+cdef class ArrayViewHolder:
+ """Memory holder for an ArrowArrayView
+
+ This class is responsible for the lifecycle of the ArrowArrayView
+ whose memory it is responsible. When this object is deleted,
+ ArrowArrayViewReset() is called on the contents.
+ """
+ cdef ArrowArrayView c_array_view
+
+ def __cinit__(self):
+ ArrowArrayViewInitFromType(&self.c_array_view, NANOARROW_TYPE_UNINITIALIZED)
+
+ def __dealloc__(self):
+ ArrowArrayViewReset(&self.c_array_view)
+
+ def _addr(self):
+ return <uintptr_t>&self.c_array_view
+
+
+class NanoarrowException(RuntimeError):
+ """An error resulting from a call to the nanoarrow C library
+
+ Calls to the nanoarrow C library and/or the Arrow C Stream interface
+ callbacks return an errno error code and sometimes a message with extra
+ detail. This exception wraps a RuntimeError to format a suitable message
+ and store the components of the original error.
+ """
+
+ def __init__(self, what, code, message=""):
+ self.what = what
+ self.code = code
+ self.message = message
+
+ if self.message == "":
+ super().__init__(f"{self.what} failed ({self.code})")
+ else:
+ super().__init__(f"{self.what} failed ({self.code}): {self.message}")
+
+
+cdef class Error:
+ """Memory holder for an ArrowError
+
+ ArrowError is the C struct that is optionally passed to nanoarrow functions
+ when a detailed error message might be returned. This class holds a C
+ reference to the object and provides helpers for raising exceptions based
+ on the contained message.
+ """
+ cdef ArrowError c_error
+
+ def __cinit__(self):
+ self.c_error.message[0] = 0
+
+ def raise_message(self, what, code):
+ """Raise a NanoarrowException from this message
+ """
+ raise NanoarrowException(what, code, self.c_error.message.decode("UTF-8"))
+
+ @staticmethod
+ def raise_error(what, code):
+ """Raise a NanoarrowException without a message
+ """
+ raise NanoarrowException(what, code, "")
+
+
+cdef class Schema:
+ """ArrowSchema wrapper
+
+ This class provides a user-facing interface to access the fields of
+ an ArrowSchema as defined in the Arrow C Data interface. These objects
+ are usually created using `nanoarrow.schema()`. This Python wrapper
+ allows access to schema fields but does not automatically deserialize
+ their content: use `.view()` to validate and deserialize the content
+ into a more easily inspectable object.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.schema(pa.int32())
+ >>> schema.is_valid()
+ True
+ >>> schema.format
+ 'i'
+ >>> schema.name
+ ''
+ >>> schema_view = schema.view()
+ >>> schema_view.type
+ 'int32'
+ """
+ cdef object _base
+ cdef ArrowSchema* _ptr
+
+ @staticmethod
+ def allocate():
+ base = SchemaHolder()
+ return Schema(base, base._addr())
+
+ def __cinit__(self, object base, uintptr_t addr):
+ self._base = base,
+ self._ptr = <ArrowSchema*>addr
+
+ def _addr(self):
+ return <uintptr_t>self._ptr
+
+ def is_valid(self):
+ return self._ptr != NULL and self._ptr.release != NULL
+
+ def _assert_valid(self):
+ if self._ptr == NULL:
+ raise RuntimeError("schema is NULL")
+ if self._ptr.release == NULL:
+ raise RuntimeError("schema is released")
+
+ def __repr__(self):
+ cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0, True)
+ cdef char* out = <char*>PyMem_Malloc(n_chars + 1)
+ if not out:
+ raise MemoryError()
+
+ ArrowSchemaToString(self._ptr, out, n_chars + 1, True)
+ out_str = out.decode("UTF-8")
+ PyMem_Free(out)
+
+ return out_str
+
+ @property
+ def format(self):
+ self._assert_valid()
+ if self._ptr.format != NULL:
+ return self._ptr.format.decode("UTF-8")
+
+ @property
+ def name(self):
+ self._assert_valid()
+ if self._ptr.name != NULL:
+ return self._ptr.name.decode("UTF-8")
+ else:
+ return None
+
+ @property
+ def flags(self):
+ return self._ptr.flags
+
+ @property
+ def metadata(self):
+ self._assert_valid()
+ if self._ptr.metadata != NULL:
+ return SchemaMetadata(self, <uintptr_t>self._ptr.metadata)
+ else:
+ return None
+
+ @property
+ def children(self):
+ self._assert_valid()
+ return SchemaChildren(self)
+
+ @property
+ def dictionary(self):
+ self._assert_valid()
+ if self._ptr.dictionary != NULL:
+ return Schema(self, <uintptr_t>self._ptr.dictionary)
+ else:
+ return None
+
+ def view(self):
+ self._assert_valid()
+ schema_view = SchemaView()
+ cdef Error error = Error()
+ cdef int result = ArrowSchemaViewInit(&schema_view._schema_view, self._ptr, &error.c_error)
+ if result != NANOARROW_OK:
+ error.raise_message("ArrowSchemaViewInit()", result)
+
+ return schema_view
+
+
+cdef class SchemaView:
+ """ArrowSchemaView wrapper
+
+ The ArrowSchemaView is a nanoarrow C library structure that facilitates
+ access to the deserialized content of an ArrowSchema (e.g., parameter
+ values for parameterized types). This wrapper extends that facility to Python.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.schema(pa.decimal128(10, 3))
+ >>> schema_view = schema.view()
+ >>> schema_view.type
+ 'decimal128'
+ >>> schema_view.decimal_bitwidth
+ 128
+ >>> schema_view.decimal_precision
+ 10
+ >>> schema_view.decimal_scale
+ 3
+ """
+ cdef ArrowSchemaView _schema_view
+
+ _fixed_size_types = (
+ NANOARROW_TYPE_FIXED_SIZE_LIST,
+ NANOARROW_TYPE_FIXED_SIZE_BINARY
+ )
+
+ _decimal_types = (
+ NANOARROW_TYPE_DECIMAL128,
+ NANOARROW_TYPE_DECIMAL256
+ )
+
+ _time_unit_types = (
+ NANOARROW_TYPE_TIME32,
+ NANOARROW_TYPE_TIME64,
+ NANOARROW_TYPE_DURATION,
+ NANOARROW_TYPE_TIMESTAMP
+ )
+
+ _union_types = (
+ NANOARROW_TYPE_DENSE_UNION,
+ NANOARROW_TYPE_SPARSE_UNION
+ )
+
+ def __cinit__(self):
+ self._schema_view.type = NANOARROW_TYPE_UNINITIALIZED
+ self._schema_view.storage_type = NANOARROW_TYPE_UNINITIALIZED
+
+ @property
+ def type(self):
+ cdef const char* type_str = ArrowTypeString(self._schema_view.type)
+ if type_str != NULL:
+ return type_str.decode('UTF-8')
+
+ @property
+ def storage_type(self):
+ cdef const char* type_str = ArrowTypeString(self._schema_view.storage_type)
+ if type_str != NULL:
+ return type_str.decode('UTF-8')
+
+ @property
+ def fixed_size(self):
+ if self._schema_view.type in SchemaView._fixed_size_types:
+ return self._schema_view.fixed_size
+
+ @property
+ def decimal_bitwidth(self):
+ if self._schema_view.type in SchemaView._decimal_types:
+ return self._schema_view.decimal_bitwidth
+
+ @property
+ def decimal_precision(self):
+ if self._schema_view.type in SchemaView._decimal_types:
+ return self._schema_view.decimal_precision
+
+ @property
+ def decimal_scale(self):
+ if self._schema_view.type in SchemaView._decimal_types:
+ return self._schema_view.decimal_scale
+
+ @property
+ def time_unit(self):
+ if self._schema_view.type in SchemaView._time_unit_types:
+ return ArrowTimeUnitString(self._schema_view.time_unit).decode('UTF-8')
+
+ @property
+ def timezone(self):
+ if self._schema_view.type == NANOARROW_TYPE_TIMESTAMP:
+ return self._schema_view.timezone.decode('UTF_8')
+
+ @property
+ def union_type_ids(self):
+ if self._schema_view.type in SchemaView._union_types:
+ type_ids_str = self._schema_view.union_type_ids.decode('UTF-8').split(',')
+ return (int(type_id) for type_id in type_ids_str)
+
+ @property
+ def extension_name(self):
+ if self._schema_view.extension_name.data != NULL:
+ name_bytes = PyBytes_FromStringAndSize(
+ self._schema_view.extension_name.data,
+ self._schema_view.extension_name.size_bytes
+ )
+ return name_bytes.decode('UTF-8')
+
+ @property
+ def extension_metadata(self):
+ if self._schema_view.extension_name.data != NULL:
+ return PyBytes_FromStringAndSize(
+ self._schema_view.extension_metadata.data,
+ self._schema_view.extension_metadata.size_bytes
+ )
+
+cdef class Array:
+ """ArrowArray wrapper
+
+ This class provides a user-facing interface to access the fields of
+ an ArrowArray as defined in the Arrow C Data interface, holding an
+ optional reference to a Schema that can be used to safely deserialize
+ the content. These objects are usually created using `nanoarrow.array()`.
+ This Python wrapper allows access to array fields but does not
+ automatically deserialize their content: use `.view()` to validate and
+ deserialize the content into a more easily inspectable object.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> import nanoarrow as na
+ >>> array = na.array(pa.array(["one", "two", "three", None]))
+ >>> array.length
+ 4
+ >>> array.null_count
+ 1
+ >>> array_view = array.view()
+ """
+ cdef object _base
+ cdef ArrowArray* _ptr
+ cdef Schema _schema
+
+ @staticmethod
+ def allocate(Schema schema):
+ base = ArrayHolder()
+ return Array(base, base._addr(), schema)
+
+ def __cinit__(self, object base, uintptr_t addr, Schema schema):
+ self._base = base,
+ self._ptr = <ArrowArray*>addr
+ self._schema = schema
+
+ def _addr(self):
+ return <uintptr_t>self._ptr
+
+ def is_valid(self):
+ return self._ptr != NULL and self._ptr.release != NULL
+
+ def _assert_valid(self):
+ if self._ptr == NULL:
+ raise RuntimeError("Array is NULL")
+ if self._ptr.release == NULL:
+ raise RuntimeError("Array is released")
+
+ @property
+ def schema(self):
+ return self._schema
+
+ @property
+ def length(self):
+ self._assert_valid()
+ return self._ptr.length
+
+ @property
+ def offset(self):
+ self._assert_valid()
+ return self._ptr.offset
+
+ @property
+ def null_count(self):
+ return self._ptr.null_count
+
+ @property
+ def buffers(self):
+ return tuple(<uintptr_t>self._ptr.buffers[i] for i in range(self._ptr.n_buffers))
+
+ @property
+ def children(self):
+ return ArrayChildren(self)
+
+ @property
+ def dictionary(self):
+ self._assert_valid()
+ if self._ptr.dictionary != NULL:
+ return Array(self, <uintptr_t>self._ptr.dictionary, self._schema.dictionary)
+ else:
+ return None
+
+ def view(self):
+ cdef ArrayViewHolder holder = ArrayViewHolder()
+
+ cdef Error error = Error()
+ cdef int result = ArrowArrayViewInitFromSchema(&holder.c_array_view,
+ self._schema._ptr, &error.c_error)
+ if result != NANOARROW_OK:
+ error.raise_message("ArrowArrayViewInitFromSchema()", result)
+
+ result = ArrowArrayViewSetArray(&holder.c_array_view, self._ptr, &error.c_error)
+ if result != NANOARROW_OK:
+ error.raise_message("ArrowArrayViewSetArray()", result)
+
+ return ArrayView(holder, holder._addr(), self._schema, self)
+
+
+cdef class ArrayView:
+ """ArrowArrayView wrapper
+
+ The ArrowArrayView is a nanoarrow C library structure that provides
+ structured access to buffers addresses, buffer sizes, and buffer
+ data types. The buffer data is usually propagated from an ArrowArray
+ but can also be propagated from other types of objects (e.g., serialized
+ IPC). The offset and length of this view are independent of its parent
+ (i.e., this object can also represent a slice of its parent).
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> import nanoarrow as na
+ >>> array_view = na.array(pa.array(["one", "two", "three", None])).view()
+ >>> np.array(array_view.buffers[1])
+ array([ 0, 3, 6, 11, 11], dtype=int32)
+ >>> np.array(array_view.buffers[2])
+ array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
+ dtype='|S1')
+ """
+ cdef object _base
+ cdef ArrowArrayView* _ptr
+ cdef Schema _schema
+ cdef object _base_buffer
+
+ def __cinit__(self, object base, uintptr_t addr, Schema schema, object base_buffer):
+ self._base = base
+ self._ptr = <ArrowArrayView*>addr
+ self._schema = schema
+ self._base_buffer = base_buffer
+
+ @property
+ def length(self):
+ return self._ptr.length
+
+ @property
+ def offset(self):
+ return self._ptr.offset
+
+ @property
+ def null_count(self):
+ return self._ptr.null_count
+
+ @property
+ def children(self):
+ return ArrayViewChildren(self)
+
+ @property
+ def buffers(self):
+ return ArrayViewBuffers(self)
+
+ @property
+ def dictionary(self):
+ if self._ptr.dictionary == NULL:
+ return None
+ else:
+ return ArrayView(
+ self,
+ <uintptr_t>self._ptr.dictionary,
+ self._schema.dictionary,
+ None
+ )
+
+ @property
+ def schema(self):
+ return self._schema
+
+
+cdef class SchemaChildren:
+ """Wrapper for a lazily-resolved list of Schema children
+ """
+ cdef Schema _parent
+ cdef int64_t _length
+
+ def __cinit__(self, Schema parent):
+ self._parent = parent
+ self._length = parent._ptr.n_children
+
+ def __len__(self):
+ return self._length
+
+ def __getitem__(self, k):
+ k = int(k)
+ if k < 0 or k >= self._length:
+ raise IndexError(f"{k} out of range [0, {self._length})")
+
+ return Schema(self._parent, self._child_addr(k))
+
+ cdef _child_addr(self, int64_t i):
+ cdef ArrowSchema** children = self._parent._ptr.children
+ cdef ArrowSchema* child = children[i]
+ return <uintptr_t>child
+
+
+cdef class SchemaMetadata:
+ """Wrapper for a lazily-parsed Schema.metadata string
+ """
+
+ cdef object _parent
+ cdef const char* _metadata
+ cdef ArrowMetadataReader _reader
+
+ def __cinit__(self, object parent, uintptr_t ptr):
+ self._parent = parent
+ self._metadata = <const char*>ptr
+
+ def _init_reader(self):
+ cdef int result = ArrowMetadataReaderInit(&self._reader, self._metadata)
+ if result != NANOARROW_OK:
+ Error.raise_error("ArrowMetadataReaderInit()", result)
+
+ def __len__(self):
+ self._init_reader()
+ return self._reader.remaining_keys
+
+ def __iter__(self):
+ cdef ArrowStringView key
+ cdef ArrowStringView value
+ self._init_reader()
+ while self._reader.remaining_keys > 0:
+ ArrowMetadataReaderRead(&self._reader, &key, &value)
+ key_obj = PyBytes_FromStringAndSize(key.data, key.size_bytes).decode('UTF-8')
+ value_obj = PyBytes_FromStringAndSize(value.data, value.size_bytes)
+ yield key_obj, value_obj
+
+
+cdef class ArrayChildren:
+ """Wrapper for a lazily-resolved list of Array children
+ """
+ cdef Array _parent
+ cdef int64_t _length
+
+ def __cinit__(self, Array parent):
+ self._parent = parent
+ self._length = parent._ptr.n_children
+
+ def __len__(self):
+ return self._length
+
+ def __getitem__(self, k):
+ k = int(k)
+ if k < 0 or k >= self._length:
+ raise IndexError(f"{k} out of range [0, {self._length})")
+ return Array(self._parent, self._child_addr(k), self._parent.schema.children[k])
+
+ cdef _child_addr(self, int64_t i):
+ cdef ArrowArray** children = self._parent._ptr.children
+ cdef ArrowArray* child = children[i]
+ return <uintptr_t>child
+
+
+cdef class ArrayViewChildren:
+ """Wrapper for a lazily-resolved list of ArrayView children
+ """
+ cdef ArrayView _parent
+ cdef int64_t _length
+
+ def __cinit__(self, ArrayView parent):
+ self._parent = parent
+ self._length = parent._ptr.n_children
+
+ def __len__(self):
+ return self._length
+
+ def __getitem__(self, k):
+ k = int(k)
+ if k < 0 or k >= self._length:
+ raise IndexError(f"{k} out of range [0, {self._length})")
+ return ArrayView(
+ self._parent,
+ self._child_addr(k),
+ self._parent._schema.children[k],
+ None
+ )
+
+ cdef _child_addr(self, int64_t i):
+ cdef ArrowArrayView** children = self._parent._ptr.children
+ cdef ArrowArrayView* child = children[i]
+ return <uintptr_t>child
+
+
+cdef class BufferView:
+ """Wrapper for Array buffer content
+
+ This object is a Python wrapper around a buffer held by an Array.
+ It implements the Python buffer protocol and is best accessed through
+ another implementor (e.g., `np.array(array_view.buffers[1])`)). Note that
+ this buffer content does not apply any parent offset.
+ """
+ cdef object _base
+ cdef ArrowBufferView* _ptr
+ cdef ArrowBufferType _buffer_type
+ cdef ArrowType _buffer_data_type
+ cdef Py_ssize_t _element_size_bits
+ cdef Py_ssize_t _shape
+ cdef Py_ssize_t _strides
+
+ def __cinit__(self, object base, uintptr_t addr,
+ ArrowBufferType buffer_type, ArrowType buffer_data_type,
+ Py_ssize_t element_size_bits):
+ self._base = base
+ self._ptr = <ArrowBufferView*>addr
+ self._buffer_type = buffer_type
+ self._buffer_data_type = buffer_data_type
+ self._element_size_bits = element_size_bits
+ self._strides = self._item_size()
+ self._shape = self._ptr.size_bytes // self._strides
+
+
+ cdef Py_ssize_t _item_size(self):
+ if self._buffer_data_type == NANOARROW_TYPE_BOOL:
+ return 1
+ elif self._buffer_data_type == NANOARROW_TYPE_STRING:
+ return 1
+ elif self._buffer_data_type == NANOARROW_TYPE_BINARY:
+ return 1
+ else:
+ return self._element_size_bits // 8
+
+ cdef const char* _get_format(self):
+ if self._buffer_data_type == NANOARROW_TYPE_INT8:
+ return "b"
+ elif self._buffer_data_type == NANOARROW_TYPE_UINT8:
+ return "B"
+ elif self._buffer_data_type == NANOARROW_TYPE_INT16:
+ return "h"
+ elif self._buffer_data_type == NANOARROW_TYPE_UINT16:
+ return "H"
+ elif self._buffer_data_type == NANOARROW_TYPE_INT32:
+ return "i"
+ elif self._buffer_data_type == NANOARROW_TYPE_UINT32:
+ return "I"
+ elif self._buffer_data_type == NANOARROW_TYPE_INT64:
+ return "l"
+ elif self._buffer_data_type == NANOARROW_TYPE_UINT64:
+ return "L"
+ elif self._buffer_data_type == NANOARROW_TYPE_FLOAT:
+ return "f"
+ elif self._buffer_data_type == NANOARROW_TYPE_DOUBLE:
+ return "d"
+ elif self._buffer_data_type == NANOARROW_TYPE_STRING:
+ return "c"
+ else:
+ return "B"
+
+ def __getbuffer__(self, Py_buffer *buffer, int flags):
+ buffer.buf = <void*>self._ptr.data.data
+ buffer.format = self._get_format()
+ buffer.internal = NULL
+ buffer.itemsize = self._strides
+ buffer.len = self._ptr.size_bytes
+ buffer.ndim = 1
+ buffer.obj = self
+ buffer.readonly = 1
+ buffer.shape = &self._shape
+ buffer.strides = &self._strides
+ buffer.suboffsets = NULL
+
+ def __releasebuffer__(self, Py_buffer *buffer):
+ pass
+
+
+cdef class ArrayViewBuffers:
+ """A lazily-resolved list of ArrayView buffers
+ """
+ cdef ArrayView _array_view
+ cdef int64_t _length
+
+ def __cinit__(self, ArrayView array_view):
+ self._array_view = array_view
+ self._length = 3
+ for i in range(3):
+ if self._array_view._ptr.layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE:
+ self._length = i
+ break
+
+ def __len__(self):
+ return self._length
+
+ def __getitem__(self, k):
+ k = int(k)
+ if k < 0 or k >= self._length:
+ raise IndexError(f"{k} out of range [0, {self._length})")
+ cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k])
+ if buffer_view.data.data == NULL:
+ return None
+
+ return BufferView(
+ self._array_view,
+ <uintptr_t>buffer_view,
+ self._array_view._ptr.layout.buffer_type[k],
+ self._array_view._ptr.layout.buffer_data_type[k],
+ self._array_view._ptr.layout.element_size_bits[k]
+ )
+
+
+cdef class ArrayStream:
+ """ArrowArrayStream wrapper
+
+ This class provides a user-facing interface to access the fields of
+ an ArrowArrayStream as defined in the Arrow C Stream interface.
+ These objects are usually created using `nanoarrow.array_stream()`.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> pa_column = pa.array([1, 2, 3], pa.int32())
+ >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
+ >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema, [pa_batch])
+ >>> array_stream = na.array_stream(pa_reader)
+ >>> array_stream.get_schema()
+ struct<col1: int32>
+ >>> array_stream.get_next().length
+ 3
+ >>> array_stream.get_next() is None
+ Traceback (most recent call last):
+ ...
+ StopIteration
+ """
+ cdef object _base
+ cdef ArrowArrayStream* _ptr
+ cdef object _cached_schema
+
+ def __cinit__(self, object base, uintptr_t addr):
+ self._base = base
+ self._ptr = <ArrowArrayStream*>addr
+ self._cached_schema = None
+
+ def _addr(self):
+ return <uintptr_t>self._ptr
+
+ def is_valid(self):
+ return self._ptr != NULL and self._ptr.release != NULL
+
+ def _assert_valid(self):
+ if self._ptr == NULL:
+ raise RuntimeError("array stream pointer is NULL")
+ if self._ptr.release == NULL:
+ raise RuntimeError("array stream is released")
+
+ def _get_schema(self, Schema schema):
+ self._assert_valid()
+ cdef int code = self._ptr.get_schema(self._ptr, schema._ptr)
+ cdef const char* message = NULL
+ if code != NANOARROW_OK:
+ message = self._ptr.get_last_error(self._ptr)
+ if message != NULL:
+ raise NanoarrowException(
+ "ArrowArrayStream::get_schema()",
+ code,
+ message.decode("UTF-8")
+ )
+ else:
+ raise NanoarrowException("ArrowArrayStream::get_schema()", code)
+
+ self._cached_schema = schema
+
+ def get_schema(self):
+ """Get the schema associated with this stream
+ """
+ out = Schema.allocate()
+ self._get_schema(out)
+ return out
+
+ def get_next(self):
+ """Get the next Array from this stream
+
+ Returns None when there are no more arrays in this stream.
+ """
+ self._assert_valid()
+
+ # We return a reference to the same Python object for each
+ # Array that is returned. This is independent of get_schema(),
+ # which is guaranteed to call the C object's callback and
+ # faithfully pass on the returned value.
+ if self._cached_schema is None:
+ self._cached_schema = Schema.allocate()
+ self._get_schema(self._cached_schema)
+
+ cdef Array array = Array.allocate(self._cached_schema)
+ cdef int code = self._ptr.get_next(self._ptr, array._ptr)
+ cdef const char* message = NULL
+ if code != NANOARROW_OK:
+ message = self._ptr.get_last_error(self._ptr)
+ if message != NULL:
+ raise NanoarrowException(
+ "ArrowArrayStream::get_next()",
+ code,
+ message.decode("UTF-8")
+ )
+ else:
+ raise NanoarrowException("ArrowArrayStream::get_next()", code)
+
+ if not array.is_valid():
+ raise StopIteration()
+ else:
+ return array
+
+ def __iter__(self):
+ while True:
+ yield self.get_next()
+
+ @staticmethod
+ def allocate():
+ base = ArrayStreamHolder()
+ return ArrayStream(base, base._addr())
diff --git a/python/nanoarrow/lib.py b/python/nanoarrow/lib.py
new file mode 100644
index 0000000..a3c27e7
--- /dev/null
+++ b/python/nanoarrow/lib.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from ._lib import Schema, Array, ArrayStream
+
+
+def schema(obj):
+ if isinstance(obj, Schema):
+ return obj
+
+ # Not particularly safe because _export_to_c() could be exporting an
+ # array, schema, or array_stream. The ideal
+ # solution here would be something like __arrow_c_schema__()
+ if hasattr(obj, "_export_to_c"):
+ out = Schema.allocate()
+ obj._export_to_c(out._addr())
+ return out
+ else:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} to nanoarrow.Schema"
+ )
+
+
+def array(obj):
+ if isinstance(obj, Array):
+ return obj
+
+ # Somewhat safe because calling _export_to_c() with two arguments will
+ # not fail with a crash (but will fail with a confusing error). The ideal
+ # solution here would be something like __arrow_c_array__()
+ if hasattr(obj, "_export_to_c"):
+ out = Array.allocate(Schema.allocate())
+ obj._export_to_c(out._addr(), out.schema._addr())
+ return out
+ else:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} to nanoarrow.Array"
+ )
+
+
+def array_stream(obj):
+ if isinstance(obj, Schema):
+ return obj
+
+ # Not particularly safe because _export_to_c() could be exporting an
+ # array, schema, or array_stream. The ideal
+ # solution here would be something like __arrow_c_array_stream__()
+ if hasattr(obj, "_export_to_c"):
+ out = ArrayStream.allocate()
+ obj._export_to_c(out._addr())
+ return out
+ else:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} to nanoarrow.ArrowArrayStream"
+ )
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 1cc2c17..743cebe 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -19,14 +19,13 @@
[project]
name = "nanoarrow"
version = "1.0.0-alpha0"
-description = ""
+description = "Python bindings to the nanoarrow C library"
authors = [{name = "Apache Arrow Developers", email = "dev@arrow.apache.org"}]
license = {text = "Apache-2.0"}
requires-python = ">=3.8"
-dependencies = ["numpy"]
[project.optional-dependencies]
-test = ["pyarrow", "pytest"]
+test = ["pyarrow", "pytest", "numpy"]
[project.urls]
homepage = "https://arrow.apache.org"
@@ -36,7 +35,6 @@ repository = "https://github.com/apache/arrow-nanoarrow"
requires = [
"setuptools >= 61.0.0",
"setuptools-scm",
- "Cython",
- "oldest-supported-numpy",
+ "Cython"
]
build-backend = "setuptools.build_meta"
diff --git a/python/setup.py b/python/setup.py
index f6f7efb..4222cd8 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -17,33 +17,43 @@
# specific language governing permissions and limitations
# under the License.
-import shutil
-from pathlib import Path
-
+import os
+import sys
+import subprocess
from setuptools import Extension, setup
-import numpy as np
+# Run bootstrap.py to run cmake generating a fresh bundle based on this
+# checkout or copy from ../dist if the caller doesn't have cmake available.
+# Note that bootstrap.py won't exist if building from sdist.
+this_dir = os.path.dirname(__file__)
+bootstrap_py = os.path.join(this_dir, "bootstrap.py")
+if os.path.exists(bootstrap_py):
+ subprocess.run([sys.executable, bootstrap_py])
-# setuptools gets confused by relative paths that extend above the project root
-target = Path(__file__).parent / "src" / "nanoarrow"
-shutil.copy(
- Path(__file__).parent / "../dist/nanoarrow.c", target / "nanoarrow.c"
-)
-shutil.copy(
- Path(__file__).parent / "../dist/nanoarrow.h", target / "nanoarrow.h"
-)
+# Set some extra flags for compiling with coverage support
+if os.getenv("NANOARROW_PYTHON_COVERAGE") == "1":
+ coverage_compile_args = ["--coverage"]
+ coverage_link_args = ["--coverage"]
+ coverage_define_macros = [("CYTHON_TRACE", 1)]
+else:
+ coverage_compile_args = []
+ coverage_link_args = []
+ coverage_define_macros = []
setup(
ext_modules=[
Extension(
name="nanoarrow._lib",
- include_dirs=[np.get_include(), "src/nanoarrow"],
- language="c++",
+ include_dirs=["nanoarrow"],
+ language="c",
sources=[
- "src/nanoarrow/_lib.pyx",
- "src/nanoarrow/nanoarrow.c",
+ "nanoarrow/_lib.pyx",
+ "nanoarrow/nanoarrow.c",
],
+ extra_compile_args=coverage_compile_args,
+ extra_link_args=coverage_link_args,
+ define_macros=coverage_define_macros,
)
]
)
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
deleted file mode 100644
index a6b4da1..0000000
--- a/python/src/nanoarrow/_lib.pyx
+++ /dev/null
@@ -1,86 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: language_level = 3
-
-"""Low-level nanoarrow Python bindings."""
-
-from libc.stdint cimport uint8_t, uintptr_t
-
-from nanoarrow_c cimport *
-
-import numpy as np
-cimport numpy as cnp
-
-cnp.import_array()
-
-
-cdef dict _numpy_type_map = {
- NANOARROW_TYPE_UINT8: cnp.NPY_UINT8,
- NANOARROW_TYPE_INT8: cnp.NPY_INT8,
- NANOARROW_TYPE_UINT16: cnp.NPY_UINT16,
- NANOARROW_TYPE_INT16: cnp.NPY_INT16,
- NANOARROW_TYPE_UINT32: cnp.NPY_UINT32,
- NANOARROW_TYPE_INT32: cnp.NPY_INT32,
- NANOARROW_TYPE_UINT64: cnp.NPY_UINT64,
- NANOARROW_TYPE_INT64: cnp.NPY_INT64,
- NANOARROW_TYPE_HALF_FLOAT: cnp.NPY_FLOAT16,
- NANOARROW_TYPE_FLOAT: cnp.NPY_FLOAT32,
- NANOARROW_TYPE_DOUBLE: cnp.NPY_FLOAT64,
-}
-
-
-def as_numpy_array(arr):
- cdef ArrowSchema schema
- cdef ArrowArray array
- cdef ArrowArrayView array_view
- cdef ArrowError error
-
- arr._export_to_c(<uintptr_t> &array, <uintptr_t> &schema)
- ArrowArrayViewInitFromSchema(&array_view, &schema, &error)
-
- # primitive arrays have DATA as the second buffer
- if array_view.layout.buffer_type[1] != NANOARROW_BUFFER_TYPE_DATA:
- raise TypeError("Cannot convert a non-primitive array")
-
- # disallow nulls for this method
- if array.null_count > 0:
- raise ValueError("Cannot convert array with nulls")
- elif array.null_count < 0:
- # not yet computed
- if array_view.layout.buffer_type[0] == NANOARROW_BUFFER_TYPE_VALIDITY:
- if array.buffers[0] != NULL:
- null_count = ArrowBitCountSet(
- <const uint8_t *>array.buffers[0], array.offset, array.length
- )
- if null_count > 0:
- raise ValueError("Cannot convert array with nulls")
-
- cdef int type_num
- if array_view.storage_type in _numpy_type_map:
- type_num = _numpy_type_map[array_view.storage_type]
- else:
- raise NotImplementedError(array_view.storage_type)
-
- cdef cnp.npy_intp dims[1]
- dims[0] = array.length
- cdef cnp.ndarray result = cnp.PyArray_New(
- np.ndarray, 1, dims, type_num, NULL, <void *> array.buffers[1], -1, 0, <object>NULL
- )
- # TODO set base
-
- return result
diff --git a/python/src/nanoarrow/nanoarrow_c.pxd b/python/src/nanoarrow/nanoarrow_c.pxd
deleted file mode 100644
index 440f449..0000000
--- a/python/src/nanoarrow/nanoarrow_c.pxd
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: language_level = 3
-
-from libc.stdint cimport int64_t, int8_t, uint8_t
-
-
-cdef extern from "nanoarrow.h":
- struct ArrowSchema:
- const char* format
- int64_t n_children
- void (*release)(ArrowSchema*)
-
- struct ArrowArray:
- int64_t length
- int64_t null_count
- int64_t offset
- const void** buffers
- void (*release)(ArrowArray*)
-
- struct ArrowArrayStream:
- int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out)
-
- ctypedef int ArrowErrorCode
-
- enum ArrowType:
- NANOARROW_TYPE_UNINITIALIZED = 0
- NANOARROW_TYPE_NA = 1
- NANOARROW_TYPE_BOOL
- NANOARROW_TYPE_UINT8
- NANOARROW_TYPE_INT8
- NANOARROW_TYPE_UINT16
- NANOARROW_TYPE_INT16
- NANOARROW_TYPE_UINT32
- NANOARROW_TYPE_INT32
- NANOARROW_TYPE_UINT64
- NANOARROW_TYPE_INT64
- NANOARROW_TYPE_HALF_FLOAT
- NANOARROW_TYPE_FLOAT
- NANOARROW_TYPE_DOUBLE
- NANOARROW_TYPE_STRING
- NANOARROW_TYPE_BINARY
- NANOARROW_TYPE_FIXED_SIZE_BINARY
- NANOARROW_TYPE_DATE32
- NANOARROW_TYPE_DATE64
- NANOARROW_TYPE_TIMESTAMP
- NANOARROW_TYPE_TIME32
- NANOARROW_TYPE_TIME64
- NANOARROW_TYPE_INTERVAL_MONTHS
- NANOARROW_TYPE_INTERVAL_DAY_TIME
- NANOARROW_TYPE_DECIMAL128
- NANOARROW_TYPE_DECIMAL256
- NANOARROW_TYPE_LIST
- NANOARROW_TYPE_STRUCT
- NANOARROW_TYPE_SPARSE_UNION
- NANOARROW_TYPE_DENSE_UNION
- NANOARROW_TYPE_DICTIONARY
- NANOARROW_TYPE_MAP
- NANOARROW_TYPE_EXTENSION
- NANOARROW_TYPE_FIXED_SIZE_LIST
- NANOARROW_TYPE_DURATION
- NANOARROW_TYPE_LARGE_STRING
- NANOARROW_TYPE_LARGE_BINARY
- NANOARROW_TYPE_LARGE_LIST
- NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
-
- enum ArrowBufferType:
- NANOARROW_BUFFER_TYPE_NONE
- NANOARROW_BUFFER_TYPE_VALIDITY
- NANOARROW_BUFFER_TYPE_TYPE_ID
- NANOARROW_BUFFER_TYPE_UNION_OFFSET
- NANOARROW_BUFFER_TYPE_DATA_OFFSET
- NANOARROW_BUFFER_TYPE_DATA
-
- struct ArrowError:
- pass
-
- const char* ArrowErrorMessage(ArrowError* error)
-
- struct ArrowLayout:
- ArrowBufferType buffer_type[3]
- int64_t element_size_bits[3]
- int64_t child_size_elements
-
- cdef union buffer_data:
- const void* data
- const int8_t* as_int8
- const uint8_t* as_uint8
-
- struct ArrowBufferView:
- buffer_data data
- int64_t size_bytes
-
- struct ArrowBuffer:
- uint8_t* data
- int64_t size_bytes
-
- struct ArrowBitmap:
- ArrowBuffer buffer
- int64_t size_bits
-
- struct ArrowArrayView:
- ArrowArray* array
- ArrowType storage_type
- ArrowLayout layout
- ArrowBufferView buffer_views[3]
- int64_t n_children
- ArrowArrayView** children
-
- ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error)
- ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error)
- int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to)
diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py
index fd76534..3162274 100644
--- a/python/tests/test_nanoarrow.py
+++ b/python/tests/test_nanoarrow.py
@@ -1,27 +1,293 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import re
import numpy as np
import pyarrow as pa
+import pytest
-import nanoarrow
+import nanoarrow as na
-import pytest
+def test_c_version():
+ re_version = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$")
+ assert re_version.match(na.c_version()) is not None
+
+
+def test_schema_helper():
+ schema = na.Schema.allocate()
+ assert na.schema(schema) is schema
+
+ schema = na.schema(pa.null())
+ assert isinstance(schema, na.Schema)
+
+ with pytest.raises(TypeError):
+ na.schema(None)
+
+
+def test_array_helper():
+ array = na.Array.allocate(na.Schema.allocate())
+ assert na.array(array) is array
+
+ array = na.array(pa.array([], pa.null()))
+ assert isinstance(array, na.Array)
+
+ with pytest.raises(TypeError):
+ na.schema(None)
+
+
+def test_schema_basic():
+ schema = na.Schema.allocate()
+ assert schema.is_valid() is False
+ assert repr(schema) == "[invalid: schema is released]"
+
+ schema = na.schema(pa.schema([pa.field("some_name", pa.int32())]))
+
+ assert schema.format == "+s"
+ assert schema.flags == 0
+ assert schema.metadata is None
+ assert len(schema.children) == 1
+ assert schema.children[0].format == "i"
+ assert schema.children[0].name == "some_name"
+ assert repr(schema.children[0]) == "int32"
+ assert schema.dictionary is None
+
+ with pytest.raises(IndexError):
+ schema.children[1]
+
+
+def test_schema_dictionary():
+ schema = na.schema(pa.dictionary(pa.int32(), pa.utf8()))
+ assert schema.format == "i"
+ assert schema.dictionary.format == "u"
+
+
+def test_schema_metadata():
+ meta = {"key1": "value1", "key2": "value2"}
+ schema = na.schema(pa.field("", pa.int32(), metadata=meta))
+
+ assert len(schema.metadata) == 2
+
+ meta2 = {k: v for k, v in schema.metadata}
+ assert list(meta2.keys()) == ["key1", "key2"]
+ assert list(meta2.values()) == [b"value1", b"value2"]
+
+
+def test_schema_view():
+ schema = na.Schema.allocate()
+ with pytest.raises(RuntimeError):
+ schema.view()
+
+ schema = na.schema(pa.int32())
+ view = schema.view()
+ assert view.type == "int32"
+ assert view.storage_type == "int32"
+
+ assert view.fixed_size is None
+ assert view.decimal_bitwidth is None
+ assert view.decimal_scale is None
+ assert view.time_unit is None
+ assert view.timezone is None
+ assert view.union_type_ids is None
+ assert view.extension_name is None
+ assert view.extension_metadata is None
+
+
+def test_schema_view_extra_params():
+ schema = na.schema(pa.binary(12))
+ view = schema.view()
+ assert view.fixed_size == 12
+
+ schema = na.schema(pa.list_(pa.int32(), 12))
+ assert view.fixed_size == 12
+
+ schema = na.schema(pa.decimal128(10, 3))
+ view = schema.view()
+ assert view.decimal_bitwidth == 128
+ assert view.decimal_precision == 10
+ assert view.decimal_scale == 3
+
+ schema = na.schema(pa.decimal256(10, 3))
+ view = schema.view()
+ assert view.decimal_bitwidth == 256
+ assert view.decimal_precision == 10
+ assert view.decimal_scale == 3
+
+ schema = na.schema(pa.duration("us"))
+ view = schema.view()
+ assert view.time_unit == "us"
+
+ schema = na.schema(pa.timestamp("us", tz="America/Halifax"))
+ view = schema.view()
+ assert view.type == "timestamp"
+ assert view.storage_type == "int64"
+ assert view.time_unit == "us"
+ assert view.timezone == "America/Halifax"
+
+ meta = {
+ "ARROW:extension:name": "some_name",
+ "ARROW:extension:metadata": "some_metadata",
+ }
+ schema = na.schema(pa.field("", pa.int32(), metadata=meta))
+ view = schema.view()
+ assert view.extension_name == "some_name"
+ assert view.extension_metadata == b"some_metadata"
+
+
+def test_array():
+ array = na.array(pa.array([1, 2, 3], pa.int32()))
+ assert array.is_valid() is True
+ assert array.length == 3
+ assert array.offset == 0
+ assert array.null_count == 0
+ assert len(array.buffers) == 2
+ assert array.buffers[0] == 0
+ assert len(array.children) == 0
+ assert array.dictionary is None
+
+ with pytest.raises(IndexError):
+ array.children[1]
+
+
+def test_array_view():
+ array = na.array(pa.array([1, 2, 3], pa.int32()))
+ view = array.view()
+
+ assert view.schema is array.schema
+
+ data_buffer = memoryview(view.buffers[1])
+ data_buffer_copy = bytes(data_buffer)
+ assert len(data_buffer_copy) == 12
+
+ if sys.byteorder == "little":
+ assert data_buffer_copy == b"\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00"
+ else:
+ assert data_buffer_copy == b"\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03"
+
+ with pytest.raises(IndexError):
+ view.children[1]
+
+
+def test_array_view_recursive():
+ pa_array_child = pa.array([1, 2, 3], pa.int32())
+ pa_array = pa.record_batch([pa_array_child], names=["some_column"])
+
+ array = na.array(pa_array)
+
+ assert array.schema.format == "+s"
+ assert array.length == 3
+ assert len(array.children) == 1
+
+ assert array.children[0].schema.format == "i"
+ assert array.children[0].length == 3
+ assert array.children[0].schema._addr() == array.schema.children[0]._addr()
+
+ view = array.view()
+ assert len(view.buffers) == 1
+ assert len(view.children) == 1
+ assert view.schema._addr() == array.schema._addr()
+
+ assert len(view.children[0].buffers) == 2
+ assert view.children[0].schema._addr() == array.schema.children[0]._addr()
+ assert view.children[0].schema._addr() == array.children[0].schema._addr()
+
+
+def test_array_view_dictionary():
+ pa_array = pa.array(["a", "b", "b"], pa.dictionary(pa.int32(), pa.utf8()))
+ array = na.array(pa_array)
+
+ assert array.schema.format == "i"
+ assert array.dictionary.schema.format == "u"
+
+ view = array.view()
+ assert len(view.buffers) == 2
+ assert len(view.dictionary.buffers) == 3
+
+
+def test_buffers_data():
+ data_types = [
+ (pa.uint8(), np.uint8()),
+ (pa.int8(), np.int8()),
+ (pa.uint16(), np.uint16()),
+ (pa.int16(), np.int16()),
+ (pa.uint32(), np.uint32()),
+ (pa.int32(), np.int32()),
+ (pa.uint64(), np.uint64()),
+ (pa.int64(), np.int64()),
+ (pa.float32(), np.float32()),
+ (pa.float64(), np.float64()),
+ ]
+
+ for pa_type, np_type in data_types:
+ view = na.array(pa.array([0, 1, 2], pa_type)).view()
+ np.testing.assert_array_equal(
+ np.array(view.buffers[1]), np.array([0, 1, 2], np_type)
+ )
+
+
+def test_buffers_string():
+ view = na.array(pa.array(["a", "bc", "def"])).view()
+
+ assert view.buffers[0] is None
+ np.testing.assert_array_equal(
+ np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32())
+ )
+ np.testing.assert_array_equal(
+ np.array(view.buffers[2]), np.array(list("abcdef"), dtype="|S1")
+ )
+
+
+def test_buffers_binary():
+ view = na.array(pa.array([b"a", b"bc", b"def"])).view()
+
+ assert view.buffers[0] is None
+ np.testing.assert_array_equal(
+ np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32())
+ )
+ np.testing.assert_array_equal(np.array(view.buffers[2]), np.array(list(b"abcdef")))
+
+
+def test_array_stream():
+ array_stream = na.ArrayStream.allocate()
+ assert array_stream.is_valid() is False
+ with pytest.raises(RuntimeError):
+ array_stream.get_schema()
+ with pytest.raises(RuntimeError):
+ array_stream.get_next()
+
+ pa_array_child = pa.array([1, 2, 3], pa.int32())
+ pa_array = pa.record_batch([pa_array_child], names=["some_column"])
+ reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
+ array_stream = na.array_stream(reader)
-def test_as_numpy_array():
-
- arr = pa.array([1, 2, 3])
- result = nanoarrow.as_numpy_array(arr)
- expected = arr.to_numpy()
- np.testing.assert_array_equal(result, expected)
+ assert array_stream.is_valid() is True
+ array = array_stream.get_next()
+ assert array.schema.children[0].name == "some_column"
+ with pytest.raises(StopIteration):
+ array_stream.get_next()
- arr = pa.array([1, 2, 3], pa.uint8())
- result = nanoarrow.as_numpy_array(arr)
- expected = arr.to_numpy()
- np.testing.assert_array_equal(result, expected)
- arr = pa.array([1, 2, None])
- with pytest.raises(ValueError, match="Cannot convert array with nulls"):
- nanoarrow.as_numpy_array(arr)
+def test_array_stream_iter():
+ pa_array_child = pa.array([1, 2, 3], pa.int32())
+ pa_array = pa.record_batch([pa_array_child], names=["some_column"])
+ reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
+ array_stream = na.array_stream(reader)
- arr = pa.array([[1], [2, 3]])
- with pytest.raises(TypeError, match="Cannot convert a non-primitive array"):
- nanoarrow.as_numpy_array(arr)
+ arrays = list(array_stream)
+ assert len(arrays) == 1
+ assert arrays[0].schema.children[0].name == "some_column"
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index 9fb3cc1..2408a52 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -301,6 +301,8 @@ enum ArrowType {
/// \ingroup nanoarrow-utils
///
/// Returns NULL for invalid values for type
+static inline const char* ArrowTypeString(enum ArrowType type);
+
static inline const char* ArrowTypeString(enum ArrowType type) {
switch (type) {
case NANOARROW_TYPE_NA:
@@ -419,6 +421,8 @@ enum ArrowValidationLevel {
/// \ingroup nanoarrow-utils
///
/// Returns NULL for invalid values for time_unit
+static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit);
+
static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) {
switch (time_unit) {
case NANOARROW_TIME_UNIT_SECOND:
@@ -461,6 +465,8 @@ struct ArrowStringView {
/// \brief Return a view of a const C string
/// \ingroup nanoarrow-utils
+static inline struct ArrowStringView ArrowCharView(const char* value);
+
static inline struct ArrowStringView ArrowCharView(const char* value) {
struct ArrowStringView out;