You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/13 19:44:55 UTC
[4/4] arrow git commit: ARROW-819: Public Cython and C++ API in the
style of lxml, arrow::py::import_pyarrow method
ARROW-819: Public Cython and C++ API in the style of lxml, arrow::py::import_pyarrow method
I have been looking at LXML's approach to creating both a public Cython API and C++ API
https://github.com/lxml/lxml
While this may seem like a somewhat radical reorganization of the code, putting all of the main symbols in a single Cython extension makes generating a C++ API for them significantly simpler. By using `.pxi` files we can break the codebase into as small pieces as we like (as long as there are no circular dependencies). As a convenient side effect, the build times are shorter.
Author: Wes McKinney <we...@twosigma.com>
Closes #680 from wesm/ARROW-819 and squashes the following commits:
9e6ee246 [Wes McKinney] Fix up optional extensions
cff757de [Wes McKinney] Expose pyarrow C API in arrow/python/pyarrow.h
b39d19cd [Wes McKinney] Fix test suite. Move _config into lib
ff1b5e51 [Wes McKinney] Rename things a bit
d4a83912 [Wes McKinney] Reorganize Cython code in the style of lxml so make declaring a public C API easier
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/9e875a68
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/9e875a68
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/9e875a68
Branch: refs/heads/master
Commit: 9e875a6843b7bd155f7e10d011f5e8d25a47494c
Parents: 95ee96b
Author: Wes McKinney <we...@twosigma.com>
Authored: Sat May 13 15:44:43 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sat May 13 15:44:43 2017 -0400
----------------------------------------------------------------------
cpp/CMakeLists.txt | 4 +-
cpp/src/arrow/python/CMakeLists.txt | 2 +
cpp/src/arrow/python/pyarrow.cc | 75 ++
cpp/src/arrow/python/pyarrow.h | 55 +
cpp/src/arrow/python/pyarrow_api.h | 143 +++
python/CMakeLists.txt | 7 +-
python/pyarrow/__init__.pxd | 34 +
python/pyarrow/__init__.py | 112 +-
python/pyarrow/_array.pxd | 247 -----
python/pyarrow/_array.pyx | 1631 -----------------------------
python/pyarrow/_error.pxd | 20 -
python/pyarrow/_error.pyx | 70 --
python/pyarrow/_io.pxd | 50 -
python/pyarrow/_io.pyx | 1274 ----------------------
python/pyarrow/_jemalloc.pyx | 2 +-
python/pyarrow/_memory.pxd | 30 -
python/pyarrow/_memory.pyx | 58 -
python/pyarrow/_parquet.pyx | 22 +-
python/pyarrow/_table.pxd | 62 --
python/pyarrow/_table.pyx | 926 ----------------
python/pyarrow/array.pxi | 1549 +++++++++++++++++++++++++++
python/pyarrow/error.pxi | 70 ++
python/pyarrow/feather.py | 6 +-
python/pyarrow/filesystem.py | 14 +-
python/pyarrow/formatting.py | 4 +-
python/pyarrow/includes/libarrow.pxd | 61 +-
python/pyarrow/includes/pyarrow.pxd | 75 --
python/pyarrow/io.pxi | 1253 ++++++++++++++++++++++
python/pyarrow/ipc.py | 10 +-
python/pyarrow/lib.pxd | 337 ++++++
python/pyarrow/lib.pyx | 88 ++
python/pyarrow/memory.pxi | 55 +
python/pyarrow/parquet.py | 13 +-
python/pyarrow/public-api.pxi | 107 ++
python/pyarrow/table.pxi | 884 ++++++++++++++++
python/pyarrow/tests/test_feather.py | 2 +-
python/setup.py | 9 +-
37 files changed, 4809 insertions(+), 4552 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2146379..6b2ceec 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -852,7 +852,8 @@ if (UNIX)
)
FOREACH(item ${LINT_FILES})
- IF(NOT (item MATCHES "_generated.h"))
+ IF(NOT ((item MATCHES "_generated.h") OR
+ (item MATCHES "pyarrow_api.h")))
LIST(APPEND FILTERED_LINT_FILES ${item})
ENDIF()
ENDFOREACH(item ${LINT_FILES})
@@ -878,6 +879,7 @@ if (${CLANG_FORMAT_FOUND})
`find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h |
sed -e '/_generated/g' |
sed -e '/windows_compatibility.h/g' |
+ sed -e '/pyarrow_api.h/g' |
sed -e '/config.h/g' | # python/config.h
sed -e '/platform.h/g'` # python/platform.h
)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/cpp/src/arrow/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt
index c5cbc50..3085229 100644
--- a/cpp/src/arrow/python/CMakeLists.txt
+++ b/cpp/src/arrow/python/CMakeLists.txt
@@ -50,6 +50,7 @@ set(ARROW_PYTHON_SRCS
io.cc
numpy_convert.cc
pandas_convert.cc
+ pyarrow.cc
)
set(ARROW_PYTHON_SHARED_LINK_LIBS
@@ -90,6 +91,7 @@ install(FILES
numpy_interop.h
pandas_convert.h
platform.h
+ pyarrow.h
type_traits.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python")
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/cpp/src/arrow/python/pyarrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc
new file mode 100644
index 0000000..56c0381
--- /dev/null
+++ b/cpp/src/arrow/python/pyarrow.cc
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/pyarrow.h"
+
+#include <memory>
+
+#include "arrow/array.h"
+#include "arrow/table.h"
+#include "arrow/tensor.h"
+#include "arrow/type.h"
+
+namespace {
+#include "arrow/python/pyarrow_api.h"
+}
+
+namespace arrow {
+namespace py {
+
+int import_pyarrow() {
+ return ::import_pyarrow__lib();
+}
+
+PyObject* wrap_buffer(const std::shared_ptr<Buffer>& buffer) {
+ return ::pyarrow_wrap_buffer(buffer);
+}
+
+PyObject* wrap_data_type(const std::shared_ptr<DataType>& type) {
+ return ::pyarrow_wrap_data_type(type);
+}
+
+PyObject* wrap_field(const std::shared_ptr<Field>& field) {
+ return ::pyarrow_wrap_field(field);
+}
+
+PyObject* wrap_schema(const std::shared_ptr<Schema>& schema) {
+ return ::pyarrow_wrap_schema(schema);
+}
+
+PyObject* wrap_array(const std::shared_ptr<Array>& array) {
+ return ::pyarrow_wrap_array(array);
+}
+
+PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor) {
+ return ::pyarrow_wrap_tensor(tensor);
+}
+
+PyObject* wrap_column(const std::shared_ptr<Column>& column) {
+ return ::pyarrow_wrap_column(column);
+}
+
+PyObject* wrap_table(const std::shared_ptr<Table>& table) {
+ return ::pyarrow_wrap_table(table);
+}
+
+PyObject* wrap_record_batch(const std::shared_ptr<RecordBatch>& batch) {
+ return ::pyarrow_wrap_batch(batch);
+}
+
+} // namespace py
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/cpp/src/arrow/python/pyarrow.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h
new file mode 100644
index 0000000..7c618ce
--- /dev/null
+++ b/cpp/src/arrow/python/pyarrow.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_PYTHON_PYARROW_H
+#define ARROW_PYTHON_PYARROW_H
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class Buffer;
+class Column;
+class DataType;
+class Field;
+class RecordBatch;
+class Schema;
+class Table;
+class Tensor;
+
+namespace py {
+
+ARROW_EXPORT int import_pyarrow();
+ARROW_EXPORT PyObject* wrap_buffer(const std::shared_ptr<Buffer>& buffer);
+ARROW_EXPORT PyObject* wrap_data_type(const std::shared_ptr<DataType>& type);
+ARROW_EXPORT PyObject* wrap_field(const std::shared_ptr<Field>& field);
+ARROW_EXPORT PyObject* wrap_schema(const std::shared_ptr<Schema>& schema);
+ARROW_EXPORT PyObject* wrap_array(const std::shared_ptr<Array>& array);
+ARROW_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor);
+ARROW_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>& column);
+ARROW_EXPORT PyObject* wrap_table(const std::shared_ptr<Table>& table);
+ARROW_EXPORT PyObject* wrap_record_batch(const std::shared_ptr<RecordBatch>& batch);
+
+} // namespace py
+} // namespace arrow
+
+#endif // ARROW_PYTHON_PYARROW_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/cpp/src/arrow/python/pyarrow_api.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pyarrow_api.h b/cpp/src/arrow/python/pyarrow_api.h
new file mode 100644
index 0000000..7b70844
--- /dev/null
+++ b/cpp/src/arrow/python/pyarrow_api.h
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// DO NOT EDIT THIS FILE. Update from pyarrow/lib_api.h after pyarrow build
+
+/* Generated by Cython 0.25.2 */
+
+#ifndef __PYX_HAVE_API__pyarrow__lib
+#define __PYX_HAVE_API__pyarrow__lib
+#include "Python.h"
+
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0;
+#define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType> const &) = 0;
+#define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field> const &) = 0;
+#define pyarrow_wrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema)(std::shared_ptr< arrow::Schema> const &) = 0;
+#define pyarrow_wrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array> const &) = 0;
+#define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0;
+#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column)(std::shared_ptr< arrow::Column> const &) = 0;
+#define pyarrow_wrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr< arrow::Table> const &) = 0;
+#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch> const &) = 0;
+#define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch
+#if !defined(__Pyx_PyIdentifier_FromString)
+#if PY_MAJOR_VERSION < 3
+ #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+#else
+ #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#endif
+#endif
+
+#ifndef __PYX_HAVE_RT_ImportModule
+#define __PYX_HAVE_RT_ImportModule
+static PyObject *__Pyx_ImportModule(const char *name) {
+ PyObject *py_name = 0;
+ PyObject *py_module = 0;
+ py_name = __Pyx_PyIdentifier_FromString(name);
+ if (!py_name)
+ goto bad;
+ py_module = PyImport_Import(py_name);
+ Py_DECREF(py_name);
+ return py_module;
+bad:
+ Py_XDECREF(py_name);
+ return 0;
+}
+#endif
+
+#ifndef __PYX_HAVE_RT_ImportFunction
+#define __PYX_HAVE_RT_ImportFunction
+static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
+ PyObject *d = 0;
+ PyObject *cobj = 0;
+ union {
+ void (*fp)(void);
+ void *p;
+ } tmp;
+ d = PyObject_GetAttrString(module, (char *)"__pyx_capi__");
+ if (!d)
+ goto bad;
+ cobj = PyDict_GetItemString(d, funcname);
+ if (!cobj) {
+ PyErr_Format(PyExc_ImportError,
+ "%.200s does not export expected C function %.200s",
+ PyModule_GetName(module), funcname);
+ goto bad;
+ }
+#if PY_VERSION_HEX >= 0x02070000
+ if (!PyCapsule_IsValid(cobj, sig)) {
+ PyErr_Format(PyExc_TypeError,
+ "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)",
+ PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj));
+ goto bad;
+ }
+ tmp.p = PyCapsule_GetPointer(cobj, sig);
+#else
+ {const char *desc, *s1, *s2;
+ desc = (const char *)PyCObject_GetDesc(cobj);
+ if (!desc)
+ goto bad;
+ s1 = desc; s2 = sig;
+ while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; }
+ if (*s1 != *s2) {
+ PyErr_Format(PyExc_TypeError,
+ "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)",
+ PyModule_GetName(module), funcname, sig, desc);
+ goto bad;
+ }
+ tmp.p = PyCObject_AsVoidPtr(cobj);}
+#endif
+ *f = tmp.fp;
+ if (!(*f))
+ goto bad;
+ Py_DECREF(d);
+ return 0;
+bad:
+ Py_XDECREF(d);
+ return -1;
+}
+#endif
+
+
+static int import_pyarrow__lib(void) {
+ PyObject *module = 0;
+ module = __Pyx_ImportModule("pyarrow.lib");
+ if (!module) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column, "PyObject *(std::shared_ptr< arrow::Column> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad;
+ Py_DECREF(module); module = 0;
+ return 0;
+ bad:
+ Py_XDECREF(module);
+ return -1;
+}
+
+#endif /* !__PYX_HAVE_API__pyarrow__lib */
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index d828710..123dd5d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -265,12 +265,7 @@ if (UNIX)
endif()
set(CYTHON_EXTENSIONS
- _array
- _config
- _error
- _io
- _memory
- _table
+ lib
)
set(LINK_LIBS
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/pyarrow/__init__.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd
new file mode 100644
index 0000000..4f43455
--- /dev/null
+++ b/python/pyarrow/__init__.pxd
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from libcpp.memory cimport shared_ptr
+from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType,
+ CField, CRecordBatch, CSchema,
+ CTable, CTensor)
+
+
+cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py":
+ cdef int import_pyarrow() except -1
+ cdef object wrap_buffer(const shared_ptr[CBuffer]& buffer)
+ cdef object wrap_data_type(const shared_ptr[CDataType]& type)
+ cdef object wrap_field(const shared_ptr[CField]& field)
+ cdef object wrap_schema(const shared_ptr[CSchema]& schema)
+ cdef object wrap_array(const shared_ptr[CArray]& sp_array)
+ cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
+ cdef object wrap_column(const shared_ptr[CColumn]& ccolumn)
+ cdef object wrap_table(const shared_ptr[CTable]& ctable)
+ cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 4d8da9f..7d79811 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -25,53 +25,51 @@ except DistributionNotFound:
pass
-import pyarrow._config
-from pyarrow._config import cpu_count, set_cpu_count
+from pyarrow.lib import cpu_count, set_cpu_count
+from pyarrow.lib import (null, bool_,
+ int8, int16, int32, int64,
+ uint8, uint16, uint32, uint64,
+ time32, time64, timestamp, date32, date64,
+ float16, float32, float64,
+ binary, string, decimal,
+ list_, struct, dictionary, field,
+ DataType,
+ DecimalType,
+ DictionaryType,
+ FixedSizeBinaryType,
+ TimestampType,
+ Time32Type,
+ Time64Type,
+ Field,
+ Schema,
+ schema,
+ Array, Tensor,
+ array,
+ from_numpy_dtype,
+ NullArray,
+ NumericArray, IntegerArray, FloatingPointArray,
+ BooleanArray,
+ Int8Array, UInt8Array,
+ Int16Array, UInt16Array,
+ Int32Array, UInt32Array,
+ Int64Array, UInt64Array,
+ ListArray,
+ BinaryArray, StringArray,
+ FixedSizeBinaryArray,
+ DictionaryArray,
+ Date32Array, Date64Array,
+ TimestampArray, Time32Array, Time64Array,
+ DecimalArray,
+ ArrayValue, Scalar, NA, NAType,
+ BooleanValue,
+ Int8Value, Int16Value, Int32Value, Int64Value,
+ UInt8Value, UInt16Value, UInt32Value, UInt64Value,
+ FloatValue, DoubleValue, ListValue,
+ BinaryValue, StringValue, FixedSizeBinaryValue,
+ DecimalValue,
+ Date32Value, Date64Value, TimestampValue)
-from pyarrow._array import (null, bool_,
- int8, int16, int32, int64,
- uint8, uint16, uint32, uint64,
- time32, time64, timestamp, date32, date64,
- float16, float32, float64,
- binary, string, decimal,
- list_, struct, dictionary, field,
- DataType,
- DecimalType,
- DictionaryType,
- FixedSizeBinaryType,
- TimestampType,
- Time32Type,
- Time64Type,
- Field,
- Schema,
- schema,
- Array, Tensor,
- array,
- from_numpy_dtype,
- NullArray,
- NumericArray, IntegerArray, FloatingPointArray,
- BooleanArray,
- Int8Array, UInt8Array,
- Int16Array, UInt16Array,
- Int32Array, UInt32Array,
- Int64Array, UInt64Array,
- ListArray,
- BinaryArray, StringArray,
- FixedSizeBinaryArray,
- DictionaryArray,
- Date32Array, Date64Array,
- TimestampArray, Time32Array, Time64Array,
- DecimalArray,
- ArrayValue, Scalar, NA, NAType,
- BooleanValue,
- Int8Value, Int16Value, Int32Value, Int64Value,
- UInt8Value, UInt16Value, UInt32Value, UInt64Value,
- FloatValue, DoubleValue, ListValue,
- BinaryValue, StringValue, FixedSizeBinaryValue,
- DecimalValue,
- Date32Value, Date64Value, TimestampValue)
-
-from pyarrow._io import (HdfsFile, NativeFile, PythonFile,
+from pyarrow.lib import (HdfsFile, NativeFile, PythonFile,
Buffer, BufferReader, InMemoryOutputStream,
OSFile, MemoryMappedFile, memory_map,
frombuffer, read_tensor, write_tensor,
@@ -79,17 +77,17 @@ from pyarrow._io import (HdfsFile, NativeFile, PythonFile,
get_record_batch_size, get_tensor_size,
have_libhdfs, have_libhdfs3)
-from pyarrow._memory import (MemoryPool, total_allocated_bytes,
- set_memory_pool, default_memory_pool)
-from pyarrow._table import (ChunkedArray, Column, RecordBatch, Table,
- concat_tables)
-from pyarrow._error import (ArrowException,
- ArrowKeyError,
- ArrowInvalid,
- ArrowIOError,
- ArrowMemoryError,
- ArrowNotImplementedError,
- ArrowTypeError)
+from pyarrow.lib import (MemoryPool, total_allocated_bytes,
+ set_memory_pool, default_memory_pool)
+from pyarrow.lib import (ChunkedArray, Column, RecordBatch, Table,
+ concat_tables)
+from pyarrow.lib import (ArrowException,
+ ArrowKeyError,
+ ArrowInvalid,
+ ArrowIOError,
+ ArrowMemoryError,
+ ArrowNotImplementedError,
+ ArrowTypeError)
def jemalloc_memory_pool():
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/pyarrow/_array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd
deleted file mode 100644
index 464de31..0000000
--- a/python/pyarrow/_array.pxd
+++ /dev/null
@@ -1,247 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport *
-
-from cpython cimport PyObject
-
-cdef extern from "Python.h":
- int PySlice_Check(object)
-
-
-cdef class DataType:
- cdef:
- shared_ptr[CDataType] sp_type
- CDataType* type
-
- cdef void init(self, const shared_ptr[CDataType]& type)
-
-
-cdef class DictionaryType(DataType):
- cdef:
- const CDictionaryType* dict_type
-
-
-cdef class TimestampType(DataType):
- cdef:
- const CTimestampType* ts_type
-
-
-cdef class Time32Type(DataType):
- cdef:
- const CTime32Type* time_type
-
-
-cdef class Time64Type(DataType):
- cdef:
- const CTime64Type* time_type
-
-
-cdef class FixedSizeBinaryType(DataType):
- cdef:
- const CFixedSizeBinaryType* fixed_size_binary_type
-
-
-cdef class DecimalType(FixedSizeBinaryType):
- cdef:
- const CDecimalType* decimal_type
-
-
-cdef class Field:
- cdef:
- shared_ptr[CField] sp_field
- CField* field
-
- cdef readonly:
- DataType type
-
- cdef init(self, const shared_ptr[CField]& field)
-
-
-cdef class Schema:
- cdef:
- shared_ptr[CSchema] sp_schema
- CSchema* schema
-
- cdef init(self, const vector[shared_ptr[CField]]& fields)
- cdef init_schema(self, const shared_ptr[CSchema]& schema)
-
-
-cdef class Scalar:
- cdef readonly:
- DataType type
-
-
-cdef class NAType(Scalar):
- pass
-
-
-cdef class ArrayValue(Scalar):
- cdef:
- shared_ptr[CArray] sp_array
- int64_t index
-
- cdef void init(self, DataType type,
- const shared_ptr[CArray]& sp_array, int64_t index)
-
- cdef void _set_array(self, const shared_ptr[CArray]& sp_array)
-
-
-cdef class Int8Value(ArrayValue):
- pass
-
-
-cdef class Int64Value(ArrayValue):
- pass
-
-
-cdef class ListValue(ArrayValue):
- cdef readonly:
- DataType value_type
-
- cdef:
- CListArray* ap
-
- cdef getitem(self, int64_t i)
-
-
-cdef class StringValue(ArrayValue):
- pass
-
-
-cdef class FixedSizeBinaryValue(ArrayValue):
- pass
-
-
-cdef class Array:
- cdef:
- shared_ptr[CArray] sp_array
- CArray* ap
-
- cdef readonly:
- DataType type
-
- cdef init(self, const shared_ptr[CArray]& sp_array)
- cdef getitem(self, int64_t i)
-
-
-cdef class Tensor:
- cdef:
- shared_ptr[CTensor] sp_tensor
- CTensor* tp
-
- cdef readonly:
- DataType type
-
- cdef init(self, const shared_ptr[CTensor]& sp_tensor)
-
-
-cdef class NullArray(Array):
- pass
-
-
-cdef class BooleanArray(Array):
- pass
-
-
-cdef class NumericArray(Array):
- pass
-
-
-cdef class IntegerArray(NumericArray):
- pass
-
-
-cdef class FloatingPointArray(NumericArray):
- pass
-
-
-cdef class Int8Array(IntegerArray):
- pass
-
-
-cdef class UInt8Array(IntegerArray):
- pass
-
-
-cdef class Int16Array(IntegerArray):
- pass
-
-
-cdef class UInt16Array(IntegerArray):
- pass
-
-
-cdef class Int32Array(IntegerArray):
- pass
-
-
-cdef class UInt32Array(IntegerArray):
- pass
-
-
-cdef class Int64Array(IntegerArray):
- pass
-
-
-cdef class UInt64Array(IntegerArray):
- pass
-
-
-cdef class FloatArray(FloatingPointArray):
- pass
-
-
-cdef class DoubleArray(FloatingPointArray):
- pass
-
-
-cdef class FixedSizeBinaryArray(Array):
- pass
-
-
-cdef class DecimalArray(FixedSizeBinaryArray):
- pass
-
-
-cdef class ListArray(Array):
- pass
-
-
-cdef class StringArray(Array):
- pass
-
-
-cdef class BinaryArray(Array):
- pass
-
-
-cdef class DictionaryArray(Array):
- cdef:
- object _indices, _dictionary
-
-
-cdef wrap_array_output(PyObject* output)
-cdef DataType box_data_type(const shared_ptr[CDataType]& type)
-cdef Field box_field(const shared_ptr[CField]& field)
-cdef Schema box_schema(const shared_ptr[CSchema]& schema)
-cdef object box_array(const shared_ptr[CArray]& sp_array)
-cdef object box_tensor(const shared_ptr[CTensor]& sp_tensor)
-cdef object box_scalar(DataType type,
- const shared_ptr[CArray]& sp_array,
- int64_t index)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/pyarrow/_array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx
deleted file mode 100644
index f01cff6..0000000
--- a/python/pyarrow/_array.pyx
+++ /dev/null
@@ -1,1631 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-
-from cython.operator cimport dereference as deref
-from pyarrow.includes.libarrow cimport *
-from pyarrow.includes.common cimport PyObject_to_object
-cimport pyarrow.includes.pyarrow as pyarrow
-from pyarrow._error cimport check_status
-from pyarrow._memory cimport MemoryPool, maybe_unbox_memory_pool
-cimport cpython as cp
-
-
-import datetime
-import decimal as _pydecimal
-import numpy as np
-import six
-import pyarrow._config
-from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical
-
-
-cdef _pandas():
- import pandas as pd
- return pd
-
-
-# These are imprecise because the type (in pandas 0.x) depends on the presence
-# of nulls
-_pandas_type_map = {
- _Type_NA: np.float64, # NaNs
- _Type_BOOL: np.bool_,
- _Type_INT8: np.int8,
- _Type_INT16: np.int16,
- _Type_INT32: np.int32,
- _Type_INT64: np.int64,
- _Type_UINT8: np.uint8,
- _Type_UINT16: np.uint16,
- _Type_UINT32: np.uint32,
- _Type_UINT64: np.uint64,
- _Type_HALF_FLOAT: np.float16,
- _Type_FLOAT: np.float32,
- _Type_DOUBLE: np.float64,
- _Type_DATE32: np.dtype('datetime64[ns]'),
- _Type_DATE64: np.dtype('datetime64[ns]'),
- _Type_TIMESTAMP: np.dtype('datetime64[ns]'),
- _Type_BINARY: np.object_,
- _Type_FIXED_SIZE_BINARY: np.object_,
- _Type_STRING: np.object_,
- _Type_LIST: np.object_
-}
-
-cdef class DataType:
-
- def __cinit__(self):
- pass
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- self.sp_type = type
- self.type = type.get()
-
- def __str__(self):
- return frombytes(self.type.ToString())
-
- def __repr__(self):
- return '{0.__class__.__name__}({0})'.format(self)
-
- def __richcmp__(DataType self, DataType other, int op):
- if op == cp.Py_EQ:
- return self.type.Equals(deref(other.type))
- elif op == cp.Py_NE:
- return not self.type.Equals(deref(other.type))
- else:
- raise TypeError('Invalid comparison')
-
- def to_pandas_dtype(self):
- """
- Return the NumPy dtype that would be used for storing this
- """
- cdef Type type_id = self.type.id()
- if type_id in _pandas_type_map:
- return _pandas_type_map[type_id]
- else:
- raise NotImplementedError(str(self))
-
-
-cdef class DictionaryType(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.dict_type = <const CDictionaryType*> type.get()
-
-
-cdef class TimestampType(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.ts_type = <const CTimestampType*> type.get()
-
- property unit:
-
- def __get__(self):
- return timeunit_to_string(self.ts_type.unit())
-
- property tz:
-
- def __get__(self):
- if self.ts_type.timezone().size() > 0:
- return frombytes(self.ts_type.timezone())
- else:
- return None
-
-
-cdef class Time32Type(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.time_type = <const CTime32Type*> type.get()
-
- property unit:
-
- def __get__(self):
- return timeunit_to_string(self.time_type.unit())
-
-
-cdef class Time64Type(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.time_type = <const CTime64Type*> type.get()
-
- property unit:
-
- def __get__(self):
- return timeunit_to_string(self.time_type.unit())
-
-
-cdef class FixedSizeBinaryType(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.fixed_size_binary_type = (
- <const CFixedSizeBinaryType*> type.get())
-
- property byte_width:
-
- def __get__(self):
- return self.fixed_size_binary_type.byte_width()
-
-
-cdef class DecimalType(FixedSizeBinaryType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.decimal_type = <const CDecimalType*> type.get()
-
-
-cdef class Field:
- """
- Represents a named field, with a data type, nullability, and optional
- metadata
-
- Notes
- -----
- Do not use this class's constructor directly; use pyarrow.field
- """
- def __cinit__(self):
- pass
-
- cdef init(self, const shared_ptr[CField]& field):
- self.sp_field = field
- self.field = field.get()
- self.type = box_data_type(field.get().type())
-
- def equals(self, Field other):
- """
- Test if this field is equal to the other
- """
- return self.field.Equals(deref(other.field))
-
- def __str__(self):
- self._check_null()
- return 'pyarrow.Field<{0}>'.format(frombytes(self.field.ToString()))
-
- def __repr__(self):
- return self.__str__()
-
- property nullable:
-
- def __get__(self):
- self._check_null()
- return self.field.nullable()
-
- property name:
-
- def __get__(self):
- self._check_null()
- return frombytes(self.field.name())
-
- property metadata:
-
- def __get__(self):
- self._check_null()
- return box_metadata(self.field.metadata().get())
-
- def _check_null(self):
- if self.field == NULL:
- raise ReferenceError(
- 'Field not initialized (references NULL pointer)')
-
- def add_metadata(self, dict metadata):
- """
- Add metadata as dict of string keys and values to Field
-
- Parameters
- ----------
- metadata : dict
- Keys and values must be string-like / coercible to bytes
-
- Returns
- -------
- field : pyarrow.Field
- """
- cdef shared_ptr[CKeyValueMetadata] c_meta
- convert_metadata(metadata, &c_meta)
-
- cdef shared_ptr[CField] new_field
- with nogil:
- check_status(self.field.AddMetadata(c_meta, &new_field))
-
- return box_field(new_field)
-
- def remove_metadata(self):
- """
- Create new field without metadata, if any
-
- Returns
- -------
- field : pyarrow.Field
- """
- cdef shared_ptr[CField] new_field
- with nogil:
- new_field = self.field.RemoveMetadata()
- return box_field(new_field)
-
-
-cdef class Schema:
-
- def __cinit__(self):
- pass
-
- def __len__(self):
- return self.schema.num_fields()
-
- def __getitem__(self, i):
- if i < 0 or i >= len(self):
- raise IndexError("{0} is out of bounds".format(i))
-
- cdef Field result = Field()
- result.init(self.schema.field(i))
- result.type = box_data_type(result.field.type())
-
- return result
-
- cdef init(self, const vector[shared_ptr[CField]]& fields):
- self.schema = new CSchema(fields)
- self.sp_schema.reset(self.schema)
-
- cdef init_schema(self, const shared_ptr[CSchema]& schema):
- self.schema = schema.get()
- self.sp_schema = schema
-
- property names:
-
- def __get__(self):
- cdef int i
- result = []
- for i in range(self.schema.num_fields()):
- name = frombytes(self.schema.field(i).get().name())
- result.append(name)
- return result
-
- property metadata:
-
- def __get__(self):
- return box_metadata(self.schema.metadata().get())
-
- def equals(self, other):
- """
- Test if this schema is equal to the other
- """
- cdef Schema _other
- _other = other
-
- return self.sp_schema.get().Equals(deref(_other.schema))
-
- def field_by_name(self, name):
- """
- Access a field by its name rather than the column index.
-
- Parameters
- ----------
- name: str
-
- Returns
- -------
- field: pyarrow.Field
- """
- return box_field(self.schema.GetFieldByName(tobytes(name)))
-
- def add_metadata(self, dict metadata):
- """
- Add metadata as dict of string keys and values to Schema
-
- Parameters
- ----------
- metadata : dict
- Keys and values must be string-like / coercible to bytes
-
- Returns
- -------
- schema : pyarrow.Schema
- """
- cdef shared_ptr[CKeyValueMetadata] c_meta
- convert_metadata(metadata, &c_meta)
-
- cdef shared_ptr[CSchema] new_schema
- with nogil:
- check_status(self.schema.AddMetadata(c_meta, &new_schema))
-
- return box_schema(new_schema)
-
- def remove_metadata(self):
- """
- Create new schema without metadata, if any
-
- Returns
- -------
- schema : pyarrow.Schema
- """
- cdef shared_ptr[CSchema] new_schema
- with nogil:
- new_schema = self.schema.RemoveMetadata()
- return box_schema(new_schema)
-
- def __str__(self):
- return frombytes(self.schema.ToString())
-
- def __repr__(self):
- return self.__str__()
-
-
-cdef box_metadata(const CKeyValueMetadata* metadata):
- cdef unordered_map[c_string, c_string] result
- if metadata != NULL:
- metadata.ToUnorderedMap(&result)
- return result
- else:
- return None
-
-
-cdef dict _type_cache = {}
-
-
-cdef DataType primitive_type(Type type):
- if type in _type_cache:
- return _type_cache[type]
-
- cdef DataType out = DataType()
- out.init(pyarrow.GetPrimitiveType(type))
-
- _type_cache[type] = out
- return out
-
-#------------------------------------------------------------
-# Type factory functions
-
-cdef int convert_metadata(dict metadata,
- shared_ptr[CKeyValueMetadata]* out) except -1:
- cdef:
- shared_ptr[CKeyValueMetadata] meta = (
- make_shared[CKeyValueMetadata]())
- c_string key, value
-
- for py_key, py_value in metadata.items():
- key = tobytes(py_key)
- value = tobytes(py_value)
- meta.get().Append(key, value)
- out[0] = meta
- return 0
-
-
-def field(name, DataType type, bint nullable=True, dict metadata=None):
- """
- Create a pyarrow.Field instance
-
- Parameters
- ----------
- name : string or bytes
- type : pyarrow.DataType
- nullable : boolean, default True
- metadata : dict, default None
- Keys and values must be coercible to bytes
-
- Returns
- -------
- field : pyarrow.Field
- """
- cdef:
- shared_ptr[CKeyValueMetadata] c_meta
- Field result = Field()
-
- if metadata is not None:
- convert_metadata(metadata, &c_meta)
-
- result.sp_field.reset(new CField(tobytes(name), type.sp_type,
- nullable, c_meta))
- result.field = result.sp_field.get()
- result.type = type
- return result
-
-
-cdef set PRIMITIVE_TYPES = set([
- _Type_NA, _Type_BOOL,
- _Type_UINT8, _Type_INT8,
- _Type_UINT16, _Type_INT16,
- _Type_UINT32, _Type_INT32,
- _Type_UINT64, _Type_INT64,
- _Type_TIMESTAMP, _Type_DATE32,
- _Type_DATE64,
- _Type_HALF_FLOAT,
- _Type_FLOAT,
- _Type_DOUBLE])
-
-
-def null():
- return primitive_type(_Type_NA)
-
-
-def bool_():
- return primitive_type(_Type_BOOL)
-
-
-def uint8():
- return primitive_type(_Type_UINT8)
-
-
-def int8():
- return primitive_type(_Type_INT8)
-
-
-def uint16():
- return primitive_type(_Type_UINT16)
-
-
-def int16():
- return primitive_type(_Type_INT16)
-
-
-def uint32():
- return primitive_type(_Type_UINT32)
-
-
-def int32():
- return primitive_type(_Type_INT32)
-
-
-def uint64():
- return primitive_type(_Type_UINT64)
-
-
-def int64():
- return primitive_type(_Type_INT64)
-
-
-cdef dict _timestamp_type_cache = {}
-cdef dict _time_type_cache = {}
-
-
-cdef timeunit_to_string(TimeUnit unit):
- if unit == TimeUnit_SECOND:
- return 's'
- elif unit == TimeUnit_MILLI:
- return 'ms'
- elif unit == TimeUnit_MICRO:
- return 'us'
- elif unit == TimeUnit_NANO:
- return 'ns'
-
-
-def timestamp(unit_str, tz=None):
- cdef:
- TimeUnit unit
- c_string c_timezone
-
- if unit_str == "s":
- unit = TimeUnit_SECOND
- elif unit_str == 'ms':
- unit = TimeUnit_MILLI
- elif unit_str == 'us':
- unit = TimeUnit_MICRO
- elif unit_str == 'ns':
- unit = TimeUnit_NANO
- else:
- raise ValueError('Invalid TimeUnit string')
-
- cdef TimestampType out = TimestampType()
-
- if tz is None:
- out.init(ctimestamp(unit))
- if unit in _timestamp_type_cache:
- return _timestamp_type_cache[unit]
- _timestamp_type_cache[unit] = out
- else:
- if not isinstance(tz, six.string_types):
- tz = tz.zone
-
- c_timezone = tobytes(tz)
- out.init(ctimestamp(unit, c_timezone))
-
- return out
-
-
-def time32(unit_str):
- cdef:
- TimeUnit unit
- c_string c_timezone
-
- if unit_str == "s":
- unit = TimeUnit_SECOND
- elif unit_str == 'ms':
- unit = TimeUnit_MILLI
- else:
- raise ValueError('Invalid TimeUnit for time32: {}'.format(unit_str))
-
- cdef Time32Type out
- if unit in _time_type_cache:
- return _time_type_cache[unit]
- else:
- out = Time32Type()
- out.init(ctime32(unit))
- _time_type_cache[unit] = out
- return out
-
-
-def time64(unit_str):
- cdef:
- TimeUnit unit
- c_string c_timezone
-
- if unit_str == "us":
- unit = TimeUnit_MICRO
- elif unit_str == 'ns':
- unit = TimeUnit_NANO
- else:
- raise ValueError('Invalid TimeUnit for time64: {}'.format(unit_str))
-
- cdef Time64Type out
- if unit in _time_type_cache:
- return _time_type_cache[unit]
- else:
- out = Time64Type()
- out.init(ctime64(unit))
- _time_type_cache[unit] = out
- return out
-
-
-def date32():
- return primitive_type(_Type_DATE32)
-
-
-def date64():
- return primitive_type(_Type_DATE64)
-
-
-def float16():
- return primitive_type(_Type_HALF_FLOAT)
-
-
-def float32():
- return primitive_type(_Type_FLOAT)
-
-
-def float64():
- return primitive_type(_Type_DOUBLE)
-
-
-cpdef DataType decimal(int precision, int scale=0):
- cdef shared_ptr[CDataType] decimal_type
- decimal_type.reset(new CDecimalType(precision, scale))
- return box_data_type(decimal_type)
-
-
-def string():
- """
- UTF8 string
- """
- return primitive_type(_Type_STRING)
-
-
-def binary(int length=-1):
- """Binary (PyBytes-like) type
-
- Parameters
- ----------
- length : int, optional, default -1
- If length == -1 then return a variable length binary type. If length is
- greater than or equal to 0 then return a fixed size binary type of
- width `length`.
- """
- if length == -1:
- return primitive_type(_Type_BINARY)
-
- cdef shared_ptr[CDataType] fixed_size_binary_type
- fixed_size_binary_type.reset(new CFixedSizeBinaryType(length))
- return box_data_type(fixed_size_binary_type)
-
-
-def list_(DataType value_type):
- cdef DataType out = DataType()
- cdef shared_ptr[CDataType] list_type
- list_type.reset(new CListType(value_type.sp_type))
- out.init(list_type)
- return out
-
-
-def dictionary(DataType index_type, Array dictionary):
- """
- Dictionary (categorical, or simply encoded) type
- """
- cdef DictionaryType out = DictionaryType()
- cdef shared_ptr[CDataType] dict_type
- dict_type.reset(new CDictionaryType(index_type.sp_type,
- dictionary.sp_array))
- out.init(dict_type)
- return out
-
-
-def struct(fields):
- """
-
- """
- cdef:
- DataType out = DataType()
- Field field
- vector[shared_ptr[CField]] c_fields
- cdef shared_ptr[CDataType] struct_type
-
- for field in fields:
- c_fields.push_back(field.sp_field)
-
- struct_type.reset(new CStructType(c_fields))
- out.init(struct_type)
- return out
-
-
-def schema(fields):
- """
- Construct pyarrow.Schema from collection of fields
-
- Parameters
- ----------
- field : list or iterable
-
- Returns
- -------
- schema : pyarrow.Schema
- """
- cdef:
- Schema result
- Field field
- vector[shared_ptr[CField]] c_fields
-
- for i, field in enumerate(fields):
- c_fields.push_back(field.sp_field)
-
- result = Schema()
- result.init(c_fields)
- return result
-
-
-cdef DataType box_data_type(const shared_ptr[CDataType]& type):
- cdef:
- DataType out
-
- if type.get() == NULL:
- return None
-
- if type.get().id() == _Type_DICTIONARY:
- out = DictionaryType()
- elif type.get().id() == _Type_TIMESTAMP:
- out = TimestampType()
- elif type.get().id() == _Type_FIXED_SIZE_BINARY:
- out = FixedSizeBinaryType()
- elif type.get().id() == _Type_DECIMAL:
- out = DecimalType()
- else:
- out = DataType()
-
- out.init(type)
- return out
-
-cdef Field box_field(const shared_ptr[CField]& field):
- if field.get() == NULL:
- return None
- cdef Field out = Field()
- out.init(field)
- return out
-
-cdef Schema box_schema(const shared_ptr[CSchema]& type):
- cdef Schema out = Schema()
- out.init_schema(type)
- return out
-
-
-def from_numpy_dtype(object dtype):
- """
- Convert NumPy dtype to pyarrow.DataType
- """
- cdef shared_ptr[CDataType] c_type
- with nogil:
- check_status(pyarrow.NumPyDtypeToArrow(dtype, &c_type))
-
- return box_data_type(c_type)
-
-
-NA = None
-
-
-cdef class NAType(Scalar):
-
- def __cinit__(self):
- global NA
- if NA is not None:
- raise Exception('Cannot create multiple NAType instances')
-
- self.type = null()
-
- def __repr__(self):
- return 'NA'
-
- def as_py(self):
- return None
-
-
-NA = NAType()
-
-
-cdef class ArrayValue(Scalar):
-
- cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
- int64_t index):
- self.type = type
- self.index = index
- self._set_array(sp_array)
-
- cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
- self.sp_array = sp_array
-
- def __repr__(self):
- if hasattr(self, 'as_py'):
- return repr(self.as_py())
- else:
- return super(Scalar, self).__repr__()
-
-
-cdef class BooleanValue(ArrayValue):
-
- def as_py(self):
- cdef CBooleanArray* ap = <CBooleanArray*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int8Value(ArrayValue):
-
- def as_py(self):
- cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt8Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int16Value(ArrayValue):
-
- def as_py(self):
- cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt16Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int32Value(ArrayValue):
-
- def as_py(self):
- cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt32Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int64Value(ArrayValue):
-
- def as_py(self):
- cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt64Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Date32Value(ArrayValue):
-
- def as_py(self):
- cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get()
-
- # Shift to seconds since epoch
- return datetime.datetime.utcfromtimestamp(
- int(ap.Value(self.index)) * 86400).date()
-
-
-cdef class Date64Value(ArrayValue):
-
- def as_py(self):
- cdef CDate64Array* ap = <CDate64Array*> self.sp_array.get()
- return datetime.datetime.utcfromtimestamp(
- ap.Value(self.index) / 1000).date()
-
-
-cdef class TimestampValue(ArrayValue):
-
- def as_py(self):
- cdef:
- CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
- CTimestampType* dtype = <CTimestampType*>ap.type().get()
- int64_t val = ap.Value(self.index)
-
- timezone = None
- tzinfo = None
- if dtype.timezone().size() > 0:
- timezone = frombytes(dtype.timezone())
- import pytz
- tzinfo = pytz.timezone(timezone)
-
- try:
- pd = _pandas()
- if dtype.unit() == TimeUnit_SECOND:
- val = val * 1000000000
- elif dtype.unit() == TimeUnit_MILLI:
- val = val * 1000000
- elif dtype.unit() == TimeUnit_MICRO:
- val = val * 1000
- return pd.Timestamp(val, tz=tzinfo)
- except ImportError:
- if dtype.unit() == TimeUnit_SECOND:
- result = datetime.datetime.utcfromtimestamp(val)
- elif dtype.unit() == TimeUnit_MILLI:
- result = datetime.datetime.utcfromtimestamp(float(val) / 1000)
- elif dtype.unit() == TimeUnit_MICRO:
- result = datetime.datetime.utcfromtimestamp(
- float(val) / 1000000)
- else:
- # TimeUnit_NANO
- raise NotImplementedError("Cannot convert nanosecond "
- "timestamps without pandas")
- if timezone is not None:
- result = result.replace(tzinfo=tzinfo)
- return result
-
-
-cdef class FloatValue(ArrayValue):
-
- def as_py(self):
- cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class DoubleValue(ArrayValue):
-
- def as_py(self):
- cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class DecimalValue(ArrayValue):
-
- def as_py(self):
- cdef:
- CDecimalArray* ap = <CDecimalArray*> self.sp_array.get()
- c_string s = ap.FormatValue(self.index)
- return _pydecimal.Decimal(s.decode('utf8'))
-
-
-cdef class StringValue(ArrayValue):
-
- def as_py(self):
- cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
- return ap.GetString(self.index).decode('utf-8')
-
-
-cdef class BinaryValue(ArrayValue):
-
- def as_py(self):
- cdef:
- const uint8_t* ptr
- int32_t length
- CBinaryArray* ap = <CBinaryArray*> self.sp_array.get()
-
- ptr = ap.GetValue(self.index, &length)
- return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
-
-
-cdef class ListValue(ArrayValue):
-
- def __len__(self):
- return self.ap.value_length(self.index)
-
- def __getitem__(self, i):
- return self.getitem(i)
-
- def __iter__(self):
- for i in range(len(self)):
- yield self.getitem(i)
- raise StopIteration
-
- cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
- self.sp_array = sp_array
- self.ap = <CListArray*> sp_array.get()
- self.value_type = box_data_type(self.ap.value_type())
-
- cdef getitem(self, int64_t i):
- cdef int64_t j = self.ap.value_offset(self.index) + i
- return box_scalar(self.value_type, self.ap.values(), j)
-
- def as_py(self):
- cdef:
- int64_t j
- list result = []
-
- for j in range(len(self)):
- result.append(self.getitem(j).as_py())
-
- return result
-
-
-cdef class FixedSizeBinaryValue(ArrayValue):
-
- def as_py(self):
- cdef:
- CFixedSizeBinaryArray* ap
- CFixedSizeBinaryType* ap_type
- int32_t length
- const char* data
- ap = <CFixedSizeBinaryArray*> self.sp_array.get()
- ap_type = <CFixedSizeBinaryType*> ap.type().get()
- length = ap_type.byte_width()
- data = <const char*> ap.GetValue(self.index)
- return cp.PyBytes_FromStringAndSize(data, length)
-
-
-
-cdef dict _scalar_classes = {
- _Type_BOOL: BooleanValue,
- _Type_UINT8: Int8Value,
- _Type_UINT16: Int16Value,
- _Type_UINT32: Int32Value,
- _Type_UINT64: Int64Value,
- _Type_INT8: Int8Value,
- _Type_INT16: Int16Value,
- _Type_INT32: Int32Value,
- _Type_INT64: Int64Value,
- _Type_DATE32: Date32Value,
- _Type_DATE64: Date64Value,
- _Type_TIMESTAMP: TimestampValue,
- _Type_FLOAT: FloatValue,
- _Type_DOUBLE: DoubleValue,
- _Type_LIST: ListValue,
- _Type_BINARY: BinaryValue,
- _Type_STRING: StringValue,
- _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
- _Type_DECIMAL: DecimalValue,
-}
-
-cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array,
- int64_t index):
- cdef ArrayValue val
- if type.type.id() == _Type_NA:
- return NA
- elif sp_array.get().IsNull(index):
- return NA
- else:
- val = _scalar_classes[type.type.id()]()
- val.init(type, sp_array, index)
- return val
-
-
-cdef maybe_coerce_datetime64(values, dtype, DataType type,
- timestamps_to_ms=False):
-
- from pyarrow.compat import DatetimeTZDtype
-
- if values.dtype.type != np.datetime64:
- return values, type
-
- coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]'
-
- if coerce_ms:
- values = values.astype('datetime64[ms]')
-
- if isinstance(dtype, DatetimeTZDtype):
- tz = dtype.tz
- unit = 'ms' if coerce_ms else dtype.unit
- type = timestamp(unit, tz)
- elif type is None:
- # Trust the NumPy dtype
- type = from_numpy_dtype(values.dtype)
-
- return values, type
-
-
-
-def array(object sequence, DataType type=None, MemoryPool memory_pool=None):
- """
- Create pyarrow.Array instance from a Python sequence
-
- Parameters
- ----------
- sequence : sequence-like object of Python objects
- type : pyarrow.DataType, optional
- If not passed, will be inferred from the data
- memory_pool : pyarrow.MemoryPool, optional
- If not passed, will allocate memory from the currently-set default
- memory pool
-
- Returns
- -------
- array : pyarrow.Array
- """
- cdef:
- shared_ptr[CArray] sp_array
- CMemoryPool* pool
-
- pool = maybe_unbox_memory_pool(memory_pool)
- if type is None:
- check_status(pyarrow.ConvertPySequence(sequence, pool, &sp_array))
- else:
- check_status(
- pyarrow.ConvertPySequence(
- sequence, pool, &sp_array, type.sp_type
- )
- )
-
- return box_array(sp_array)
-
-
-
-cdef class Array:
-
- cdef init(self, const shared_ptr[CArray]& sp_array):
- self.sp_array = sp_array
- self.ap = sp_array.get()
- self.type = box_data_type(self.sp_array.get().type())
-
- @staticmethod
- def from_pandas(obj, mask=None, DataType type=None,
- timestamps_to_ms=False,
- MemoryPool memory_pool=None):
- """
- Convert pandas.Series to an Arrow Array.
-
- Parameters
- ----------
- series : pandas.Series or numpy.ndarray
-
- mask : pandas.Series or numpy.ndarray, optional
- boolean mask if the object is valid or null
-
- type : pyarrow.DataType
- Explicit type to attempt to coerce to
-
- timestamps_to_ms : bool, optional
- Convert datetime columns to ms resolution. This is needed for
- compatibility with other functionality like Parquet I/O which
- only supports milliseconds.
-
- memory_pool: MemoryPool, optional
- Specific memory pool to use to allocate the resulting Arrow array.
-
- Notes
- -----
- Localized timestamps will currently be returned as UTC (pandas's native
- representation). Timezone-naive data will be implicitly interpreted as
- UTC.
-
- Examples
- --------
-
- >>> import pandas as pd
- >>> import pyarrow as pa
- >>> pa.Array.from_pandas(pd.Series([1, 2]))
- <pyarrow.array.Int64Array object at 0x7f674e4c0e10>
- [
- 1,
- 2
- ]
-
- >>> import numpy as np
- >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1],
- ... dtype=bool))
- <pyarrow.array.Int64Array object at 0x7f9019e11208>
- [
- 1,
- NA
- ]
-
- Returns
- -------
- pyarrow.array.Array
- """
- cdef:
- shared_ptr[CArray] out
- shared_ptr[CDataType] c_type
- CMemoryPool* pool
-
- if mask is not None:
- mask = get_series_values(mask)
-
- values = get_series_values(obj)
- pool = maybe_unbox_memory_pool(memory_pool)
-
- if isinstance(values, Categorical):
- return DictionaryArray.from_arrays(
- values.codes, values.categories.values,
- mask=mask, memory_pool=memory_pool)
- elif values.dtype == object:
- # Object dtype undergoes a different conversion path as more type
- # inference may be needed
- if type is not None:
- c_type = type.sp_type
- with nogil:
- check_status(pyarrow.PandasObjectsToArrow(
- pool, values, mask, c_type, &out))
- else:
- values, type = maybe_coerce_datetime64(
- values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms)
-
- if type is None:
- check_status(pyarrow.NumPyDtypeToArrow(values.dtype, &c_type))
- else:
- c_type = type.sp_type
-
- with nogil:
- check_status(pyarrow.PandasToArrow(
- pool, values, mask, c_type, &out))
-
- return box_array(out)
-
- property null_count:
-
- def __get__(self):
- return self.sp_array.get().null_count()
-
- def __iter__(self):
- for i in range(len(self)):
- yield self.getitem(i)
- raise StopIteration
-
- def __repr__(self):
- from pyarrow.formatting import array_format
- type_format = object.__repr__(self)
- values = array_format(self, window=10)
- return '{0}\n{1}'.format(type_format, values)
-
- def equals(Array self, Array other):
- return self.ap.Equals(deref(other.ap))
-
- def __len__(self):
- if self.sp_array.get():
- return self.sp_array.get().length()
- else:
- return 0
-
- def isnull(self):
- raise NotImplemented
-
- def __getitem__(self, key):
- cdef:
- Py_ssize_t n = len(self)
-
- if PySlice_Check(key):
- start = key.start or 0
- while start < 0:
- start += n
-
- stop = key.stop if key.stop is not None else n
- while stop < 0:
- stop += n
-
- step = key.step or 1
- if step != 1:
- raise IndexError('only slices with step 1 supported')
- else:
- return self.slice(start, stop - start)
-
- while key < 0:
- key += len(self)
-
- return self.getitem(key)
-
- cdef getitem(self, int64_t i):
- return box_scalar(self.type, self.sp_array, i)
-
- def slice(self, offset=0, length=None):
- """
- Compute zero-copy slice of this array
-
- Parameters
- ----------
- offset : int, default 0
- Offset from start of array to slice
- length : int, default None
- Length of slice (default is until end of Array starting from
- offset)
-
- Returns
- -------
- sliced : RecordBatch
- """
- cdef:
- shared_ptr[CArray] result
-
- if offset < 0:
- raise IndexError('Offset must be non-negative')
-
- if length is None:
- result = self.ap.Slice(offset)
- else:
- result = self.ap.Slice(offset, length)
-
- return box_array(result)
-
- def to_pandas(self):
- """
- Convert to an array object suitable for use in pandas
-
- See also
- --------
- Column.to_pandas
- Table.to_pandas
- RecordBatch.to_pandas
- """
- cdef:
- PyObject* out
-
- with nogil:
- check_status(
- pyarrow.ConvertArrayToPandas(self.sp_array, self, &out))
- return wrap_array_output(out)
-
- def to_pylist(self):
- """
- Convert to an list of native Python objects.
- """
- return [x.as_py() for x in self]
-
-
-cdef class Tensor:
-
- cdef init(self, const shared_ptr[CTensor]& sp_tensor):
- self.sp_tensor = sp_tensor
- self.tp = sp_tensor.get()
- self.type = box_data_type(self.tp.type())
-
- def __repr__(self):
- return """<pyarrow.Tensor>
-type: {0}
-shape: {1}
-strides: {2}""".format(self.type, self.shape, self.strides)
-
- @staticmethod
- def from_numpy(obj):
- cdef shared_ptr[CTensor] ctensor
- check_status(pyarrow.NdarrayToTensor(default_memory_pool(),
- obj, &ctensor))
- return box_tensor(ctensor)
-
- def to_numpy(self):
- """
- Convert arrow::Tensor to numpy.ndarray with zero copy
- """
- cdef:
- PyObject* out
-
- check_status(pyarrow.TensorToNdarray(deref(self.tp), self, &out))
- return PyObject_to_object(out)
-
- def equals(self, Tensor other):
- """
- Return true if the tensors contains exactly equal data
- """
- return self.tp.Equals(deref(other.tp))
-
- property is_mutable:
-
- def __get__(self):
- return self.tp.is_mutable()
-
- property is_contiguous:
-
- def __get__(self):
- return self.tp.is_contiguous()
-
- property ndim:
-
- def __get__(self):
- return self.tp.ndim()
-
- property size:
-
- def __get__(self):
- return self.tp.size()
-
- property shape:
-
- def __get__(self):
- cdef size_t i
- py_shape = []
- for i in range(self.tp.shape().size()):
- py_shape.append(self.tp.shape()[i])
- return py_shape
-
- property strides:
-
- def __get__(self):
- cdef size_t i
- py_strides = []
- for i in range(self.tp.strides().size()):
- py_strides.append(self.tp.strides()[i])
- return py_strides
-
-
-
-cdef wrap_array_output(PyObject* output):
- cdef object obj = PyObject_to_object(output)
-
- if isinstance(obj, dict):
- return Categorical(obj['indices'],
- categories=obj['dictionary'],
- fastpath=True)
- else:
- return obj
-
-
-cdef class NullArray(Array):
- pass
-
-
-cdef class BooleanArray(Array):
- pass
-
-
-cdef class NumericArray(Array):
- pass
-
-
-cdef class IntegerArray(NumericArray):
- pass
-
-
-cdef class FloatingPointArray(NumericArray):
- pass
-
-
-cdef class Int8Array(IntegerArray):
- pass
-
-
-cdef class UInt8Array(IntegerArray):
- pass
-
-
-cdef class Int16Array(IntegerArray):
- pass
-
-
-cdef class UInt16Array(IntegerArray):
- pass
-
-
-cdef class Int32Array(IntegerArray):
- pass
-
-
-cdef class UInt32Array(IntegerArray):
- pass
-
-
-cdef class Int64Array(IntegerArray):
- pass
-
-
-cdef class UInt64Array(IntegerArray):
- pass
-
-
-cdef class Date32Array(NumericArray):
- pass
-
-
-cdef class Date64Array(NumericArray):
- pass
-
-
-cdef class TimestampArray(NumericArray):
- pass
-
-
-cdef class Time32Array(NumericArray):
- pass
-
-
-cdef class Time64Array(NumericArray):
- pass
-
-
-cdef class FloatArray(FloatingPointArray):
- pass
-
-
-cdef class DoubleArray(FloatingPointArray):
- pass
-
-
-cdef class FixedSizeBinaryArray(Array):
- pass
-
-
-cdef class DecimalArray(FixedSizeBinaryArray):
- pass
-
-
-cdef class ListArray(Array):
- pass
-
-
-cdef class StringArray(Array):
- pass
-
-
-cdef class BinaryArray(Array):
- pass
-
-
-cdef class DictionaryArray(Array):
-
- cdef getitem(self, int64_t i):
- cdef Array dictionary = self.dictionary
- index = self.indices[i]
- if index is NA:
- return index
- else:
- return box_scalar(dictionary.type, dictionary.sp_array,
- index.as_py())
-
- property dictionary:
-
- def __get__(self):
- cdef CDictionaryArray* darr = <CDictionaryArray*>(self.ap)
-
- if self._dictionary is None:
- self._dictionary = box_array(darr.dictionary())
-
- return self._dictionary
-
- property indices:
-
- def __get__(self):
- cdef CDictionaryArray* darr = <CDictionaryArray*>(self.ap)
-
- if self._indices is None:
- self._indices = box_array(darr.indices())
-
- return self._indices
-
- @staticmethod
- def from_arrays(indices, dictionary, mask=None,
- MemoryPool memory_pool=None):
- """
- Construct Arrow DictionaryArray from array of indices (must be
- non-negative integers) and corresponding array of dictionary values
-
- Parameters
- ----------
- indices : ndarray or pandas.Series, integer type
- dictionary : ndarray or pandas.Series
- mask : ndarray or pandas.Series, boolean type
- True values indicate that indices are actually null
-
- Returns
- -------
- dict_array : DictionaryArray
- """
- cdef:
- Array arrow_indices, arrow_dictionary
- DictionaryArray result
- shared_ptr[CDataType] c_type
- shared_ptr[CArray] c_result
-
- if isinstance(indices, Array):
- if mask is not None:
- raise NotImplementedError(
- "mask not implemented with Arrow array inputs yet")
- arrow_indices = indices
- else:
- if mask is None:
- mask = indices == -1
- else:
- mask = mask | (indices == -1)
- arrow_indices = Array.from_pandas(indices, mask=mask,
- memory_pool=memory_pool)
-
- if isinstance(dictionary, Array):
- arrow_dictionary = dictionary
- else:
- arrow_dictionary = Array.from_pandas(dictionary,
- memory_pool=memory_pool)
-
- if not isinstance(arrow_indices, IntegerArray):
- raise ValueError('Indices must be integer type')
-
- c_type.reset(new CDictionaryType(arrow_indices.type.sp_type,
- arrow_dictionary.sp_array))
- c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array))
-
- result = DictionaryArray()
- result.init(c_result)
- return result
-
-
-cdef dict _array_classes = {
- _Type_NA: NullArray,
- _Type_BOOL: BooleanArray,
- _Type_UINT8: UInt8Array,
- _Type_UINT16: UInt16Array,
- _Type_UINT32: UInt32Array,
- _Type_UINT64: UInt64Array,
- _Type_INT8: Int8Array,
- _Type_INT16: Int16Array,
- _Type_INT32: Int32Array,
- _Type_INT64: Int64Array,
- _Type_DATE32: Date32Array,
- _Type_DATE64: Date64Array,
- _Type_TIMESTAMP: TimestampArray,
- _Type_TIME32: Time32Array,
- _Type_TIME64: Time64Array,
- _Type_FLOAT: FloatArray,
- _Type_DOUBLE: DoubleArray,
- _Type_LIST: ListArray,
- _Type_BINARY: BinaryArray,
- _Type_STRING: StringArray,
- _Type_DICTIONARY: DictionaryArray,
- _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
- _Type_DECIMAL: DecimalArray,
-}
-
-cdef object box_array(const shared_ptr[CArray]& sp_array):
- if sp_array.get() == NULL:
- raise ValueError('Array was NULL')
-
- cdef CDataType* data_type = sp_array.get().type().get()
-
- if data_type == NULL:
- raise ValueError('Array data type was NULL')
-
- cdef Array arr = _array_classes[data_type.id()]()
- arr.init(sp_array)
- return arr
-
-
-cdef object box_tensor(const shared_ptr[CTensor]& sp_tensor):
- if sp_tensor.get() == NULL:
- raise ValueError('Tensor was NULL')
-
- cdef Tensor tensor = Tensor()
- tensor.init(sp_tensor)
- return tensor
-
-
-cdef object get_series_values(object obj):
- if isinstance(obj, PandasSeries):
- result = obj.values
- elif isinstance(obj, np.ndarray):
- result = obj
- else:
- result = PandasSeries(obj).values
-
- return result
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/pyarrow/_error.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_error.pxd b/python/pyarrow/_error.pxd
deleted file mode 100644
index 4fb46c2..0000000
--- a/python/pyarrow/_error.pxd
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from pyarrow.includes.libarrow cimport CStatus
-
-cdef int check_status(const CStatus& status) nogil except -1
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/pyarrow/_error.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_error.pyx b/python/pyarrow/_error.pyx
deleted file mode 100644
index 259aeb0..0000000
--- a/python/pyarrow/_error.pyx
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from pyarrow.includes.libarrow cimport CStatus
-from pyarrow.includes.common cimport c_string
-from pyarrow.compat import frombytes
-
-
-class ArrowException(Exception):
- pass
-
-
-class ArrowInvalid(ValueError, ArrowException):
- pass
-
-
-class ArrowMemoryError(MemoryError, ArrowException):
- pass
-
-
-class ArrowIOError(IOError, ArrowException):
- pass
-
-
-class ArrowKeyError(KeyError, ArrowException):
- pass
-
-
-class ArrowTypeError(TypeError, ArrowException):
- pass
-
-
-class ArrowNotImplementedError(NotImplementedError, ArrowException):
- pass
-
-
-cdef int check_status(const CStatus& status) nogil except -1:
- if status.ok():
- return 0
-
- with gil:
- message = frombytes(status.ToString())
- if status.IsInvalid():
- raise ArrowInvalid(message)
- elif status.IsIOError():
- raise ArrowIOError(message)
- elif status.IsOutOfMemory():
- raise ArrowMemoryError(message)
- elif status.IsKeyError():
- raise ArrowKeyError(message)
- elif status.IsNotImplemented():
- raise ArrowNotImplementedError(message)
- elif status.IsTypeError():
- raise ArrowTypeError(message)
- else:
- raise ArrowException(message)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9e875a68/python/pyarrow/_io.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_io.pxd b/python/pyarrow/_io.pxd
deleted file mode 100644
index 0c37a09..0000000
--- a/python/pyarrow/_io.pxd
+++ /dev/null
@@ -1,50 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# distutils: language = c++
-
-from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport *
-
-
-cdef class Buffer:
- cdef:
- shared_ptr[CBuffer] buffer
- Py_ssize_t shape[1]
- Py_ssize_t strides[1]
-
- cdef init(self, const shared_ptr[CBuffer]& buffer)
-
-
-cdef class NativeFile:
- cdef:
- shared_ptr[RandomAccessFile] rd_file
- shared_ptr[OutputStream] wr_file
- bint is_readable
- bint is_writeable
- bint is_open
- bint own_file
-
- # By implementing these "virtual" functions (all functions in Cython
- # extension classes are technically virtual in the C++ sense) we can expose
- # the arrow::io abstract file interfaces to other components throughout the
- # suite of Arrow C++ libraries
- cdef read_handle(self, shared_ptr[RandomAccessFile]* file)
- cdef write_handle(self, shared_ptr[OutputStream]* file)
-
-cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader)
-cdef get_writer(object source, shared_ptr[OutputStream]* writer)