You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/26 15:45:44 UTC
[2/5] arrow git commit: ARROW-341: [Python] Move pyarrow's C++ code
to the main C++ source tree, install libarrow_python and headers
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
deleted file mode 100644
index a7386ce..0000000
--- a/python/src/pyarrow/adapters/pandas.cc
+++ /dev/null
@@ -1,1936 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Functions for pandas conversion via NumPy
-
-#include <Python.h>
-
-#include "pyarrow/adapters/pandas.h"
-#include "pyarrow/numpy_interop.h"
-
-#include <algorithm>
-#include <atomic>
-#include <cmath>
-#include <cstdint>
-#include <memory>
-#include <mutex>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <unordered_map>
-
-#include "arrow/array.h"
-#include "arrow/column.h"
-#include "arrow/loader.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/type_fwd.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit-util.h"
-#include "arrow/util/macros.h"
-
-#include "pyarrow/adapters/builtin.h"
-#include "pyarrow/common.h"
-#include "pyarrow/config.h"
-#include "pyarrow/type_traits.h"
-#include "pyarrow/util/datetime.h"
-
-namespace arrow {
-namespace py {
-
-// ----------------------------------------------------------------------
-// Utility code
-
-int cast_npy_type_compat(int type_num) {
-// Both LONGLONG and INT64 can be observed in the wild, which is buggy. We set
-// U/LONGLONG to U/INT64 so things work properly.
-
-#if (NPY_INT64 == NPY_LONGLONG) && (NPY_SIZEOF_LONGLONG == 8)
- if (type_num == NPY_LONGLONG) { type_num = NPY_INT64; }
- if (type_num == NPY_ULONGLONG) { type_num = NPY_UINT64; }
-#endif
-
- return type_num;
-}
-
-static inline bool PyObject_is_null(const PyObject* obj) {
- return obj == Py_None || obj == numpy_nan;
-}
-
-static inline bool PyObject_is_string(const PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
- return PyUnicode_Check(obj) || PyBytes_Check(obj);
-#else
- return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
-template <int TYPE>
-static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
- typedef npy_traits<TYPE> traits;
- typedef typename traits::value_type T;
-
- int64_t null_count = 0;
- const T* values = reinterpret_cast<const T*>(data);
-
- // TODO(wesm): striding
- for (int i = 0; i < length; ++i) {
- if (traits::isnull(values[i])) {
- ++null_count;
- } else {
- BitUtil::SetBit(bitmap, i);
- }
- }
-
- return null_count;
-}
-
-// Returns null count
-static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
- int64_t null_count = 0;
- const uint8_t* mask_values = static_cast<const uint8_t*>(PyArray_DATA(mask));
- // TODO(wesm): strided null mask
- for (int i = 0; i < length; ++i) {
- if (mask_values[i]) {
- ++null_count;
- } else {
- BitUtil::SetBit(bitmap, i);
- }
- }
- return null_count;
-}
-
-template <int TYPE>
-static int64_t ValuesToValidBytes(
- const void* data, int64_t length, uint8_t* valid_bytes) {
- typedef npy_traits<TYPE> traits;
- typedef typename traits::value_type T;
-
- int64_t null_count = 0;
- const T* values = reinterpret_cast<const T*>(data);
-
- // TODO(wesm): striding
- for (int i = 0; i < length; ++i) {
- valid_bytes[i] = not traits::isnull(values[i]);
- if (traits::isnull(values[i])) null_count++;
- }
-
- return null_count;
-}
-
-Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
- if (PyArray_NDIM(numpy_array) != 1) {
- return Status::Invalid("only handle 1-dimensional arrays");
- }
-
- if (PyArray_DESCR(numpy_array)->type_num != np_type) {
- return Status::Invalid("can only handle exact conversions");
- }
-
- npy_intp* astrides = PyArray_STRIDES(numpy_array);
- if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) {
- return Status::Invalid("No support for strided arrays in lists yet");
- }
- return Status::OK();
-}
-
-Status AppendObjectStrings(StringBuilder& string_builder, PyObject** objects,
- int64_t objects_length, bool* have_bytes) {
- PyObject* obj;
-
- for (int64_t i = 0; i < objects_length; ++i) {
- obj = objects[i];
- if (PyUnicode_Check(obj)) {
- obj = PyUnicode_AsUTF8String(obj);
- if (obj == NULL) {
- PyErr_Clear();
- return Status::TypeError("failed converting unicode to UTF8");
- }
- const int64_t length = PyBytes_GET_SIZE(obj);
- Status s = string_builder.Append(PyBytes_AS_STRING(obj), length);
- Py_DECREF(obj);
- if (!s.ok()) { return s; }
- } else if (PyBytes_Check(obj)) {
- *have_bytes = true;
- const int64_t length = PyBytes_GET_SIZE(obj);
- RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
- } else {
- string_builder.AppendNull();
- }
- }
-
- return Status::OK();
-}
-
-template <typename T>
-struct WrapBytes {};
-
-template <>
-struct WrapBytes<StringArray> {
- static inline PyObject* Wrap(const uint8_t* data, int64_t length) {
- return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(data), length);
- }
-};
-
-template <>
-struct WrapBytes<BinaryArray> {
- static inline PyObject* Wrap(const uint8_t* data, int64_t length) {
- return PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), length);
- }
-};
-
-static inline bool ListTypeSupported(const Type::type type_id) {
- switch (type_id) {
- case Type::UINT8:
- case Type::INT8:
- case Type::UINT16:
- case Type::INT16:
- case Type::UINT32:
- case Type::INT32:
- case Type::INT64:
- case Type::UINT64:
- case Type::FLOAT:
- case Type::DOUBLE:
- case Type::STRING:
- case Type::TIMESTAMP:
- // The above types are all supported.
- return true;
- default:
- break;
- }
- return false;
-}
-
-// ----------------------------------------------------------------------
-// Conversion from NumPy-in-Pandas to Arrow
-
-class PandasConverter : public TypeVisitor {
- public:
- PandasConverter(
- MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr<DataType>& type)
- : pool_(pool),
- type_(type),
- arr_(reinterpret_cast<PyArrayObject*>(ao)),
- mask_(nullptr) {
- if (mo != nullptr and mo != Py_None) { mask_ = reinterpret_cast<PyArrayObject*>(mo); }
- length_ = PyArray_SIZE(arr_);
- }
-
- bool is_strided() const {
- npy_intp* astrides = PyArray_STRIDES(arr_);
- return astrides[0] != PyArray_DESCR(arr_)->elsize;
- }
-
- Status InitNullBitmap() {
- int null_bytes = BitUtil::BytesForBits(length_);
-
- null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
- RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
-
- null_bitmap_data_ = null_bitmap_->mutable_data();
- memset(null_bitmap_data_, 0, null_bytes);
-
- return Status::OK();
- }
-
- // ----------------------------------------------------------------------
- // Traditional visitor conversion for non-object arrays
-
- template <typename ArrowType>
- Status ConvertData(std::shared_ptr<Buffer>* data);
-
- template <typename ArrowType>
- Status VisitNative() {
- using traits = arrow_traits<ArrowType::type_id>;
-
- if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); }
-
- std::shared_ptr<Buffer> data;
- RETURN_NOT_OK(ConvertData<ArrowType>(&data));
-
- int64_t null_count = 0;
- if (mask_ != nullptr) {
- null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
- } else if (traits::supports_nulls) {
- // TODO(wesm): this presumes the NumPy C type and arrow C type are the
- // same
- null_count = ValuesToBitmap<traits::npy_type>(
- PyArray_DATA(arr_), length_, null_bitmap_data_);
- }
-
- std::vector<FieldMetadata> fields(1);
- fields[0].length = length_;
- fields[0].null_count = null_count;
- fields[0].offset = 0;
-
- return LoadArray(type_, fields, {null_bitmap_, data}, &out_);
- }
-
-#define VISIT_NATIVE(TYPE) \
- Status Visit(const TYPE& type) override { return VisitNative<TYPE>(); }
-
- VISIT_NATIVE(BooleanType);
- VISIT_NATIVE(Int8Type);
- VISIT_NATIVE(Int16Type);
- VISIT_NATIVE(Int32Type);
- VISIT_NATIVE(Int64Type);
- VISIT_NATIVE(UInt8Type);
- VISIT_NATIVE(UInt16Type);
- VISIT_NATIVE(UInt32Type);
- VISIT_NATIVE(UInt64Type);
- VISIT_NATIVE(FloatType);
- VISIT_NATIVE(DoubleType);
- VISIT_NATIVE(TimestampType);
-
-#undef VISIT_NATIVE
-
- Status Convert(std::shared_ptr<Array>* out) {
- if (PyArray_NDIM(arr_) != 1) {
- return Status::Invalid("only handle 1-dimensional arrays");
- }
- // TODO(wesm): strided arrays
- if (is_strided()) { return Status::Invalid("no support for strided data yet"); }
-
- if (type_ == nullptr) { return Status::Invalid("Must pass data type"); }
-
- // Visit the type to perform conversion
- RETURN_NOT_OK(type_->Accept(this));
-
- *out = out_;
- return Status::OK();
- }
-
- // ----------------------------------------------------------------------
- // Conversion logic for various object dtype arrays
-
- template <int ITEM_TYPE, typename ArrowType>
- Status ConvertTypedLists(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
-
- Status ConvertObjectStrings(std::shared_ptr<Array>* out);
- Status ConvertBooleans(std::shared_ptr<Array>* out);
- Status ConvertDates(std::shared_ptr<Array>* out);
- Status ConvertLists(const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
- Status ConvertObjects(std::shared_ptr<Array>* out);
-
- protected:
- MemoryPool* pool_;
- std::shared_ptr<DataType> type_;
- PyArrayObject* arr_;
- PyArrayObject* mask_;
- int64_t length_;
-
- // Used in visitor pattern
- std::shared_ptr<Array> out_;
-
- std::shared_ptr<ResizableBuffer> null_bitmap_;
- uint8_t* null_bitmap_data_;
-};
-
-template <typename ArrowType>
-inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>* data) {
- using traits = arrow_traits<ArrowType::type_id>;
-
- // Handle LONGLONG->INT64 and other fun things
- int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
-
- if (traits::npy_type != type_num_compat) {
- return Status::NotImplemented("NumPy type casts not yet implemented");
- }
-
- *data = std::make_shared<NumPyBuffer>(arr_);
- return Status::OK();
-}
-
-template <>
-inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
- int nbytes = BitUtil::BytesForBits(length_);
- auto buffer = std::make_shared<PoolBuffer>(pool_);
- RETURN_NOT_OK(buffer->Resize(nbytes));
-
- const uint8_t* values = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
-
- uint8_t* bitmap = buffer->mutable_data();
-
- memset(bitmap, 0, nbytes);
- for (int i = 0; i < length_; ++i) {
- if (values[i] > 0) { BitUtil::SetBit(bitmap, i); }
- }
-
- *data = buffer;
- return Status::OK();
-}
-
-Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
- PyAcquireGIL lock;
-
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- Date64Builder date_builder(pool_);
- RETURN_NOT_OK(date_builder.Resize(length_));
-
- Status s;
- PyObject* obj;
- for (int64_t i = 0; i < length_; ++i) {
- obj = objects[i];
- if (PyDate_CheckExact(obj)) {
- PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj);
- date_builder.Append(PyDate_to_ms(pydate));
- } else {
- date_builder.AppendNull();
- }
- }
- return date_builder.Finish(out);
-}
-
-Status PandasConverter::ConvertObjectStrings(std::shared_ptr<Array>* out) {
- PyAcquireGIL lock;
-
- // The output type at this point is inconclusive because there may be bytes
- // and unicode mixed in the object array
-
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- StringBuilder string_builder(pool_);
- RETURN_NOT_OK(string_builder.Resize(length_));
-
- Status s;
- bool have_bytes = false;
- RETURN_NOT_OK(AppendObjectStrings(string_builder, objects, length_, &have_bytes));
- RETURN_NOT_OK(string_builder.Finish(out));
-
- if (have_bytes) {
- const auto& arr = static_cast<const StringArray&>(*out->get());
- *out = std::make_shared<BinaryArray>(arr.length(), arr.value_offsets(), arr.data(),
- arr.null_bitmap(), arr.null_count());
- }
- return Status::OK();
-}
-
-Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
- PyAcquireGIL lock;
-
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
-
- int nbytes = BitUtil::BytesForBits(length_);
- auto data = std::make_shared<PoolBuffer>(pool_);
- RETURN_NOT_OK(data->Resize(nbytes));
- uint8_t* bitmap = data->mutable_data();
- memset(bitmap, 0, nbytes);
-
- int64_t null_count = 0;
- for (int64_t i = 0; i < length_; ++i) {
- if (objects[i] == Py_True) {
- BitUtil::SetBit(bitmap, i);
- BitUtil::SetBit(null_bitmap_data_, i);
- } else if (objects[i] != Py_False) {
- ++null_count;
- } else {
- BitUtil::SetBit(null_bitmap_data_, i);
- }
- }
-
- *out = std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count);
-
- return Status::OK();
-}
-
-Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
- // Python object arrays are annoying, since we could have one of:
- //
- // * Strings
- // * Booleans with nulls
- // * Mixed type (not supported at the moment by arrow format)
- //
- // Additionally, nulls may be encoded either as np.nan or None. So we have to
- // do some type inference and conversion
-
- RETURN_NOT_OK(InitNullBitmap());
-
- // TODO: mask not supported here
- if (mask_ != nullptr) {
- return Status::NotImplemented("mask not supported in object conversions yet");
- }
-
- const PyObject** objects;
- {
- PyAcquireGIL lock;
- objects = reinterpret_cast<const PyObject**>(PyArray_DATA(arr_));
- PyDateTime_IMPORT;
- }
-
- if (type_) {
- switch (type_->type) {
- case Type::STRING:
- return ConvertObjectStrings(out);
- case Type::BOOL:
- return ConvertBooleans(out);
- case Type::DATE64:
- return ConvertDates(out);
- case Type::LIST: {
- const auto& list_field = static_cast<const ListType&>(*type_);
- return ConvertLists(list_field.value_field()->type, out);
- }
- default:
- return Status::TypeError("No known conversion to Arrow type");
- }
- } else {
- for (int64_t i = 0; i < length_; ++i) {
- if (PyObject_is_null(objects[i])) {
- continue;
- } else if (PyObject_is_string(objects[i])) {
- return ConvertObjectStrings(out);
- } else if (PyBool_Check(objects[i])) {
- return ConvertBooleans(out);
- } else if (PyDate_CheckExact(objects[i])) {
- return ConvertDates(out);
- } else {
- return Status::TypeError("unhandled python type");
- }
- }
- }
-
- return Status::TypeError("Unable to infer type of object array, were all null");
-}
-
-template <int ITEM_TYPE, typename ArrowType>
-inline Status PandasConverter::ConvertTypedLists(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
- typedef npy_traits<ITEM_TYPE> traits;
- typedef typename traits::value_type T;
- typedef typename traits::BuilderClass BuilderT;
-
- PyAcquireGIL lock;
-
- auto value_builder = std::make_shared<BuilderT>(pool_, type);
- ListBuilder list_builder(pool_, value_builder);
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- for (int64_t i = 0; i < length_; ++i) {
- if (PyObject_is_null(objects[i])) {
- RETURN_NOT_OK(list_builder.AppendNull());
- } else if (PyArray_Check(objects[i])) {
- auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
- RETURN_NOT_OK(list_builder.Append(true));
-
- // TODO(uwe): Support more complex numpy array structures
- RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
-
- int64_t size = PyArray_DIM(numpy_array, 0);
- auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array));
- if (traits::supports_nulls) {
- null_bitmap_->Resize(size, false);
- // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't
- // currently support this.
- // ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
- ValuesToValidBytes<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
- RETURN_NOT_OK(value_builder->Append(data, size, null_bitmap_->data()));
- } else {
- RETURN_NOT_OK(value_builder->Append(data, size));
- }
-
- } else if (PyList_Check(objects[i])) {
- int64_t size;
- std::shared_ptr<DataType> inferred_type;
- RETURN_NOT_OK(list_builder.Append(true));
- RETURN_NOT_OK(InferArrowType(objects[i], &size, &inferred_type));
- if (inferred_type->type != type->type) {
- std::stringstream ss;
- ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
- return Status::TypeError(ss.str());
- }
- RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder));
- } else {
- return Status::TypeError("Unsupported Python type for list items");
- }
- }
- return list_builder.Finish(out);
-}
-
-template <>
-inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
- PyAcquireGIL lock;
- // TODO: If there are bytes involed, convert to Binary representation
- bool have_bytes = false;
-
- auto value_builder = std::make_shared<StringBuilder>(pool_);
- ListBuilder list_builder(pool_, value_builder);
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- for (int64_t i = 0; i < length_; ++i) {
- if (PyObject_is_null(objects[i])) {
- RETURN_NOT_OK(list_builder.AppendNull());
- } else if (PyArray_Check(objects[i])) {
- auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
- RETURN_NOT_OK(list_builder.Append(true));
-
- // TODO(uwe): Support more complex numpy array structures
- RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
-
- int64_t size = PyArray_DIM(numpy_array, 0);
- auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
- RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes));
- } else if (PyList_Check(objects[i])) {
- int64_t size;
- std::shared_ptr<DataType> inferred_type;
- RETURN_NOT_OK(list_builder.Append(true));
- RETURN_NOT_OK(InferArrowType(objects[i], &size, &inferred_type));
- if (inferred_type->type != Type::STRING) {
- std::stringstream ss;
- ss << inferred_type->ToString() << " cannot be converted to STRING.";
- return Status::TypeError(ss.str());
- }
- RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder));
- } else {
- return Status::TypeError("Unsupported Python type for list items");
- }
- }
- return list_builder.Finish(out);
-}
-
-#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \
- case Type::TYPE: { \
- return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type, out); \
- }
-
-Status PandasConverter::ConvertLists(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
- switch (type->type) {
- LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
- LIST_CASE(INT8, NPY_INT8, Int8Type)
- LIST_CASE(UINT16, NPY_UINT16, UInt16Type)
- LIST_CASE(INT16, NPY_INT16, Int16Type)
- LIST_CASE(UINT32, NPY_UINT32, UInt32Type)
- LIST_CASE(INT32, NPY_INT32, Int32Type)
- LIST_CASE(UINT64, NPY_UINT64, UInt64Type)
- LIST_CASE(INT64, NPY_INT64, Int64Type)
- LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
- LIST_CASE(FLOAT, NPY_FLOAT, FloatType)
- LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
- LIST_CASE(STRING, NPY_OBJECT, StringType)
- default:
- return Status::TypeError("Unknown list item type");
- }
-
- return Status::TypeError("Unknown list type");
-}
-
-Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
- PandasConverter converter(pool, ao, mo, type);
- return converter.Convert(out);
-}
-
-Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
- PandasConverter converter(pool, ao, mo, type);
- return converter.ConvertObjects(out);
-}
-
-Status PandasDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out) {
- PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
-
- int type_num = cast_npy_type_compat(descr->type_num);
-
-#define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \
- case NPY_##NPY_NAME: \
- *out = FACTORY(); \
- break;
-
- switch (type_num) {
- TO_ARROW_TYPE_CASE(BOOL, boolean);
- TO_ARROW_TYPE_CASE(INT8, int8);
- TO_ARROW_TYPE_CASE(INT16, int16);
- TO_ARROW_TYPE_CASE(INT32, int32);
- TO_ARROW_TYPE_CASE(INT64, int64);
-#if (NPY_INT64 != NPY_LONGLONG)
- TO_ARROW_TYPE_CASE(LONGLONG, int64);
-#endif
- TO_ARROW_TYPE_CASE(UINT8, uint8);
- TO_ARROW_TYPE_CASE(UINT16, uint16);
- TO_ARROW_TYPE_CASE(UINT32, uint32);
- TO_ARROW_TYPE_CASE(UINT64, uint64);
-#if (NPY_UINT64 != NPY_ULONGLONG)
- TO_ARROW_CASE(ULONGLONG);
-#endif
- TO_ARROW_TYPE_CASE(FLOAT32, float32);
- TO_ARROW_TYPE_CASE(FLOAT64, float64);
- case NPY_DATETIME: {
- auto date_dtype =
- reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
- TimeUnit unit;
- switch (date_dtype->meta.base) {
- case NPY_FR_s:
- unit = TimeUnit::SECOND;
- break;
- case NPY_FR_ms:
- unit = TimeUnit::MILLI;
- break;
- case NPY_FR_us:
- unit = TimeUnit::MICRO;
- break;
- case NPY_FR_ns:
- unit = TimeUnit::NANO;
- break;
- default:
- return Status::NotImplemented("Unsupported datetime64 time unit");
- }
- *out = timestamp(unit);
- } break;
- default: {
- std::stringstream ss;
- ss << "Unsupported numpy type " << descr->type_num << std::endl;
- return Status::NotImplemented(ss.str());
- }
- }
-
-#undef TO_ARROW_TYPE_CASE
-
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// pandas 0.x DataFrame conversion internals
-
-inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
- if (type == NPY_DATETIME) {
- PyArray_Descr* descr = PyArray_DESCR(out);
- auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
- if (datatype->type == Type::TIMESTAMP) {
- auto timestamp_type = static_cast<TimestampType*>(datatype);
-
- switch (timestamp_type->unit) {
- case TimestampType::Unit::SECOND:
- date_dtype->meta.base = NPY_FR_s;
- break;
- case TimestampType::Unit::MILLI:
- date_dtype->meta.base = NPY_FR_ms;
- break;
- case TimestampType::Unit::MICRO:
- date_dtype->meta.base = NPY_FR_us;
- break;
- case TimestampType::Unit::NANO:
- date_dtype->meta.base = NPY_FR_ns;
- break;
- }
- } else {
- // datatype->type == Type::DATE64
- date_dtype->meta.base = NPY_FR_D;
- }
- }
-}
-
-class PandasBlock {
- public:
- enum type {
- OBJECT,
- UINT8,
- INT8,
- UINT16,
- INT16,
- UINT32,
- INT32,
- UINT64,
- INT64,
- FLOAT,
- DOUBLE,
- BOOL,
- DATETIME,
- DATETIME_WITH_TZ,
- CATEGORICAL
- };
-
- PandasBlock(int64_t num_rows, int num_columns)
- : num_rows_(num_rows), num_columns_(num_columns) {}
- virtual ~PandasBlock() {}
-
- virtual Status Allocate() = 0;
- virtual Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) = 0;
-
- PyObject* block_arr() const { return block_arr_.obj(); }
-
- virtual Status GetPyResult(PyObject** output) {
- PyObject* result = PyDict_New();
- RETURN_IF_PYERROR();
-
- PyDict_SetItemString(result, "block", block_arr_.obj());
- PyDict_SetItemString(result, "placement", placement_arr_.obj());
-
- *output = result;
-
- return Status::OK();
- }
-
- protected:
- Status AllocateNDArray(int npy_type, int ndim = 2) {
- PyAcquireGIL lock;
-
- PyObject* block_arr;
- if (ndim == 2) {
- npy_intp block_dims[2] = {num_columns_, num_rows_};
- block_arr = PyArray_SimpleNew(2, block_dims, npy_type);
- } else {
- npy_intp block_dims[1] = {num_rows_};
- block_arr = PyArray_SimpleNew(1, block_dims, npy_type);
- }
-
- if (block_arr == NULL) {
- // TODO(wesm): propagating Python exception
- return Status::OK();
- }
-
- npy_intp placement_dims[1] = {num_columns_};
- PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
- if (placement_arr == NULL) {
- // TODO(wesm): propagating Python exception
- return Status::OK();
- }
-
- block_arr_.reset(block_arr);
- placement_arr_.reset(placement_arr);
-
- block_data_ = reinterpret_cast<uint8_t*>(
- PyArray_DATA(reinterpret_cast<PyArrayObject*>(block_arr)));
-
- placement_data_ = reinterpret_cast<int64_t*>(
- PyArray_DATA(reinterpret_cast<PyArrayObject*>(placement_arr)));
-
- return Status::OK();
- }
-
- int64_t num_rows_;
- int num_columns_;
-
- OwnedRef block_arr_;
- uint8_t* block_data_;
-
- // ndarray<int32>
- OwnedRef placement_arr_;
- int64_t* placement_data_;
-
- DISALLOW_COPY_AND_ASSIGN(PandasBlock);
-};
-
-template <typename T>
-inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
- // Upcast to double, set NaN as appropriate
-
- for (int i = 0; i < arr->length(); ++i) {
- *out_values++ = prim_arr->IsNull(i) ? NAN : in_values[i];
- }
- }
-}
-
-template <typename T>
-inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
- memcpy(out_values, in_values, sizeof(T) * arr->length());
- out_values += arr->length();
- }
-}
-
-template <typename InType, typename OutType>
-inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
- for (int64_t i = 0; i < arr->length(); ++i) {
- *out_values = in_values[i];
- }
- }
-}
-
-static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) {
- PyAcquireGIL lock;
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto bool_arr = static_cast<BooleanArray*>(arr.get());
-
- for (int64_t i = 0; i < arr->length(); ++i) {
- if (bool_arr->IsNull(i)) {
- Py_INCREF(Py_None);
- *out_values++ = Py_None;
- } else if (bool_arr->Value(i)) {
- // True
- Py_INCREF(Py_True);
- *out_values++ = Py_True;
- } else {
- // False
- Py_INCREF(Py_False);
- *out_values++ = Py_False;
- }
- }
- }
- return Status::OK();
-}
-
-static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto bool_arr = static_cast<BooleanArray*>(arr.get());
- for (int64_t i = 0; i < arr->length(); ++i) {
- *out_values++ = static_cast<uint8_t>(bool_arr->Value(i));
- }
- }
-}
-
-template <typename ArrayType>
-inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) {
- PyAcquireGIL lock;
- for (int c = 0; c < data.num_chunks(); c++) {
- auto arr = static_cast<ArrayType*>(data.chunk(c).get());
-
- const uint8_t* data_ptr;
- int32_t length;
- const bool has_nulls = data.null_count() > 0;
- for (int64_t i = 0; i < arr->length(); ++i) {
- if (has_nulls && arr->IsNull(i)) {
- Py_INCREF(Py_None);
- *out_values = Py_None;
- } else {
- data_ptr = arr->GetValue(i, &length);
- *out_values = WrapBytes<ArrayType>::Wrap(data_ptr, length);
- if (*out_values == nullptr) {
- PyErr_Clear();
- std::stringstream ss;
- ss << "Wrapping "
- << std::string(reinterpret_cast<const char*>(data_ptr), length) << " failed";
- return Status::UnknownError(ss.str());
- }
- }
- ++out_values;
- }
- }
- return Status::OK();
-}
-
-template <typename ArrowType>
-inline Status ConvertListsLike(
- const std::shared_ptr<Column>& col, PyObject** out_values) {
- const ChunkedArray& data = *col->data().get();
- auto list_type = std::static_pointer_cast<ListType>(col->type());
-
- // Get column of underlying value arrays
- std::vector<std::shared_ptr<Array>> value_arrays;
- for (int c = 0; c < data.num_chunks(); c++) {
- auto arr = std::static_pointer_cast<ListArray>(data.chunk(c));
- value_arrays.emplace_back(arr->values());
- }
- auto flat_column = std::make_shared<Column>(list_type->value_field(), value_arrays);
- // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
- // Storing a reference to the whole Array would be to expensive.
- PyObject* numpy_array;
- RETURN_NOT_OK(ConvertColumnToPandas(flat_column, nullptr, &numpy_array));
-
- PyAcquireGIL lock;
-
- for (int c = 0; c < data.num_chunks(); c++) {
- auto arr = std::static_pointer_cast<ListArray>(data.chunk(c));
-
- const uint8_t* data_ptr;
- const bool has_nulls = data.null_count() > 0;
- for (int64_t i = 0; i < arr->length(); ++i) {
- if (has_nulls && arr->IsNull(i)) {
- Py_INCREF(Py_None);
- *out_values = Py_None;
- } else {
- PyObject* start = PyLong_FromLong(arr->value_offset(i));
- PyObject* end = PyLong_FromLong(arr->value_offset(i + 1));
- PyObject* slice = PySlice_New(start, end, NULL);
- *out_values = PyObject_GetItem(numpy_array, slice);
- Py_DECREF(start);
- Py_DECREF(end);
- Py_DECREF(slice);
- }
- ++out_values;
- }
- }
-
- Py_XDECREF(numpy_array);
- return Status::OK();
-}
-
-template <typename T>
-inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-
- const uint8_t* valid_bits = arr->null_bitmap_data();
-
- if (arr->null_count() > 0) {
- for (int64_t i = 0; i < arr->length(); ++i) {
- *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i];
- }
- } else {
- memcpy(out_values, in_values, sizeof(T) * arr->length());
- out_values += arr->length();
- }
- }
-}
-
-template <typename InType, typename OutType>
-inline void ConvertNumericNullableCast(
- const ChunkedArray& data, OutType na_value, OutType* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
-
- for (int64_t i = 0; i < arr->length(); ++i) {
- *out_values++ = arr->IsNull(i) ? na_value : static_cast<OutType>(in_values[i]);
- }
- }
-}
-
-template <typename T>
-inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-
- for (int64_t i = 0; i < arr->length(); ++i) {
- // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
- *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000;
- }
- }
-}
-
-template <typename InType, int SHIFT>
-inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
-
- for (int64_t i = 0; i < arr->length(); ++i) {
- *out_values++ = arr->IsNull(i) ? kPandasTimestampNull
- : (static_cast<int64_t>(in_values[i]) * SHIFT);
- }
- }
-}
-
-#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \
- case Type::ArrowEnum: \
- RETURN_NOT_OK((ConvertListsLike<ArrowType>(col, out_buffer))); \
- break;
-
-class ObjectBlock : public PandasBlock {
- public:
- using PandasBlock::PandasBlock;
- virtual ~ObjectBlock() {}
-
- Status Allocate() override { return AllocateNDArray(NPY_OBJECT); }
-
- Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) override {
- Type::type type = col->type()->type;
-
- PyObject** out_buffer =
- reinterpret_cast<PyObject**>(block_data_) + rel_placement * num_rows_;
-
- const ChunkedArray& data = *col->data().get();
-
- if (type == Type::BOOL) {
- RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer));
- } else if (type == Type::BINARY) {
- RETURN_NOT_OK(ConvertBinaryLike<BinaryArray>(data, out_buffer));
- } else if (type == Type::STRING) {
- RETURN_NOT_OK(ConvertBinaryLike<StringArray>(data, out_buffer));
- } else if (type == Type::LIST) {
- auto list_type = std::static_pointer_cast<ListType>(col->type());
- switch (list_type->value_type()->type) {
- CONVERTLISTSLIKE_CASE(UInt8Type, UINT8)
- CONVERTLISTSLIKE_CASE(Int8Type, INT8)
- CONVERTLISTSLIKE_CASE(UInt16Type, UINT16)
- CONVERTLISTSLIKE_CASE(Int16Type, INT16)
- CONVERTLISTSLIKE_CASE(UInt32Type, UINT32)
- CONVERTLISTSLIKE_CASE(Int32Type, INT32)
- CONVERTLISTSLIKE_CASE(UInt64Type, UINT64)
- CONVERTLISTSLIKE_CASE(Int64Type, INT64)
- CONVERTLISTSLIKE_CASE(TimestampType, TIMESTAMP)
- CONVERTLISTSLIKE_CASE(FloatType, FLOAT)
- CONVERTLISTSLIKE_CASE(DoubleType, DOUBLE)
- CONVERTLISTSLIKE_CASE(StringType, STRING)
- default: {
- std::stringstream ss;
- ss << "Not implemented type for lists: " << list_type->value_type()->ToString();
- return Status::NotImplemented(ss.str());
- }
- }
- } else {
- std::stringstream ss;
- ss << "Unsupported type for object array output: " << col->type()->ToString();
- return Status::NotImplemented(ss.str());
- }
-
- placement_data_[rel_placement] = abs_placement;
- return Status::OK();
- }
-};
-
-template <int ARROW_TYPE, typename C_TYPE>
-class IntBlock : public PandasBlock {
- public:
- using PandasBlock::PandasBlock;
-
- Status Allocate() override {
- return AllocateNDArray(arrow_traits<ARROW_TYPE>::npy_type);
- }
-
- Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) override {
- Type::type type = col->type()->type;
-
- C_TYPE* out_buffer =
- reinterpret_cast<C_TYPE*>(block_data_) + rel_placement * num_rows_;
-
- const ChunkedArray& data = *col->data().get();
-
- if (type != ARROW_TYPE) { return Status::NotImplemented(col->type()->ToString()); }
-
- ConvertIntegerNoNullsSameType<C_TYPE>(data, out_buffer);
- placement_data_[rel_placement] = abs_placement;
- return Status::OK();
- }
-};
-
-using UInt8Block = IntBlock<Type::UINT8, uint8_t>;
-using Int8Block = IntBlock<Type::INT8, int8_t>;
-using UInt16Block = IntBlock<Type::UINT16, uint16_t>;
-using Int16Block = IntBlock<Type::INT16, int16_t>;
-using UInt32Block = IntBlock<Type::UINT32, uint32_t>;
-using Int32Block = IntBlock<Type::INT32, int32_t>;
-using UInt64Block = IntBlock<Type::UINT64, uint64_t>;
-using Int64Block = IntBlock<Type::INT64, int64_t>;
-
-class Float32Block : public PandasBlock {
- public:
- using PandasBlock::PandasBlock;
-
- Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); }
-
- Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) override {
- Type::type type = col->type()->type;
-
- if (type != Type::FLOAT) { return Status::NotImplemented(col->type()->ToString()); }
-
- float* out_buffer = reinterpret_cast<float*>(block_data_) + rel_placement * num_rows_;
-
- ConvertNumericNullable<float>(*col->data().get(), NAN, out_buffer);
- placement_data_[rel_placement] = abs_placement;
- return Status::OK();
- }
-};
-
-class Float64Block : public PandasBlock {
- public:
- using PandasBlock::PandasBlock;
-
- Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); }
-
- Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) override {
- Type::type type = col->type()->type;
-
- double* out_buffer =
- reinterpret_cast<double*>(block_data_) + rel_placement * num_rows_;
-
- const ChunkedArray& data = *col->data().get();
-
-#define INTEGER_CASE(IN_TYPE) \
- ConvertIntegerWithNulls<IN_TYPE>(data, out_buffer); \
- break;
-
- switch (type) {
- case Type::UINT8:
- INTEGER_CASE(uint8_t);
- case Type::INT8:
- INTEGER_CASE(int8_t);
- case Type::UINT16:
- INTEGER_CASE(uint16_t);
- case Type::INT16:
- INTEGER_CASE(int16_t);
- case Type::UINT32:
- INTEGER_CASE(uint32_t);
- case Type::INT32:
- INTEGER_CASE(int32_t);
- case Type::UINT64:
- INTEGER_CASE(uint64_t);
- case Type::INT64:
- INTEGER_CASE(int64_t);
- case Type::FLOAT:
- ConvertNumericNullableCast<float, double>(data, NAN, out_buffer);
- break;
- case Type::DOUBLE:
- ConvertNumericNullable<double>(data, NAN, out_buffer);
- break;
- default:
- return Status::NotImplemented(col->type()->ToString());
- }
-
-#undef INTEGER_CASE
-
- placement_data_[rel_placement] = abs_placement;
- return Status::OK();
- }
-};
-
-class BoolBlock : public PandasBlock {
- public:
- using PandasBlock::PandasBlock;
-
- Status Allocate() override { return AllocateNDArray(NPY_BOOL); }
-
- Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) override {
- Type::type type = col->type()->type;
-
- if (type != Type::BOOL) { return Status::NotImplemented(col->type()->ToString()); }
-
- uint8_t* out_buffer =
- reinterpret_cast<uint8_t*>(block_data_) + rel_placement * num_rows_;
-
- ConvertBooleanNoNulls(*col->data().get(), out_buffer);
- placement_data_[rel_placement] = abs_placement;
- return Status::OK();
- }
-};
-
-class DatetimeBlock : public PandasBlock {
- public:
- using PandasBlock::PandasBlock;
-
- Status AllocateDatetime(int ndim) {
- RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME, ndim));
-
- PyAcquireGIL lock;
- auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
- PyArray_DESCR(reinterpret_cast<PyArrayObject*>(block_arr_.obj()))->c_metadata);
- date_dtype->meta.base = NPY_FR_ns;
- return Status::OK();
- }
-
- Status Allocate() override { return AllocateDatetime(2); }
-
- Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) override {
- Type::type type = col->type()->type;
-
- int64_t* out_buffer =
- reinterpret_cast<int64_t*>(block_data_) + rel_placement * num_rows_;
-
- const ChunkedArray& data = *col.get()->data();
-
- if (type == Type::DATE64) {
- // Date64Type is millisecond timestamp stored as int64_t
- // TODO(wesm): Do we want to make sure to zero out the milliseconds?
- ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer);
- } else if (type == Type::TIMESTAMP) {
- auto ts_type = static_cast<TimestampType*>(col->type().get());
-
- if (ts_type->unit == TimeUnit::NANO) {
- ConvertNumericNullable<int64_t>(data, kPandasTimestampNull, out_buffer);
- } else if (ts_type->unit == TimeUnit::MICRO) {
- ConvertDatetimeNanos<int64_t, 1000L>(data, out_buffer);
- } else if (ts_type->unit == TimeUnit::MILLI) {
- ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer);
- } else if (ts_type->unit == TimeUnit::SECOND) {
- ConvertDatetimeNanos<int64_t, 1000000000L>(data, out_buffer);
- } else {
- return Status::NotImplemented("Unsupported time unit");
- }
- } else {
- return Status::NotImplemented(col->type()->ToString());
- }
-
- placement_data_[rel_placement] = abs_placement;
- return Status::OK();
- }
-};
-
-class DatetimeTZBlock : public DatetimeBlock {
- public:
- DatetimeTZBlock(const std::string& timezone, int64_t num_rows)
- : DatetimeBlock(num_rows, 1), timezone_(timezone) {}
-
- // Like Categorical, the internal ndarray is 1-dimensional
- Status Allocate() override { return AllocateDatetime(1); }
-
- Status GetPyResult(PyObject** output) override {
- PyObject* result = PyDict_New();
- RETURN_IF_PYERROR();
-
- PyObject* py_tz = PyUnicode_FromStringAndSize(
- timezone_.c_str(), static_cast<Py_ssize_t>(timezone_.size()));
- RETURN_IF_PYERROR();
-
- PyDict_SetItemString(result, "block", block_arr_.obj());
- PyDict_SetItemString(result, "timezone", py_tz);
- PyDict_SetItemString(result, "placement", placement_arr_.obj());
-
- *output = result;
-
- return Status::OK();
- }
-
- private:
- std::string timezone_;
-};
-
-template <int ARROW_INDEX_TYPE>
-class CategoricalBlock : public PandasBlock {
- public:
- CategoricalBlock(int64_t num_rows) : PandasBlock(num_rows, 1) {}
-
- Status Allocate() override {
- constexpr int npy_type = arrow_traits<ARROW_INDEX_TYPE>::npy_type;
-
- if (!(npy_type == NPY_INT8 || npy_type == NPY_INT16 || npy_type == NPY_INT32 ||
- npy_type == NPY_INT64)) {
- return Status::Invalid("Category indices must be signed integers");
- }
- return AllocateNDArray(npy_type, 1);
- }
-
- Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
- int64_t rel_placement) override {
- using T = typename arrow_traits<ARROW_INDEX_TYPE>::T;
-
- T* out_values = reinterpret_cast<T*>(block_data_) + rel_placement * num_rows_;
-
- const ChunkedArray& data = *col->data().get();
-
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- const auto& dict_arr = static_cast<const DictionaryArray&>(*arr);
- const auto& indices = static_cast<const PrimitiveArray&>(*dict_arr.indices());
- auto in_values = reinterpret_cast<const T*>(indices.data()->data());
-
- // Null is -1 in CategoricalBlock
- for (int i = 0; i < arr->length(); ++i) {
- *out_values++ = indices.IsNull(i) ? -1 : in_values[i];
- }
- }
-
- placement_data_[rel_placement] = abs_placement;
-
- auto dict_type = static_cast<const DictionaryType*>(col->type().get());
-
- PyObject* dict;
- RETURN_NOT_OK(ConvertArrayToPandas(dict_type->dictionary(), nullptr, &dict));
- dictionary_.reset(dict);
-
- return Status::OK();
- }
-
- Status GetPyResult(PyObject** output) override {
- PyObject* result = PyDict_New();
- RETURN_IF_PYERROR();
-
- PyDict_SetItemString(result, "block", block_arr_.obj());
- PyDict_SetItemString(result, "dictionary", dictionary_.obj());
- PyDict_SetItemString(result, "placement", placement_arr_.obj());
-
- *output = result;
-
- return Status::OK();
- }
-
- protected:
- OwnedRef dictionary_;
-};
-
-Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns,
- std::shared_ptr<PandasBlock>* block) {
-#define BLOCK_CASE(NAME, TYPE) \
- case PandasBlock::NAME: \
- *block = std::make_shared<TYPE>(num_rows, num_columns); \
- break;
-
- switch (type) {
- BLOCK_CASE(OBJECT, ObjectBlock);
- BLOCK_CASE(UINT8, UInt8Block);
- BLOCK_CASE(INT8, Int8Block);
- BLOCK_CASE(UINT16, UInt16Block);
- BLOCK_CASE(INT16, Int16Block);
- BLOCK_CASE(UINT32, UInt32Block);
- BLOCK_CASE(INT32, Int32Block);
- BLOCK_CASE(UINT64, UInt64Block);
- BLOCK_CASE(INT64, Int64Block);
- BLOCK_CASE(FLOAT, Float32Block);
- BLOCK_CASE(DOUBLE, Float64Block);
- BLOCK_CASE(BOOL, BoolBlock);
- BLOCK_CASE(DATETIME, DatetimeBlock);
- default:
- return Status::NotImplemented("Unsupported block type");
- }
-
-#undef BLOCK_CASE
-
- return (*block)->Allocate();
-}
-
-static inline Status MakeCategoricalBlock(const std::shared_ptr<DataType>& type,
- int64_t num_rows, std::shared_ptr<PandasBlock>* block) {
- // All categoricals become a block with a single column
- auto dict_type = static_cast<const DictionaryType*>(type.get());
- switch (dict_type->index_type()->type) {
- case Type::INT8:
- *block = std::make_shared<CategoricalBlock<Type::INT8>>(num_rows);
- break;
- case Type::INT16:
- *block = std::make_shared<CategoricalBlock<Type::INT16>>(num_rows);
- break;
- case Type::INT32:
- *block = std::make_shared<CategoricalBlock<Type::INT32>>(num_rows);
- break;
- case Type::INT64:
- *block = std::make_shared<CategoricalBlock<Type::INT64>>(num_rows);
- break;
- default: {
- std::stringstream ss;
- ss << "Categorical index type not implemented: "
- << dict_type->index_type()->ToString();
- return Status::NotImplemented(ss.str());
- }
- }
- return (*block)->Allocate();
-}
-
-using BlockMap = std::unordered_map<int, std::shared_ptr<PandasBlock>>;
-
-// Construct the exact pandas 0.x "BlockManager" memory layout
-//
-// * For each column determine the correct output pandas type
-// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output
-// * Allocate block placement arrays
-// * Write Arrow columns out into each slice of memory; populate block
-// * placement arrays as we go
-class DataFrameBlockCreator {
- public:
- DataFrameBlockCreator(const std::shared_ptr<Table>& table) : table_(table) {}
-
- Status Convert(int nthreads, PyObject** output) {
- column_types_.resize(table_->num_columns());
- column_block_placement_.resize(table_->num_columns());
- type_counts_.clear();
- blocks_.clear();
-
- RETURN_NOT_OK(CreateBlocks());
- RETURN_NOT_OK(WriteTableToBlocks(nthreads));
-
- return GetResultList(output);
- }
-
- Status CreateBlocks() {
- for (int i = 0; i < table_->num_columns(); ++i) {
- std::shared_ptr<Column> col = table_->column(i);
- PandasBlock::type output_type;
-
- Type::type column_type = col->type()->type;
- switch (column_type) {
- case Type::BOOL:
- output_type = col->null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL;
- break;
- case Type::UINT8:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8;
- break;
- case Type::INT8:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8;
- break;
- case Type::UINT16:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16;
- break;
- case Type::INT16:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16;
- break;
- case Type::UINT32:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32;
- break;
- case Type::INT32:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32;
- break;
- case Type::INT64:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64;
- break;
- case Type::UINT64:
- output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64;
- break;
- case Type::FLOAT:
- output_type = PandasBlock::FLOAT;
- break;
- case Type::DOUBLE:
- output_type = PandasBlock::DOUBLE;
- break;
- case Type::STRING:
- case Type::BINARY:
- output_type = PandasBlock::OBJECT;
- break;
- case Type::DATE64:
- output_type = PandasBlock::DATETIME;
- break;
- case Type::TIMESTAMP: {
- const auto& ts_type = static_cast<const TimestampType&>(*col->type());
- if (ts_type.timezone != "") {
- output_type = PandasBlock::DATETIME_WITH_TZ;
- } else {
- output_type = PandasBlock::DATETIME;
- }
- } break;
- case Type::LIST: {
- auto list_type = std::static_pointer_cast<ListType>(col->type());
- if (!ListTypeSupported(list_type->value_type()->type)) {
- std::stringstream ss;
- ss << "Not implemented type for lists: "
- << list_type->value_type()->ToString();
- return Status::NotImplemented(ss.str());
- }
- output_type = PandasBlock::OBJECT;
- } break;
- case Type::DICTIONARY:
- output_type = PandasBlock::CATEGORICAL;
- break;
- default:
- return Status::NotImplemented(col->type()->ToString());
- }
-
- int block_placement = 0;
- std::shared_ptr<PandasBlock> block;
- if (output_type == PandasBlock::CATEGORICAL) {
- RETURN_NOT_OK(MakeCategoricalBlock(col->type(), table_->num_rows(), &block));
- categorical_blocks_[i] = block;
- } else if (output_type == PandasBlock::DATETIME_WITH_TZ) {
- const auto& ts_type = static_cast<const TimestampType&>(*col->type());
- block = std::make_shared<DatetimeTZBlock>(ts_type.timezone, table_->num_rows());
- RETURN_NOT_OK(block->Allocate());
- datetimetz_blocks_[i] = block;
- } else {
- auto it = type_counts_.find(output_type);
- if (it != type_counts_.end()) {
- block_placement = it->second;
- // Increment count
- it->second += 1;
- } else {
- // Add key to map
- type_counts_[output_type] = 1;
- }
- }
-
- column_types_[i] = output_type;
- column_block_placement_[i] = block_placement;
- }
-
- // Create normal non-categorical blocks
- for (const auto& it : type_counts_) {
- PandasBlock::type type = static_cast<PandasBlock::type>(it.first);
- std::shared_ptr<PandasBlock> block;
- RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block));
- blocks_[type] = block;
- }
- return Status::OK();
- }
-
- Status WriteTableToBlocks(int nthreads) {
- auto WriteColumn = [this](int i) {
- std::shared_ptr<Column> col = this->table_->column(i);
- PandasBlock::type output_type = this->column_types_[i];
-
- int rel_placement = this->column_block_placement_[i];
-
- std::shared_ptr<PandasBlock> block;
- if (output_type == PandasBlock::CATEGORICAL) {
- auto it = this->categorical_blocks_.find(i);
- if (it == this->blocks_.end()) {
- return Status::KeyError("No categorical block allocated");
- }
- block = it->second;
- } else if (output_type == PandasBlock::DATETIME_WITH_TZ) {
- auto it = this->datetimetz_blocks_.find(i);
- if (it == this->datetimetz_blocks_.end()) {
- return Status::KeyError("No datetimetz block allocated");
- }
- block = it->second;
- } else {
- auto it = this->blocks_.find(output_type);
- if (it == this->blocks_.end()) { return Status::KeyError("No block allocated"); }
- block = it->second;
- }
- return block->Write(col, i, rel_placement);
- };
-
- nthreads = std::min<int>(nthreads, table_->num_columns());
-
- if (nthreads == 1) {
- for (int i = 0; i < table_->num_columns(); ++i) {
- RETURN_NOT_OK(WriteColumn(i));
- }
- } else {
- std::vector<std::thread> thread_pool;
- thread_pool.reserve(nthreads);
- std::atomic<int> task_counter(0);
-
- std::mutex error_mtx;
- bool error_occurred = false;
- Status error;
-
- for (int thread_id = 0; thread_id < nthreads; ++thread_id) {
- thread_pool.emplace_back(
- [this, &error, &error_occurred, &error_mtx, &task_counter, &WriteColumn]() {
- int column_num;
- while (!error_occurred) {
- column_num = task_counter.fetch_add(1);
- if (column_num >= this->table_->num_columns()) { break; }
- Status s = WriteColumn(column_num);
- if (!s.ok()) {
- std::lock_guard<std::mutex> lock(error_mtx);
- error_occurred = true;
- error = s;
- break;
- }
- }
- });
- }
- for (auto&& thread : thread_pool) {
- thread.join();
- }
-
- if (error_occurred) { return error; }
- }
- return Status::OK();
- }
-
- Status AppendBlocks(const BlockMap& blocks, PyObject* list) {
- for (const auto& it : blocks) {
- PyObject* item;
- RETURN_NOT_OK(it.second->GetPyResult(&item));
- if (PyList_Append(list, item) < 0) { RETURN_IF_PYERROR(); }
- }
- return Status::OK();
- }
-
- Status GetResultList(PyObject** out) {
- PyAcquireGIL lock;
-
- PyObject* result = PyList_New(0);
- RETURN_IF_PYERROR();
-
- RETURN_NOT_OK(AppendBlocks(blocks_, result));
- RETURN_NOT_OK(AppendBlocks(categorical_blocks_, result));
- RETURN_NOT_OK(AppendBlocks(datetimetz_blocks_, result));
-
- *out = result;
- return Status::OK();
- }
-
- private:
- std::shared_ptr<Table> table_;
-
- // column num -> block type id
- std::vector<PandasBlock::type> column_types_;
-
- // column num -> relative placement within internal block
- std::vector<int> column_block_placement_;
-
- // block type -> type count
- std::unordered_map<int, int> type_counts_;
-
- // block type -> block
- BlockMap blocks_;
-
- // column number -> categorical block
- BlockMap categorical_blocks_;
-
- // column number -> datetimetz block
- BlockMap datetimetz_blocks_;
-};
-
-class ArrowDeserializer {
- public:
- ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref)
- : col_(col), data_(*col->data().get()), py_ref_(py_ref) {}
-
- Status AllocateOutput(int type) {
- PyAcquireGIL lock;
-
- npy_intp dims[1] = {col_->length()};
- result_ = PyArray_SimpleNew(1, dims, type);
- arr_ = reinterpret_cast<PyArrayObject*>(result_);
-
- if (arr_ == NULL) {
- // Error occurred, trust that SimpleNew set the error state
- return Status::OK();
- }
-
- set_numpy_metadata(type, col_->type().get(), arr_);
-
- return Status::OK();
- }
-
- template <int TYPE>
- Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr<Array> arr) {
- typedef typename arrow_traits<TYPE>::T T;
-
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-
- // Zero-Copy. We can pass the data pointer directly to NumPy.
- void* data = const_cast<T*>(in_values);
-
- PyAcquireGIL lock;
-
- // Zero-Copy. We can pass the data pointer directly to NumPy.
- npy_intp dims[1] = {col_->length()};
- result_ = PyArray_SimpleNewFromData(1, dims, npy_type, data);
- arr_ = reinterpret_cast<PyArrayObject*>(result_);
-
- if (arr_ == NULL) {
- // Error occurred, trust that SimpleNew set the error state
- return Status::OK();
- }
-
- set_numpy_metadata(npy_type, col_->type().get(), arr_);
-
- if (PyArray_SetBaseObject(arr_, py_ref_) == -1) {
- // Error occurred, trust that SetBaseObject set the error state
- return Status::OK();
- } else {
- // PyArray_SetBaseObject steals our reference to py_ref_
- Py_INCREF(py_ref_);
- }
-
- // Arrow data is immutable.
- PyArray_CLEARFLAGS(arr_, NPY_ARRAY_WRITEABLE);
-
- return Status::OK();
- }
-
- // ----------------------------------------------------------------------
- // Allocate new array and deserialize. Can do a zero copy conversion for some
- // types
-
- Status Convert(PyObject** out) {
-#define CONVERT_CASE(TYPE) \
- case Type::TYPE: { \
- RETURN_NOT_OK(ConvertValues<Type::TYPE>()); \
- } break;
-
- switch (col_->type()->type) {
- CONVERT_CASE(BOOL);
- CONVERT_CASE(INT8);
- CONVERT_CASE(INT16);
- CONVERT_CASE(INT32);
- CONVERT_CASE(INT64);
- CONVERT_CASE(UINT8);
- CONVERT_CASE(UINT16);
- CONVERT_CASE(UINT32);
- CONVERT_CASE(UINT64);
- CONVERT_CASE(FLOAT);
- CONVERT_CASE(DOUBLE);
- CONVERT_CASE(BINARY);
- CONVERT_CASE(STRING);
- CONVERT_CASE(DATE64);
- CONVERT_CASE(TIMESTAMP);
- CONVERT_CASE(DICTIONARY);
- CONVERT_CASE(LIST);
- default: {
- std::stringstream ss;
- ss << "Arrow type reading not implemented for " << col_->type()->ToString();
- return Status::NotImplemented(ss.str());
- }
- }
-
-#undef CONVERT_CASE
-
- *out = result_;
- return Status::OK();
- }
-
- template <int TYPE>
- inline typename std::enable_if<
- (TYPE != Type::DATE64) & arrow_traits<TYPE>::is_numeric_nullable, Status>::type
- ConvertValues() {
- typedef typename arrow_traits<TYPE>::T T;
- int npy_type = arrow_traits<TYPE>::npy_type;
-
- if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
- return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
- }
-
- RETURN_NOT_OK(AllocateOutput(npy_type));
- auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
- ConvertNumericNullable<T>(data_, arrow_traits<TYPE>::na_value, out_values);
-
- return Status::OK();
- }
-
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::DATE64, Status>::type ConvertValues() {
- typedef typename arrow_traits<TYPE>::T T;
-
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
- auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
- ConvertDates<T>(data_, arrow_traits<TYPE>::na_value, out_values);
- return Status::OK();
- }
-
- // Integer specialization
- template <int TYPE>
- inline
- typename std::enable_if<arrow_traits<TYPE>::is_numeric_not_nullable, Status>::type
- ConvertValues() {
- typedef typename arrow_traits<TYPE>::T T;
- int npy_type = arrow_traits<TYPE>::npy_type;
-
- if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
- return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
- }
-
- if (data_.null_count() > 0) {
- RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
- auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
- ConvertIntegerWithNulls<T>(data_, out_values);
- } else {
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
- auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
- ConvertIntegerNoNullsSameType<T>(data_, out_values);
- }
-
- return Status::OK();
- }
-
- // Boolean specialization
- template <int TYPE>
- inline typename std::enable_if<arrow_traits<TYPE>::is_boolean, Status>::type
- ConvertValues() {
- if (data_.null_count() > 0) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- RETURN_NOT_OK(ConvertBooleanWithNulls(data_, out_values));
- } else {
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
- auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(arr_));
- ConvertBooleanNoNulls(data_, out_values);
- }
- return Status::OK();
- }
-
- // UTF8 strings
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::STRING, Status>::type ConvertValues() {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertBinaryLike<StringArray>(data_, out_values);
- }
-
- template <int T2>
- inline typename std::enable_if<T2 == Type::BINARY, Status>::type ConvertValues() {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertBinaryLike<BinaryArray>(data_, out_values);
- }
-
-#define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \
- case Type::ArrowEnum: \
- return ConvertListsLike<ArrowType>(col_, out_values);
-
- template <int T2>
- inline typename std::enable_if<T2 == Type::LIST, Status>::type ConvertValues() {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- auto list_type = std::static_pointer_cast<ListType>(col_->type());
- switch (list_type->value_type()->type) {
- CONVERTVALUES_LISTSLIKE_CASE(UInt8Type, UINT8)
- CONVERTVALUES_LISTSLIKE_CASE(Int8Type, INT8)
- CONVERTVALUES_LISTSLIKE_CASE(UInt16Type, UINT16)
- CONVERTVALUES_LISTSLIKE_CASE(Int16Type, INT16)
- CONVERTVALUES_LISTSLIKE_CASE(UInt32Type, UINT32)
- CONVERTVALUES_LISTSLIKE_CASE(Int32Type, INT32)
- CONVERTVALUES_LISTSLIKE_CASE(UInt64Type, UINT64)
- CONVERTVALUES_LISTSLIKE_CASE(Int64Type, INT64)
- CONVERTVALUES_LISTSLIKE_CASE(TimestampType, TIMESTAMP)
- CONVERTVALUES_LISTSLIKE_CASE(FloatType, FLOAT)
- CONVERTVALUES_LISTSLIKE_CASE(DoubleType, DOUBLE)
- CONVERTVALUES_LISTSLIKE_CASE(StringType, STRING)
- default: {
- std::stringstream ss;
- ss << "Not implemented type for lists: " << list_type->value_type()->ToString();
- return Status::NotImplemented(ss.str());
- }
- }
- }
-
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::DICTIONARY, Status>::type ConvertValues() {
- std::shared_ptr<PandasBlock> block;
- RETURN_NOT_OK(MakeCategoricalBlock(col_->type(), col_->length(), &block));
- RETURN_NOT_OK(block->Write(col_, 0, 0));
-
- auto dict_type = static_cast<const DictionaryType*>(col_->type().get());
-
- PyAcquireGIL lock;
- result_ = PyDict_New();
- RETURN_IF_PYERROR();
-
- PyObject* dictionary;
- RETURN_NOT_OK(ConvertArrayToPandas(dict_type->dictionary(), nullptr, &dictionary));
-
- PyDict_SetItemString(result_, "indices", block->block_arr());
- PyDict_SetItemString(result_, "dictionary", dictionary);
-
- return Status::OK();
- }
-
- private:
- std::shared_ptr<Column> col_;
- const ChunkedArray& data_;
- PyObject* py_ref_;
- PyArrayObject* arr_;
- PyObject* result_;
-};
-
-Status ConvertArrayToPandas(
- const std::shared_ptr<Array>& arr, PyObject* py_ref, PyObject** out) {
- static std::string dummy_name = "dummy";
- auto field = std::make_shared<Field>(dummy_name, arr->type());
- auto col = std::make_shared<Column>(field, arr);
- return ConvertColumnToPandas(col, py_ref, out);
-}
-
-Status ConvertColumnToPandas(
- const std::shared_ptr<Column>& col, PyObject* py_ref, PyObject** out) {
- ArrowDeserializer converter(col, py_ref);
- return converter.Convert(out);
-}
-
-Status ConvertTableToPandas(
- const std::shared_ptr<Table>& table, int nthreads, PyObject** out) {
- DataFrameBlockCreator helper(table);
- return helper.Convert(nthreads, out);
-}
-
-} // namespace py
-} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
deleted file mode 100644
index 6862339..0000000
--- a/python/src/pyarrow/adapters/pandas.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Functions for converting between pandas's NumPy-based data representation
-// and Arrow data structures
-
-#ifndef PYARROW_ADAPTERS_PANDAS_H
-#define PYARROW_ADAPTERS_PANDAS_H
-
-#include <Python.h>
-
-#include <memory>
-
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class Column;
-class DataType;
-class MemoryPool;
-class Status;
-class Table;
-
-namespace py {
-
-ARROW_EXPORT
-Status ConvertArrayToPandas(
- const std::shared_ptr<Array>& arr, PyObject* py_ref, PyObject** out);
-
-ARROW_EXPORT
-Status ConvertColumnToPandas(
- const std::shared_ptr<Column>& col, PyObject* py_ref, PyObject** out);
-
-struct PandasOptions {
- bool strings_to_categorical;
-};
-
-// Convert a whole table as efficiently as possible to a pandas.DataFrame.
-//
-// The returned Python object is a list of tuples consisting of the exact 2D
-// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
-//
-// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
-ARROW_EXPORT
-Status ConvertTableToPandas(
- const std::shared_ptr<Table>& table, int nthreads, PyObject** out);
-
-ARROW_EXPORT
-Status PandasDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
-
-ARROW_EXPORT
-Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
-
-/// Convert dtype=object arrays. If target data type is not known, pass a type
-/// with nullptr
-ARROW_EXPORT
-Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
-
-} // namespace py
-} // namespace arrow
-
-#endif // PYARROW_ADAPTERS_PANDAS_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/api.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h
deleted file mode 100644
index f65cc09..0000000
--- a/python/src/pyarrow/api.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PYARROW_API_H
-#define PYARROW_API_H
-
-#include "pyarrow/helpers.h"
-
-#include "pyarrow/adapters/builtin.h"
-#include "pyarrow/adapters/pandas.h"
-
-#endif // PYARROW_API_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/common.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc
deleted file mode 100644
index 792aa47..0000000
--- a/python/src/pyarrow/common.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "pyarrow/common.h"
-
-#include <cstdlib>
-#include <mutex>
-#include <sstream>
-
-#include "arrow/memory_pool.h"
-#include "arrow/status.h"
-
-namespace arrow {
-namespace py {
-
-static std::mutex memory_pool_mutex;
-static MemoryPool* default_pyarrow_pool = nullptr;
-
-void set_default_memory_pool(MemoryPool* pool) {
- std::lock_guard<std::mutex> guard(memory_pool_mutex);
- default_pyarrow_pool = pool;
-}
-
-MemoryPool* get_memory_pool() {
- std::lock_guard<std::mutex> guard(memory_pool_mutex);
- if (default_pyarrow_pool) {
- return default_pyarrow_pool;
- } else {
- return default_memory_pool();
- }
-}
-
-// ----------------------------------------------------------------------
-// PyBuffer
-
-PyBuffer::PyBuffer(PyObject* obj)
- : Buffer(nullptr, 0) {
- if (PyObject_CheckBuffer(obj)) {
- obj_ = PyMemoryView_FromObject(obj);
- Py_buffer* buffer = PyMemoryView_GET_BUFFER(obj_);
- data_ = reinterpret_cast<const uint8_t*>(buffer->buf);
- size_ = buffer->len;
- capacity_ = buffer->len;
- is_mutable_ = false;
- Py_INCREF(obj_);
- }
-}
-
-PyBuffer::~PyBuffer() {
- PyAcquireGIL lock;
- Py_DECREF(obj_);
-}
-
-} // namespace py
-} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
deleted file mode 100644
index b4e4ea6..0000000
--- a/python/src/pyarrow/common.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PYARROW_COMMON_H
-#define PYARROW_COMMON_H
-
-#include "pyarrow/config.h"
-
-#include "arrow/buffer.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class MemoryPool;
-
-namespace py {
-
-class PyAcquireGIL {
- public:
- PyAcquireGIL() { state_ = PyGILState_Ensure(); }
-
- ~PyAcquireGIL() { PyGILState_Release(state_); }
-
- private:
- PyGILState_STATE state_;
- DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL);
-};
-
-#define PYARROW_IS_PY2 PY_MAJOR_VERSION <= 2
-
-class OwnedRef {
- public:
- OwnedRef() : obj_(nullptr) {}
-
- OwnedRef(PyObject* obj) : obj_(obj) {}
-
- ~OwnedRef() {
- PyAcquireGIL lock;
- Py_XDECREF(obj_);
- }
-
- void reset(PyObject* obj) {
- if (obj_ != nullptr) { Py_XDECREF(obj_); }
- obj_ = obj;
- }
-
- void release() { obj_ = nullptr; }
-
- PyObject* obj() const { return obj_; }
-
- private:
- PyObject* obj_;
-};
-
-struct PyObjectStringify {
- OwnedRef tmp_obj;
- const char* bytes;
-
- PyObjectStringify(PyObject* obj) {
- PyObject* bytes_obj;
- if (PyUnicode_Check(obj)) {
- bytes_obj = PyUnicode_AsUTF8String(obj);
- tmp_obj.reset(bytes_obj);
- } else {
- bytes_obj = obj;
- }
- bytes = PyBytes_AsString(bytes_obj);
- }
-};
-
-// TODO(wesm): We can just let errors pass through. To be explored later
-#define RETURN_IF_PYERROR() \
- if (PyErr_Occurred()) { \
- PyObject *exc_type, *exc_value, *traceback; \
- PyErr_Fetch(&exc_type, &exc_value, &traceback); \
- PyObjectStringify stringified(exc_value); \
- std::string message(stringified.bytes); \
- Py_DECREF(exc_type); \
- Py_XDECREF(exc_value); \
- Py_XDECREF(traceback); \
- PyErr_Clear(); \
- return Status::UnknownError(message); \
- }
-
-// Return the common PyArrow memory pool
-ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool);
-ARROW_EXPORT MemoryPool* get_memory_pool();
-
-class ARROW_EXPORT NumPyBuffer : public Buffer {
- public:
- NumPyBuffer(PyArrayObject* arr) : Buffer(nullptr, 0) {
- arr_ = arr;
- Py_INCREF(arr);
-
- data_ = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
- size_ = PyArray_SIZE(arr_) * PyArray_DESCR(arr_)->elsize;
- capacity_ = size_;
- }
-
- virtual ~NumPyBuffer() { Py_XDECREF(arr_); }
-
- private:
- PyArrayObject* arr_;
-};
-
-class ARROW_EXPORT PyBuffer : public Buffer {
- public:
- /// Note that the GIL must be held when calling the PyBuffer constructor.
- ///
- /// While memoryview objects support multi-demensional buffers, PyBuffer only supports
- /// one-dimensional byte buffers.
- PyBuffer(PyObject* obj);
- ~PyBuffer();
-
- private:
- PyObject* obj_;
-};
-
-} // namespace py
-} // namespace arrow
-
-#endif // PYARROW_COMMON_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/config.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.cc b/python/src/pyarrow/config.cc
deleted file mode 100644
index 0be6d96..0000000
--- a/python/src/pyarrow/config.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <Python.h>
-
-#include "pyarrow/config.h"
-
-namespace arrow {
-namespace py {
-
-void pyarrow_init() {}
-
-PyObject* numpy_nan = nullptr;
-
-void pyarrow_set_numpy_nan(PyObject* obj) {
- Py_INCREF(obj);
- numpy_nan = obj;
-}
-
-} // namespace py
-} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/config.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h
deleted file mode 100644
index 87fc5c2..0000000
--- a/python/src/pyarrow/config.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PYARROW_CONFIG_H
-#define PYARROW_CONFIG_H
-
-#include <Python.h>
-
-#include "arrow/util/visibility.h"
-
-#include "pyarrow/numpy_interop.h"
-
-#if PY_MAJOR_VERSION >= 3
-#define PyString_Check PyUnicode_Check
-#endif
-
-namespace arrow {
-namespace py {
-
-ARROW_EXPORT
-extern PyObject* numpy_nan;
-
-ARROW_EXPORT
-void pyarrow_init();
-
-ARROW_EXPORT
-void pyarrow_set_numpy_nan(PyObject* obj);
-
-} // namespace py
-} // namespace arrow
-
-#endif // PYARROW_CONFIG_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/do_import_numpy.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/do_import_numpy.h b/python/src/pyarrow/do_import_numpy.h
deleted file mode 100644
index bb4a382..0000000
--- a/python/src/pyarrow/do_import_numpy.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Trick borrowed from dynd-python for initializing the NumPy array API
-
-// Trigger the array import (inversion of NO_IMPORT_ARRAY)
-#define NUMPY_IMPORT_ARRAY
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
deleted file mode 100644
index 43edf8a..0000000
--- a/python/src/pyarrow/helpers.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "pyarrow/helpers.h"
-
-#include <arrow/api.h>
-
-namespace arrow {
-namespace py {
-
-#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
- case Type::NAME: \
- return FACTORY(); \
- break;
-
-std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
- switch (type) {
- case Type::NA:
- return null();
- GET_PRIMITIVE_TYPE(UINT8, uint8);
- GET_PRIMITIVE_TYPE(INT8, int8);
- GET_PRIMITIVE_TYPE(UINT16, uint16);
- GET_PRIMITIVE_TYPE(INT16, int16);
- GET_PRIMITIVE_TYPE(UINT32, uint32);
- GET_PRIMITIVE_TYPE(INT32, int32);
- GET_PRIMITIVE_TYPE(UINT64, uint64);
- GET_PRIMITIVE_TYPE(INT64, int64);
- GET_PRIMITIVE_TYPE(DATE32, date32);
- GET_PRIMITIVE_TYPE(DATE64, date64);
- GET_PRIMITIVE_TYPE(BOOL, boolean);
- GET_PRIMITIVE_TYPE(FLOAT, float32);
- GET_PRIMITIVE_TYPE(DOUBLE, float64);
- GET_PRIMITIVE_TYPE(BINARY, binary);
- GET_PRIMITIVE_TYPE(STRING, utf8);
- default:
- return nullptr;
- }
-}
-
-} // namespace py
-} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/python/src/pyarrow/helpers.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h
deleted file mode 100644
index 611e814..0000000
--- a/python/src/pyarrow/helpers.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PYARROW_HELPERS_H
-#define PYARROW_HELPERS_H
-
-#include <memory>
-
-#include "arrow/type.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace py {
-
-ARROW_EXPORT
-std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
-
-} // namespace py
-} // namespace arrow
-
-#endif // PYARROW_HELPERS_H