You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/03 22:05:44 UTC
arrow git commit: ARROW-443: [Python] Support ingest of strided NumPy
arrays from pandas
Repository: arrow
Updated Branches:
refs/heads/master 7d1d4e751 -> f05b7c62c
ARROW-443: [Python] Support ingest of strided NumPy arrays from pandas
Author: Wes McKinney <we...@twosigma.com>
Closes #482 from wesm/ARROW-443 and squashes the following commits:
d9b36c0 [Wes McKinney] Run commented out test cases, fix issue
8f0ff38 [Wes McKinney] cpplint
88eef1a [Wes McKinney] Support strided mask argument in some object conversions
22d4489 [Wes McKinney] First cut at strided NumPy import
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/f05b7c62
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/f05b7c62
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/f05b7c62
Branch: refs/heads/master
Commit: f05b7c62cf6151a2a03292508628f8f1a8e7a1aa
Parents: 7d1d4e7
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Apr 3 18:05:39 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Apr 3 18:05:39 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/python/config.cc | 1 +
cpp/src/arrow/python/numpy-internal.h | 66 ++++++
cpp/src/arrow/python/pandas_convert.cc | 252 +++++++++++++++--------
python/pyarrow/array.pyx | 4 +-
python/pyarrow/tests/test_convert_pandas.py | 59 +++++-
5 files changed, 290 insertions(+), 92 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/cpp/src/arrow/python/config.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/config.cc b/cpp/src/arrow/python/config.cc
index 2abc4dd..c2a6916 100644
--- a/cpp/src/arrow/python/config.cc
+++ b/cpp/src/arrow/python/config.cc
@@ -16,6 +16,7 @@
// under the License.
#include <Python.h>
+#include <datetime.h>
#include "arrow/python/config.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/cpp/src/arrow/python/numpy-internal.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
new file mode 100644
index 0000000..fcc6a58
--- /dev/null
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Internal utilities for dealing with NumPy
+
+#ifndef ARROW_PYTHON_NUMPY_INTERNAL_H
+#define ARROW_PYTHON_NUMPY_INTERNAL_H
+
+#include <Python.h>
+
+#include <cstdint>
+
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_interop.h"
+
+namespace arrow {
+namespace py {
+
+/// Indexing convenience for interacting with strided 1-dim ndarray objects
+template <typename T>
+class Ndarray1DIndexer {
+ public:
+ typedef int64_t size_type;
+
+ Ndarray1DIndexer() : arr_(nullptr), data_(nullptr) {}
+
+ explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { Init(arr); }
+
+ void Init(PyArrayObject* arr) {
+ arr_ = arr;
+ DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays";
+ Py_INCREF(arr);
+ data_ = reinterpret_cast<T*>(PyArray_DATA(arr));
+ stride_ = PyArray_STRIDES(arr)[0] / sizeof(T);
+ }
+
+ ~Ndarray1DIndexer() { Py_XDECREF(arr_); }
+
+ int64_t size() const { return PyArray_SIZE(arr_); }
+
+ T& operator[](size_type index) { return *(data_ + index * stride_); }
+
+ private:
+ PyArrayObject* arr_;
+ T* data_;
+ int64_t stride_;
+};
+
+} // namespace py
+} // namespace arrow
+
+#endif // ARROW_PYTHON_NUMPY_INTERNAL_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 01019e5..9577892 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -47,6 +47,7 @@
#include "arrow/python/builtin_convert.h"
#include "arrow/python/common.h"
#include "arrow/python/config.h"
+#include "arrow/python/numpy-internal.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/type_traits.h"
#include "arrow/python/util/datetime.h"
@@ -70,15 +71,16 @@ static inline bool PyObject_is_string(const PyObject* obj) {
}
template <int TYPE>
-static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
+static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
typedef npy_traits<TYPE> traits;
typedef typename traits::value_type T;
int64_t null_count = 0;
- const T* values = reinterpret_cast<const T*>(data);
+
+ Ndarray1DIndexer<T> values(arr);
// TODO(wesm): striding
- for (int i = 0; i < length; ++i) {
+ for (int i = 0; i < values.size(); ++i) {
if (traits::isnull(values[i])) {
++null_count;
} else {
@@ -92,8 +94,8 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap)
// Returns null count
static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
int64_t null_count = 0;
- const uint8_t* mask_values = static_cast<const uint8_t*>(PyArray_DATA(mask));
- // TODO(wesm): strided null mask
+
+ Ndarray1DIndexer<uint8_t> mask_values(mask);
for (int i = 0; i < length; ++i) {
if (mask_values[i]) {
++null_count;
@@ -138,13 +140,24 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
return Status::OK();
}
-Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
- PyObject** objects, bool* have_bytes) {
+static Status AppendObjectStrings(
+ PyArrayObject* arr, PyArrayObject* mask, StringBuilder* builder, bool* have_bytes) {
PyObject* obj;
- for (int64_t i = 0; i < objects_length; ++i) {
+ Ndarray1DIndexer<PyObject*> objects(arr);
+ Ndarray1DIndexer<uint8_t> mask_values;
+
+ bool have_mask = false;
+ if (mask != nullptr) {
+ mask_values.Init(mask);
+ have_mask = true;
+ }
+
+ for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
- if (PyUnicode_Check(obj)) {
+ if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else if (PyUnicode_Check(obj)) {
obj = PyUnicode_AsUTF8String(obj);
if (obj == NULL) {
PyErr_Clear();
@@ -158,8 +171,6 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
*have_bytes = true;
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
- } else if (PyObject_is_null(obj)) {
- RETURN_NOT_OK(builder->AppendNull());
} else {
return InvalidConversion(obj, "string or bytes");
}
@@ -168,13 +179,24 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
return Status::OK();
}
-static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width,
- FixedSizeBinaryBuilder* builder, PyObject** objects) {
+static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mask,
+ int byte_width, FixedSizeBinaryBuilder* builder) {
PyObject* obj;
- for (int64_t i = 0; i < objects_length; ++i) {
+ Ndarray1DIndexer<PyObject*> objects(arr);
+ Ndarray1DIndexer<uint8_t> mask_values;
+
+ bool have_mask = false;
+ if (mask != nullptr) {
+ mask_values.Init(mask);
+ have_mask = true;
+ }
+
+ for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
- if (PyUnicode_Check(obj)) {
+ if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else if (PyUnicode_Check(obj)) {
obj = PyUnicode_AsUTF8String(obj);
if (obj == NULL) {
PyErr_Clear();
@@ -190,8 +212,6 @@ static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width
RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
RETURN_NOT_OK(
builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
- } else if (PyObject_is_null(obj)) {
- RETURN_NOT_OK(builder->AppendNull());
} else {
return InvalidConversion(obj, "string or bytes");
}
@@ -299,8 +319,7 @@ class PandasConverter : public TypeVisitor {
} else if (traits::supports_nulls) {
// TODO(wesm): this presumes the NumPy C type and arrow C type are the
// same
- null_count = ValuesToBitmap<traits::npy_type>(
- PyArray_DATA(arr_), length_, null_bitmap_data_);
+ null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
}
std::vector<FieldMetadata> fields(1);
@@ -329,36 +348,33 @@ class PandasConverter : public TypeVisitor {
#undef VISIT_NATIVE
- Status Convert(std::shared_ptr<Array>* out) {
+ Status Convert() {
if (PyArray_NDIM(arr_) != 1) {
return Status::Invalid("only handle 1-dimensional arrays");
}
- // TODO(wesm): strided arrays
- if (is_strided()) { return Status::Invalid("no support for strided data yet"); }
if (type_ == nullptr) { return Status::Invalid("Must pass data type"); }
// Visit the type to perform conversion
RETURN_NOT_OK(type_->Accept(this));
- *out = out_;
return Status::OK();
}
+ std::shared_ptr<Array> result() const { return out_; }
+
// ----------------------------------------------------------------------
// Conversion logic for various object dtype arrays
template <int ITEM_TYPE, typename ArrowType>
- Status ConvertTypedLists(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
+ Status ConvertTypedLists(const std::shared_ptr<DataType>& type);
- Status ConvertObjectStrings(std::shared_ptr<Array>* out);
- Status ConvertObjectFixedWidthBytes(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
- Status ConvertBooleans(std::shared_ptr<Array>* out);
- Status ConvertDates(std::shared_ptr<Array>* out);
- Status ConvertLists(const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
- Status ConvertObjects(std::shared_ptr<Array>* out);
+ Status ConvertObjectStrings();
+ Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
+ Status ConvertBooleans();
+ Status ConvertDates();
+ Status ConvertLists(const std::shared_ptr<DataType>& type);
+ Status ConvertObjects();
protected:
MemoryPool* pool_;
@@ -374,9 +390,31 @@ class PandasConverter : public TypeVisitor {
uint8_t* null_bitmap_data_;
};
+template <typename T>
+void CopyStrided(T* input_data, int64_t length, int64_t stride, T* output_data) {
+ // Passing input_data as non-const is a concession to PyObject*
+ int64_t j = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ output_data[i] = input_data[j];
+ j += stride;
+ }
+}
+
+template <>
+void CopyStrided<PyObject*>(
+ PyObject** input_data, int64_t length, int64_t stride, PyObject** output_data) {
+ int64_t j = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ output_data[i] = input_data[j];
+ if (output_data[i] != nullptr) { Py_INCREF(output_data[i]); }
+ j += stride;
+ }
+}
+
template <typename ArrowType>
inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>* data) {
using traits = arrow_traits<ArrowType::type_id>;
+ using T = typename traits::T;
// Handle LONGLONG->INT64 and other fun things
int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
@@ -385,7 +423,20 @@ inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>* data) {
return Status::NotImplemented("NumPy type casts not yet implemented");
}
- *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+ if (is_strided()) {
+ // Strided, must copy into new contiguous memory
+ const int64_t stride = PyArray_STRIDES(arr_)[0];
+ const int64_t stride_elements = stride / sizeof(T);
+
+ auto new_buffer = std::make_shared<PoolBuffer>(pool_);
+ RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_));
+ CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr_)), length_, stride_elements,
+ reinterpret_cast<T*>(new_buffer->mutable_data()));
+ *data = new_buffer;
+ } else {
+ // Can zero-copy
+ *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+ }
return Status::OK();
}
@@ -395,7 +446,7 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
auto buffer = std::make_shared<PoolBuffer>(pool_);
RETURN_NOT_OK(buffer->Resize(nbytes));
- const uint8_t* values = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+ Ndarray1DIndexer<uint8_t> values(arr_);
uint8_t* bitmap = buffer->mutable_data();
@@ -434,13 +485,22 @@ Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
return Status::TypeError(ss.str());
}
-Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertDates() {
PyAcquireGIL lock;
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ Ndarray1DIndexer<PyObject*> objects(arr_);
+
+ if (mask_ != nullptr) {
+ return Status::NotImplemented("mask not supported in object conversions yet");
+ }
+
Date64Builder date_builder(pool_);
RETURN_NOT_OK(date_builder.Resize(length_));
+ /// We have to run this in this compilation unit, since we cannot use the
+ /// datetime API otherwise
+ PyDateTime_IMPORT;
+
Status s;
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
@@ -454,50 +514,57 @@ Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
return InvalidConversion(obj, "date");
}
}
- return date_builder.Finish(out);
+ return date_builder.Finish(&out_);
}
-Status PandasConverter::ConvertObjectStrings(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertObjectStrings() {
PyAcquireGIL lock;
// The output type at this point is inconclusive because there may be bytes
// and unicode mixed in the object array
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
StringBuilder builder(pool_);
RETURN_NOT_OK(builder.Resize(length_));
Status s;
bool have_bytes = false;
- RETURN_NOT_OK(AppendObjectStrings(length_, &builder, objects, &have_bytes));
- RETURN_NOT_OK(builder.Finish(out));
+ RETURN_NOT_OK(AppendObjectStrings(arr_, mask_, &builder, &have_bytes));
+ RETURN_NOT_OK(builder.Finish(&out_));
if (have_bytes) {
- const auto& arr = static_cast<const StringArray&>(*out->get());
- *out = std::make_shared<BinaryArray>(arr.length(), arr.value_offsets(), arr.data(),
+ const auto& arr = static_cast<const StringArray&>(*out_);
+ out_ = std::make_shared<BinaryArray>(arr.length(), arr.value_offsets(), arr.data(),
arr.null_bitmap(), arr.null_count());
}
return Status::OK();
}
Status PandasConverter::ConvertObjectFixedWidthBytes(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+ const std::shared_ptr<DataType>& type) {
PyAcquireGIL lock;
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ Ndarray1DIndexer<PyObject*> objects(arr_);
+
+ int32_t value_size = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
+
FixedSizeBinaryBuilder builder(pool_, type);
RETURN_NOT_OK(builder.Resize(length_));
- RETURN_NOT_OK(AppendObjectFixedWidthBytes(length_,
- std::dynamic_pointer_cast<FixedSizeBinaryType>(builder.type())->byte_width(),
- &builder, objects));
- RETURN_NOT_OK(builder.Finish(out));
+ RETURN_NOT_OK(AppendObjectFixedWidthBytes(arr_, mask_, value_size, &builder));
+ RETURN_NOT_OK(builder.Finish(&out_));
return Status::OK();
}
-Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertBooleans() {
PyAcquireGIL lock;
- PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ Ndarray1DIndexer<PyObject*> objects(arr_);
+ Ndarray1DIndexer<uint8_t> mask_values;
+
+ bool have_mask = false;
+ if (mask_ != nullptr) {
+ mask_values.Init(mask_);
+ have_mask = true;
+ }
int64_t nbytes = BitUtil::BytesForBits(length_);
auto data = std::make_shared<PoolBuffer>(pool_);
@@ -509,24 +576,24 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
obj = objects[i];
- if (obj == Py_True) {
+ if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+ ++null_count;
+ } else if (obj == Py_True) {
BitUtil::SetBit(bitmap, i);
BitUtil::SetBit(null_bitmap_data_, i);
} else if (obj == Py_False) {
BitUtil::SetBit(null_bitmap_data_, i);
- } else if (PyObject_is_null(obj)) {
- ++null_count;
} else {
return InvalidConversion(obj, "bool");
}
}
- *out = std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count);
+ out_ = std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count);
return Status::OK();
}
-Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertObjects() {
// Python object arrays are annoying, since we could have one of:
//
// * Strings
@@ -538,31 +605,27 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
RETURN_NOT_OK(InitNullBitmap());
- // TODO: mask not supported here
- if (mask_ != nullptr) {
- return Status::NotImplemented("mask not supported in object conversions yet");
- }
+ Ndarray1DIndexer<PyObject*> objects;
- const PyObject** objects;
{
PyAcquireGIL lock;
- objects = reinterpret_cast<const PyObject**>(PyArray_DATA(arr_));
+ objects.Init(arr_);
PyDateTime_IMPORT;
}
if (type_) {
switch (type_->type) {
case Type::STRING:
- return ConvertObjectStrings(out);
+ return ConvertObjectStrings();
case Type::FIXED_SIZE_BINARY:
- return ConvertObjectFixedWidthBytes(type_, out);
+ return ConvertObjectFixedWidthBytes(type_);
case Type::BOOL:
- return ConvertBooleans(out);
+ return ConvertBooleans();
case Type::DATE64:
- return ConvertDates(out);
+ return ConvertDates();
case Type::LIST: {
const auto& list_field = static_cast<const ListType&>(*type_);
- return ConvertLists(list_field.value_field()->type, out);
+ return ConvertLists(list_field.value_field()->type);
}
default:
return Status::TypeError("No known conversion to Arrow type");
@@ -572,11 +635,11 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
if (PyObject_is_null(objects[i])) {
continue;
} else if (PyObject_is_string(objects[i])) {
- return ConvertObjectStrings(out);
+ return ConvertObjectStrings();
} else if (PyBool_Check(objects[i])) {
- return ConvertBooleans(out);
+ return ConvertBooleans();
} else if (PyDate_CheckExact(objects[i])) {
- return ConvertDates(out);
+ return ConvertDates();
} else {
return InvalidConversion(
const_cast<PyObject*>(objects[i]), "string, bool, or date");
@@ -588,14 +651,22 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
}
template <int ITEM_TYPE, typename ArrowType>
-inline Status PandasConverter::ConvertTypedLists(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>& type) {
typedef npy_traits<ITEM_TYPE> traits;
typedef typename traits::value_type T;
typedef typename traits::BuilderClass BuilderT;
PyAcquireGIL lock;
+ // TODO: mask not supported here
+ if (mask_ != nullptr) {
+ return Status::NotImplemented("mask not supported in object conversions yet");
+ }
+
+ if (is_strided()) {
+ return Status::NotImplemented("strided arrays not implemented for lists");
+ }
+
auto value_builder = std::make_shared<BuilderT>(pool_, type);
ListBuilder list_builder(pool_, value_builder);
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
@@ -637,16 +708,25 @@ inline Status PandasConverter::ConvertTypedLists(
return Status::TypeError("Unsupported Python type for list items");
}
}
- return list_builder.Finish(out);
+ return list_builder.Finish(&out_);
}
template <>
inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+ const std::shared_ptr<DataType>& type) {
PyAcquireGIL lock;
// TODO: If there are bytes involed, convert to Binary representation
bool have_bytes = false;
+ // TODO: mask not supported here
+ if (mask_ != nullptr) {
+ return Status::NotImplemented("mask not supported in object conversions yet");
+ }
+
+ if (is_strided()) {
+ return Status::NotImplemented("strided arrays not implemented for lists");
+ }
+
auto value_builder = std::make_shared<StringBuilder>(pool_);
ListBuilder list_builder(pool_, value_builder);
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
@@ -660,9 +740,8 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
// TODO(uwe): Support more complex numpy array structures
RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
- int64_t size = static_cast<int64_t>(PyArray_DIM(numpy_array, 0));
- auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
- RETURN_NOT_OK(AppendObjectStrings(size, value_builder.get(), data, &have_bytes));
+ RETURN_NOT_OK(
+ AppendObjectStrings(numpy_array, nullptr, value_builder.get(), &have_bytes));
} else if (PyList_Check(objects[i])) {
int64_t size;
std::shared_ptr<DataType> inferred_type;
@@ -678,16 +757,15 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
return Status::TypeError("Unsupported Python type for list items");
}
}
- return list_builder.Finish(out);
+ return list_builder.Finish(&out_);
}
-#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \
- case Type::TYPE: { \
- return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type, out); \
+#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \
+ case Type::TYPE: { \
+ return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type); \
}
-Status PandasConverter::ConvertLists(
- const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertLists(const std::shared_ptr<DataType>& type) {
switch (type->type) {
LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
LIST_CASE(INT8, NPY_INT8, Int8Type)
@@ -711,13 +789,17 @@ Status PandasConverter::ConvertLists(
Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
PandasConverter converter(pool, ao, mo, type);
- return converter.Convert(out);
+ RETURN_NOT_OK(converter.Convert());
+ *out = converter.result();
+ return Status::OK();
}
Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
PandasConverter converter(pool, ao, mo, type);
- return converter.ConvertObjects(out);
+ RETURN_NOT_OK(converter.ConvertObjects());
+ *out = converter.result();
+ return Status::OK();
}
// ----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index e7c456d..67785e3 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -82,8 +82,8 @@ cdef class Array:
@staticmethod
def from_numpy(obj, mask=None, DataType type=None,
- timestamps_to_ms=False,
- MemoryPool memory_pool=None):
+ timestamps_to_ms=False,
+ MemoryPool memory_pool=None):
"""
Convert pandas.Series to an Arrow Array.
http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 0b3c02e..56830a8 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -75,16 +75,25 @@ class TestPandasConversion(unittest.TestCase):
expected = df
tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
- def _check_array_roundtrip(self, values, expected=None,
+ def _check_array_roundtrip(self, values, expected=None, mask=None,
timestamps_to_ms=False, type=None):
arr = A.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
- type=type)
+ mask=mask, type=type)
result = arr.to_pandas()
- assert arr.null_count == pd.isnull(values).sum()
+ values_nulls = pd.isnull(values)
+ if mask is None:
+ assert arr.null_count == values_nulls.sum()
+ else:
+ assert arr.null_count == (mask | values_nulls).sum()
- tm.assert_series_equal(pd.Series(result), pd.Series(values),
- check_names=False)
+ if mask is None:
+ tm.assert_series_equal(pd.Series(result), pd.Series(values),
+ check_names=False)
+ else:
+ expected = pd.Series(np.ma.masked_array(values, mask=mask))
+ tm.assert_series_equal(pd.Series(result), expected,
+ check_names=False)
def test_float_no_nulls(self):
data = {}
@@ -402,3 +411,43 @@ class TestPandasConversion(unittest.TestCase):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
with self.assertRaises(A.error.ArrowException):
A.Table.from_pandas(data)
+
+ def test_strided_data_import(self):
+ cases = []
+
+ columns = ['a', 'b', 'c']
+ N, K = 100, 3
+ random_numbers = np.random.randn(N, K).copy() * 100
+
+ numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
+ 'f4', 'f8']
+
+ for type_name in numeric_dtypes:
+ cases.append(random_numbers.astype(type_name))
+
+ # strings
+ cases.append(np.array([tm.rands(10) for i in range(N * K)],
+ dtype=object)
+ .reshape(N, K).copy())
+
+ # booleans
+ boolean_objects = (np.array([True, False, True] * N, dtype=object)
+ .reshape(N, K).copy())
+
+ # add some nulls, so dtype comes back as objects
+ boolean_objects[5] = None
+ cases.append(boolean_objects)
+
+ cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
+ dtype='datetime64[ms]')
+ .reshape(N, K).copy())
+
+ strided_mask = (random_numbers > 0).astype(bool)[:, 0]
+
+ for case in cases:
+ df = pd.DataFrame(case, columns=columns)
+ col = df['a']
+
+ self._check_pandas_roundtrip(df)
+ self._check_array_roundtrip(col)
+ self._check_array_roundtrip(col, mask=strided_mask)