You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/06/28 15:35:49 UTC
arrow git commit: ARROW-834: Python Support creating from iterables
Repository: arrow
Updated Branches:
refs/heads/master a58893882 -> bddb2197d
ARROW-834: Python Support creating from iterables
Support creating arrow arrays from iterables.
Possible follow up TODO (or possibly belongs in this issue); throw a clear exception when passed an iterator rather than an iterable.
Author: Holden Karau <ho...@us.ibm.com>
Closes #602 from holdenk/ARROW-834-csupport-creating-from-iterables and squashes the following commits:
750e7f4c [Holden Karau] Switch AppendItem to pure virtual for TypedConverterVisitor
0b72e956 [Holden Karau] Remove unecessary file after merge
2ed00d91 [Holden Karau] Fix long line
ee2afaa4 [Holden Karau] Comment the built in converter type inferance code a bit.
dddf57db [Holden Karau] Make a note about the resize/realloc in underflow with size
1fd9588a [Holden Karau] Do dynamic resize on the array buffer if size ended up being larger (e.g. support underflow from iterator constructors).
ad935e9d [Holden Karau] Have size override the size of the iterator if the iterator is larger.
42f06996 [Holden Karau] Style fix
fa0abcc2 [Holden Karau] Add ConvertPySequence to other side
01e462c2 [Holden Karau] Naive merge, lets see if it works
9eb3f106 [Holden Karau] Return the append inside of the decimal convert case/switch business
a571ad4b [Holden Karau] Merge in changes to timestamp/datetime builtin converter
8c42fdc2 [Holden Karau] Feedback from wes (fix some previously unchecked appends, fix long line )
389976cb [Holden Karau] Use CRTP in the iterator
52b03e3e [Holden Karau] Use a const ownedref
1d970bdb [Holden Karau] Switch the SeqVisitor to use OwnedRef
c429f9a5 [Holden Karau] Style fixes
d392daa8 [Holden Karau] Add limmited pure iterator support and a note
be58bc0f [Holden Karau] Restore ArrowBlock (unreleated change)
3a55e824 [Holden Karau] Update array function description
80cc971e [Holden Karau] Cleanup debugging
63c0b7fa [Holden Karau] Tests pass (TODO cleanup debugging)
82ec3c3d [Holden Karau] revert changes to _array.pyx
ca0d5303 [Holden Karau] In theory this works ok now for iterables as well
b6c72f5c [Holden Karau] Make TypedConverterVisitor work on PySequence or Python Iterators
48b08aa5 [Holden Karau] Switch remaining converters
a1bf4bd1 [Holden Karau] Move over timestamp and byte converters
15cdfe34 [Holden Karau] Move more of the convertors to the visitor version
76e08ca5 [Holden Karau] Part of the way along adding iterable support
77c935b9 [Holden Karau] Revert accidently java change
5c0fa0b5 [Holden Karau] Start adding iterable support
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/bddb2197
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/bddb2197
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/bddb2197
Branch: refs/heads/master
Commit: bddb2197df4e3cba4a27da27cd15917fe30d3d45
Parents: a588938
Author: Holden Karau <ho...@us.ibm.com>
Authored: Wed Jun 28 11:35:45 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed Jun 28 11:35:45 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/python/builtin_convert.cc | 434 +++++++++++-----------
cpp/src/arrow/python/builtin_convert.h | 3 +-
cpp/src/arrow/python/pandas_convert.cc | 4 +-
python/pyarrow/array.pxi | 29 +-
python/pyarrow/includes/libarrow.pxd | 4 +
python/pyarrow/tests/test_convert_builtin.py | 50 +++
6 files changed, 307 insertions(+), 217 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 3197c2a..d3bfb37 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -145,48 +145,50 @@ class ScalarVisitor {
static constexpr int MAX_NESTING_LEVELS = 32;
+// SeqVisitor is used to infer the type.
class SeqVisitor {
public:
SeqVisitor() : max_nesting_level_(0) {
memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
}
+ // co-recursive with VisitElem
Status Visit(PyObject* obj, int level = 0) {
- Py_ssize_t size = PySequence_Size(obj);
-
if (level > max_nesting_level_) { max_nesting_level_ = level; }
- for (int64_t i = 0; i < size; ++i) {
- // TODO(wesm): Error checking?
- // TODO(wesm): Specialize for PyList_GET_ITEM?
- OwnedRef item_ref(PySequence_GetItem(obj, i));
- PyObject* item = item_ref.obj();
-
- if (PyList_Check(item)) {
- RETURN_NOT_OK(Visit(item, level + 1));
- } else if (PyDict_Check(item)) {
- return Status::NotImplemented("No type inference for dicts");
- } else {
- // We permit nulls at any level of nesting
- if (item == Py_None) {
- // TODO
- } else {
- ++nesting_histogram_[level];
- scalars_.Visit(item);
- }
+ // Loop through either a sequence or an iterator.
+ if (PySequence_Check(obj)) {
+ Py_ssize_t size = PySequence_Size(obj);
+ for (int64_t i = 0; i < size; ++i) {
+ // TODO(wesm): Specialize for PyList_GET_ITEM?
+ OwnedRef ref = OwnedRef(PySequence_GetItem(obj, i));
+ RETURN_NOT_OK(VisitElem(ref, level));
}
+ } else if (PyObject_HasAttrString(obj, "__iter__")) {
+ OwnedRef iter = OwnedRef(PyObject_GetIter(obj));
+ PyObject* item;
+ while ((item = PyIter_Next(iter.obj()))) {
+ OwnedRef ref = OwnedRef(item);
+ RETURN_NOT_OK(VisitElem(ref, level));
+ }
+ } else {
+ return Status::TypeError("Object is not a sequence or iterable");
}
return Status::OK();
}
std::shared_ptr<DataType> GetType() {
+ // If all the non-list inputs were null (or there were no inputs)
if (scalars_.total_count() == 0) {
if (max_nesting_level_ == 0) {
+ // If its just a single empty list or list of nulls, return null.
return null();
} else {
+ // Error, if we have nesting but no concrete base type.
return nullptr;
}
} else {
+ // Lists of Lists of [X]
std::shared_ptr<DataType> result = scalars_.GetType();
for (int i = 0; i < max_nesting_level_; ++i) {
result = std::make_shared<ListType>(result);
@@ -199,6 +201,7 @@ class SeqVisitor {
if (scalars_.total_count() > 0) {
if (num_nesting_levels() > 1) {
return Status::Invalid("Mixed nesting levels not supported");
+ // If the nesting goes deeper than the deepest scalar
} else if (max_observed_level() < max_nesting_level_) {
return Status::Invalid("Mixed nesting levels not supported");
}
@@ -206,6 +209,7 @@ class SeqVisitor {
return Status::OK();
}
+ // Returns the deepest level which has scalar elements.
int max_observed_level() const {
int result = 0;
for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
@@ -214,6 +218,7 @@ class SeqVisitor {
return result;
}
+ // Returns the number of nesting levels which have scalar elements.
int num_nesting_levels() const {
int result = 0;
for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
@@ -226,16 +231,50 @@ class SeqVisitor {
ScalarVisitor scalars_;
// Track observed
+ // Deapest nesting level (irregardless of scalars)
int max_nesting_level_;
+ // Number of scalar elements at each nesting level.
+ // (TOOD: We really only need to know if a scalar is present, not the count).
int nesting_histogram_[MAX_NESTING_LEVELS];
+
+ // Visits a specific element (inner part of the loop).
+ Status VisitElem(const OwnedRef &item_ref, int level) {
+ if (PyList_Check(item_ref.obj())) {
+ RETURN_NOT_OK(Visit(item_ref.obj(), level + 1));
+ } else if (PyDict_Check(item_ref.obj())) {
+ return Status::NotImplemented("No type inference for dicts");
+ } else {
+ // We permit nulls at any level of nesting
+ if (item_ref.obj() == Py_None) {
+ // TODO
+ } else {
+ ++nesting_histogram_[level];
+ scalars_.Visit(item_ref.obj());
+ }
+ }
+ return Status::OK();
+ }
};
Status InferArrowSize(PyObject* obj, int64_t* size) {
- *size = static_cast<int64_t>(PySequence_Size(obj));
+ if (PySequence_Check(obj)) {
+ *size = static_cast<int64_t>(PySequence_Size(obj));
+ } else if (PyObject_HasAttrString(obj, "__iter__")) {
+ PyObject* iter = PyObject_GetIter(obj);
+ OwnedRef iter_ref(iter);
+ *size = 0;
+ PyObject* item;
+ while ((item = PyIter_Next(iter))) {
+ OwnedRef item_ref(item);
+ *size += 1;
+ }
+ } else {
+ return Status::TypeError("Object is not a sequence or iterable");
+ }
if (PyErr_Occurred()) {
// Not a sequence
PyErr_Clear();
- return Status::TypeError("Object is not a sequence");
+ return Status::TypeError("Object is not a sequence or iterable");
}
return Status::OK();
}
@@ -243,6 +282,7 @@ Status InferArrowSize(PyObject* obj, int64_t* size) {
// Non-exhaustive type inference
Status InferArrowTypeAndSize(
PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
+
RETURN_NOT_OK(InferArrowSize(obj, size));
// For 0-length sequences, refuse to guess
@@ -268,7 +308,9 @@ class SeqConverter {
return Status::OK();
}
- virtual Status AppendData(PyObject* seq) = 0;
+ virtual Status AppendData(PyObject* seq, int64_t size) = 0;
+
+ virtual ~SeqConverter() {}
protected:
std::shared_ptr<ArrayBuilder> builder_;
@@ -287,221 +329,210 @@ class TypedConverter : public SeqConverter {
BuilderType* typed_builder_;
};
-class BoolConverter : public TypedConverter<BooleanBuilder> {
+template <typename BuilderType, class Derived>
+class TypedConverterVisitor : public TypedConverter<BuilderType> {
public:
- Status AppendData(PyObject* seq) override {
- int64_t size = static_cast<int64_t>(PySequence_Size(seq));
- RETURN_NOT_OK(typed_builder_->Reserve(size));
- for (int64_t i = 0; i < size; ++i) {
- OwnedRef item(PySequence_GetItem(seq, i));
- if (item.obj() == Py_None) {
- typed_builder_->AppendNull();
- } else {
- if (item.obj() == Py_True) {
- typed_builder_->Append(true);
- } else {
- typed_builder_->Append(false);
- }
+ Status AppendData(PyObject* obj, int64_t size) override {
+ /// Ensure we've allocated enough space
+ RETURN_NOT_OK(this->typed_builder_->Reserve(size));
+ // Iterate over the items adding each one
+ if (PySequence_Check(obj)) {
+ for (int64_t i = 0; i < size; ++i) {
+ OwnedRef ref(PySequence_GetItem(obj, i));
+ RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+ }
+ } else if (PyObject_HasAttrString(obj, "__iter__")) {
+ PyObject* iter = PyObject_GetIter(obj);
+ OwnedRef iter_ref(iter);
+ PyObject* item;
+ int64_t i = 0;
+ // To allow people with long generators to only convert a subset, stop
+ // consuming at size.
+ while ((item = PyIter_Next(iter)) && i < size) {
+ OwnedRef ref(item);
+ RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+ ++i;
}
+ if (size != i) {
+ RETURN_NOT_OK(this->typed_builder_->Resize(i));
+ }
+ } else {
+ return Status::TypeError("Object is not a sequence or iterable");
}
return Status::OK();
}
+
+ virtual Status AppendItem(const OwnedRef& item) = 0;
};
-class Int64Converter : public TypedConverter<Int64Builder> {
+class BoolConverter : public TypedConverterVisitor<
+ BooleanBuilder, BoolConverter> {
public:
- Status AppendData(PyObject* seq) override {
- int64_t val;
- int64_t size = static_cast<int64_t>(PySequence_Size(seq));
- RETURN_NOT_OK(typed_builder_->Reserve(size));
- for (int64_t i = 0; i < size; ++i) {
- OwnedRef item(PySequence_GetItem(seq, i));
- if (item.obj() == Py_None) {
- typed_builder_->AppendNull();
+ inline Status AppendItem(const OwnedRef& item) {
+ if (item.obj() == Py_None) {
+ return typed_builder_->AppendNull();
+ } else {
+ if (item.obj() == Py_True) {
+ return typed_builder_->Append(true);
} else {
- val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
- RETURN_IF_PYERROR();
- typed_builder_->Append(val);
+ return typed_builder_->Append(false);
}
}
- return Status::OK();
}
};
-class DateConverter : public TypedConverter<Date64Builder> {
+class Int64Converter : public TypedConverterVisitor<
+ Int64Builder, Int64Converter> {
public:
- Status AppendData(PyObject* seq) override {
- int64_t size = static_cast<int64_t>(PySequence_Size(seq));
- RETURN_NOT_OK(typed_builder_->Reserve(size));
- for (int64_t i = 0; i < size; ++i) {
- OwnedRef item(PySequence_GetItem(seq, i));
- if (item.obj() == Py_None) {
- typed_builder_->AppendNull();
- } else {
- PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
- typed_builder_->Append(PyDate_to_ms(pydate));
- }
+ inline Status AppendItem(const OwnedRef& item) {
+ int64_t val;
+ if (item.obj() == Py_None) {
+ return typed_builder_->AppendNull();
+ } else {
+ val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+ RETURN_IF_PYERROR();
+ return typed_builder_->Append(val);
}
- return Status::OK();
}
};
-class TimestampConverter : public TypedConverter<TimestampBuilder> {
+class DateConverter : public TypedConverterVisitor<
+ Date64Builder, DateConverter> {
public:
- Status AppendData(PyObject* seq) override {
- int64_t size = static_cast<int64_t>(PySequence_Size(seq));
- RETURN_NOT_OK(typed_builder_->Reserve(size));
- for (int64_t i = 0; i < size; ++i) {
- OwnedRef item(PySequence_GetItem(seq, i));
- if (item.obj() == Py_None) {
- typed_builder_->AppendNull();
- } else {
- PyDateTime_DateTime* pydatetime =
- reinterpret_cast<PyDateTime_DateTime*>(item.obj());
- typed_builder_->Append(PyDateTime_to_us(pydatetime));
- RETURN_IF_PYERROR();
- }
+ inline Status AppendItem(const OwnedRef& item) {
+ if (item.obj() == Py_None) {
+ return typed_builder_->AppendNull();
+ } else {
+ PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
+ return typed_builder_->Append(PyDate_to_ms(pydate));
+ }
+ }
+};
+
+class TimestampConverter : public TypedConverterVisitor<
+ Date64Builder, TimestampConverter> {
+ public:
+ inline Status AppendItem(const OwnedRef& item) {
+ if (item.obj() == Py_None) {
+ return typed_builder_->AppendNull();
+ } else {
+ PyDateTime_DateTime* pydatetime =
+ reinterpret_cast<PyDateTime_DateTime*>(item.obj());
+ return typed_builder_->Append(PyDateTime_to_us(pydatetime));
}
- return Status::OK();
}
};
-class DoubleConverter : public TypedConverter<DoubleBuilder> {
+class DoubleConverter : public TypedConverterVisitor<
+ DoubleBuilder, DoubleConverter> {
public:
- Status AppendData(PyObject* seq) override {
+ inline Status AppendItem(const OwnedRef& item) {
double val;
- int64_t size = static_cast<int64_t>(PySequence_Size(seq));
- RETURN_NOT_OK(typed_builder_->Reserve(size));
- for (int64_t i = 0; i < size; ++i) {
- OwnedRef item(PySequence_GetItem(seq, i));
- if (item.obj() == Py_None) {
- typed_builder_->AppendNull();
- } else {
- val = PyFloat_AsDouble(item.obj());
- RETURN_IF_PYERROR();
- typed_builder_->Append(val);
- }
+ if (item.obj() == Py_None) {
+ return typed_builder_->AppendNull();
+ } else {
+ val = PyFloat_AsDouble(item.obj());
+ RETURN_IF_PYERROR();
+ return typed_builder_->Append(val);
}
- return Status::OK();
}
};
-class BytesConverter : public TypedConverter<BinaryBuilder> {
+class BytesConverter : public TypedConverterVisitor<
+ BinaryBuilder, BytesConverter> {
public:
- Status AppendData(PyObject* seq) override {
- PyObject* item;
+ inline Status AppendItem(const OwnedRef& item) {
PyObject* bytes_obj;
- OwnedRef tmp;
const char* bytes;
Py_ssize_t length;
- Py_ssize_t size = PySequence_Size(seq);
- for (int64_t i = 0; i < size; ++i) {
- item = PySequence_GetItem(seq, i);
- OwnedRef holder(item);
-
- if (item == Py_None) {
- RETURN_NOT_OK(typed_builder_->AppendNull());
- continue;
- } else if (PyUnicode_Check(item)) {
- tmp.reset(PyUnicode_AsUTF8String(item));
- RETURN_IF_PYERROR();
- bytes_obj = tmp.obj();
- } else if (PyBytes_Check(item)) {
- bytes_obj = item;
- } else {
- return InvalidConversion(item, "bytes");
- }
- // No error checking
- length = PyBytes_GET_SIZE(bytes_obj);
- bytes = PyBytes_AS_STRING(bytes_obj);
- RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast<int32_t>(length)));
+ OwnedRef tmp;
+
+ if (item.obj() == Py_None) {
+ RETURN_NOT_OK(typed_builder_->AppendNull());
+ return Status::OK();
+ } else if (PyUnicode_Check(item.obj())) {
+ tmp.reset(PyUnicode_AsUTF8String(item.obj()));
+ RETURN_IF_PYERROR();
+ bytes_obj = tmp.obj();
+ } else if (PyBytes_Check(item.obj())) {
+ bytes_obj = item.obj();
+ } else {
+ return InvalidConversion(item.obj(), "bytes");
}
- return Status::OK();
+ // No error checking
+ length = PyBytes_GET_SIZE(bytes_obj);
+ bytes = PyBytes_AS_STRING(bytes_obj);
+ return typed_builder_->Append(bytes, static_cast<int32_t>(length));
}
};
-class FixedWidthBytesConverter : public TypedConverter<FixedSizeBinaryBuilder> {
+class FixedWidthBytesConverter : public TypedConverterVisitor<
+ FixedSizeBinaryBuilder, FixedWidthBytesConverter> {
public:
- Status AppendData(PyObject* seq) override {
- PyObject* item;
+ inline Status AppendItem(const OwnedRef& item) {
PyObject* bytes_obj;
OwnedRef tmp;
Py_ssize_t expected_length = std::dynamic_pointer_cast<FixedSizeBinaryType>(
typed_builder_->type())->byte_width();
- Py_ssize_t size = PySequence_Size(seq);
- for (int64_t i = 0; i < size; ++i) {
- item = PySequence_GetItem(seq, i);
- OwnedRef holder(item);
-
- if (item == Py_None) {
- RETURN_NOT_OK(typed_builder_->AppendNull());
- continue;
- } else if (PyUnicode_Check(item)) {
- tmp.reset(PyUnicode_AsUTF8String(item));
- RETURN_IF_PYERROR();
- bytes_obj = tmp.obj();
- } else if (PyBytes_Check(item)) {
- bytes_obj = item;
- } else {
- return InvalidConversion(item, "bytes");
- }
- // No error checking
- RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
- RETURN_NOT_OK(typed_builder_->Append(
- reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(bytes_obj))));
+ if (item.obj() == Py_None) {
+ RETURN_NOT_OK(typed_builder_->AppendNull());
+ return Status::OK();
+ } else if (PyUnicode_Check(item.obj())) {
+ tmp.reset(PyUnicode_AsUTF8String(item.obj()));
+ RETURN_IF_PYERROR();
+ bytes_obj = tmp.obj();
+ } else if (PyBytes_Check(item.obj())) {
+ bytes_obj = item.obj();
+ } else {
+ return InvalidConversion(item.obj(), "bytes");
}
- return Status::OK();
+ // No error checking
+ RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
+ return typed_builder_->Append(
+ reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(bytes_obj)));
}
};
-class UTF8Converter : public TypedConverter<StringBuilder> {
+class UTF8Converter : public TypedConverterVisitor<
+ StringBuilder, UTF8Converter> {
public:
- Status AppendData(PyObject* seq) override {
- PyObject* item;
+ inline Status AppendItem(const OwnedRef& item) {
PyObject* bytes_obj;
OwnedRef tmp;
const char* bytes;
Py_ssize_t length;
- Py_ssize_t size = PySequence_Size(seq);
- for (int64_t i = 0; i < size; ++i) {
- item = PySequence_GetItem(seq, i);
- OwnedRef holder(item);
-
- if (item == Py_None) {
- RETURN_NOT_OK(typed_builder_->AppendNull());
- continue;
- } else if (!PyUnicode_Check(item)) {
- return Status::Invalid("Non-unicode value encountered");
- }
- tmp.reset(PyUnicode_AsUTF8String(item));
- RETURN_IF_PYERROR();
- bytes_obj = tmp.obj();
- // No error checking
- length = PyBytes_GET_SIZE(bytes_obj);
- bytes = PyBytes_AS_STRING(bytes_obj);
- RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast<int32_t>(length)));
+ if (item.obj() == Py_None) {
+ return typed_builder_->AppendNull();
+ } else if (!PyUnicode_Check(item.obj())) {
+ return Status::Invalid("Non-unicode value encountered");
}
- return Status::OK();
+ tmp.reset(PyUnicode_AsUTF8String(item.obj()));
+ RETURN_IF_PYERROR();
+ bytes_obj = tmp.obj();
+
+ // No error checking
+ length = PyBytes_GET_SIZE(bytes_obj);
+ bytes = PyBytes_AS_STRING(bytes_obj);
+ return typed_builder_->Append(bytes, static_cast<int32_t>(length));
}
};
-class ListConverter : public TypedConverter<ListBuilder> {
+class ListConverter : public TypedConverterVisitor<
+ ListBuilder, ListConverter> {
public:
Status Init(const std::shared_ptr<ArrayBuilder>& builder) override;
- Status AppendData(PyObject* seq) override {
- Py_ssize_t size = PySequence_Size(seq);
- for (int64_t i = 0; i < size; ++i) {
- OwnedRef item(PySequence_GetItem(seq, i));
- if (item.obj() == Py_None) {
- RETURN_NOT_OK(typed_builder_->AppendNull());
- } else {
- typed_builder_->Append();
- RETURN_NOT_OK(value_converter_->AppendData(item.obj()));
- }
+ inline Status AppendItem(const OwnedRef& item) {
+ if (item.obj() == Py_None) {
+ return typed_builder_->AppendNull();
+ } else {
+ typed_builder_->Append();
+ PyObject* item_obj = item.obj();
+ int64_t list_size =
+ static_cast<int64_t>(PySequence_Size(item_obj));
+ return value_converter_->AppendData(item_obj, list_size);
}
- return Status::OK();
}
protected:
@@ -512,45 +543,33 @@ class ListConverter : public TypedConverter<ListBuilder> {
case bit_width: { \
arrow::decimal::Decimal##bit_width out; \
RETURN_NOT_OK(PythonDecimalToArrowDecimal((item), &out)); \
- RETURN_NOT_OK((builder)->Append(out)); \
+ return ((builder)->Append(out)); \
break; \
}
-class DecimalConverter : public TypedConverter<arrow::DecimalBuilder> {
+class DecimalConverter : public TypedConverterVisitor<
+ arrow::DecimalBuilder, DecimalConverter> {
public:
- Status AppendData(PyObject* seq) override {
- /// Ensure we've allocated enough space
- Py_ssize_t size = PySequence_Size(seq);
- RETURN_NOT_OK(typed_builder_->Reserve(size));
-
+ inline Status AppendItem(const OwnedRef& item) {
/// Can the compiler figure out that the case statement below isn't necessary
/// once we're running?
const int bit_width =
std::dynamic_pointer_cast<arrow::DecimalType>(typed_builder_->type())
->bit_width();
- OwnedRef ref;
- PyObject* item = nullptr;
- for (int64_t i = 0; i < size; ++i) {
- ref.reset(PySequence_GetItem(seq, i));
- item = ref.obj();
-
- /// TODO(phillipc): Check for nan?
- if (item != Py_None) {
- switch (bit_width) {
- DECIMAL_CONVERT_CASE(32, item, typed_builder_)
- DECIMAL_CONVERT_CASE(64, item, typed_builder_)
- DECIMAL_CONVERT_CASE(128, item, typed_builder_)
- default:
- break;
- }
- RETURN_IF_PYERROR();
- } else {
- RETURN_NOT_OK(typed_builder_->AppendNull());
+ /// TODO(phillipc): Check for nan?
+ if (item.obj() != Py_None) {
+ switch (bit_width) {
+ DECIMAL_CONVERT_CASE(32, item.obj(), typed_builder_)
+ DECIMAL_CONVERT_CASE(64, item.obj(), typed_builder_)
+ DECIMAL_CONVERT_CASE(128, item.obj(), typed_builder_)
+ default:
+ return Status::OK();
}
+ RETURN_IF_PYERROR();
+ } else {
+ return typed_builder_->AppendNull();
}
-
- return Status::OK();
}
};
@@ -601,7 +620,8 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
}
Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
- const std::shared_ptr<ArrayBuilder>& builder) {
+ const std::shared_ptr<ArrayBuilder>& builder,
+ int64_t size) {
PyDateTime_IMPORT;
std::shared_ptr<SeqConverter> converter = GetConverter(type);
if (converter == nullptr) {
@@ -611,7 +631,7 @@ Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
}
converter->Init(builder);
- return converter->AppendData(obj);
+ return converter->AppendData(obj, size);
}
Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out) {
@@ -632,7 +652,7 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>
// Give the sequence converter an array builder
std::shared_ptr<ArrayBuilder> builder;
RETURN_NOT_OK(MakeBuilder(pool, type, &builder));
- RETURN_NOT_OK(AppendPySequence(obj, type, builder));
+ RETURN_NOT_OK(AppendPySequence(obj, type, builder, size));
return builder->Finish(out);
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/cpp/src/arrow/python/builtin_convert.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index a6180d4..7f42c33 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -44,7 +44,8 @@ ARROW_EXPORT arrow::Status InferArrowSize(PyObject* obj, int64_t* size);
ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj,
const std::shared_ptr<arrow::DataType>& type,
- const std::shared_ptr<arrow::ArrayBuilder>& builder);
+ const std::shared_ptr<arrow::ArrayBuilder>& builder,
+ int64_t size);
// Type and size inference
ARROW_EXPORT
http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 654c392..ea23496 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -953,7 +953,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
return Status::TypeError(ss.str());
}
- RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder));
+ RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder, size));
} else {
return Status::TypeError("Unsupported Python type for list items");
}
@@ -1002,7 +1002,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
ss << inferred_type->ToString() << " cannot be converted to STRING.";
return Status::TypeError(ss.str());
}
- RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder));
+ RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder, size));
} else {
return Status::TypeError("Unsupported Python type for list items");
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 2a0b0a7..e1be5b1 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1059,18 +1059,26 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type,
-def array(object sequence, DataType type=None, MemoryPool memory_pool=None):
+def array(object sequence, DataType type=None, MemoryPool memory_pool=None,
+ size=None):
"""
Create pyarrow.Array instance from a Python sequence
Parameters
----------
- sequence : sequence-like object of Python objects
+ sequence : sequence-like or iterable object of Python objects.
+ If both type and size are specified may be a single use iterable.
type : pyarrow.DataType, optional
If not passed, will be inferred from the data
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool
+ size : int64, optional
+ Size of the elements. If the imput is larger than size bail at this
+ length. For iterators, if size is larger than the input iterator this
+ will be treated as a "max size", but will involve an initial allocation
+ of size followed by a resize to the actual size (so if you know the
+ exact size specifying it correctly will give you better performance).
Returns
-------
@@ -1084,11 +1092,18 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None):
if type is None:
check_status(ConvertPySequence(sequence, pool, &sp_array))
else:
- check_status(
- ConvertPySequence(
- sequence, pool, &sp_array, type.sp_type
- )
- )
+ if size is None:
+ check_status(
+ ConvertPySequence(
+ sequence, pool, &sp_array, type.sp_type
+ )
+ )
+ else:
+ check_status(
+ ConvertPySequence(
+ sequence, pool, &sp_array, type.sp_type, size
+ )
+ )
return pyarrow_wrap_array(sp_array)
http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index f712274..741d832 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -650,6 +650,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
CStatus ConvertPySequence(object obj, CMemoryPool* pool,
shared_ptr[CArray]* out,
const shared_ptr[CDataType]& type)
+ CStatus ConvertPySequence(object obj, CMemoryPool* pool,
+ shared_ptr[CArray]* out,
+ const shared_ptr[CDataType]& type,
+ int64_t size)
CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index d25055d..bf14c4f 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -22,6 +22,44 @@ import pyarrow as pa
import datetime
import decimal
+class StrangeIterable:
+ def __init__(self, lst):
+ self.lst = lst
+
+ def __iter__(self):
+ return self.lst.__iter__()
+
+class TestConvertIterable(unittest.TestCase):
+
+ def test_iterable_types(self):
+ arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
+ arr2 = pa.array((0, 1, 2, 3))
+
+ assert arr1.equals(arr2)
+
+ def test_empty_iterable(self):
+ arr = pa.array(StrangeIterable([]))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []
+
+
+class TestLimitedConvertIterator(unittest.TestCase):
+ def test_iterator_types(self):
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+ def test_iterator_size_overflow(self):
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
+ arr2 = pa.array((0, 1))
+ assert arr1.equals(arr2)
+
+ def test_iterator_size_underflow(self):
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
class TestConvertSequence(unittest.TestCase):
@@ -208,3 +246,15 @@ class TestConvertSequence(unittest.TestCase):
type = pa.decimal(precision=23, scale=5)
arr = pa.array(data, type=type)
assert arr.to_pylist() == data
+
+ def test_range_types(self):
+ arr1 = pa.array(range(3))
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+ def test_empty_range(self):
+ arr = pa.array(range(0))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []