You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/06/28 15:35:49 UTC

arrow git commit: ARROW-834: Python Support creating from iterables

Repository: arrow
Updated Branches:
  refs/heads/master a58893882 -> bddb2197d


ARROW-834: Python Support creating from iterables

Support creating arrow arrays from iterables.
Possible follow up TODO (or possibly belongs in this issue); throw a clear exception when passed an iterator rather than an iterable.

Author: Holden Karau <ho...@us.ibm.com>

Closes #602 from holdenk/ARROW-834-csupport-creating-from-iterables and squashes the following commits:

750e7f4c [Holden Karau] Switch AppendItem to pure virtual for TypedConverterVisitor
0b72e956 [Holden Karau] Remove unecessary file after merge
2ed00d91 [Holden Karau] Fix long line
ee2afaa4 [Holden Karau] Comment the built in converter type inferance code a bit.
dddf57db [Holden Karau] Make a note about the resize/realloc in underflow with size
1fd9588a [Holden Karau] Do dynamic resize on the array buffer if size ended up being larger (e.g. support underflow from iterator constructors).
ad935e9d [Holden Karau] Have size override the size of the iterator if the iterator is larger.
42f06996 [Holden Karau] Style fix
fa0abcc2 [Holden Karau] Add ConvertPySequence to other side
01e462c2 [Holden Karau] Naive merge, lets see if it works
9eb3f106 [Holden Karau] Return the append inside of the decimal convert case/switch business
a571ad4b [Holden Karau] Merge in changes to timestamp/datetime builtin converter
8c42fdc2 [Holden Karau] Feedback from wes (fix some previously unchecked appends, fix long line )
389976cb [Holden Karau] Use CRTP in the iterator
52b03e3e [Holden Karau] Use a const ownedref
1d970bdb [Holden Karau] Switch the SeqVisitor to use OwnedRef
c429f9a5 [Holden Karau] Style fixes
d392daa8 [Holden Karau] Add limmited pure iterator support and a note
be58bc0f [Holden Karau] Restore ArrowBlock (unreleated change)
3a55e824 [Holden Karau] Update array function description
80cc971e [Holden Karau] Cleanup debugging
63c0b7fa [Holden Karau] Tests pass (TODO cleanup debugging)
82ec3c3d [Holden Karau] revert changes to _array.pyx
ca0d5303 [Holden Karau] In theory this works ok now for iterables as well
b6c72f5c [Holden Karau] Make TypedConverterVisitor work on PySequence or Python Iterators
48b08aa5 [Holden Karau] Switch remaining converters
a1bf4bd1 [Holden Karau] Move over timestamp and byte converters
15cdfe34 [Holden Karau] Move more of the convertors to the visitor version
76e08ca5 [Holden Karau] Part of the way along adding iterable support
77c935b9 [Holden Karau] Revert accidently java change
5c0fa0b5 [Holden Karau] Start adding iterable support


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/bddb2197
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/bddb2197
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/bddb2197

Branch: refs/heads/master
Commit: bddb2197df4e3cba4a27da27cd15917fe30d3d45
Parents: a588938
Author: Holden Karau <ho...@us.ibm.com>
Authored: Wed Jun 28 11:35:45 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed Jun 28 11:35:45 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/builtin_convert.cc      | 434 +++++++++++-----------
 cpp/src/arrow/python/builtin_convert.h       |   3 +-
 cpp/src/arrow/python/pandas_convert.cc       |   4 +-
 python/pyarrow/array.pxi                     |  29 +-
 python/pyarrow/includes/libarrow.pxd         |   4 +
 python/pyarrow/tests/test_convert_builtin.py |  50 +++
 6 files changed, 307 insertions(+), 217 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 3197c2a..d3bfb37 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -145,48 +145,50 @@ class ScalarVisitor {
 
 static constexpr int MAX_NESTING_LEVELS = 32;
 
+// SeqVisitor is used to infer the type.
 class SeqVisitor {
  public:
   SeqVisitor() : max_nesting_level_(0) {
     memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
   }
 
+  // co-recursive with VisitElem
   Status Visit(PyObject* obj, int level = 0) {
-    Py_ssize_t size = PySequence_Size(obj);
-
     if (level > max_nesting_level_) { max_nesting_level_ = level; }
 
-    for (int64_t i = 0; i < size; ++i) {
-      // TODO(wesm): Error checking?
-      // TODO(wesm): Specialize for PyList_GET_ITEM?
-      OwnedRef item_ref(PySequence_GetItem(obj, i));
-      PyObject* item = item_ref.obj();
-
-      if (PyList_Check(item)) {
-        RETURN_NOT_OK(Visit(item, level + 1));
-      } else if (PyDict_Check(item)) {
-        return Status::NotImplemented("No type inference for dicts");
-      } else {
-        // We permit nulls at any level of nesting
-        if (item == Py_None) {
-          // TODO
-        } else {
-          ++nesting_histogram_[level];
-          scalars_.Visit(item);
-        }
+    // Loop through either a sequence or an iterator.
+    if (PySequence_Check(obj)) {
+      Py_ssize_t size = PySequence_Size(obj);
+      for (int64_t i = 0; i < size; ++i) {
+	// TODO(wesm): Specialize for PyList_GET_ITEM?
+	OwnedRef ref = OwnedRef(PySequence_GetItem(obj, i));
+	RETURN_NOT_OK(VisitElem(ref, level));
       }
+    } else if (PyObject_HasAttrString(obj, "__iter__")) {
+      OwnedRef iter = OwnedRef(PyObject_GetIter(obj));
+      PyObject* item;
+      while ((item = PyIter_Next(iter.obj()))) {
+	OwnedRef ref = OwnedRef(item);
+	RETURN_NOT_OK(VisitElem(ref, level));
+      }
+    } else {
+      return Status::TypeError("Object is not a sequence or iterable");
     }
     return Status::OK();
   }
 
   std::shared_ptr<DataType> GetType() {
+    // If all the non-list inputs were null (or there were no inputs)
     if (scalars_.total_count() == 0) {
       if (max_nesting_level_ == 0) {
+	// If its just a single empty list or list of nulls, return null.
         return null();
       } else {
+	// Error, if we have nesting but no concrete base type.
         return nullptr;
       }
     } else {
+      // Lists of Lists of [X]
       std::shared_ptr<DataType> result = scalars_.GetType();
       for (int i = 0; i < max_nesting_level_; ++i) {
         result = std::make_shared<ListType>(result);
@@ -199,6 +201,7 @@ class SeqVisitor {
     if (scalars_.total_count() > 0) {
       if (num_nesting_levels() > 1) {
         return Status::Invalid("Mixed nesting levels not supported");
+      // If the nesting goes deeper than the deepest scalar
       } else if (max_observed_level() < max_nesting_level_) {
         return Status::Invalid("Mixed nesting levels not supported");
       }
@@ -206,6 +209,7 @@ class SeqVisitor {
     return Status::OK();
   }
 
+  // Returns the deepest level which has scalar elements.
   int max_observed_level() const {
     int result = 0;
     for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
@@ -214,6 +218,7 @@ class SeqVisitor {
     return result;
   }
 
+  // Returns the number of nesting levels which have scalar elements.
   int num_nesting_levels() const {
     int result = 0;
     for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
@@ -226,16 +231,50 @@ class SeqVisitor {
   ScalarVisitor scalars_;
 
   // Track observed
+  // Deapest nesting level (irregardless of scalars)
   int max_nesting_level_;
+  // Number of scalar elements at each nesting level.
+  // (TOOD: We really only need to know if a scalar is present, not the count).
   int nesting_histogram_[MAX_NESTING_LEVELS];
+
+  // Visits a specific element (inner part of the loop).
+  Status VisitElem(const OwnedRef &item_ref, int level) {
+    if (PyList_Check(item_ref.obj())) {
+      RETURN_NOT_OK(Visit(item_ref.obj(), level + 1));
+    } else if (PyDict_Check(item_ref.obj())) {
+      return Status::NotImplemented("No type inference for dicts");
+    } else {
+      // We permit nulls at any level of nesting
+      if (item_ref.obj() == Py_None) {
+	// TODO
+      } else {
+	++nesting_histogram_[level];
+	scalars_.Visit(item_ref.obj());
+      }
+    }
+    return Status::OK();
+  }
 };
 
 Status InferArrowSize(PyObject* obj, int64_t* size) {
-  *size = static_cast<int64_t>(PySequence_Size(obj));
+  if (PySequence_Check(obj)) {
+    *size = static_cast<int64_t>(PySequence_Size(obj));
+  } else if (PyObject_HasAttrString(obj, "__iter__")) {
+    PyObject* iter = PyObject_GetIter(obj);
+    OwnedRef iter_ref(iter);
+    *size = 0;
+    PyObject* item;
+    while ((item = PyIter_Next(iter))) {
+      OwnedRef item_ref(item);
+      *size += 1;
+    }
+  } else {
+    return Status::TypeError("Object is not a sequence or iterable");
+  }
   if (PyErr_Occurred()) {
     // Not a sequence
     PyErr_Clear();
-    return Status::TypeError("Object is not a sequence");
+    return Status::TypeError("Object is not a sequence or iterable");
   }
   return Status::OK();
 }
@@ -243,6 +282,7 @@ Status InferArrowSize(PyObject* obj, int64_t* size) {
 // Non-exhaustive type inference
 Status InferArrowTypeAndSize(
     PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
+
   RETURN_NOT_OK(InferArrowSize(obj, size));
 
   // For 0-length sequences, refuse to guess
@@ -268,7 +308,9 @@ class SeqConverter {
     return Status::OK();
   }
 
-  virtual Status AppendData(PyObject* seq) = 0;
+  virtual Status AppendData(PyObject* seq, int64_t size) = 0;
+
+  virtual ~SeqConverter() {}
 
  protected:
   std::shared_ptr<ArrayBuilder> builder_;
@@ -287,221 +329,210 @@ class TypedConverter : public SeqConverter {
   BuilderType* typed_builder_;
 };
 
-class BoolConverter : public TypedConverter<BooleanBuilder> {
+template <typename BuilderType, class Derived>
+class TypedConverterVisitor : public TypedConverter<BuilderType> {
  public:
-  Status AppendData(PyObject* seq) override {
-    int64_t size = static_cast<int64_t>(PySequence_Size(seq));
-    RETURN_NOT_OK(typed_builder_->Reserve(size));
-    for (int64_t i = 0; i < size; ++i) {
-      OwnedRef item(PySequence_GetItem(seq, i));
-      if (item.obj() == Py_None) {
-        typed_builder_->AppendNull();
-      } else {
-        if (item.obj() == Py_True) {
-          typed_builder_->Append(true);
-        } else {
-          typed_builder_->Append(false);
-        }
+  Status AppendData(PyObject* obj, int64_t size) override {
+    /// Ensure we've allocated enough space
+    RETURN_NOT_OK(this->typed_builder_->Reserve(size));
+    // Iterate over the items adding each one
+    if (PySequence_Check(obj)) {
+      for (int64_t i = 0; i < size; ++i) {
+	OwnedRef ref(PySequence_GetItem(obj, i));
+	RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+      }
+    } else if (PyObject_HasAttrString(obj, "__iter__")) {
+      PyObject* iter = PyObject_GetIter(obj);
+      OwnedRef iter_ref(iter);
+      PyObject* item;
+      int64_t i = 0;
+      // To allow people with long generators to only convert a subset, stop
+      // consuming at size.
+      while ((item = PyIter_Next(iter)) && i < size) {
+	OwnedRef ref(item);
+	RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+	++i;
       }
+      if (size != i) {
+	RETURN_NOT_OK(this->typed_builder_->Resize(i));
+      }
+    } else {
+      return Status::TypeError("Object is not a sequence or iterable");
     }
     return Status::OK();
   }
+
+  virtual Status AppendItem(const OwnedRef& item) = 0;
 };
 
-class Int64Converter : public TypedConverter<Int64Builder> {
+class BoolConverter : public TypedConverterVisitor<
+  BooleanBuilder, BoolConverter> {
  public:
-  Status AppendData(PyObject* seq) override {
-    int64_t val;
-    int64_t size = static_cast<int64_t>(PySequence_Size(seq));
-    RETURN_NOT_OK(typed_builder_->Reserve(size));
-    for (int64_t i = 0; i < size; ++i) {
-      OwnedRef item(PySequence_GetItem(seq, i));
-      if (item.obj() == Py_None) {
-        typed_builder_->AppendNull();
+  inline Status AppendItem(const OwnedRef& item) {
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else {
+      if (item.obj() == Py_True) {
+	return typed_builder_->Append(true);
       } else {
-        val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
-        RETURN_IF_PYERROR();
-        typed_builder_->Append(val);
+	return typed_builder_->Append(false);
       }
     }
-    return Status::OK();
   }
 };
 
-class DateConverter : public TypedConverter<Date64Builder> {
+class Int64Converter : public TypedConverterVisitor<
+  Int64Builder, Int64Converter> {
  public:
-  Status AppendData(PyObject* seq) override {
-    int64_t size = static_cast<int64_t>(PySequence_Size(seq));
-    RETURN_NOT_OK(typed_builder_->Reserve(size));
-    for (int64_t i = 0; i < size; ++i) {
-      OwnedRef item(PySequence_GetItem(seq, i));
-      if (item.obj() == Py_None) {
-        typed_builder_->AppendNull();
-      } else {
-        PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
-        typed_builder_->Append(PyDate_to_ms(pydate));
-      }
+  inline Status AppendItem(const OwnedRef& item) {
+    int64_t val;
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else {
+      val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+      RETURN_IF_PYERROR();
+      return typed_builder_->Append(val);
     }
-    return Status::OK();
   }
 };
 
-class TimestampConverter : public TypedConverter<TimestampBuilder> {
+class DateConverter : public TypedConverterVisitor<
+  Date64Builder, DateConverter> {
  public:
-  Status AppendData(PyObject* seq) override {
-    int64_t size = static_cast<int64_t>(PySequence_Size(seq));
-    RETURN_NOT_OK(typed_builder_->Reserve(size));
-    for (int64_t i = 0; i < size; ++i) {
-      OwnedRef item(PySequence_GetItem(seq, i));
-      if (item.obj() == Py_None) {
-        typed_builder_->AppendNull();
-      } else {
-        PyDateTime_DateTime* pydatetime =
-            reinterpret_cast<PyDateTime_DateTime*>(item.obj());
-        typed_builder_->Append(PyDateTime_to_us(pydatetime));
-        RETURN_IF_PYERROR();
-      }
+  inline Status AppendItem(const OwnedRef& item) {
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else {
+      PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
+      return typed_builder_->Append(PyDate_to_ms(pydate));
+    }
+  }
+};
+
+class TimestampConverter : public TypedConverterVisitor<
+  Date64Builder, TimestampConverter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else {
+      PyDateTime_DateTime* pydatetime =
+	reinterpret_cast<PyDateTime_DateTime*>(item.obj());
+      return typed_builder_->Append(PyDateTime_to_us(pydatetime));
     }
-    return Status::OK();
   }
 };
 
-class DoubleConverter : public TypedConverter<DoubleBuilder> {
+class DoubleConverter : public TypedConverterVisitor<
+  DoubleBuilder, DoubleConverter> {
  public:
-  Status AppendData(PyObject* seq) override {
+  inline Status AppendItem(const OwnedRef& item) {
     double val;
-    int64_t size = static_cast<int64_t>(PySequence_Size(seq));
-    RETURN_NOT_OK(typed_builder_->Reserve(size));
-    for (int64_t i = 0; i < size; ++i) {
-      OwnedRef item(PySequence_GetItem(seq, i));
-      if (item.obj() == Py_None) {
-        typed_builder_->AppendNull();
-      } else {
-        val = PyFloat_AsDouble(item.obj());
-        RETURN_IF_PYERROR();
-        typed_builder_->Append(val);
-      }
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else {
+      val = PyFloat_AsDouble(item.obj());
+      RETURN_IF_PYERROR();
+      return typed_builder_->Append(val);
     }
-    return Status::OK();
   }
 };
 
-class BytesConverter : public TypedConverter<BinaryBuilder> {
+class BytesConverter : public TypedConverterVisitor<
+  BinaryBuilder, BytesConverter> {
  public:
-  Status AppendData(PyObject* seq) override {
-    PyObject* item;
+  inline Status AppendItem(const OwnedRef& item) {
     PyObject* bytes_obj;
-    OwnedRef tmp;
     const char* bytes;
     Py_ssize_t length;
-    Py_ssize_t size = PySequence_Size(seq);
-    for (int64_t i = 0; i < size; ++i) {
-      item = PySequence_GetItem(seq, i);
-      OwnedRef holder(item);
-
-      if (item == Py_None) {
-        RETURN_NOT_OK(typed_builder_->AppendNull());
-        continue;
-      } else if (PyUnicode_Check(item)) {
-        tmp.reset(PyUnicode_AsUTF8String(item));
-        RETURN_IF_PYERROR();
-        bytes_obj = tmp.obj();
-      } else if (PyBytes_Check(item)) {
-        bytes_obj = item;
-      } else {
-        return InvalidConversion(item, "bytes");
-      }
-      // No error checking
-      length = PyBytes_GET_SIZE(bytes_obj);
-      bytes = PyBytes_AS_STRING(bytes_obj);
-      RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast<int32_t>(length)));
+    OwnedRef tmp;
+
+    if (item.obj() == Py_None) {
+      RETURN_NOT_OK(typed_builder_->AppendNull());
+      return Status::OK();
+    } else if (PyUnicode_Check(item.obj())) {
+      tmp.reset(PyUnicode_AsUTF8String(item.obj()));
+      RETURN_IF_PYERROR();
+      bytes_obj = tmp.obj();
+    } else if (PyBytes_Check(item.obj())) {
+      bytes_obj = item.obj();
+    } else {
+      return InvalidConversion(item.obj(), "bytes");
     }
-    return Status::OK();
+    // No error checking
+    length = PyBytes_GET_SIZE(bytes_obj);
+    bytes = PyBytes_AS_STRING(bytes_obj);
+    return typed_builder_->Append(bytes, static_cast<int32_t>(length));
   }
 };
 
-class FixedWidthBytesConverter : public TypedConverter<FixedSizeBinaryBuilder> {
+class FixedWidthBytesConverter : public TypedConverterVisitor<
+  FixedSizeBinaryBuilder, FixedWidthBytesConverter> {
  public:
-  Status AppendData(PyObject* seq) override {
-    PyObject* item;
+  inline Status AppendItem(const OwnedRef& item) {
     PyObject* bytes_obj;
     OwnedRef tmp;
     Py_ssize_t expected_length = std::dynamic_pointer_cast<FixedSizeBinaryType>(
         typed_builder_->type())->byte_width();
-    Py_ssize_t size = PySequence_Size(seq);
-    for (int64_t i = 0; i < size; ++i) {
-      item = PySequence_GetItem(seq, i);
-      OwnedRef holder(item);
-
-      if (item == Py_None) {
-        RETURN_NOT_OK(typed_builder_->AppendNull());
-        continue;
-      } else if (PyUnicode_Check(item)) {
-        tmp.reset(PyUnicode_AsUTF8String(item));
-        RETURN_IF_PYERROR();
-        bytes_obj = tmp.obj();
-      } else if (PyBytes_Check(item)) {
-        bytes_obj = item;
-      } else {
-        return InvalidConversion(item, "bytes");
-      }
-      // No error checking
-      RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
-      RETURN_NOT_OK(typed_builder_->Append(
-          reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(bytes_obj))));
+    if (item.obj() == Py_None) {
+      RETURN_NOT_OK(typed_builder_->AppendNull());
+      return Status::OK();
+    } else if (PyUnicode_Check(item.obj())) {
+      tmp.reset(PyUnicode_AsUTF8String(item.obj()));
+      RETURN_IF_PYERROR();
+      bytes_obj = tmp.obj();
+    } else if (PyBytes_Check(item.obj())) {
+      bytes_obj = item.obj();
+    } else {
+      return InvalidConversion(item.obj(), "bytes");
     }
-    return Status::OK();
+    // No error checking
+    RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
+    return typed_builder_->Append(
+	reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(bytes_obj)));
   }
 };
 
-class UTF8Converter : public TypedConverter<StringBuilder> {
+class UTF8Converter : public TypedConverterVisitor<
+  StringBuilder, UTF8Converter> {
  public:
-  Status AppendData(PyObject* seq) override {
-    PyObject* item;
+  inline Status AppendItem(const OwnedRef& item) {
     PyObject* bytes_obj;
     OwnedRef tmp;
     const char* bytes;
     Py_ssize_t length;
-    Py_ssize_t size = PySequence_Size(seq);
-    for (int64_t i = 0; i < size; ++i) {
-      item = PySequence_GetItem(seq, i);
-      OwnedRef holder(item);
-
-      if (item == Py_None) {
-        RETURN_NOT_OK(typed_builder_->AppendNull());
-        continue;
-      } else if (!PyUnicode_Check(item)) {
-        return Status::Invalid("Non-unicode value encountered");
-      }
-      tmp.reset(PyUnicode_AsUTF8String(item));
-      RETURN_IF_PYERROR();
-      bytes_obj = tmp.obj();
 
-      // No error checking
-      length = PyBytes_GET_SIZE(bytes_obj);
-      bytes = PyBytes_AS_STRING(bytes_obj);
-      RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast<int32_t>(length)));
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else if (!PyUnicode_Check(item.obj())) {
+      return Status::Invalid("Non-unicode value encountered");
     }
-    return Status::OK();
+    tmp.reset(PyUnicode_AsUTF8String(item.obj()));
+    RETURN_IF_PYERROR();
+    bytes_obj = tmp.obj();
+
+    // No error checking
+    length = PyBytes_GET_SIZE(bytes_obj);
+    bytes = PyBytes_AS_STRING(bytes_obj);
+    return typed_builder_->Append(bytes, static_cast<int32_t>(length));
   }
 };
 
-class ListConverter : public TypedConverter<ListBuilder> {
+class ListConverter : public TypedConverterVisitor<
+  ListBuilder, ListConverter> {
  public:
   Status Init(const std::shared_ptr<ArrayBuilder>& builder) override;
 
-  Status AppendData(PyObject* seq) override {
-    Py_ssize_t size = PySequence_Size(seq);
-    for (int64_t i = 0; i < size; ++i) {
-      OwnedRef item(PySequence_GetItem(seq, i));
-      if (item.obj() == Py_None) {
-        RETURN_NOT_OK(typed_builder_->AppendNull());
-      } else {
-        typed_builder_->Append();
-        RETURN_NOT_OK(value_converter_->AppendData(item.obj()));
-      }
+  inline Status AppendItem(const OwnedRef& item) {
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else {
+      typed_builder_->Append();
+      PyObject* item_obj = item.obj();
+      int64_t list_size =
+	static_cast<int64_t>(PySequence_Size(item_obj));
+      return value_converter_->AppendData(item_obj, list_size);
     }
-    return Status::OK();
   }
 
  protected:
@@ -512,45 +543,33 @@ class ListConverter : public TypedConverter<ListBuilder> {
   case bit_width: {                                           \
     arrow::decimal::Decimal##bit_width out;                   \
     RETURN_NOT_OK(PythonDecimalToArrowDecimal((item), &out)); \
-    RETURN_NOT_OK((builder)->Append(out));                    \
+    return ((builder)->Append(out));                          \
     break;                                                    \
   }
 
-class DecimalConverter : public TypedConverter<arrow::DecimalBuilder> {
+class DecimalConverter : public TypedConverterVisitor<
+  arrow::DecimalBuilder, DecimalConverter> {
  public:
-  Status AppendData(PyObject* seq) override {
-    /// Ensure we've allocated enough space
-    Py_ssize_t size = PySequence_Size(seq);
-    RETURN_NOT_OK(typed_builder_->Reserve(size));
-
+  inline Status AppendItem(const OwnedRef& item) {
     /// Can the compiler figure out that the case statement below isn't necessary
     /// once we're running?
     const int bit_width =
         std::dynamic_pointer_cast<arrow::DecimalType>(typed_builder_->type())
             ->bit_width();
 
-    OwnedRef ref;
-    PyObject* item = nullptr;
-    for (int64_t i = 0; i < size; ++i) {
-      ref.reset(PySequence_GetItem(seq, i));
-      item = ref.obj();
-
-      /// TODO(phillipc): Check for nan?
-      if (item != Py_None) {
-        switch (bit_width) {
-          DECIMAL_CONVERT_CASE(32, item, typed_builder_)
-          DECIMAL_CONVERT_CASE(64, item, typed_builder_)
-          DECIMAL_CONVERT_CASE(128, item, typed_builder_)
-          default:
-            break;
-        }
-        RETURN_IF_PYERROR();
-      } else {
-        RETURN_NOT_OK(typed_builder_->AppendNull());
+    /// TODO(phillipc): Check for nan?
+    if (item.obj() != Py_None) {
+      switch (bit_width) {
+	  DECIMAL_CONVERT_CASE(32, item.obj(), typed_builder_)
+	  DECIMAL_CONVERT_CASE(64, item.obj(), typed_builder_)
+	  DECIMAL_CONVERT_CASE(128, item.obj(), typed_builder_)
+      default:
+	  return Status::OK();
       }
+      RETURN_IF_PYERROR();
+    } else {
+      return typed_builder_->AppendNull();
     }
-
-    return Status::OK();
   }
 };
 
@@ -601,7 +620,8 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
 }
 
 Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
-    const std::shared_ptr<ArrayBuilder>& builder) {
+			const std::shared_ptr<ArrayBuilder>& builder,
+			int64_t size) {
   PyDateTime_IMPORT;
   std::shared_ptr<SeqConverter> converter = GetConverter(type);
   if (converter == nullptr) {
@@ -611,7 +631,7 @@ Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
   }
   converter->Init(builder);
 
-  return converter->AppendData(obj);
+  return converter->AppendData(obj, size);
 }
 
 Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out) {
@@ -632,7 +652,7 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>
   // Give the sequence converter an array builder
   std::shared_ptr<ArrayBuilder> builder;
   RETURN_NOT_OK(MakeBuilder(pool, type, &builder));
-  RETURN_NOT_OK(AppendPySequence(obj, type, builder));
+  RETURN_NOT_OK(AppendPySequence(obj, type, builder, size));
   return builder->Finish(out);
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/cpp/src/arrow/python/builtin_convert.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index a6180d4..7f42c33 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -44,7 +44,8 @@ ARROW_EXPORT arrow::Status InferArrowSize(PyObject* obj, int64_t* size);
 
 ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj,
     const std::shared_ptr<arrow::DataType>& type,
-    const std::shared_ptr<arrow::ArrayBuilder>& builder);
+    const std::shared_ptr<arrow::ArrayBuilder>& builder,
+    int64_t size);
 
 // Type and size inference
 ARROW_EXPORT

http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 654c392..ea23496 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -953,7 +953,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
         ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
         return Status::TypeError(ss.str());
       }
-      RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder));
+      RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder, size));
     } else {
       return Status::TypeError("Unsupported Python type for list items");
     }
@@ -1002,7 +1002,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
         ss << inferred_type->ToString() << " cannot be converted to STRING.";
         return Status::TypeError(ss.str());
       }
-      RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder));
+      RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder, size));
     } else {
       return Status::TypeError("Unsupported Python type for list items");
     }

http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 2a0b0a7..e1be5b1 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1059,18 +1059,26 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type,
 
 
 
-def array(object sequence, DataType type=None, MemoryPool memory_pool=None):
+def array(object sequence, DataType type=None, MemoryPool memory_pool=None,
+          size=None):
     """
     Create pyarrow.Array instance from a Python sequence
 
     Parameters
     ----------
-    sequence : sequence-like object of Python objects
+    sequence : sequence-like or iterable object of Python objects.
+        If both type and size are specified may be a single use iterable.
     type : pyarrow.DataType, optional
         If not passed, will be inferred from the data
     memory_pool : pyarrow.MemoryPool, optional
         If not passed, will allocate memory from the currently-set default
         memory pool
+    size : int64, optional
+        Size of the elements. If the imput is larger than size bail at this
+        length. For iterators, if size is larger than the input iterator this
+        will be treated as a "max size", but will involve an initial allocation
+        of size followed by a resize to the actual size (so if you know the
+        exact size specifying it correctly will give you better performance).
 
     Returns
     -------
@@ -1084,11 +1092,18 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None):
     if type is None:
         check_status(ConvertPySequence(sequence, pool, &sp_array))
     else:
-        check_status(
-            ConvertPySequence(
-                sequence, pool, &sp_array, type.sp_type
-            )
-        )
+        if size is None:
+            check_status(
+                ConvertPySequence(
+                    sequence, pool, &sp_array, type.sp_type
+                )
+             )
+        else:
+            check_status(
+                ConvertPySequence(
+                    sequence, pool, &sp_array, type.sp_type, size
+                )
+             )
 
     return pyarrow_wrap_array(sp_array)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index f712274..741d832 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -650,6 +650,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
     CStatus ConvertPySequence(object obj, CMemoryPool* pool,
                               shared_ptr[CArray]* out,
                               const shared_ptr[CDataType]& type)
+    CStatus ConvertPySequence(object obj, CMemoryPool* pool,
+                              shared_ptr[CArray]* out,
+                              const shared_ptr[CDataType]& type,
+			      int64_t size)
 
     CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/bddb2197/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index d25055d..bf14c4f 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -22,6 +22,44 @@ import pyarrow as pa
 import datetime
 import decimal
 
+class StrangeIterable:
+    def __init__(self, lst):
+        self.lst = lst
+
+    def __iter__(self):
+        return self.lst.__iter__()
+
+class TestConvertIterable(unittest.TestCase):
+
+    def test_iterable_types(self):
+        arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
+        arr2 = pa.array((0, 1, 2, 3))
+
+        assert arr1.equals(arr2)
+
+    def test_empty_iterable(self):
+        arr = pa.array(StrangeIterable([]))
+        assert len(arr) == 0
+        assert arr.null_count == 0
+        assert arr.type == pa.null()
+        assert arr.to_pylist() == []
+
+
+class TestLimitedConvertIterator(unittest.TestCase):
+    def test_iterator_types(self):
+        arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
+        arr2 = pa.array((0, 1, 2))
+        assert arr1.equals(arr2)
+
+    def test_iterator_size_overflow(self):
+        arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
+        arr2 = pa.array((0, 1))
+        assert arr1.equals(arr2)
+
+    def test_iterator_size_underflow(self):
+        arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
+        arr2 = pa.array((0, 1, 2))
+        assert arr1.equals(arr2)
 
 class TestConvertSequence(unittest.TestCase):
 
@@ -208,3 +246,15 @@ class TestConvertSequence(unittest.TestCase):
         type = pa.decimal(precision=23, scale=5)
         arr = pa.array(data, type=type)
         assert arr.to_pylist() == data
+
+    def test_range_types(self):
+        arr1 = pa.array(range(3))
+        arr2 = pa.array((0, 1, 2))
+        assert arr1.equals(arr2)
+
+    def test_empty_range(self):
+        arr = pa.array(range(0))
+        assert len(arr) == 0
+        assert arr.null_count == 0
+        assert arr.type == pa.null()
+        assert arr.to_pylist() == []