You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/07/17 16:32:35 UTC

arrow git commit: ARROW-1177: [C++] Check for int32 offset overflow in ListBuilder, BinaryBuilder

Repository: arrow
Updated Branches:
  refs/heads/master 0396240b5 -> 1541a08c7


ARROW-1177: [C++] Check for int32 offset overflow in ListBuilder, BinaryBuilder

I also refactored BinaryBuilder to not inherit from ListBuilder, which is a bit cleaner. I added a draft of ARROW-507; it needs a unit test and to handle the case where some passed offsets are null (so they need to be sanitized)

Author: Wes McKinney <we...@twosigma.com>

Closes #853 from wesm/ARROW-1177 and squashes the following commits:

f6be04f [Wes McKinney] Fix DCHECKs in ListBuilder, BinaryBuilder
28f17ab [Wes McKinney] Use binary strings for py2.7
c9e7502 [Wes McKinney] Fix some off-by-one errors
5a8be84 [Wes McKinney] Fix another warning
23adefc [Wes McKinney] Fix compiler warning
35ab4f2 [Wes McKinney] Refactoring BinaryBuilder. Add check for int32 offset overflow for List, Binary, String. Add basic ListArray::FromArrays method, add Python binding


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1541a08c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1541a08c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1541a08c

Branch: refs/heads/master
Commit: 1541a08c795f03b2ce5884a00fc83d1fb79a448e
Parents: 0396240
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Jul 17 18:32:30 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Jul 17 18:32:30 2017 +0200

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc              |  11 +-
 cpp/src/arrow/array.cc                   |  26 ++++
 cpp/src/arrow/array.h                    |  13 ++
 cpp/src/arrow/buffer.h                   |  73 ++++++-----
 cpp/src/arrow/builder.cc                 | 168 ++++++++++++++++----------
 cpp/src/arrow/builder.h                  |  60 ++++++---
 cpp/src/arrow/ipc/ipc-json-test.cc       |   4 +-
 cpp/src/arrow/ipc/ipc-read-write-test.cc |   5 +-
 cpp/src/arrow/ipc/test-common.h          |  21 +++-
 cpp/src/arrow/test-util.h                |   9 +-
 python/pyarrow/array.pxi                 |  36 +++++-
 python/pyarrow/includes/libarrow.pxd     |   6 +
 python/pyarrow/tests/test_array.py       |  12 ++
 13 files changed, 313 insertions(+), 131 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 01bef03..acb4819 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -71,13 +71,14 @@ Status MakeArrayFromValidBytes(
   std::shared_ptr<Buffer> null_buf;
   RETURN_NOT_OK(BitUtil::BytesToBits(v, &null_buf));
 
-  BufferBuilder value_builder(pool);
+  TypedBufferBuilder<int32_t> value_builder(pool);
   for (size_t i = 0; i < v.size(); ++i) {
-    RETURN_NOT_OK(value_builder.Append<int32_t>(0));
+    RETURN_NOT_OK(value_builder.Append(0));
   }
 
-  *out = std::make_shared<Int32Array>(
-      v.size(), value_builder.Finish(), null_buf, null_count);
+  std::shared_ptr<Buffer> values;
+  RETURN_NOT_OK(value_builder.Finish(&values));
+  *out = std::make_shared<Int32Array>(v.size(), values, null_buf, null_count);
   return Status::OK();
 }
 
@@ -996,7 +997,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
   vector<uint8_t> is_null = {0, 0, 0, 1, 0};
 
   int N = static_cast<int>(strings.size());
-  int reps = 1000;
+  int reps = 10;
 
   for (int j = 0; j < reps; ++j) {
     for (int i = 0; i < N; ++i) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 48a3bd5..f20b849 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -190,6 +190,32 @@ ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
   SetData(internal_data);
 }
 
+Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
+    std::shared_ptr<Array>* out) {
+  if (ARROW_PREDICT_FALSE(offsets.length() == 0)) {
+    return Status::Invalid("List offsets must have non-zero length");
+  }
+
+  if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) {
+    return Status::Invalid("Null offsets in ListArray::FromArrays not yet implemented");
+  }
+
+  if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) {
+    return Status::Invalid("List offsets must be signed int32");
+  }
+
+  BufferVector buffers = {
+      offsets.null_bitmap(), static_cast<const Int32Array&>(offsets).values()};
+
+  auto list_type = list(values.type());
+  auto internal_data = std::make_shared<internal::ArrayData>(list_type,
+      offsets.length() - 1, std::move(buffers), offsets.null_count(), offsets.offset());
+  internal_data->child_data.push_back(values.data());
+
+  *out = std::make_shared<ListArray>(internal_data);
+  return Status::OK();
+}
+
 void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
   this->Array::SetData(data);
   auto value_offsets = data->buffers[1];

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 80284cd..2da0b54 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -371,6 +371,19 @@ class ARROW_EXPORT ListArray : public Array {
       const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0,
       int64_t offset = 0);
 
+  /// \brief Construct ListArray from array of offsets and child value array
+  ///
+  /// Note: does not validate input beyond sanity checks. Use
+  /// arrow::ValidateArray if you need stronger validation of inputs
+  ///
+  /// \param[in] offsets Array containing n + 1 offsets encoding length and size
+  /// \param[in] values Array containing
+  /// \param[in] pool MemoryPool in case new offsets array needs to be
+  /// allocated because of null values
+  /// \param[out] out Will have length equal to offsets.length() - 1
+  static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
+      std::shared_ptr<Array>* out);
+
   /// \brief Return array object containing the list's values
   std::shared_ptr<Array> values() const;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index b117b24..488a4c0 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -212,58 +212,71 @@ class ARROW_EXPORT BufferBuilder {
     return Status::OK();
   }
 
-  template <typename T>
+  // Unsafe methods don't check existing size
+  void UnsafeAppend(const uint8_t* data, int64_t length) {
+    memcpy(data_ + size_, data, static_cast<size_t>(length));
+    size_ += length;
+  }
+
+  Status Finish(std::shared_ptr<Buffer>* out) {
+    // Do not shrink to fit to avoid unneeded realloc
+    if (size_ > 0) { RETURN_NOT_OK(buffer_->Resize(size_, false)); }
+    *out = buffer_;
+    Reset();
+    return Status::OK();
+  }
+
+  void Reset() {
+    buffer_ = nullptr;
+    capacity_ = size_ = 0;
+  }
+
+  int64_t capacity() const { return capacity_; }
+  int64_t length() const { return size_; }
+  const uint8_t* data() const { return data_; }
+
+ protected:
+  std::shared_ptr<PoolBuffer> buffer_;
+  MemoryPool* pool_;
+  uint8_t* data_;
+  int64_t capacity_;
+  int64_t size_;
+};
+
+template <typename T>
+class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {
+ public:
+  explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {}
+
   Status Append(T arithmetic_value) {
     static_assert(std::is_arithmetic<T>::value,
         "Convenience buffer append only supports arithmetic types");
-    return Append(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
+    return BufferBuilder::Append(
+        reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
   }
 
-  template <typename T>
   Status Append(const T* arithmetic_values, int64_t num_elements) {
     static_assert(std::is_arithmetic<T>::value,
         "Convenience buffer append only supports arithmetic types");
-    return Append(
+    return BufferBuilder::Append(
         reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T));
   }
 
-  // Unsafe methods don't check existing size
-  void UnsafeAppend(const uint8_t* data, int64_t length) {
-    memcpy(data_ + size_, data, static_cast<size_t>(length));
-    size_ += length;
-  }
-
-  template <typename T>
   void UnsafeAppend(T arithmetic_value) {
     static_assert(std::is_arithmetic<T>::value,
         "Convenience buffer append only supports arithmetic types");
-    UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
+    BufferBuilder::UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
   }
 
-  template <typename T>
   void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) {
     static_assert(std::is_arithmetic<T>::value,
         "Convenience buffer append only supports arithmetic types");
-    UnsafeAppend(
+    BufferBuilder::UnsafeAppend(
         reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T));
   }
 
-  std::shared_ptr<Buffer> Finish() {
-    auto result = buffer_;
-    buffer_ = nullptr;
-    capacity_ = size_ = 0;
-    return result;
-  }
-  int64_t capacity() const { return capacity_; }
-  int64_t length() const { return size_; }
-  const uint8_t* data() const { return data_; }
-
- private:
-  std::shared_ptr<PoolBuffer> buffer_;
-  MemoryPool* pool_;
-  uint8_t* data_;
-  int64_t capacity_;
-  int64_t size_;
+  const T* data() const { return reinterpret_cast<const T*>(data_); }
+  int64_t length() const { return size_ / sizeof(T); }
 };
 
 /// Allocate a new mutable buffer from a memory pool

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 885c650..a2f24a7 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -21,6 +21,8 @@
 #include <cstdint>
 #include <cstring>
 #include <limits>
+#include <sstream>
+#include <vector>
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
@@ -99,21 +101,17 @@ Status ArrayBuilder::Reserve(int64_t elements) {
   return Status::OK();
 }
 
+void ArrayBuilder::Reset() {
+  capacity_ = length_ = null_count_ = 0;
+  null_bitmap_ = nullptr;
+}
+
 Status ArrayBuilder::SetNotNull(int64_t length) {
   RETURN_NOT_OK(Reserve(length));
   UnsafeSetNotNull(length);
   return Status::OK();
 }
 
-void ArrayBuilder::UnsafeAppendToBitmap(bool is_valid) {
-  if (is_valid) {
-    BitUtil::SetBit(null_bitmap_data_, length_);
-  } else {
-    ++null_count_;
-  }
-  ++length_;
-}
-
 void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
   if (valid_bytes == nullptr) {
     UnsafeSetNotNull(length);
@@ -971,7 +969,8 @@ Status DecimalBuilder::Resize(int64_t capacity) {
 }
 
 Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) {
-  std::shared_ptr<Buffer> data = byte_builder_.Finish();
+  std::shared_ptr<Buffer> data;
+  RETURN_NOT_OK(byte_builder_.Finish(&data));
 
   /// TODO(phillipc): not sure where to get the offset argument here
   *out = std::make_shared<DecimalArray>(
@@ -987,65 +986,66 @@ ListBuilder::ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_b
     : ArrayBuilder(pool,
           type ? type : std::static_pointer_cast<DataType>(
                             std::make_shared<ListType>(value_builder->type()))),
-      offset_builder_(pool),
+      offsets_builder_(pool),
       value_builder_(std::move(value_builder)) {}
 
-ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values,
-    const std::shared_ptr<DataType>& type)
-    : ArrayBuilder(pool,
-          type ? type : std::static_pointer_cast<DataType>(
-                            std::make_shared<ListType>(values->type()))),
-      offset_builder_(pool),
-      values_(values) {}
-
 Status ListBuilder::Append(
     const int32_t* offsets, int64_t length, const uint8_t* valid_bytes) {
   RETURN_NOT_OK(Reserve(length));
   UnsafeAppendToBitmap(valid_bytes, length);
-  offset_builder_.UnsafeAppend<int32_t>(offsets, length);
+  offsets_builder_.UnsafeAppend(offsets, length);
   return Status::OK();
 }
 
+Status ListBuilder::AppendNextOffset() {
+  int64_t num_values = value_builder_->length();
+  if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits<int32_t>::max())) {
+    std::stringstream ss;
+    ss << "ListArray cannot contain more then INT32_MAX - 1 child elements,"
+       << " have " << num_values;
+    return Status::Invalid(ss.str());
+  }
+  return offsets_builder_.Append(static_cast<int32_t>(num_values));
+}
+
 Status ListBuilder::Append(bool is_valid) {
   RETURN_NOT_OK(Reserve(1));
   UnsafeAppendToBitmap(is_valid);
-  RETURN_NOT_OK(
-      offset_builder_.Append<int32_t>(static_cast<int32_t>(value_builder_->length())));
-  return Status::OK();
+  return AppendNextOffset();
 }
 
 Status ListBuilder::Init(int64_t elements) {
-  DCHECK_LT(elements, std::numeric_limits<int64_t>::max());
+  DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   // one more then requested for offsets
-  return offset_builder_.Resize((elements + 1) * sizeof(int64_t));
+  return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
 }
 
 Status ListBuilder::Resize(int64_t capacity) {
-  DCHECK_LT(capacity, std::numeric_limits<int64_t>::max());
+  DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
   // one more then requested for offsets
-  RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+  RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
   return ArrayBuilder::Resize(capacity);
 }
 
 Status ListBuilder::Finish(std::shared_ptr<Array>* out) {
+  RETURN_NOT_OK(AppendNextOffset());
+
+  std::shared_ptr<Buffer> offsets;
+  RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+
   std::shared_ptr<Array> items = values_;
   if (!items) { RETURN_NOT_OK(value_builder_->Finish(&items)); }
 
-  RETURN_NOT_OK(offset_builder_.Append<int64_t>(items->length()));
-  std::shared_ptr<Buffer> offsets = offset_builder_.Finish();
-
   *out = std::make_shared<ListArray>(
       type_, length_, offsets, items, null_bitmap_, null_count_);
 
   Reset();
-
   return Status::OK();
 }
 
 void ListBuilder::Reset() {
-  capacity_ = length_ = null_count_ = 0;
-  null_bitmap_ = nullptr;
+  ArrayBuilder::Reset();
   values_ = nullptr;
 }
 
@@ -1057,52 +1057,97 @@ ArrayBuilder* ListBuilder::value_builder() const {
 // ----------------------------------------------------------------------
 // String and binary
 
-BinaryBuilder::BinaryBuilder(MemoryPool* pool)
-    : ListBuilder(pool, std::unique_ptr<ArrayBuilder>(new UInt8Builder(pool, uint8())),
-          binary()) {
-  byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get());
+BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type)
+    : ArrayBuilder(pool, type), offsets_builder_(pool), value_data_builder_(pool) {}
+
+BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(pool, binary()) {}
+
+Status BinaryBuilder::Init(int64_t elements) {
+  DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  // one more then requested for offsets
+  return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
+}
+
+Status BinaryBuilder::Resize(int64_t capacity) {
+  DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+  // one more then requested for offsets
+  RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+  return ArrayBuilder::Resize(capacity);
 }
 
-BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type)
-    : ListBuilder(
-          pool, std::unique_ptr<ArrayBuilder>(new UInt8Builder(pool, uint8())), type) {
-  byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get());
+Status BinaryBuilder::AppendNextOffset() {
+  const int64_t num_bytes = value_data_builder_.length();
+  if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
+    std::stringstream ss;
+    ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " bytes, have "
+       << num_bytes;
+    return Status::Invalid(ss.str());
+  }
+  return offsets_builder_.Append(static_cast<int32_t>(num_bytes));
 }
 
-Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
-  std::shared_ptr<Array> result;
-  RETURN_NOT_OK(ListBuilder::Finish(&result));
+Status BinaryBuilder::Append(const uint8_t* value, int32_t length) {
+  RETURN_NOT_OK(Reserve(1));
+  RETURN_NOT_OK(AppendNextOffset());
+  RETURN_NOT_OK(value_data_builder_.Append(value, length));
+  UnsafeAppendToBitmap(true);
+  return Status::OK();
+}
 
-  const auto list = std::dynamic_pointer_cast<ListArray>(result);
-  auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
+Status BinaryBuilder::AppendNull() {
+  RETURN_NOT_OK(AppendNextOffset());
+  RETURN_NOT_OK(Reserve(1));
+  UnsafeAppendToBitmap(false);
+  return Status::OK();
+}
+
+Status BinaryBuilder::FinishInternal(std::shared_ptr<internal::ArrayData>* out) {
+  // Write final offset (values length)
+  RETURN_NOT_OK(AppendNextOffset());
+  std::shared_ptr<Buffer> offsets, value_data;
 
-  *out = std::make_shared<BinaryArray>(list->length(), list->value_offsets(),
-      values->values(), list->null_bitmap(), list->null_count());
+  RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+  RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
+
+  BufferVector buffers = {null_bitmap_, offsets, value_data};
+  *out = std::make_shared<internal::ArrayData>(
+      type_, length_, std::move(buffers), null_count_, 0);
   return Status::OK();
 }
 
+Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
+  std::shared_ptr<internal::ArrayData> data;
+  RETURN_NOT_OK(FinishInternal(&data));
+  *out = std::make_shared<BinaryArray>(data);
+  Reset();
+  return Status::OK();
+}
+
+void BinaryBuilder::Reset() {
+  ArrayBuilder::Reset();
+  offsets_builder_.Reset();
+  value_data_builder_.Reset();
+}
+
 const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const {
-  const int32_t* offsets = reinterpret_cast<const int32_t*>(offset_builder_.data());
+  const int32_t* offsets = offsets_builder_.data();
   int32_t offset = offsets[i];
   if (i == (length_ - 1)) {
-    *out_length = static_cast<int32_t>(value_builder_->length()) - offset;
+    *out_length = static_cast<int32_t>(value_data_builder_.length()) - offset;
   } else {
     *out_length = offsets[i + 1] - offset;
   }
-  return byte_builder_->data()->data() + offset;
+  return value_data_builder_.data() + offset;
 }
 
 StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {}
 
 Status StringBuilder::Finish(std::shared_ptr<Array>* out) {
-  std::shared_ptr<Array> result;
-  RETURN_NOT_OK(ListBuilder::Finish(&result));
-
-  const auto list = std::dynamic_pointer_cast<ListArray>(result);
-  auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
-
-  *out = std::make_shared<StringArray>(list->length(), list->value_offsets(),
-      values->values(), list->null_bitmap(), list->null_count());
+  std::shared_ptr<internal::ArrayData> data;
+  RETURN_NOT_OK(FinishInternal(&data));
+  *out = std::make_shared<StringArray>(data);
+  Reset();
   return Status::OK();
 }
 
@@ -1139,19 +1184,18 @@ Status FixedSizeBinaryBuilder::AppendNull() {
 }
 
 Status FixedSizeBinaryBuilder::Init(int64_t elements) {
-  DCHECK_LT(elements, std::numeric_limits<int64_t>::max());
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   return byte_builder_.Resize(elements * byte_width_);
 }
 
 Status FixedSizeBinaryBuilder::Resize(int64_t capacity) {
-  DCHECK_LT(capacity, std::numeric_limits<int64_t>::max());
   RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_));
   return ArrayBuilder::Resize(capacity);
 }
 
 Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>* out) {
-  std::shared_ptr<Buffer> data = byte_builder_.Finish();
+  std::shared_ptr<Buffer> data;
+  RETURN_NOT_OK(byte_builder_.Finish(&data));
   *out = std::make_shared<FixedSizeBinaryArray>(
       type_, length_, data, null_bitmap_, null_count_);
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index c0a075b..6b54c9f 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -38,6 +38,12 @@ namespace arrow {
 
 class Array;
 
+namespace internal {
+
+struct ArrayData;
+
+}  // namespace internal
+
 namespace decimal {
 
 template <typename T>
@@ -127,12 +133,20 @@ class ARROW_EXPORT ArrayBuilder {
   // Child value array builders. These are owned by this class
   std::vector<std::unique_ptr<ArrayBuilder>> children_;
 
-  //
+  void Reset();
+
   // Unsafe operations (don't check capacity/don't resize)
-  //
 
   // Append to null bitmap.
-  void UnsafeAppendToBitmap(bool is_valid);
+  void UnsafeAppendToBitmap(bool is_valid) {
+    if (is_valid) {
+      BitUtil::SetBit(null_bitmap_data_, length_);
+    } else {
+      ++null_count_;
+    }
+    ++length_;
+  }
+
   // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
   // assume all of length bits are valid.
   void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
@@ -494,7 +508,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
 // ----------------------------------------------------------------------
 // List builder
 
-/// Builder class for variable-length list array value types
+/// \class ListBuilder
+/// \brief Builder class for variable-length list array value types
 ///
 /// To use this class, you must append values to the child array builder and use
 /// the Append function to delimit each distinct list value (once the values
@@ -513,22 +528,18 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder {
   ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_builder,
       const std::shared_ptr<DataType>& type = nullptr);
 
-  /// Use this constructor to build the list with a pre-existing values array
-  ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values,
-      const std::shared_ptr<DataType>& type = nullptr);
-
   Status Init(int64_t elements) override;
   Status Resize(int64_t capacity) override;
   Status Finish(std::shared_ptr<Array>* out) override;
 
-  /// Vector append
+  /// \brief Vector append
   ///
   /// If passed, valid_bytes is of equal length to values, and any zero byte
   /// will be considered as a null for that slot
   Status Append(
       const int32_t* offsets, int64_t length, const uint8_t* valid_bytes = nullptr);
 
-  /// Start a new variable-length list slot
+  /// \brief Start a new variable-length list slot
   ///
   /// This function should be called before beginning to append elements to the
   /// value builder
@@ -539,25 +550,26 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder {
   ArrayBuilder* value_builder() const;
 
  protected:
-  BufferBuilder offset_builder_;
+  TypedBufferBuilder<int32_t> offsets_builder_;
   std::unique_ptr<ArrayBuilder> value_builder_;
   std::shared_ptr<Array> values_;
 
+  Status AppendNextOffset();
+
   void Reset();
 };
 
 // ----------------------------------------------------------------------
 // Binary and String
 
-class ARROW_EXPORT BinaryBuilder : public ListBuilder {
+/// \class BinaryBuilder
+/// \brief Builder class for variable-length binary data
+class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
  public:
   explicit BinaryBuilder(MemoryPool* pool);
   explicit BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type);
 
-  Status Append(const uint8_t* value, int32_t length) {
-    RETURN_NOT_OK(ListBuilder::Append());
-    return byte_builder_->Append(value, length);
-  }
+  Status Append(const uint8_t* value, int32_t length);
 
   Status Append(const char* value, int32_t length) {
     return Append(reinterpret_cast<const uint8_t*>(value), length);
@@ -567,6 +579,10 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
     return Append(value.c_str(), static_cast<int32_t>(value.size()));
   }
 
+  Status AppendNull();
+
+  Status Init(int64_t elements) override;
+  Status Resize(int64_t capacity) override;
   Status Finish(std::shared_ptr<Array>* out) override;
 
   /// Temporary access to a value.
@@ -575,10 +591,18 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
   const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
 
  protected:
-  UInt8Builder* byte_builder_;
+  TypedBufferBuilder<int32_t> offsets_builder_;
+  TypedBufferBuilder<uint8_t> value_data_builder_;
+
+  static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1;
+
+  Status AppendNextOffset();
+  Status FinishInternal(std::shared_ptr<internal::ArrayData>* out);
+  void Reset();
 };
 
-// String builder
+/// \class StringBuilder
+/// \brief Builder class for UTF8 strings
 class ARROW_EXPORT StringBuilder : public BinaryBuilder {
  public:
   using BinaryBuilder::BinaryBuilder;

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/ipc-json-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc
index 318e318..79344df 100644
--- a/cpp/src/arrow/ipc/ipc-json-test.cc
+++ b/cpp/src/arrow/ipc/ipc-json-test.cc
@@ -169,7 +169,7 @@ TEST(TestJsonArrayWriter, NestedTypes) {
   std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7};
 
   std::shared_ptr<Buffer> list_bitmap;
-  ASSERT_OK(test::GetBitmapFromBoolVector(list_is_valid, &list_bitmap));
+  ASSERT_OK(test::GetBitmapFromVector(list_is_valid, &list_bitmap));
   std::shared_ptr<Buffer> offsets_buffer = test::GetBufferFromVector(offsets);
 
   ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap, 1);
@@ -179,7 +179,7 @@ TEST(TestJsonArrayWriter, NestedTypes) {
   // Struct
   std::vector<bool> struct_is_valid = {true, false, true, true, true, false, true};
   std::shared_ptr<Buffer> struct_bitmap;
-  ASSERT_OK(test::GetBitmapFromBoolVector(struct_is_valid, &struct_bitmap));
+  ASSERT_OK(test::GetBitmapFromVector(struct_is_valid, &struct_bitmap));
 
   auto struct_type =
       struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())});

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/ipc-read-write-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index 42f14b0..2119ff7 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -343,7 +343,7 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) {
   auto union_type = union_({field("f0", a0->type())}, {0});
   std::vector<int32_t> type_ids(a0->length());
   std::shared_ptr<Buffer> ids_buffer;
-  ASSERT_OK(test::CopyBufferFromVector(type_ids, &ids_buffer));
+  ASSERT_OK(test::CopyBufferFromVector(type_ids, default_memory_pool(), &ids_buffer));
   a1 =
       std::make_shared<UnionArray>(union_type, a0->length(), struct_children, ids_buffer);
   CheckArray(a1);
@@ -355,7 +355,8 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) {
     type_offsets.push_back(i);
   }
   std::shared_ptr<Buffer> offsets_buffer;
-  ASSERT_OK(test::CopyBufferFromVector(type_offsets, &offsets_buffer));
+  ASSERT_OK(
+      test::CopyBufferFromVector(type_offsets, default_memory_pool(), &offsets_buffer));
   a1 = std::make_shared<UnionArray>(
       dense_union_type, a0->length(), struct_children, ids_buffer, offsets_buffer);
   CheckArray(a1);

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index a542c87..67a41ba 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -139,9 +139,16 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
     std::replace_if(offsets.begin(), offsets.end(),
         [child_length](int32_t offset) { return offset > child_length; }, child_length);
   }
-  ListBuilder builder(pool, child_array);
-  RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data()));
-  RETURN_NOT_OK(builder.Finish(out));
+
+  offsets[num_lists] = static_cast<int32_t>(child_array->length());
+
+  /// TODO(wesm): Implement support for nulls in ListArray::FromArrays
+  std::shared_ptr<Buffer> null_bitmap, offsets_buffer;
+  RETURN_NOT_OK(test::GetBitmapFromVector(valid_lists, &null_bitmap));
+  RETURN_NOT_OK(test::CopyBufferFromVector(offsets, pool, &offsets_buffer));
+
+  *out = std::make_shared<ListArray>(list(child_array->type()), num_lists, offsets_buffer,
+      child_array, null_bitmap, kUnknownNullCount);
   return ValidateArray(**out);
 }
 
@@ -398,7 +405,8 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
 
   std::shared_ptr<Buffer> type_ids_buffer;
   std::vector<uint8_t> type_ids = {5, 10, 5, 5, 10, 10, 5};
-  RETURN_NOT_OK(test::CopyBufferFromVector(type_ids, &type_ids_buffer));
+  RETURN_NOT_OK(
+      test::CopyBufferFromVector(type_ids, default_memory_pool(), &type_ids_buffer));
 
   std::vector<int32_t> u0_values = {0, 1, 2, 3, 4, 5, 6};
   ArrayFromVector<Int32Type, int32_t>(u0_values, &sparse_children[0]);
@@ -415,7 +423,8 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
 
   std::shared_ptr<Buffer> offsets_buffer;
   std::vector<int32_t> offsets = {0, 0, 1, 2, 1, 2, 3};
-  RETURN_NOT_OK(test::CopyBufferFromVector(offsets, &offsets_buffer));
+  RETURN_NOT_OK(
+      test::CopyBufferFromVector(offsets, default_memory_pool(), &offsets_buffer));
 
   std::vector<uint8_t> null_bytes(length, 1);
   null_bytes[2] = 0;
@@ -479,7 +488,7 @@ Status MakeDictionary(std::shared_ptr<RecordBatch>* out) {
   ArrayFromVector<Int8Type, int8_t>(is_valid3, indices3_values, &indices3);
 
   std::shared_ptr<Buffer> null_bitmap;
-  RETURN_NOT_OK(test::GetBitmapFromBoolVector(is_valid, &null_bitmap));
+  RETURN_NOT_OK(test::GetBitmapFromVector(is_valid, &null_bitmap));
 
   std::shared_ptr<Array> a3 = std::make_shared<ListArray>(f3_type, length,
       std::static_pointer_cast<PrimitiveArray>(offsets)->values(),

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index 2cff97a..2bc6625 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -102,10 +102,10 @@ std::shared_ptr<Buffer> GetBufferFromVector(const std::vector<T>& values) {
 
 template <typename T>
 inline Status CopyBufferFromVector(
-    const std::vector<T>& values, std::shared_ptr<Buffer>* result) {
+    const std::vector<T>& values, MemoryPool* pool, std::shared_ptr<Buffer>* result) {
   int64_t nbytes = static_cast<int>(values.size()) * sizeof(T);
 
-  auto buffer = std::make_shared<PoolBuffer>(default_memory_pool());
+  auto buffer = std::make_shared<PoolBuffer>(pool);
   RETURN_NOT_OK(buffer->Resize(nbytes));
   memcpy(buffer->mutable_data(), values.data(), nbytes);
 
@@ -113,8 +113,9 @@ inline Status CopyBufferFromVector(
   return Status::OK();
 }
 
-static inline Status GetBitmapFromBoolVector(
-    const std::vector<bool>& is_valid, std::shared_ptr<Buffer>* result) {
+template <typename T>
+static inline Status GetBitmapFromVector(
+    const std::vector<T>& is_valid, std::shared_ptr<Buffer>* result) {
   size_t length = is_valid.size();
 
   std::shared_ptr<MutableBuffer> buffer;

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f8bccc7..c7a4415 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -40,7 +40,6 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type,
     return values, type
 
 
-
 def array(object sequence, DataType type=None, MemoryPool memory_pool=None,
           size=None):
     """
@@ -302,6 +301,19 @@ cdef class Array:
         """
         return [x.as_py() for x in self]
 
+    def validate(self):
+        """
+        Perform any validation checks implemented by
+        arrow::ValidateArray. Raises exception with error message if array does
+        not validate
+
+        Raises
+        ------
+        ArrowInvalid
+        """
+        with nogil:
+            check_status(ValidateArray(deref(self.ap)))
+
 
 cdef class Tensor:
 
@@ -478,7 +490,27 @@ cdef class DecimalArray(FixedSizeBinaryArray):
 
 
 cdef class ListArray(Array):
-    pass
+
+    @staticmethod
+    def from_arrays(Array offsets, Array values, MemoryPool pool=None):
+        """
+        Construct ListArray from arrays of int32 offsets and values
+
+        Parameters
+        ----------
+        offset : Array (int32 type)
+        values : Array (any type)
+
+        Returns
+        -------
+        list_array : ListArray
+        """
+        cdef shared_ptr[CArray] out
+        cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
+        with nogil:
+            check_status(CListArray.FromArrays(
+                deref(offsets.ap), deref(values.ap), cpool, &out))
+        return pyarrow_wrap_array(out)
 
 
 cdef class StringArray(Array):

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index dd791cd..44d83da 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -283,6 +283,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         c_string FormatValue(int i)
 
     cdef cppclass CListArray" arrow::ListArray"(CArray):
+        @staticmethod
+        CStatus FromArrays(const CArray& offsets, const CArray& values,
+                           CMemoryPool* pool, shared_ptr[CArray]* out)
+
         const int32_t* raw_value_offsets()
         int32_t value_offset(int i)
         int32_t value_length(int i)
@@ -304,6 +308,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CArray] field(int pos)
         const vector[shared_ptr[CArray]] fields()
 
+    CStatus ValidateArray(const CArray& array)
+
     cdef cppclass CChunkedArray" arrow::ChunkedArray":
         int64_t length()
         int64_t null_count()

http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 1a0ee61..5a373b4 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -199,6 +199,18 @@ def test_dictionary_with_pandas():
     tm.assert_series_equal(pd.Series(pandas2), pd.Series(ex_pandas2))
 
 
+def test_list_from_arrays():
+    offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
+    offsets = pa.Array.from_pandas(offsets_arr, type=pa.int32())
+    pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
+    values = pa.array(pyvalues, type=pa.binary())
+
+    result = pa.ListArray.from_arrays(offsets, values)
+    expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]])
+
+    assert result.equals(expected)
+
+
 def test_simple_type_construction():
     result = pa.lib.TimestampType()
     with pytest.raises(TypeError):