You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/07/17 16:32:35 UTC
arrow git commit: ARROW-1177: [C++] Check for int32 offset overflow
in ListBuilder, BinaryBuilder
Repository: arrow
Updated Branches:
refs/heads/master 0396240b5 -> 1541a08c7
ARROW-1177: [C++] Check for int32 offset overflow in ListBuilder, BinaryBuilder
I also refactored BinaryBuilder to not inherit from ListBuilder, which is a bit cleaner. I added a draft of ARROW-507; it needs a unit test and to handle the case where some passed offsets are null (so they need to be sanitized)
Author: Wes McKinney <we...@twosigma.com>
Closes #853 from wesm/ARROW-1177 and squashes the following commits:
f6be04f [Wes McKinney] Fix DCHECKs in ListBuilder, BinaryBuilder
28f17ab [Wes McKinney] Use binary strings for py2.7
c9e7502 [Wes McKinney] Fix some off-by-one errors
5a8be84 [Wes McKinney] Fix another warning
23adefc [Wes McKinney] Fix compiler warning
35ab4f2 [Wes McKinney] Refactoring BinaryBuilder. Add check for int32 offset overflow for List, Binary, String. Add basic ListArray::FromArrays method, add Python binding
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1541a08c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1541a08c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1541a08c
Branch: refs/heads/master
Commit: 1541a08c795f03b2ce5884a00fc83d1fb79a448e
Parents: 0396240
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Jul 17 18:32:30 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Jul 17 18:32:30 2017 +0200
----------------------------------------------------------------------
cpp/src/arrow/array-test.cc | 11 +-
cpp/src/arrow/array.cc | 26 ++++
cpp/src/arrow/array.h | 13 ++
cpp/src/arrow/buffer.h | 73 ++++++-----
cpp/src/arrow/builder.cc | 168 ++++++++++++++++----------
cpp/src/arrow/builder.h | 60 ++++++---
cpp/src/arrow/ipc/ipc-json-test.cc | 4 +-
cpp/src/arrow/ipc/ipc-read-write-test.cc | 5 +-
cpp/src/arrow/ipc/test-common.h | 21 +++-
cpp/src/arrow/test-util.h | 9 +-
python/pyarrow/array.pxi | 36 +++++-
python/pyarrow/includes/libarrow.pxd | 6 +
python/pyarrow/tests/test_array.py | 12 ++
13 files changed, 313 insertions(+), 131 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 01bef03..acb4819 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -71,13 +71,14 @@ Status MakeArrayFromValidBytes(
std::shared_ptr<Buffer> null_buf;
RETURN_NOT_OK(BitUtil::BytesToBits(v, &null_buf));
- BufferBuilder value_builder(pool);
+ TypedBufferBuilder<int32_t> value_builder(pool);
for (size_t i = 0; i < v.size(); ++i) {
- RETURN_NOT_OK(value_builder.Append<int32_t>(0));
+ RETURN_NOT_OK(value_builder.Append(0));
}
- *out = std::make_shared<Int32Array>(
- v.size(), value_builder.Finish(), null_buf, null_count);
+ std::shared_ptr<Buffer> values;
+ RETURN_NOT_OK(value_builder.Finish(&values));
+ *out = std::make_shared<Int32Array>(v.size(), values, null_buf, null_count);
return Status::OK();
}
@@ -996,7 +997,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
vector<uint8_t> is_null = {0, 0, 0, 1, 0};
int N = static_cast<int>(strings.size());
- int reps = 1000;
+ int reps = 10;
for (int j = 0; j < reps; ++j) {
for (int i = 0; i < N; ++i) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 48a3bd5..f20b849 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -190,6 +190,32 @@ ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
SetData(internal_data);
}
+Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ if (ARROW_PREDICT_FALSE(offsets.length() == 0)) {
+ return Status::Invalid("List offsets must have non-zero length");
+ }
+
+ if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) {
+ return Status::Invalid("Null offsets in ListArray::FromArrays not yet implemented");
+ }
+
+ if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) {
+ return Status::Invalid("List offsets must be signed int32");
+ }
+
+ BufferVector buffers = {
+ offsets.null_bitmap(), static_cast<const Int32Array&>(offsets).values()};
+
+ auto list_type = list(values.type());
+ auto internal_data = std::make_shared<internal::ArrayData>(list_type,
+ offsets.length() - 1, std::move(buffers), offsets.null_count(), offsets.offset());
+ internal_data->child_data.push_back(values.data());
+
+ *out = std::make_shared<ListArray>(internal_data);
+ return Status::OK();
+}
+
void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
auto value_offsets = data->buffers[1];
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 80284cd..2da0b54 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -371,6 +371,19 @@ class ARROW_EXPORT ListArray : public Array {
const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0,
int64_t offset = 0);
+ /// \brief Construct ListArray from array of offsets and child value array
+ ///
+ /// Note: does not validate input beyond sanity checks. Use
+ /// arrow::ValidateArray if you need stronger validation of inputs
+ ///
+ /// \param[in] offsets Array containing n + 1 offsets encoding length and size
+ /// \param[in] values Array containing
+ /// \param[in] pool MemoryPool in case new offsets array needs to be
+ /// allocated because of null values
+ /// \param[out] out Will have length equal to offsets.length() - 1
+ static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
+ std::shared_ptr<Array>* out);
+
/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index b117b24..488a4c0 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -212,58 +212,71 @@ class ARROW_EXPORT BufferBuilder {
return Status::OK();
}
- template <typename T>
+ // Unsafe methods don't check existing size
+ void UnsafeAppend(const uint8_t* data, int64_t length) {
+ memcpy(data_ + size_, data, static_cast<size_t>(length));
+ size_ += length;
+ }
+
+ Status Finish(std::shared_ptr<Buffer>* out) {
+ // Do not shrink to fit to avoid unneeded realloc
+ if (size_ > 0) { RETURN_NOT_OK(buffer_->Resize(size_, false)); }
+ *out = buffer_;
+ Reset();
+ return Status::OK();
+ }
+
+ void Reset() {
+ buffer_ = nullptr;
+ capacity_ = size_ = 0;
+ }
+
+ int64_t capacity() const { return capacity_; }
+ int64_t length() const { return size_; }
+ const uint8_t* data() const { return data_; }
+
+ protected:
+ std::shared_ptr<PoolBuffer> buffer_;
+ MemoryPool* pool_;
+ uint8_t* data_;
+ int64_t capacity_;
+ int64_t size_;
+};
+
+template <typename T>
+class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {
+ public:
+ explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {}
+
Status Append(T arithmetic_value) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
- return Append(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
+ return BufferBuilder::Append(
+ reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
}
- template <typename T>
Status Append(const T* arithmetic_values, int64_t num_elements) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
- return Append(
+ return BufferBuilder::Append(
reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T));
}
- // Unsafe methods don't check existing size
- void UnsafeAppend(const uint8_t* data, int64_t length) {
- memcpy(data_ + size_, data, static_cast<size_t>(length));
- size_ += length;
- }
-
- template <typename T>
void UnsafeAppend(T arithmetic_value) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
- UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
+ BufferBuilder::UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
}
- template <typename T>
void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
- UnsafeAppend(
+ BufferBuilder::UnsafeAppend(
reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T));
}
- std::shared_ptr<Buffer> Finish() {
- auto result = buffer_;
- buffer_ = nullptr;
- capacity_ = size_ = 0;
- return result;
- }
- int64_t capacity() const { return capacity_; }
- int64_t length() const { return size_; }
- const uint8_t* data() const { return data_; }
-
- private:
- std::shared_ptr<PoolBuffer> buffer_;
- MemoryPool* pool_;
- uint8_t* data_;
- int64_t capacity_;
- int64_t size_;
+ const T* data() const { return reinterpret_cast<const T*>(data_); }
+ int64_t length() const { return size_ / sizeof(T); }
};
/// Allocate a new mutable buffer from a memory pool
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 885c650..a2f24a7 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -21,6 +21,8 @@
#include <cstdint>
#include <cstring>
#include <limits>
+#include <sstream>
+#include <vector>
#include "arrow/array.h"
#include "arrow/buffer.h"
@@ -99,21 +101,17 @@ Status ArrayBuilder::Reserve(int64_t elements) {
return Status::OK();
}
+void ArrayBuilder::Reset() {
+ capacity_ = length_ = null_count_ = 0;
+ null_bitmap_ = nullptr;
+}
+
Status ArrayBuilder::SetNotNull(int64_t length) {
RETURN_NOT_OK(Reserve(length));
UnsafeSetNotNull(length);
return Status::OK();
}
-void ArrayBuilder::UnsafeAppendToBitmap(bool is_valid) {
- if (is_valid) {
- BitUtil::SetBit(null_bitmap_data_, length_);
- } else {
- ++null_count_;
- }
- ++length_;
-}
-
void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
if (valid_bytes == nullptr) {
UnsafeSetNotNull(length);
@@ -971,7 +969,8 @@ Status DecimalBuilder::Resize(int64_t capacity) {
}
Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) {
- std::shared_ptr<Buffer> data = byte_builder_.Finish();
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(byte_builder_.Finish(&data));
/// TODO(phillipc): not sure where to get the offset argument here
*out = std::make_shared<DecimalArray>(
@@ -987,65 +986,66 @@ ListBuilder::ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_b
: ArrayBuilder(pool,
type ? type : std::static_pointer_cast<DataType>(
std::make_shared<ListType>(value_builder->type()))),
- offset_builder_(pool),
+ offsets_builder_(pool),
value_builder_(std::move(value_builder)) {}
-ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values,
- const std::shared_ptr<DataType>& type)
- : ArrayBuilder(pool,
- type ? type : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(values->type()))),
- offset_builder_(pool),
- values_(values) {}
-
Status ListBuilder::Append(
const int32_t* offsets, int64_t length, const uint8_t* valid_bytes) {
RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(valid_bytes, length);
- offset_builder_.UnsafeAppend<int32_t>(offsets, length);
+ offsets_builder_.UnsafeAppend(offsets, length);
return Status::OK();
}
+Status ListBuilder::AppendNextOffset() {
+ int64_t num_values = value_builder_->length();
+ if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits<int32_t>::max())) {
+ std::stringstream ss;
+ ss << "ListArray cannot contain more then INT32_MAX - 1 child elements,"
+ << " have " << num_values;
+ return Status::Invalid(ss.str());
+ }
+ return offsets_builder_.Append(static_cast<int32_t>(num_values));
+}
+
Status ListBuilder::Append(bool is_valid) {
RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(is_valid);
- RETURN_NOT_OK(
- offset_builder_.Append<int32_t>(static_cast<int32_t>(value_builder_->length())));
- return Status::OK();
+ return AppendNextOffset();
}
Status ListBuilder::Init(int64_t elements) {
- DCHECK_LT(elements, std::numeric_limits<int64_t>::max());
+ DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
RETURN_NOT_OK(ArrayBuilder::Init(elements));
// one more then requested for offsets
- return offset_builder_.Resize((elements + 1) * sizeof(int64_t));
+ return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
}
Status ListBuilder::Resize(int64_t capacity) {
- DCHECK_LT(capacity, std::numeric_limits<int64_t>::max());
+ DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
// one more then requested for offsets
- RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+ RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
return ArrayBuilder::Resize(capacity);
}
Status ListBuilder::Finish(std::shared_ptr<Array>* out) {
+ RETURN_NOT_OK(AppendNextOffset());
+
+ std::shared_ptr<Buffer> offsets;
+ RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+
std::shared_ptr<Array> items = values_;
if (!items) { RETURN_NOT_OK(value_builder_->Finish(&items)); }
- RETURN_NOT_OK(offset_builder_.Append<int64_t>(items->length()));
- std::shared_ptr<Buffer> offsets = offset_builder_.Finish();
-
*out = std::make_shared<ListArray>(
type_, length_, offsets, items, null_bitmap_, null_count_);
Reset();
-
return Status::OK();
}
void ListBuilder::Reset() {
- capacity_ = length_ = null_count_ = 0;
- null_bitmap_ = nullptr;
+ ArrayBuilder::Reset();
values_ = nullptr;
}
@@ -1057,52 +1057,97 @@ ArrayBuilder* ListBuilder::value_builder() const {
// ----------------------------------------------------------------------
// String and binary
-BinaryBuilder::BinaryBuilder(MemoryPool* pool)
- : ListBuilder(pool, std::unique_ptr<ArrayBuilder>(new UInt8Builder(pool, uint8())),
- binary()) {
- byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get());
+BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type)
+ : ArrayBuilder(pool, type), offsets_builder_(pool), value_data_builder_(pool) {}
+
+BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(pool, binary()) {}
+
+Status BinaryBuilder::Init(int64_t elements) {
+ DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+ RETURN_NOT_OK(ArrayBuilder::Init(elements));
+ // one more then requested for offsets
+ return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
+}
+
+Status BinaryBuilder::Resize(int64_t capacity) {
+ DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+ // one more then requested for offsets
+ RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+ return ArrayBuilder::Resize(capacity);
}
-BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type)
- : ListBuilder(
- pool, std::unique_ptr<ArrayBuilder>(new UInt8Builder(pool, uint8())), type) {
- byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get());
+Status BinaryBuilder::AppendNextOffset() {
+ const int64_t num_bytes = value_data_builder_.length();
+ if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
+ std::stringstream ss;
+ ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " bytes, have "
+ << num_bytes;
+ return Status::Invalid(ss.str());
+ }
+ return offsets_builder_.Append(static_cast<int32_t>(num_bytes));
}
-Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
- std::shared_ptr<Array> result;
- RETURN_NOT_OK(ListBuilder::Finish(&result));
+Status BinaryBuilder::Append(const uint8_t* value, int32_t length) {
+ RETURN_NOT_OK(Reserve(1));
+ RETURN_NOT_OK(AppendNextOffset());
+ RETURN_NOT_OK(value_data_builder_.Append(value, length));
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+}
- const auto list = std::dynamic_pointer_cast<ListArray>(result);
- auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
+Status BinaryBuilder::AppendNull() {
+ RETURN_NOT_OK(AppendNextOffset());
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(false);
+ return Status::OK();
+}
+
+Status BinaryBuilder::FinishInternal(std::shared_ptr<internal::ArrayData>* out) {
+ // Write final offset (values length)
+ RETURN_NOT_OK(AppendNextOffset());
+ std::shared_ptr<Buffer> offsets, value_data;
- *out = std::make_shared<BinaryArray>(list->length(), list->value_offsets(),
- values->values(), list->null_bitmap(), list->null_count());
+ RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+ RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
+
+ BufferVector buffers = {null_bitmap_, offsets, value_data};
+ *out = std::make_shared<internal::ArrayData>(
+ type_, length_, std::move(buffers), null_count_, 0);
return Status::OK();
}
+Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
+ std::shared_ptr<internal::ArrayData> data;
+ RETURN_NOT_OK(FinishInternal(&data));
+ *out = std::make_shared<BinaryArray>(data);
+ Reset();
+ return Status::OK();
+}
+
+void BinaryBuilder::Reset() {
+ ArrayBuilder::Reset();
+ offsets_builder_.Reset();
+ value_data_builder_.Reset();
+}
+
const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const {
- const int32_t* offsets = reinterpret_cast<const int32_t*>(offset_builder_.data());
+ const int32_t* offsets = offsets_builder_.data();
int32_t offset = offsets[i];
if (i == (length_ - 1)) {
- *out_length = static_cast<int32_t>(value_builder_->length()) - offset;
+ *out_length = static_cast<int32_t>(value_data_builder_.length()) - offset;
} else {
*out_length = offsets[i + 1] - offset;
}
- return byte_builder_->data()->data() + offset;
+ return value_data_builder_.data() + offset;
}
StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {}
Status StringBuilder::Finish(std::shared_ptr<Array>* out) {
- std::shared_ptr<Array> result;
- RETURN_NOT_OK(ListBuilder::Finish(&result));
-
- const auto list = std::dynamic_pointer_cast<ListArray>(result);
- auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
-
- *out = std::make_shared<StringArray>(list->length(), list->value_offsets(),
- values->values(), list->null_bitmap(), list->null_count());
+ std::shared_ptr<internal::ArrayData> data;
+ RETURN_NOT_OK(FinishInternal(&data));
+ *out = std::make_shared<StringArray>(data);
+ Reset();
return Status::OK();
}
@@ -1139,19 +1184,18 @@ Status FixedSizeBinaryBuilder::AppendNull() {
}
Status FixedSizeBinaryBuilder::Init(int64_t elements) {
- DCHECK_LT(elements, std::numeric_limits<int64_t>::max());
RETURN_NOT_OK(ArrayBuilder::Init(elements));
return byte_builder_.Resize(elements * byte_width_);
}
Status FixedSizeBinaryBuilder::Resize(int64_t capacity) {
- DCHECK_LT(capacity, std::numeric_limits<int64_t>::max());
RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_));
return ArrayBuilder::Resize(capacity);
}
Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>* out) {
- std::shared_ptr<Buffer> data = byte_builder_.Finish();
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(byte_builder_.Finish(&data));
*out = std::make_shared<FixedSizeBinaryArray>(
type_, length_, data, null_bitmap_, null_count_);
return Status::OK();
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index c0a075b..6b54c9f 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -38,6 +38,12 @@ namespace arrow {
class Array;
+namespace internal {
+
+struct ArrayData;
+
+} // namespace internal
+
namespace decimal {
template <typename T>
@@ -127,12 +133,20 @@ class ARROW_EXPORT ArrayBuilder {
// Child value array builders. These are owned by this class
std::vector<std::unique_ptr<ArrayBuilder>> children_;
- //
+ void Reset();
+
// Unsafe operations (don't check capacity/don't resize)
- //
// Append to null bitmap.
- void UnsafeAppendToBitmap(bool is_valid);
+ void UnsafeAppendToBitmap(bool is_valid) {
+ if (is_valid) {
+ BitUtil::SetBit(null_bitmap_data_, length_);
+ } else {
+ ++null_count_;
+ }
+ ++length_;
+ }
+
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
// assume all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
@@ -494,7 +508,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
// ----------------------------------------------------------------------
// List builder
-/// Builder class for variable-length list array value types
+/// \class ListBuilder
+/// \brief Builder class for variable-length list array value types
///
/// To use this class, you must append values to the child array builder and use
/// the Append function to delimit each distinct list value (once the values
@@ -513,22 +528,18 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder {
ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_builder,
const std::shared_ptr<DataType>& type = nullptr);
- /// Use this constructor to build the list with a pre-existing values array
- ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values,
- const std::shared_ptr<DataType>& type = nullptr);
-
Status Init(int64_t elements) override;
Status Resize(int64_t capacity) override;
Status Finish(std::shared_ptr<Array>* out) override;
- /// Vector append
+ /// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status Append(
const int32_t* offsets, int64_t length, const uint8_t* valid_bytes = nullptr);
- /// Start a new variable-length list slot
+ /// \brief Start a new variable-length list slot
///
/// This function should be called before beginning to append elements to the
/// value builder
@@ -539,25 +550,26 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder {
ArrayBuilder* value_builder() const;
protected:
- BufferBuilder offset_builder_;
+ TypedBufferBuilder<int32_t> offsets_builder_;
std::unique_ptr<ArrayBuilder> value_builder_;
std::shared_ptr<Array> values_;
+ Status AppendNextOffset();
+
void Reset();
};
// ----------------------------------------------------------------------
// Binary and String
-class ARROW_EXPORT BinaryBuilder : public ListBuilder {
+/// \class BinaryBuilder
+/// \brief Builder class for variable-length binary data
+class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
public:
explicit BinaryBuilder(MemoryPool* pool);
explicit BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type);
- Status Append(const uint8_t* value, int32_t length) {
- RETURN_NOT_OK(ListBuilder::Append());
- return byte_builder_->Append(value, length);
- }
+ Status Append(const uint8_t* value, int32_t length);
Status Append(const char* value, int32_t length) {
return Append(reinterpret_cast<const uint8_t*>(value), length);
@@ -567,6 +579,10 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
return Append(value.c_str(), static_cast<int32_t>(value.size()));
}
+ Status AppendNull();
+
+ Status Init(int64_t elements) override;
+ Status Resize(int64_t capacity) override;
Status Finish(std::shared_ptr<Array>* out) override;
/// Temporary access to a value.
@@ -575,10 +591,18 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
protected:
- UInt8Builder* byte_builder_;
+ TypedBufferBuilder<int32_t> offsets_builder_;
+ TypedBufferBuilder<uint8_t> value_data_builder_;
+
+ static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1;
+
+ Status AppendNextOffset();
+ Status FinishInternal(std::shared_ptr<internal::ArrayData>* out);
+ void Reset();
};
-// String builder
+/// \class StringBuilder
+/// \brief Builder class for UTF8 strings
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
public:
using BinaryBuilder::BinaryBuilder;
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/ipc-json-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc
index 318e318..79344df 100644
--- a/cpp/src/arrow/ipc/ipc-json-test.cc
+++ b/cpp/src/arrow/ipc/ipc-json-test.cc
@@ -169,7 +169,7 @@ TEST(TestJsonArrayWriter, NestedTypes) {
std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7};
std::shared_ptr<Buffer> list_bitmap;
- ASSERT_OK(test::GetBitmapFromBoolVector(list_is_valid, &list_bitmap));
+ ASSERT_OK(test::GetBitmapFromVector(list_is_valid, &list_bitmap));
std::shared_ptr<Buffer> offsets_buffer = test::GetBufferFromVector(offsets);
ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap, 1);
@@ -179,7 +179,7 @@ TEST(TestJsonArrayWriter, NestedTypes) {
// Struct
std::vector<bool> struct_is_valid = {true, false, true, true, true, false, true};
std::shared_ptr<Buffer> struct_bitmap;
- ASSERT_OK(test::GetBitmapFromBoolVector(struct_is_valid, &struct_bitmap));
+ ASSERT_OK(test::GetBitmapFromVector(struct_is_valid, &struct_bitmap));
auto struct_type =
struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())});
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/ipc-read-write-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index 42f14b0..2119ff7 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -343,7 +343,7 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) {
auto union_type = union_({field("f0", a0->type())}, {0});
std::vector<int32_t> type_ids(a0->length());
std::shared_ptr<Buffer> ids_buffer;
- ASSERT_OK(test::CopyBufferFromVector(type_ids, &ids_buffer));
+ ASSERT_OK(test::CopyBufferFromVector(type_ids, default_memory_pool(), &ids_buffer));
a1 =
std::make_shared<UnionArray>(union_type, a0->length(), struct_children, ids_buffer);
CheckArray(a1);
@@ -355,7 +355,8 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) {
type_offsets.push_back(i);
}
std::shared_ptr<Buffer> offsets_buffer;
- ASSERT_OK(test::CopyBufferFromVector(type_offsets, &offsets_buffer));
+ ASSERT_OK(
+ test::CopyBufferFromVector(type_offsets, default_memory_pool(), &offsets_buffer));
a1 = std::make_shared<UnionArray>(
dense_union_type, a0->length(), struct_children, ids_buffer, offsets_buffer);
CheckArray(a1);
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index a542c87..67a41ba 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -139,9 +139,16 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
std::replace_if(offsets.begin(), offsets.end(),
[child_length](int32_t offset) { return offset > child_length; }, child_length);
}
- ListBuilder builder(pool, child_array);
- RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data()));
- RETURN_NOT_OK(builder.Finish(out));
+
+ offsets[num_lists] = static_cast<int32_t>(child_array->length());
+
+ /// TODO(wesm): Implement support for nulls in ListArray::FromArrays
+ std::shared_ptr<Buffer> null_bitmap, offsets_buffer;
+ RETURN_NOT_OK(test::GetBitmapFromVector(valid_lists, &null_bitmap));
+ RETURN_NOT_OK(test::CopyBufferFromVector(offsets, pool, &offsets_buffer));
+
+ *out = std::make_shared<ListArray>(list(child_array->type()), num_lists, offsets_buffer,
+ child_array, null_bitmap, kUnknownNullCount);
return ValidateArray(**out);
}
@@ -398,7 +405,8 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
std::shared_ptr<Buffer> type_ids_buffer;
std::vector<uint8_t> type_ids = {5, 10, 5, 5, 10, 10, 5};
- RETURN_NOT_OK(test::CopyBufferFromVector(type_ids, &type_ids_buffer));
+ RETURN_NOT_OK(
+ test::CopyBufferFromVector(type_ids, default_memory_pool(), &type_ids_buffer));
std::vector<int32_t> u0_values = {0, 1, 2, 3, 4, 5, 6};
ArrayFromVector<Int32Type, int32_t>(u0_values, &sparse_children[0]);
@@ -415,7 +423,8 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
std::shared_ptr<Buffer> offsets_buffer;
std::vector<int32_t> offsets = {0, 0, 1, 2, 1, 2, 3};
- RETURN_NOT_OK(test::CopyBufferFromVector(offsets, &offsets_buffer));
+ RETURN_NOT_OK(
+ test::CopyBufferFromVector(offsets, default_memory_pool(), &offsets_buffer));
std::vector<uint8_t> null_bytes(length, 1);
null_bytes[2] = 0;
@@ -479,7 +488,7 @@ Status MakeDictionary(std::shared_ptr<RecordBatch>* out) {
ArrayFromVector<Int8Type, int8_t>(is_valid3, indices3_values, &indices3);
std::shared_ptr<Buffer> null_bitmap;
- RETURN_NOT_OK(test::GetBitmapFromBoolVector(is_valid, &null_bitmap));
+ RETURN_NOT_OK(test::GetBitmapFromVector(is_valid, &null_bitmap));
std::shared_ptr<Array> a3 = std::make_shared<ListArray>(f3_type, length,
std::static_pointer_cast<PrimitiveArray>(offsets)->values(),
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index 2cff97a..2bc6625 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -102,10 +102,10 @@ std::shared_ptr<Buffer> GetBufferFromVector(const std::vector<T>& values) {
template <typename T>
inline Status CopyBufferFromVector(
- const std::vector<T>& values, std::shared_ptr<Buffer>* result) {
+ const std::vector<T>& values, MemoryPool* pool, std::shared_ptr<Buffer>* result) {
int64_t nbytes = static_cast<int>(values.size()) * sizeof(T);
- auto buffer = std::make_shared<PoolBuffer>(default_memory_pool());
+ auto buffer = std::make_shared<PoolBuffer>(pool);
RETURN_NOT_OK(buffer->Resize(nbytes));
memcpy(buffer->mutable_data(), values.data(), nbytes);
@@ -113,8 +113,9 @@ inline Status CopyBufferFromVector(
return Status::OK();
}
-static inline Status GetBitmapFromBoolVector(
- const std::vector<bool>& is_valid, std::shared_ptr<Buffer>* result) {
+template <typename T>
+static inline Status GetBitmapFromVector(
+ const std::vector<T>& is_valid, std::shared_ptr<Buffer>* result) {
size_t length = is_valid.size();
std::shared_ptr<MutableBuffer> buffer;
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f8bccc7..c7a4415 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -40,7 +40,6 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type,
return values, type
-
def array(object sequence, DataType type=None, MemoryPool memory_pool=None,
size=None):
"""
@@ -302,6 +301,19 @@ cdef class Array:
"""
return [x.as_py() for x in self]
+ def validate(self):
+ """
+ Perform any validation checks implemented by
+ arrow::ValidateArray. Raises exception with error message if array does
+ not validate
+
+ Raises
+ ------
+ ArrowInvalid
+ """
+ with nogil:
+ check_status(ValidateArray(deref(self.ap)))
+
cdef class Tensor:
@@ -478,7 +490,27 @@ cdef class DecimalArray(FixedSizeBinaryArray):
cdef class ListArray(Array):
- pass
+
+ @staticmethod
+ def from_arrays(Array offsets, Array values, MemoryPool pool=None):
+ """
+ Construct ListArray from arrays of int32 offsets and values
+
+ Parameters
+ ----------
+ offset : Array (int32 type)
+ values : Array (any type)
+
+ Returns
+ -------
+ list_array : ListArray
+ """
+ cdef shared_ptr[CArray] out
+ cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
+ with nogil:
+ check_status(CListArray.FromArrays(
+ deref(offsets.ap), deref(values.ap), cpool, &out))
+ return pyarrow_wrap_array(out)
cdef class StringArray(Array):
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index dd791cd..44d83da 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -283,6 +283,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_string FormatValue(int i)
cdef cppclass CListArray" arrow::ListArray"(CArray):
+ @staticmethod
+ CStatus FromArrays(const CArray& offsets, const CArray& values,
+ CMemoryPool* pool, shared_ptr[CArray]* out)
+
const int32_t* raw_value_offsets()
int32_t value_offset(int i)
int32_t value_length(int i)
@@ -304,6 +308,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CArray] field(int pos)
const vector[shared_ptr[CArray]] fields()
+ CStatus ValidateArray(const CArray& array)
+
cdef cppclass CChunkedArray" arrow::ChunkedArray":
int64_t length()
int64_t null_count()
http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 1a0ee61..5a373b4 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -199,6 +199,18 @@ def test_dictionary_with_pandas():
tm.assert_series_equal(pd.Series(pandas2), pd.Series(ex_pandas2))
+def test_list_from_arrays():
+ offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
+ offsets = pa.Array.from_pandas(offsets_arr, type=pa.int32())
+ pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
+ values = pa.array(pyvalues, type=pa.binary())
+
+ result = pa.ListArray.from_arrays(offsets, values)
+ expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]])
+
+ assert result.equals(expected)
+
+
def test_simple_type_construction():
result = pa.lib.TimestampType()
with pytest.raises(TypeError):