You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/04 13:02:43 UTC
arrow git commit: ARROW-1383: [C++] Add vector append variant to
primitive array builders that accepts std::vector
Repository: arrow
Updated Branches:
refs/heads/master 9968d95df -> e5aeb9001
ARROW-1383: [C++] Add vector append variant to primitive array builders that accepts std::vector<bool>
Other libraries may have null indicators in the form of bits or bytes. `std::vector<bool>` is a bit-packed container like Arrow's internal representation.
Author: Wes McKinney <we...@twosigma.com>
Closes #1033 from wesm/ARROW-1383 and squashes the following commits:
0f05f2d0 [Wes McKinney] Fix another MSVC compiler warning
cba48fc6 [Wes McKinney] More precise bool checks in BooleanBuilder::Append
6f97b063 [Wes McKinney] Add variant of vector Append in primitive array builders that accepts std::vector<bool>
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e5aeb900
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e5aeb900
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e5aeb900
Branch: refs/heads/master
Commit: e5aeb900161a068e665792a678729fee4e69f13c
Parents: 9968d95
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Sep 4 09:02:41 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Sep 4 09:02:41 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/array-test.cc | 69 ++++++++++++++++++++++++++++++++
cpp/src/arrow/builder.cc | 86 +++++++++++++++++++++++++++++++++++-----
cpp/src/arrow/builder.h | 75 ++++++++++++++++++++++++++---------
3 files changed, 202 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/e5aeb900/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 0eb19d3..3a3d28c 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -593,6 +593,42 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) {
this->Check(this->builder_nn_, false);
}
+TYPED_TEST(TestPrimitiveBuilder, TestAppendVectorStdBool) {
+ // ARROW-1383
+ DECL_T();
+
+ int64_t size = 10000;
+ this->RandomData(size);
+
+ vector<T>& draws = this->draws_;
+
+ std::vector<bool> is_valid;
+
+ // first slug
+ int64_t K = 1000;
+
+ for (int64_t i = 0; i < K; ++i) {
+ is_valid.push_back(this->valid_bytes_[i] != 0);
+ }
+ ASSERT_OK(this->builder_->Append(draws.data(), K, is_valid));
+
+ ASSERT_EQ(1000, this->builder_->length());
+ ASSERT_EQ(1024, this->builder_->capacity());
+
+ // Append the next 9000
+ is_valid.clear();
+ for (int64_t i = K; i < size; ++i) {
+ is_valid.push_back(this->valid_bytes_[i] != 0);
+ }
+
+ ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, is_valid));
+
+ ASSERT_EQ(size, this->builder_->length());
+ ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity());
+
+ this->Check(this->builder_, true);
+}
+
TYPED_TEST(TestPrimitiveBuilder, TestAdvance) {
int64_t n = 1000;
ASSERT_OK(this->builder_->Reserve(n));
@@ -630,6 +666,39 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) {
ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity());
}
+TEST(TestBooleanBuilder, TestStdBoolVectorAppend) {
+ BooleanBuilder builder;
+
+ std::vector<bool> values, is_valid;
+
+ const int length = 10000;
+ test::random_is_valid(length, 0.5, &values);
+ test::random_is_valid(length, 0.1, &is_valid);
+
+ const int chunksize = 1000;
+ for (int chunk = 0; chunk < length / chunksize; ++chunk) {
+ std::vector<bool> chunk_values, chunk_is_valid;
+ for (int i = chunk * chunksize; i < (chunk + 1) * chunksize; ++i) {
+ chunk_values.push_back(values[i]);
+ chunk_is_valid.push_back(is_valid[i]);
+ }
+ ASSERT_OK(builder.Append(chunk_values, chunk_is_valid));
+ }
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ const auto& arr = static_cast<const BooleanArray&>(*result);
+ for (int i = 0; i < length; ++i) {
+ if (is_valid[i]) {
+ ASSERT_FALSE(arr.IsNull(i));
+ ASSERT_EQ(values[i], arr.Value(i));
+ } else {
+ ASSERT_TRUE(arr.IsNull(i));
+ }
+ }
+}
+
template <typename TYPE>
void CheckSliceApproxEquals() {
using T = typename TYPE::c_type;
http://git-wip-us.apache.org/repos/asf/arrow/blob/e5aeb900/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index a1d2366..24d26d4 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -154,6 +154,37 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t leng
length_ += length;
}
+void ArrayBuilder::UnsafeAppendToBitmap(const std::vector<bool>& is_valid) {
+ int64_t byte_offset = length_ / 8;
+ int64_t bit_offset = length_ % 8;
+ uint8_t bitset = null_bitmap_data_[byte_offset];
+
+ const int64_t length = static_cast<int64_t>(is_valid.size());
+
+ for (int64_t i = 0; i < length; ++i) {
+ if (bit_offset == 8) {
+ bit_offset = 0;
+ null_bitmap_data_[byte_offset] = bitset;
+ byte_offset++;
+ // TODO: Except for the last byte, this shouldn't be needed
+ bitset = null_bitmap_data_[byte_offset];
+ }
+
+ if (is_valid[i]) {
+ bitset |= BitUtil::kBitmask[bit_offset];
+ } else {
+ bitset &= BitUtil::kFlippedBitmask[bit_offset];
+ ++null_count_;
+ }
+
+ bit_offset++;
+ }
+ if (bit_offset != 0) {
+ null_bitmap_data_[byte_offset] = bitset;
+ }
+ length_ += length;
+}
+
void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
const int64_t new_length = length + length_;
@@ -244,6 +275,23 @@ Status PrimitiveBuilder<T>::Append(const value_type* values, int64_t length,
}
template <typename T>
+Status PrimitiveBuilder<T>::Append(const value_type* values, int64_t length,
+ const std::vector<bool>& is_valid) {
+ RETURN_NOT_OK(Reserve(length));
+ DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));
+
+ if (length > 0) {
+ std::memcpy(raw_data_ + length_, values,
+ static_cast<std::size_t>(TypeTraits<T>::bytes_required(length)));
+ }
+
+ // length_ is update by these
+ ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+
+ return Status::OK();
+}
+
+template <typename T>
Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) {
const int64_t bytes_required = TypeTraits<T>::bytes_required(length_);
if (bytes_required > 0 && bytes_required < data_->size()) {
@@ -691,16 +739,7 @@ Status BooleanBuilder::Append(const uint8_t* values, int64_t length,
RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
- // Skip reading from unitialised memory
- // TODO: This actually is only to keep valgrind happy but may or may not
- // have a performance impact.
- if ((valid_bytes != nullptr) && !valid_bytes[i]) continue;
-
- if (values[i] > 0) {
- BitUtil::SetBit(raw_data_, length_ + i);
- } else {
- BitUtil::ClearBit(raw_data_, length_ + i);
- }
+ BitUtil::SetBitTo(raw_data_, length_ + i, values[i] != 0);
}
// this updates length_
@@ -708,6 +747,33 @@ Status BooleanBuilder::Append(const uint8_t* values, int64_t length,
return Status::OK();
}
+Status BooleanBuilder::Append(const uint8_t* values, int64_t length,
+ const std::vector<bool>& is_valid) {
+ RETURN_NOT_OK(Reserve(length));
+
+ for (int64_t i = 0; i < length; ++i) {
+ BitUtil::SetBitTo(raw_data_, length_ + i, values[i] != 0);
+ }
+
+ // this updates length_
+ ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+}
+
+Status BooleanBuilder::Append(const std::vector<bool>& values,
+ const std::vector<bool>& is_valid) {
+ const int64_t length = static_cast<int64_t>(values.size());
+ RETURN_NOT_OK(Reserve(length));
+
+ for (int64_t i = 0; i < length; ++i) {
+ BitUtil::SetBitTo(raw_data_, length_ + i, values[i]);
+ }
+
+ // this updates length_
+ ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+}
+
// ----------------------------------------------------------------------
// DictionaryBuilder
http://git-wip-us.apache.org/repos/asf/arrow/blob/e5aeb900/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index d1b51d6..a649d62 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -83,9 +83,11 @@ class ARROW_EXPORT ArrayBuilder {
/// Append to null bitmap
Status AppendToBitmap(bool is_valid);
+
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
/// assume all of length bits are valid.
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+
/// Set the next length bits to not null (i.e. valid).
Status SetNotNull(int64_t length);
@@ -151,6 +153,9 @@ class ARROW_EXPORT ArrayBuilder {
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
// assume all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+
+ void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
+
// Set the next length bits to not null (i.e. valid).
void UnsafeSetNotNull(int64_t length);
@@ -202,13 +207,24 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
std::shared_ptr<Buffer> data() const { return data_; }
- /// Vector append
- ///
- /// If passed, valid_bytes is of equal length to values, and any zero byte
- /// will be considered as a null for that slot
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value)
+ /// \return Status
Status Append(const value_type* values, int64_t length,
const uint8_t* valid_bytes = nullptr);
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status Append(const value_type* values, int64_t length,
+ const std::vector<bool>& is_valid);
+
Status Finish(std::shared_ptr<Array>* out) override;
Status Init(int64_t capacity) override;
@@ -395,17 +411,20 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase
return Status::OK();
}
- /// Vector append
- ///
- /// If passed, valid_bytes is of equal length to values, and any zero byte
- /// will be considered as a null for that slot
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value)
+ /// \return Status
Status Append(const uint64_t* values, int64_t length,
const uint8_t* valid_bytes = nullptr);
- Status ExpandIntSize(uint8_t new_int_size);
Status Finish(std::shared_ptr<Array>* out) override;
protected:
+ Status ExpandIntSize(uint8_t new_int_size);
+
template <typename new_type, typename old_type>
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
ExpandIntSizeInternal();
@@ -454,17 +473,20 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase
return Status::OK();
}
- /// Vector append
- ///
- /// If passed, valid_bytes is of equal length to values, and any zero byte
- /// will be considered as a null for that slot
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value)
+ /// \return Status
Status Append(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = nullptr);
- Status ExpandIntSize(uint8_t new_int_size);
Status Finish(std::shared_ptr<Array>* out) override;
protected:
+ Status ExpandIntSize(uint8_t new_int_size);
+
template <typename new_type, typename old_type>
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
ExpandIntSizeInternal();
@@ -521,13 +543,30 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
Status Append(const uint8_t val) { return Append(val != 0); }
- /// Vector append
- ///
- /// If passed, valid_bytes is of equal length to values, and any zero byte
- /// will be considered as a null for that slot
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous array of bytes (non-zero is 1)
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value)
+ /// \return Status
Status Append(const uint8_t* values, int64_t length,
const uint8_t* valid_bytes = nullptr);
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status Append(const uint8_t* values, int64_t length, const std::vector<bool>& is_valid);
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values an std::vector<bool> indicating true (1) or false
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status Append(const std::vector<bool>& values, const std::vector<bool>& is_valid);
+
Status Finish(std::shared_ptr<Array>* out) override;
Status Init(int64_t capacity) override;