You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/04 13:02:43 UTC

arrow git commit: ARROW-1383: [C++] Add vector append variant to primitive array builders that accepts std::vector

Repository: arrow
Updated Branches:
  refs/heads/master 9968d95df -> e5aeb9001


ARROW-1383: [C++] Add vector append variant to primitive array builders that accepts std::vector<bool>

Other libraries may have null indicators in the form of bits or bytes. `std::vector<bool>` is a bit-packed container like Arrow's internal representation.

Author: Wes McKinney <we...@twosigma.com>

Closes #1033 from wesm/ARROW-1383 and squashes the following commits:

0f05f2d0 [Wes McKinney] Fix another MSVC compiler warning
cba48fc6 [Wes McKinney] More precise bool checks in BooleanBuilder::Append
6f97b063 [Wes McKinney] Add variant of vector Append in primitive array builders that accepts std::vector<bool>


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e5aeb900
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e5aeb900
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e5aeb900

Branch: refs/heads/master
Commit: e5aeb900161a068e665792a678729fee4e69f13c
Parents: 9968d95
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Sep 4 09:02:41 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Sep 4 09:02:41 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc | 69 ++++++++++++++++++++++++++++++++
 cpp/src/arrow/builder.cc    | 86 +++++++++++++++++++++++++++++++++++-----
 cpp/src/arrow/builder.h     | 75 ++++++++++++++++++++++++++---------
 3 files changed, 202 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/e5aeb900/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 0eb19d3..3a3d28c 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -593,6 +593,42 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) {
   this->Check(this->builder_nn_, false);
 }
 
+TYPED_TEST(TestPrimitiveBuilder, TestAppendVectorStdBool) {
+  // ARROW-1383
+  DECL_T();
+
+  int64_t size = 10000;
+  this->RandomData(size);
+
+  vector<T>& draws = this->draws_;
+
+  std::vector<bool> is_valid;
+
+  // first slug
+  int64_t K = 1000;
+
+  for (int64_t i = 0; i < K; ++i) {
+    is_valid.push_back(this->valid_bytes_[i] != 0);
+  }
+  ASSERT_OK(this->builder_->Append(draws.data(), K, is_valid));
+
+  ASSERT_EQ(1000, this->builder_->length());
+  ASSERT_EQ(1024, this->builder_->capacity());
+
+  // Append the next 9000
+  is_valid.clear();
+  for (int64_t i = K; i < size; ++i) {
+    is_valid.push_back(this->valid_bytes_[i] != 0);
+  }
+
+  ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, is_valid));
+
+  ASSERT_EQ(size, this->builder_->length());
+  ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity());
+
+  this->Check(this->builder_, true);
+}
+
 TYPED_TEST(TestPrimitiveBuilder, TestAdvance) {
   int64_t n = 1000;
   ASSERT_OK(this->builder_->Reserve(n));
@@ -630,6 +666,39 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) {
   ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity());
 }
 
+TEST(TestBooleanBuilder, TestStdBoolVectorAppend) {
+  BooleanBuilder builder;
+
+  std::vector<bool> values, is_valid;
+
+  const int length = 10000;
+  test::random_is_valid(length, 0.5, &values);
+  test::random_is_valid(length, 0.1, &is_valid);
+
+  const int chunksize = 1000;
+  for (int chunk = 0; chunk < length / chunksize; ++chunk) {
+    std::vector<bool> chunk_values, chunk_is_valid;
+    for (int i = chunk * chunksize; i < (chunk + 1) * chunksize; ++i) {
+      chunk_values.push_back(values[i]);
+      chunk_is_valid.push_back(is_valid[i]);
+    }
+    ASSERT_OK(builder.Append(chunk_values, chunk_is_valid));
+  }
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  const auto& arr = static_cast<const BooleanArray&>(*result);
+  for (int i = 0; i < length; ++i) {
+    if (is_valid[i]) {
+      ASSERT_FALSE(arr.IsNull(i));
+      ASSERT_EQ(values[i], arr.Value(i));
+    } else {
+      ASSERT_TRUE(arr.IsNull(i));
+    }
+  }
+}
+
 template <typename TYPE>
 void CheckSliceApproxEquals() {
   using T = typename TYPE::c_type;

http://git-wip-us.apache.org/repos/asf/arrow/blob/e5aeb900/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index a1d2366..24d26d4 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -154,6 +154,37 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t leng
   length_ += length;
 }
 
+void ArrayBuilder::UnsafeAppendToBitmap(const std::vector<bool>& is_valid) {
+  int64_t byte_offset = length_ / 8;
+  int64_t bit_offset = length_ % 8;
+  uint8_t bitset = null_bitmap_data_[byte_offset];
+
+  const int64_t length = static_cast<int64_t>(is_valid.size());
+
+  for (int64_t i = 0; i < length; ++i) {
+    if (bit_offset == 8) {
+      bit_offset = 0;
+      null_bitmap_data_[byte_offset] = bitset;
+      byte_offset++;
+      // TODO: Except for the last byte, this shouldn't be needed
+      bitset = null_bitmap_data_[byte_offset];
+    }
+
+    if (is_valid[i]) {
+      bitset |= BitUtil::kBitmask[bit_offset];
+    } else {
+      bitset &= BitUtil::kFlippedBitmask[bit_offset];
+      ++null_count_;
+    }
+
+    bit_offset++;
+  }
+  if (bit_offset != 0) {
+    null_bitmap_data_[byte_offset] = bitset;
+  }
+  length_ += length;
+}
+
 void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
   const int64_t new_length = length + length_;
 
@@ -244,6 +275,23 @@ Status PrimitiveBuilder<T>::Append(const value_type* values, int64_t length,
 }
 
 template <typename T>
+Status PrimitiveBuilder<T>::Append(const value_type* values, int64_t length,
+                                   const std::vector<bool>& is_valid) {
+  RETURN_NOT_OK(Reserve(length));
+  DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));
+
+  if (length > 0) {
+    std::memcpy(raw_data_ + length_, values,
+                static_cast<std::size_t>(TypeTraits<T>::bytes_required(length)));
+  }
+
+  // length_ is update by these
+  ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+
+  return Status::OK();
+}
+
+template <typename T>
 Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) {
   const int64_t bytes_required = TypeTraits<T>::bytes_required(length_);
   if (bytes_required > 0 && bytes_required < data_->size()) {
@@ -691,16 +739,7 @@ Status BooleanBuilder::Append(const uint8_t* values, int64_t length,
   RETURN_NOT_OK(Reserve(length));
 
   for (int64_t i = 0; i < length; ++i) {
-    // Skip reading from unitialised memory
-    // TODO: This actually is only to keep valgrind happy but may or may not
-    // have a performance impact.
-    if ((valid_bytes != nullptr) && !valid_bytes[i]) continue;
-
-    if (values[i] > 0) {
-      BitUtil::SetBit(raw_data_, length_ + i);
-    } else {
-      BitUtil::ClearBit(raw_data_, length_ + i);
-    }
+    BitUtil::SetBitTo(raw_data_, length_ + i, values[i] != 0);
   }
 
   // this updates length_
@@ -708,6 +747,33 @@ Status BooleanBuilder::Append(const uint8_t* values, int64_t length,
   return Status::OK();
 }
 
+Status BooleanBuilder::Append(const uint8_t* values, int64_t length,
+                              const std::vector<bool>& is_valid) {
+  RETURN_NOT_OK(Reserve(length));
+
+  for (int64_t i = 0; i < length; ++i) {
+    BitUtil::SetBitTo(raw_data_, length_ + i, values[i] != 0);
+  }
+
+  // this updates length_
+  ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+  return Status::OK();
+}
+
+Status BooleanBuilder::Append(const std::vector<bool>& values,
+                              const std::vector<bool>& is_valid) {
+  const int64_t length = static_cast<int64_t>(values.size());
+  RETURN_NOT_OK(Reserve(length));
+
+  for (int64_t i = 0; i < length; ++i) {
+    BitUtil::SetBitTo(raw_data_, length_ + i, values[i]);
+  }
+
+  // this updates length_
+  ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // DictionaryBuilder
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/e5aeb900/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index d1b51d6..a649d62 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -83,9 +83,11 @@ class ARROW_EXPORT ArrayBuilder {
 
   /// Append to null bitmap
   Status AppendToBitmap(bool is_valid);
+
   /// Vector append. Treat each zero byte as a null.   If valid_bytes is null
   /// assume all of length bits are valid.
   Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+
   /// Set the next length bits to not null (i.e. valid).
   Status SetNotNull(int64_t length);
 
@@ -151,6 +153,9 @@ class ARROW_EXPORT ArrayBuilder {
   // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
   // assume all of length bits are valid.
   void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+
+  void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
+
   // Set the next length bits to not null (i.e. valid).
   void UnsafeSetNotNull(int64_t length);
 
@@ -202,13 +207,24 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
 
   std::shared_ptr<Buffer> data() const { return data_; }
 
-  /// Vector append
-  ///
-  /// If passed, valid_bytes is of equal length to values, and any zero byte
-  /// will be considered as a null for that slot
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value)
+  /// \return Status
   Status Append(const value_type* values, int64_t length,
                 const uint8_t* valid_bytes = nullptr);
 
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status Append(const value_type* values, int64_t length,
+                const std::vector<bool>& is_valid);
+
   Status Finish(std::shared_ptr<Array>* out) override;
   Status Init(int64_t capacity) override;
 
@@ -395,17 +411,20 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase
     return Status::OK();
   }
 
-  /// Vector append
-  ///
-  /// If passed, valid_bytes is of equal length to values, and any zero byte
-  /// will be considered as a null for that slot
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value)
+  /// \return Status
   Status Append(const uint64_t* values, int64_t length,
                 const uint8_t* valid_bytes = nullptr);
 
-  Status ExpandIntSize(uint8_t new_int_size);
   Status Finish(std::shared_ptr<Array>* out) override;
 
  protected:
+  Status ExpandIntSize(uint8_t new_int_size);
+
   template <typename new_type, typename old_type>
   typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
   ExpandIntSizeInternal();
@@ -454,17 +473,20 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase
     return Status::OK();
   }
 
-  /// Vector append
-  ///
-  /// If passed, valid_bytes is of equal length to values, and any zero byte
-  /// will be considered as a null for that slot
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value)
+  /// \return Status
   Status Append(const int64_t* values, int64_t length,
                 const uint8_t* valid_bytes = nullptr);
 
-  Status ExpandIntSize(uint8_t new_int_size);
   Status Finish(std::shared_ptr<Array>* out) override;
 
  protected:
+  Status ExpandIntSize(uint8_t new_int_size);
+
   template <typename new_type, typename old_type>
   typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
   ExpandIntSizeInternal();
@@ -521,13 +543,30 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
 
   Status Append(const uint8_t val) { return Append(val != 0); }
 
-  /// Vector append
-  ///
-  /// If passed, valid_bytes is of equal length to values, and any zero byte
-  /// will be considered as a null for that slot
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous array of bytes (non-zero is 1)
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value)
+  /// \return Status
   Status Append(const uint8_t* values, int64_t length,
                 const uint8_t* valid_bytes = nullptr);
 
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status Append(const uint8_t* values, int64_t length, const std::vector<bool>& is_valid);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values an std::vector<bool> indicating true (1) or false
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status Append(const std::vector<bool>& values, const std::vector<bool>& is_valid);
+
   Status Finish(std::shared_ptr<Array>* out) override;
   Status Init(int64_t capacity) override;