You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/01/24 03:03:54 UTC

[arrow] branch master updated: ARROW-1712: [C++] Add method to BinaryBuilder to reserve space for value data

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0a49022  ARROW-1712: [C++] Add method to BinaryBuilder to reserve space for value data
0a49022 is described below

commit 0a490222c4078a7cb0ff085cd5c9884fdda57998
Author: Panchen Xue <pa...@gmail.com>
AuthorDate: Tue Jan 23 22:03:50 2018 -0500

    ARROW-1712: [C++] Add method to BinaryBuilder to reserve space for value data
    
    Modified BinaryBuilder::Resize(int64_t) so that when building BinaryArrays with a known size, space is also reserved for value_data_builder_ to prevent internal reallocation.
    
    Author: Panchen Xue <pa...@gmail.com>
    
    Closes #1481 from xuepanchen/master and squashes the following commits:
    
    707b67bf [Panchen Xue] ARROW-1712: [C++] Fix lint errors
    360e6018 [Panchen Xue] Merge branch 'master' of https://github.com/xuepanchen/arrow
    d4bbd151 [Panchen Xue] ARROW-1712: [C++] Modify test case for BinaryBuilder::ReserveData() and change arguments for offsets_builder_.Resize()
    77f8f3c1 [Panchen Xue] Merge pull request #5 from apache/master
    bc5db7d3 [Panchen Xue] ARROW-1712: [C++] Remove unneeded data member in BinaryBuilder and modify test case
    5a5b70e2 [Panchen Xue] Merge pull request #4 from apache/master
    8e4c8925 [Panchen Xue] Merge pull request #3 from xuepanchen/xuepanchen-arrow-1712
    d3c8202b [Panchen Xue] ARROW-1945: [C++] Fix a small typo
    0b078955 [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data
    18f90fb8 [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data
    bbc65270 [Panchen Xue] ARROW-1945: [C++] Update test case for BinaryBuild data value space reservation
    15e045cc [Panchen Xue] Add test case for array-test.cc
    5a5593e5 [Panchen Xue] Update again ReserveData(int64_t) method for BinaryBuilder
    9b5e8059 [Panchen Xue] Update ReserveData(int64_t) method signature for BinaryBuilder
    8dd5eaa9 [Panchen Xue] Update builder.cc
    b002e0bd [Panchen Xue] Remove override keyword from ReserveData(int64_t) method for BinaryBuilder
    de318f47 [Panchen Xue] Implement ReserveData(int64_t) method for BinaryBuilder
    e0434e61 [Panchen Xue] Add ReserveData(int64_t) and value_data_capacity() for methods for BinaryBuilder
    5ebfb320 [Panchen Xue] Add capacity() method for TypedBufferBuilder
    5b73c1c5 [Panchen Xue] Update again BinaryBuilder::Resize(int64_t capacity) in builder.cc
    d021c54b [Panchen Xue] Merge pull request #2 from xuepanchen/xuepanchen-arrow-1712
    232024e3 [Panchen Xue] Update BinaryBuilder::Resize(int64_t capacity) in builder.cc
    c2f8dc4e [Panchen Xue] Merge pull request #1 from apache/master
---
 cpp/src/arrow/array-test.cc | 39 +++++++++++++++++++++++++++++++++++++++
 cpp/src/arrow/buffer.h      |  1 +
 cpp/src/arrow/builder.cc    | 18 ++++++++++++++----
 cpp/src/arrow/builder.h     |  5 +++++
 4 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 7ff3261..c53da85 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1155,6 +1155,45 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
   }
 }
 
+TEST_F(TestBinaryBuilder, TestCapacityReserve) {
+  vector<string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddd"};
+  int N = static_cast<int>(strings.size());
+  int reps = 15;
+  int64_t length = 0;
+  int64_t capacity = 1000;
+  int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
+
+  ASSERT_OK(builder_->ReserveData(capacity));
+
+  ASSERT_EQ(length, builder_->value_data_length());
+  ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+
+  for (int j = 0; j < reps; ++j) {
+    for (int i = 0; i < N; ++i) {
+      ASSERT_OK(builder_->Append(strings[i]));
+      length += static_cast<int>(strings[i].size());
+
+      ASSERT_EQ(length, builder_->value_data_length());
+      ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+    }
+  }
+
+  int extra_capacity = 500;
+  expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity);
+
+  ASSERT_OK(builder_->ReserveData(extra_capacity));
+
+  ASSERT_EQ(length, builder_->value_data_length());
+  ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+
+  Done();
+
+  ASSERT_EQ(reps * N, result_->length());
+  ASSERT_EQ(0, result_->null_count());
+  ASSERT_EQ(reps * 40, result_->value_data()->size());
+  ASSERT_EQ(expected_capacity, result_->value_data()->capacity());
+}
+
 TEST_F(TestBinaryBuilder, TestZeroLength) {
   // All buffers are null
   Done();
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index b50b1a1..44c352a 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -333,6 +333,7 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {
 
   const T* data() const { return reinterpret_cast<const T*>(data_); }
   int64_t length() const { return size_ / sizeof(T); }
+  int64_t capacity() const { return capacity_ / sizeof(T); }
 };
 
 /// \brief Allocate a fixed size mutable buffer from a memory pool
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index de132b5..db90152 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -1165,13 +1165,13 @@ Status ListBuilder::Init(int64_t elements) {
   DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   // one more then requested for offsets
-  return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
+  return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
 }
 
 Status ListBuilder::Resize(int64_t capacity) {
   DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
   // one more then requested for offsets
-  RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+  RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
   return ArrayBuilder::Resize(capacity);
 }
 
@@ -1216,16 +1216,26 @@ Status BinaryBuilder::Init(int64_t elements) {
   DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   // one more then requested for offsets
-  return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
+  return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
 }
 
 Status BinaryBuilder::Resize(int64_t capacity) {
   DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
   // one more then requested for offsets
-  RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
+  RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
   return ArrayBuilder::Resize(capacity);
 }
 
+Status BinaryBuilder::ReserveData(int64_t elements) {
+  if (value_data_length() + elements > value_data_capacity()) {
+    if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) {
+      return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary");
+    }
+    RETURN_NOT_OK(value_data_builder_.Reserve(elements));
+  }
+  return Status::OK();
+}
+
 Status BinaryBuilder::AppendNextOffset() {
   const int64_t num_bytes = value_data_builder_.length();
   if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index ce7b8cd..d1611f6 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -682,10 +682,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
 
   Status Init(int64_t elements) override;
   Status Resize(int64_t capacity) override;
+  /// \brief Ensures there is enough allocated capacity to append the indicated
+  /// number of bytes to the value data buffer without additional allocations
+  Status ReserveData(int64_t elements);
   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
   /// \return size of values buffer so far
   int64_t value_data_length() const { return value_data_builder_.length(); }
+  /// \return capacity of values buffer
+  int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
 
   /// Temporary access to a value.
   ///

-- 
To stop receiving notification emails like this one, please contact
wesm@apache.org.