You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/02/09 13:50:19 UTC

[GitHub] [arrow] lidavidm commented on a change in pull request #12116: ARROW-7051: [C++] Improve MakeArrayOfNull to support creation of multiple arrays

lidavidm commented on a change in pull request #12116:
URL: https://github.com/apache/arrow/pull/12116#discussion_r802665698



##########
File path: cpp/src/arrow/array/util.h
##########
@@ -37,7 +37,20 @@ namespace arrow {
 ARROW_EXPORT
 std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
 
-/// \brief Create a strongly-typed Array instance with all elements null
+/// \brief Create a strongly-typed mutable Array instance with all elements initially set
+/// to null
+/// \param[in] type the array type \param[in] length the array length
+/// \param[in] pool the memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> MakeMutableArrayOfNull(
+    const std::shared_ptr<DataType>& type, int64_t length,
+    MemoryPool* pool = default_memory_pool());
+
+/// \brief Create a strongly-typed immutable Array instance with all elements null
+///
+/// This function may reuse a single zero buffer, but may also defer to
+/// MakeArrayOfNull().

Review comment:
       was this meant to be MakeMutableArrayOfNull? it sounds a little odd for a function to defer to itself

##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -534,6 +533,151 @@ class NullArrayFactory {
   std::shared_ptr<Buffer> buffer_;
 };
 
+// mutable version of ImmutableNullArrayFactory
+class NullArrayFactory {
+ private:
+  Result<std::shared_ptr<Buffer>> CreateZeroByteBuffer(size_t scalar_size_bytes) const {
+    ARROW_ASSIGN_OR_RAISE(auto buffer,
+                          AllocateBuffer(length_ * scalar_size_bytes, pool_));
+    std::memset(buffer->mutable_data(), 0, buffer->size());
+    return std::shared_ptr<Buffer>(std::move(buffer));
+  }
+
+  Result<std::shared_ptr<Buffer>> CreateZeroOffsetBuffer(size_t index_size_bytes) const {
+    ARROW_ASSIGN_OR_RAISE(auto buffer,
+                          AllocateBuffer((length_ + 1) * index_size_bytes, pool_));
+    std::memset(buffer->mutable_data(), 0, buffer->size());
+    return std::shared_ptr<Buffer>(std::move(buffer));
+  }
+
+  Result<std::shared_ptr<Buffer>> CreateZeroBitBuffer(size_t scalar_size_bits) const {
+    ARROW_ASSIGN_OR_RAISE(auto buffer,
+                          AllocateBuffer((length_ * scalar_size_bits + 7) / 8, pool_));
+    std::memset(buffer->mutable_data(), 0, buffer->size());
+    return std::shared_ptr<Buffer>(std::move(buffer));
+  }
+
+  static Result<std::shared_ptr<Buffer>> CreateEmptyBuffer() { return AllocateBuffer(0); }
+
+ public:
+  NullArrayFactory(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+                   int64_t length)
+      : pool_(pool), type_(type), length_(length) {}
+
+  Result<std::shared_ptr<ArrayData>> Create() {
+    std::vector<std::shared_ptr<ArrayData>> child_data(type_->num_fields());
+    ARROW_ASSIGN_OR_RAISE(auto validity, CreateZeroBitBuffer(1));
+    out_ = ArrayData::Make(type_, length_, {validity}, child_data, length_, 0);
+    RETURN_NOT_OK(VisitTypeInline(*type_, this));
+    return out_;
+  }
+
+  Status Visit(const NullType&) {
+    out_->buffers.resize(1, nullptr);
+    return Status::OK();
+  }
+
+  Status Visit(const FixedWidthType& type) {
+    out_->buffers.resize(2);
+    // values
+    ARROW_ASSIGN_OR_RAISE(out_->buffers[1], CreateZeroBitBuffer(type.bit_width()));
+    return Status::OK();
+  }
+
+  template <typename T>
+  enable_if_base_binary<T, Status> Visit(const T&) {
+    out_->buffers.resize(3);
+    // offsets
+    ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+                          CreateZeroOffsetBuffer(sizeof(typename T::offset_type)));
+    // values
+    ARROW_ASSIGN_OR_RAISE(out_->buffers[2], CreateEmptyBuffer());
+    return Status::OK();
+  }
+
+  template <typename T>
+  enable_if_var_size_list<T, Status> Visit(const T& type) {
+    out_->buffers.resize(2);
+    // offsets
+    ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+                          CreateZeroOffsetBuffer(sizeof(typename T::offset_type)));
+    // values
+    ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(type, 0, /*length=*/0));
+    return Status::OK();
+  }
+
+  Status Visit(const FixedSizeListType& type) {
+    ARROW_ASSIGN_OR_RAISE(out_->child_data[0],
+                          CreateChild(type, 0, length_ * type.list_size()));
+    return Status::OK();
+  }
+
+  Status Visit(const StructType& type) {
+    for (int i = 0; i < type_->num_fields(); ++i) {
+      ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(type, i, length_));
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const UnionType& type) {
+    out_->buffers.resize(2);
+
+    // First buffer is always null
+    out_->buffers[0] = nullptr;
+
+    // type ID buffer
+    ARROW_ASSIGN_OR_RAISE(out_->buffers[1], AllocateBuffer(length_, pool_));
+    std::memset(out_->buffers[1]->mutable_data(), type.type_codes()[0], length_);
+
+    // For sparse unions, we now create children with the same length as the
+    // parent
+    int64_t child_length = length_;
+    if (type.mode() == UnionMode::DENSE) {
+      // For dense unions, we set the offsets to all zero and create children
+      // with length 1
+      out_->buffers.resize(3);
+      ARROW_ASSIGN_OR_RAISE(out_->buffers[2], CreateZeroByteBuffer(sizeof(int32_t)));
+
+      child_length = 1;
+    }
+    for (int i = 0; i < type_->num_fields(); ++i) {
+      ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(type, i, child_length));
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryType& type) {
+    out_->buffers.resize(2);
+    // dictionary indices
+    ARROW_ASSIGN_OR_RAISE(out_->buffers[1], CreateZeroBitBuffer(type.bit_width()));
+    // dictionary data
+    ARROW_ASSIGN_OR_RAISE(auto typed_null_dict, MakeArrayOfNull(type.value_type(), 0));
+    out_->dictionary = typed_null_dict->data();
+    return Status::OK();
+  }
+
+  Status Visit(const ExtensionType& type) {
+    out_->child_data.resize(type.storage_type()->num_fields());
+    RETURN_NOT_OK(VisitTypeInline(*type.storage_type(), this));
+    return Status::OK();
+  }
+
+  Status Visit(const DataType& type) {
+    return Status::NotImplemented("construction of all-null ", type);
+  }
+
+  Result<std::shared_ptr<ArrayData>> CreateChild(const DataType& type, int i,
+                                                 int64_t length) {
+    ImmutableNullArrayFactory child_factory(pool_, type.field(i)->type(), length);

Review comment:
       Should the child also use NullArrayFactory?

##########
File path: cpp/src/arrow/memory_pool.h
##########
@@ -63,6 +63,42 @@ class MemoryPoolStats {
 /// take care of the required 64-byte alignment.
 class ARROW_EXPORT MemoryPool {
  public:
+  class ARROW_EXPORT ImmutableZeros {

Review comment:
       I'm probably missing something, but why is the existing Buffer interface not sufficient for this?

##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -534,6 +533,151 @@ class NullArrayFactory {
   std::shared_ptr<Buffer> buffer_;
 };
 
+// mutable version of ImmutableNullArrayFactory
+class NullArrayFactory {
+ private:
+  Result<std::shared_ptr<Buffer>> CreateZeroByteBuffer(size_t scalar_size_bytes) const {
+    ARROW_ASSIGN_OR_RAISE(auto buffer,
+                          AllocateBuffer(length_ * scalar_size_bytes, pool_));
+    std::memset(buffer->mutable_data(), 0, buffer->size());
+    return std::shared_ptr<Buffer>(std::move(buffer));
+  }
+
+  Result<std::shared_ptr<Buffer>> CreateZeroOffsetBuffer(size_t index_size_bytes) const {
+    ARROW_ASSIGN_OR_RAISE(auto buffer,
+                          AllocateBuffer((length_ + 1) * index_size_bytes, pool_));
+    std::memset(buffer->mutable_data(), 0, buffer->size());
+    return std::shared_ptr<Buffer>(std::move(buffer));
+  }
+
+  Result<std::shared_ptr<Buffer>> CreateZeroBitBuffer(size_t scalar_size_bits) const {
+    ARROW_ASSIGN_OR_RAISE(auto buffer,
+                          AllocateBuffer((length_ * scalar_size_bits + 7) / 8, pool_));

Review comment:
       Could use BytesForBits here: https://github.com/apache/arrow/blob/d59dbbc36c7950e58332d081d47c2d43ea898215/cpp/src/arrow/util/bit_util.h#L82-L86

##########
File path: cpp/src/arrow/array/util.h
##########
@@ -46,7 +59,20 @@ Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>&
                                                int64_t length,
                                                MemoryPool* pool = default_memory_pool());
 
-/// \brief Create an Array instance whose slots are the given scalar
+/// \brief Create a mutable Array instance whose slots are initialized with the given
+/// scalar
+/// \param[in] scalar the value with which to fill the array
+/// \param[in] length the array length
+/// \param[in] pool the memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> MakeMutableArrayFromScalar(
+    const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
+
+/// \brief Create an immutable Array instance whose slots are set to the given scalar
+///
+/// This function may reuse buffers if they contain the same (repeated) value to save
+/// memory, but may also defer to MakeArrayFromScalar().

Review comment:
       ditto here - was this meant to be MakeMutableArrayFromScalar?

##########
File path: cpp/src/arrow/memory_pool.h
##########
@@ -109,6 +157,14 @@ class ARROW_EXPORT MemoryPool {
 
  protected:
   MemoryPool() = default;
+
+  /// Free a memory region allocated by GetImmutableZeros().
+  ///
+  /// @param buffer Pointer to the start of the allocated memory region
+  /// @param size Allocated size located at buffer. An allocator implementation

Review comment:
       nit, but most docstrings use \param syntax




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org