You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/06/16 05:15:47 UTC

[GitHub] [arrow] nirandaperera opened a new pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

nirandaperera opened a new pull request #10538:
URL: https://github.com/apache/arrow/pull/10538


   This PR adds variable binary type support 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r662573252



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively
+    int64_t data_buff_alloc = std::max(left_offsets[left.length] - left_offsets[0],
+                                       right_offsets[right.length] - right_offsets[0]);
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> out_data_buf,
+                          ctx->Allocate(data_buff_alloc));
+    uint8_t* out_data = out_data_buf->mutable_data();
+
+    RunIfElseLoop(
+        cond,
+        [&](int64_t offset, int64_t length) {  // from left bulk
+          auto bytes_written = left_offsets[offset + length] - left_offsets[offset];
+          std::memcpy(out_data + out_offsets[offset], left_data + left_offsets[offset],
+                      bytes_written);
+          // normalize the out_offsets by reducing input start offset, and adding the
+          // offset upto the word
+          std::transform(left_offsets + offset + 1, left_offsets + offset + length + 1,
+                         out_offsets + offset + 1, [&](const OffsetType& src_offset) {
+                           return src_offset - left_offsets[offset] + out_offsets[offset];
+                         });
+        },
+        [&](int64_t offset, int64_t length) {  // from right bulk
+          auto bytes_written = right_offsets[offset + length] - right_offsets[offset];
+          std::memcpy(out_data + out_offsets[offset], right_data + right_offsets[offset],
+                      bytes_written);
+          // normalize the out_offsets by reducing input start offset, and adding the
+          // offset upto the word
+          std::transform(right_offsets + offset + 1, right_offsets + offset + length + 1,
+                         out_offsets + offset + 1, [&](const OffsetType& src_offset) {
+                           return src_offset - right_offsets[offset] +
+                                  out_offsets[offset];
+                         });
+        },
+        [&](int64_t offset) {  // left each
+          auto bytes_written = left_offsets[offset + 1] - left_offsets[offset];
+          std::memcpy(out_data + out_offsets[offset], left_data + left_offsets[offset],
+                      bytes_written);
+          out_offsets[offset + 1] = out_offsets[offset] + bytes_written;
+        },
+        [&](int64_t offset) {  // right each
+          auto bytes_written = right_offsets[offset + 1] - right_offsets[offset];
+          std::memcpy(out_data + out_offsets[offset], right_data + right_offsets[offset],
+                      bytes_written);
+          out_offsets[offset + 1] = out_offsets[offset] + bytes_written;
+        });
+    // resize the data buffer
+    ARROW_RETURN_NOT_OK(out_data_buf->Resize(out_offsets[cond.length]));
+
+    out->buffers[1] = std::move(out_offset_buf);
+    out->buffers[2] = std::move(out_data_buf);
+    return Status::OK();
+  }
+
+  // ASA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
+    size_t left_size = left_data.size();
+
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively
+    auto data_buff_alloc =
+        std::max(left_size * cond.length,

Review comment:
       It looks like clang wants you to pass the type explicitly here

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -607,15 +1050,16 @@ struct ResolveIfElseExec {
   }
 };
 
-template <>
-struct ResolveIfElseExec<NullType> {
+template <typename AllocateMem>
+struct ResolveIfElseExec<NullType, AllocateMem> {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].is_scalar()) {
+    // if all are scalars, return a null scalar
+    if (batch[0].is_scalar() && batch[1].is_scalar() && batch[2].is_scalar()) {
       *out = MakeNullScalar(null());
     } else {
-      const std::shared_ptr<ArrayData>& cond_array = batch[0].array();
-      ARROW_ASSIGN_OR_RAISE(
-          *out, MakeArrayOfNull(null(), cond_array->length, ctx->memory_pool()));
+      int64_t len =
+          std::max(batch[0].length(), std::max(batch[1].length(), batch[2].length()));

Review comment:
       In this case it should just be `batch.length`

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively

Review comment:
       Note that for 'coalesce' and 'choose', what I've gone with is using a builder (with ReserveData) for binary types, and using something akin to your 'handlebulk' only for fixed-width types.

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());

Review comment:
       Just use `batch.length`

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -58,8 +58,10 @@ inline Bitmap GetBitmap(const Datum& datum, int i) {
 // if the condition is null then output is null otherwise we take validity from the
 // selected argument
 // ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
-Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& left_d,
-                           const Datum& right_d, ArrayData* output) {
+template <typename AllocateMem>

Review comment:
       nit: maybe AllocateNullBitmap?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -36,7 +36,7 @@ namespace {
 constexpr uint64_t kAllNull = 0;
 constexpr uint64_t kAllValid = ~kAllNull;
 
-util::optional<uint64_t> GetConstantValidityWord(const Datum& data) {

Review comment:
       AIUI, they don't need to be declared static if inside an anonymous namespace?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively

Review comment:
       Is this sufficient? For `IfElse([true, false], ['asdf', ''], ['', 'asdf'])` this will allocate only a length 4 buffer but a length 8 buffer is needed.

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -78,19 +80,37 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
   // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
   // In the following cases, we dont need to allocate out_valid bitmap
 
-  // if cond & left & right all ones, then output is all valid. output validity buffer
-  // is already allocated, hence set all bits
+  // if cond & left & right all ones, then output is all valid.
+  // if output validity buffer is already allocated (NullHandling::
+  // COMPUTED_PREALLOCATE) -> set all bits
+  // else, return nullptr
   if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
-    BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
-                       output->length);
+    if (AllocateMem::value) {
+      output->buffers[0] = nullptr;
+    } else {  // NullHandling::COMPUTED_NO_PREALLOCATE

Review comment:
       These comments contradict each other?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively
+    int64_t data_buff_alloc = std::max(left_offsets[left.length] - left_offsets[0],
+                                       right_offsets[right.length] - right_offsets[0]);
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> out_data_buf,
+                          ctx->Allocate(data_buff_alloc));
+    uint8_t* out_data = out_data_buf->mutable_data();
+
+    RunIfElseLoop(
+        cond,
+        [&](int64_t offset, int64_t length) {  // from left bulk

Review comment:
       minor nit but I wonder if the compiler could generate similar (enough) code from a
   
   ```
   struct CopyBinaryBulk {
       const offset_type* offsets;
       const uint8_t* values;
       offset_type* out_offsets;
       uint8_t* out_values;
   
       operator()(...) { ... }
   };
   ```
   
   so you don't have to copy-paste these everywhere

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -274,18 +310,129 @@ static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
 }
 
 template <typename HandleBulk, typename HandleEach>
-static void RunIfElseLoopInverted(const ArrayData& cond, HandleBulk handle_bulk,
-                                  HandleEach handle_each) {
-  return RunIfElseLoop<HandleBulk, HandleEach, true>(cond, handle_bulk, handle_each);
+static void RunIfElseLoopInverted(const ArrayData& cond, const HandleBulk& handle_bulk,
+                                  const HandleEach& handle_each) {
+  RunIfElseLoop<HandleBulk, HandleEach, true>(cond, handle_bulk, handle_each);
+}
+
+/// Runs the main if_else loop.
+///
+/// `HandleBulk` has the signature:
+///     [](int64_t offset, int64_t length){...}
+/// It should copy `length` number of elements from source array to output array with
+/// `offset` offset in both arrays
+///
+/// `HandleEach` has the signature:
+///     [](int64_t offset){...}
+/// It should copy single element from source array to output array with `offset`
+/// offset in both arrays
+template <typename HandleBulkLeft, typename HandleBulkRight, typename HandleEachLeft,
+          typename HandleEachRight, bool invert = false>
+static void RunIfElseLoop(const ArrayData& cond, const HandleBulkLeft& handle_bulk_left,
+                          const HandleBulkRight& handle_bulk_right,
+                          const HandleEachLeft& handle_each_left,
+                          const HandleEachRight& handle_each_right) {
+  int64_t offset = 0;
+  const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
+
+  // There are multiple options for this one. Ex: BitBlockCounter, BitmapWordReader,
+  // BitRunReader, etc. BitRunReader would be efficient for longer contiguous values in
+  // the cond data buffer.
+  // BitmapWordReader was slightly better performant that BitBlockCounter.
+  BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
+
+  int64_t cnt = cond_reader.words();
+  while (cnt--) {
+    Word word = cond_reader.NextWord();
+    if (invert) {
+      if (word == UINT64_MAX) {
+        handle_bulk_right(offset, word_len);
+      } else if (word == 0) {
+        handle_bulk_left(offset, word_len);
+      } else {
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (!BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_right(offset + i);
+          } else {
+            handle_each_left(offset + i);
+          }
+        }
+      }
+    } else {
+      if (word == UINT64_MAX) {
+        handle_bulk_left(offset, word_len);
+      } else if (word == 0) {
+        handle_bulk_right(offset, word_len);
+      } else {
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_left(offset + i);
+          } else {
+            handle_each_right(offset + i);
+          }
+        }
+      }
+    }
+    offset += word_len;
+  }
+
+  cnt = cond_reader.trailing_bytes();
+  while (cnt--) {
+    int valid_bits;
+    uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+    if (invert) {
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        handle_bulk_right(offset, 8);
+      } else if (byte == 0 && valid_bits == 8) {
+        handle_bulk_left(offset, 8);
+      } else {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (!BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_right(offset + i);
+          } else {
+            handle_each_left(offset + i);
+          }
+        }
+      }
+    } else {
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        handle_bulk_left(offset, 8);
+      } else if (byte == 0 && valid_bits == 8) {
+        handle_bulk_right(offset, 8);
+      } else {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_left(offset + i);
+          } else {
+            handle_each_right(offset + i);
+          }
+        }
+      }
+    }
+    offset += 8;  // doesn't necessarily have to be valid_bits here. Because it
+    // valid_bits < 8, then the loop will exit

Review comment:
       this is a little confusing but the intent is, valid_bits < 8 => cnt == 0?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively

Review comment:
       The 'most conservative' would be to allocate the sum of the value lengths.

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -316,5 +313,98 @@ TEST_F(TestIfElseKernel, IfElseDispatchBest) {
   CheckDispatchBest(name, {null(), uint8(), int8()}, {boolean(), int16(), int16()});
 }
 
+template <typename Type>
+class TestIfElseBaseBinary : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestIfElseBaseBinary, BinaryTypes);
+
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinary) {

Review comment:
       IMO these cases could be consolidated into one or two checks since there are only 12 cases here (or 5 if you want to prune: null cond, true cond with null/non-null left array, false cond with null/non-null right array).

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
##########
@@ -48,13 +48,13 @@ static void IfElseBench(benchmark::State& state) {
     ABORT_NOT_OK(IfElse(cond->Slice(offset), left->Slice(offset), right->Slice(offset)));
   }
 
-  state.SetBytesProcessed(state.iterations() *

Review comment:
       You could use BaseBinaryArray::values_length() if you want to write a specialized case.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r659863256



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -353,6 +353,332 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array
+  // S - Scalar
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively
+    auto data_buff_alloc =
+        static_cast<int64_t>((left_offsets[left.length] - left_offsets[0]) +
+                             (right_offsets[right.length] - right_offsets[0]));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> out_data_buf,
+                          ctx->Allocate(data_buff_alloc));
+    uint8_t* out_data = out_data_buf->mutable_data();
+
+    int64_t offset = cond.offset;
+    OffsetType total_bytes_written = 0;
+    while (offset < cond.offset + cond.length) {

Review comment:
       It seems like this is copying data even if the output would be null. That doesn't sound like a good idea (if the filter bitmap has 99% nulls, you want those 99% null string slots in the output to take 0 data space).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663215493



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -274,18 +310,129 @@ static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
 }
 
 template <typename HandleBulk, typename HandleEach>
-static void RunIfElseLoopInverted(const ArrayData& cond, HandleBulk handle_bulk,
-                                  HandleEach handle_each) {
-  return RunIfElseLoop<HandleBulk, HandleEach, true>(cond, handle_bulk, handle_each);
+static void RunIfElseLoopInverted(const ArrayData& cond, const HandleBulk& handle_bulk,
+                                  const HandleEach& handle_each) {
+  RunIfElseLoop<HandleBulk, HandleEach, true>(cond, handle_bulk, handle_each);
+}
+
+/// Runs the main if_else loop.
+///
+/// `HandleBulk` has the signature:
+///     [](int64_t offset, int64_t length){...}
+/// It should copy `length` number of elements from source array to output array with
+/// `offset` offset in both arrays
+///
+/// `HandleEach` has the signature:
+///     [](int64_t offset){...}
+/// It should copy single element from source array to output array with `offset`
+/// offset in both arrays
+template <typename HandleBulkLeft, typename HandleBulkRight, typename HandleEachLeft,
+          typename HandleEachRight, bool invert = false>
+static void RunIfElseLoop(const ArrayData& cond, const HandleBulkLeft& handle_bulk_left,
+                          const HandleBulkRight& handle_bulk_right,
+                          const HandleEachLeft& handle_each_left,
+                          const HandleEachRight& handle_each_right) {
+  int64_t offset = 0;
+  const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
+
+  // There are multiple options for this one. Ex: BitBlockCounter, BitmapWordReader,
+  // BitRunReader, etc. BitRunReader would be efficient for longer contiguous values in
+  // the cond data buffer.
+  // BitmapWordReader was slightly better performant that BitBlockCounter.
+  BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
+
+  int64_t cnt = cond_reader.words();
+  while (cnt--) {
+    Word word = cond_reader.NextWord();
+    if (invert) {
+      if (word == UINT64_MAX) {
+        handle_bulk_right(offset, word_len);
+      } else if (word == 0) {
+        handle_bulk_left(offset, word_len);
+      } else {
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (!BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_right(offset + i);
+          } else {
+            handle_each_left(offset + i);
+          }
+        }
+      }
+    } else {
+      if (word == UINT64_MAX) {
+        handle_bulk_left(offset, word_len);
+      } else if (word == 0) {
+        handle_bulk_right(offset, word_len);
+      } else {
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_left(offset + i);
+          } else {
+            handle_each_right(offset + i);
+          }
+        }
+      }
+    }
+    offset += word_len;
+  }
+
+  cnt = cond_reader.trailing_bytes();
+  while (cnt--) {
+    int valid_bits;
+    uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+    if (invert) {
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        handle_bulk_right(offset, 8);
+      } else if (byte == 0 && valid_bits == 8) {
+        handle_bulk_left(offset, 8);
+      } else {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (!BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_right(offset + i);
+          } else {
+            handle_each_left(offset + i);
+          }
+        }
+      }
+    } else {
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        handle_bulk_left(offset, 8);
+      } else if (byte == 0 && valid_bits == 8) {
+        handle_bulk_right(offset, 8);
+      } else {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (BitUtil::GetBit(cond_data, cond.offset + offset + i)) {
+            handle_each_left(offset + i);
+          } else {
+            handle_each_right(offset + i);
+          }
+        }
+      }
+    }
+    offset += 8;  // doesn't necessarily have to be valid_bits here. Because it
+    // valid_bits < 8, then the loop will exit

Review comment:
       No, this is only done for the bytes at the end. So, we had a 100 lengthed bitmap. Then, first 64 would fall into the `Word` block. Rest 36, would be handled byte-by-byte (8 + 8+ 8+ 8 + 4). So, `cnt=5`. When `valid_bits< 8` that means the `while` loop end on the next iteration. And `offset` value is not used beyond the loop. So, we can just keep on incrementing by 8. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663218611



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());

Review comment:
       `batch` is not available at this point :slightly_smiling_face: 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663273941



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -201,70 +226,61 @@ static constexpr int64_t word_len = sizeof(Word) * 8;
 /// Runs the main if_else loop. Here, it is expected that the right data has already
 /// been copied to the output.
 /// If `invert` is meant to invert the cond.data. If is set to `true`, then the
-/// buffer will be inverted before calling the handle_bulk or handle_each functions.
+/// buffer will be inverted before calling the handle_block or handle_each functions.
 /// This is useful, when left is an array and right is scalar. Then rather than
 /// copying data from the right to output, we can copy left data to the output and
 /// invert the cond data to fill right values. Filling out with a scalar is presumed to
 /// be more efficient than filling with an array
-template <typename HandleBulk, typename HandleEach, bool invert = false>
-static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
-                          HandleEach handle_each) {
+///
+/// `HandleBlock` has the signature:
+///     [](int64_t offset, int64_t length){...}
+/// It should copy `length` number of elements from source array to output array with
+/// `offset` offset in both arrays
+template <typename HandleBlock, bool invert = false>
+void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
   int64_t data_offset = 0;
   int64_t bit_offset = cond.offset;
   const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
 
   BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
 
+  constexpr Word pickAll = invert ? 0 : UINT64_MAX;
+  constexpr Word pickNone = ~pickAll;
+
   int64_t cnt = cond_reader.words();
   while (cnt--) {
     Word word = cond_reader.NextWord();
-    if (invert) {
-      if (word == 0) {
-        handle_bulk(data_offset, word_len);
-      } else if (word != UINT64_MAX) {
-        for (int64_t i = 0; i < word_len; ++i) {
-          if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
-            handle_each(data_offset + i);
-          }
-        }
-      }
-    } else {
-      if (word == UINT64_MAX) {
-        handle_bulk(data_offset, word_len);
-      } else if (word) {
-        for (int64_t i = 0; i < word_len; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            handle_each(data_offset + i);
-          }
+
+    // todo: check if this passes MSVC

Review comment:
       Seems like it does.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm commented on pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#issuecomment-873158635


   N.B. it looks like this segfaults in the tests on Windows MSVC 2019 (https://github.com/apache/arrow/pull/10538/checks?check_run_id=2966481230#step:8:339) and there are various errors on Clang, MSVC, and MinGW (mostly related to implicit casts or needing to explicitly parameterize std::max).


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm closed pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm closed pull request #10538:
URL: https://github.com/apache/arrow/pull/10538


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r670798383



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -893,6 +1031,21 @@ void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_functi
   }
 }
 
+void AddFSBinaryIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+  // cond array needs to be boolean always
+  ScalarKernel kernel(
+      {boolean(), InputType(Type::FIXED_SIZE_BINARY), InputType(Type::FIXED_SIZE_BINARY)},
+      OutputType([](KernelContext*, const std::vector<ValueDescr>& descrs) {
+        return ValueDescr(descrs[1].type, ValueDescr::ANY);
+      }),

Review comment:
       I think this needs to be checked! thanks 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663241697



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -316,5 +313,98 @@ TEST_F(TestIfElseKernel, IfElseDispatchBest) {
   CheckDispatchBest(name, {null(), uint8(), int8()}, {boolean(), int16(), int16()});
 }
 
+template <typename Type>
+class TestIfElseBaseBinary : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestIfElseBaseBinary, BinaryTypes);
+
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinary) {

Review comment:
       `CheckWithDifferentShapes` actually checks for all those cases + with offsets. But I agree, in `CheckWithDifferentShapes` it could check similar cases redundantly in several iterations. But we decided to leave it as it is. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663193637



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -443,16 +451,217 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
 
     // selectively copy values from left data
     T left_data = internal::UnboxScalar<Type>::Unbox(left);
-    RunIfElseLoop(
-        cond,
-        [&](int64_t data_offset, int64_t num_elems) {
-          std::fill(out_values + data_offset, out_values + data_offset + num_elems,
-                    left_data);
+    RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+      std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+                left_data);
+    });
+
+    return Status::OK();
+  }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // allocate data buffer conservatively
+    int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
+                              right_offsets[right.length] - right_offsets[0];
+
+    BuilderType builder(ctx->memory_pool());
+    ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+    ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+    RunLoop(
+        cond, *out,
+        [&](int64_t i) {
+          builder.UnsafeAppend(left_data + left_offsets[i],
+                               left_offsets[i + 1] - left_offsets[i]);
+        },
+        [&](int64_t i) {
+          builder.UnsafeAppend(right_data + right_offsets[i],
+                               right_offsets[i + 1] - right_offsets[i]);
+        },
+        [&]() { builder.AppendNull(); });
+    ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+    out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+    out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+    out->buffers[2] = std::move(out_arr->data()->buffers[2]);

Review comment:
       nit: maybe also copy over null_count?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r662606715



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively

Review comment:
       Ah! I actually had sum previously and I changed it to max! :disappointed: Thank you for catching this!!!




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm commented on pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#issuecomment-880869305


   Looks like this also needs to be rebased now. But once that's done and the nits above are addressed I think this can be merged.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r659863256



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -353,6 +353,332 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array
+  // S - Scalar
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively
+    auto data_buff_alloc =
+        static_cast<int64_t>((left_offsets[left.length] - left_offsets[0]) +
+                             (right_offsets[right.length] - right_offsets[0]));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> out_data_buf,
+                          ctx->Allocate(data_buff_alloc));
+    uint8_t* out_data = out_data_buf->mutable_data();
+
+    int64_t offset = cond.offset;
+    OffsetType total_bytes_written = 0;
+    while (offset < cond.offset + cond.length) {

Review comment:
       It seems like this is copying data even if the output would be null. That doesn't sound like a good idea (if the filter bitmap has 99% nulls, you want those 99% null string slots in the output to take 0 data space).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r668693928



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -770,6 +770,144 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    auto byte_width =
+        std::static_pointer_cast<FixedSizeBinaryType>(left.type())->byte_width();

Review comment:
       maybe `checked_cast<const FixedSizeBinaryType&>(*left.type())` or `checked_pointer_cast<FixedSizeBinaryType>(left.type())` (will be dynamic cast in debug, static cast in release)

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -893,6 +1031,21 @@ void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_functi
   }
 }
 
+void AddFSBinaryIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+  // cond array needs to be boolean always
+  ScalarKernel kernel(
+      {boolean(), InputType(Type::FIXED_SIZE_BINARY), InputType(Type::FIXED_SIZE_BINARY)},
+      OutputType([](KernelContext*, const std::vector<ValueDescr>& descrs) {
+        return ValueDescr(descrs[1].type, ValueDescr::ANY);
+      }),

Review comment:
       Hmm, do we need to check that the two branches have the same type width?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -770,6 +770,144 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    auto byte_width =
+        std::static_pointer_cast<FixedSizeBinaryType>(left.type())->byte_width();
+    return RunIfElseScalar(
+        cond, left, right, out,
+        /*CopyArrayData*/
+        [&](const ArrayData& valid_array, ArrayData* out_array) {
+          std::memcpy(
+              out_array->buffers[1]->mutable_data() + out_array->offset * byte_width,
+              valid_array.buffers[1]->mutable_data() + valid_array.offset * byte_width,

Review comment:
       nit: just `data()` here?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#issuecomment-862046118


   https://issues.apache.org/jira/browse/ARROW-12955


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663116352



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());
+    if (!cond.is_valid) {
+      // cond is null; just create a null array
+      ARROW_ASSIGN_OR_RAISE(*out,
+                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+      return Status::OK();
+    }
+
+    const auto& valid_data = cond.value ? left : right;
+    if (valid_data.is_array()) {
+      *out = valid_data;
+    } else {
+      // valid data is a scalar that needs to be broadcasted
+      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+                                                      ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively

Review comment:
       @lidavidm I tried with the builders. There's some perf penalty but the code is much more readable and manageable. So, I'll change binary ones to that. I simplified the code a lot TBH. Let me push that changes 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663240445



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -316,5 +313,98 @@ TEST_F(TestIfElseKernel, IfElseDispatchBest) {
   CheckDispatchBest(name, {null(), uint8(), int8()}, {boolean(), int16(), int16()});
 }
 
+template <typename Type>
+class TestIfElseBaseBinary : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestIfElseBaseBinary, BinaryTypes);
+
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinary) {

Review comment:
       well, there are 64 cases TBH. Each param can be null/ non-null and scalar/ array (4 variations). Since we have 3 params we have 4x4x4 = 64 permutations. :slightly_smiling_face: 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663113754



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -78,19 +80,37 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
   // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
   // In the following cases, we dont need to allocate out_valid bitmap
 
-  // if cond & left & right all ones, then output is all valid. output validity buffer
-  // is already allocated, hence set all bits
+  // if cond & left & right all ones, then output is all valid.
+  // if output validity buffer is already allocated (NullHandling::
+  // COMPUTED_PREALLOCATE) -> set all bits
+  // else, return nullptr
   if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
-    BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
-                       output->length);
+    if (AllocateMem::value) {
+      output->buffers[0] = nullptr;
+    } else {  // NullHandling::COMPUTED_NO_PREALLOCATE

Review comment:
       yes it does. My bad! 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663240445



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -316,5 +313,98 @@ TEST_F(TestIfElseKernel, IfElseDispatchBest) {
   CheckDispatchBest(name, {null(), uint8(), int8()}, {boolean(), int16(), int16()});
 }
 
+template <typename Type>
+class TestIfElseBaseBinary : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestIfElseBaseBinary, BinaryTypes);
+
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinary) {

Review comment:
       well, there are 64 cases TBH. Each param can be null/ non-null and scalar/ array (4 variations). Since we have 3 params we have 4*4*4 = 64 permutations. :slightly_smiling_face: 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
lidavidm commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r663219563



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -455,6 +602,302 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    if (left.is_scalar() && right.is_scalar()) {
+      if (cond.is_valid) {
+        *out = cond.value ? left.scalar() : right.scalar();
+      } else {
+        *out = MakeNullScalar(left.type());
+      }
+      return Status::OK();
+    }
+    // either left or right is an array. Output is always an array
+    int64_t out_arr_len = std::max(left.length(), right.length());

Review comment:
       Oh whoops.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#issuecomment-872519526


   @pitrou @lidavidm I made some changes to the code. As Antoine pointed out previously, the null values in the output string array would have non-empty slots in the data buffer. I think it requires a small fix. I'll add that here as well. But I'd like to get your feedback on the code flow. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r660103166



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -353,6 +353,332 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array
+  // S - Scalar
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively
+    auto data_buff_alloc =
+        static_cast<int64_t>((left_offsets[left.length] - left_offsets[0]) +
+                             (right_offsets[right.length] - right_offsets[0]));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> out_data_buf,
+                          ctx->Allocate(data_buff_alloc));
+    uint8_t* out_data = out_data_buf->mutable_data();
+
+    int64_t offset = cond.offset;
+    OffsetType total_bytes_written = 0;
+    while (offset < cond.offset + cond.length) {

Review comment:
       Agreed. There are some improvements I am planning to add to this. I'll ping you, once I have pushed them. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nirandaperera commented on a change in pull request #10538: ARROW-12955: [C++] Add additional type support for if_else kernel

Posted by GitBox <gi...@apache.org>.
nirandaperera commented on a change in pull request #10538:
URL: https://github.com/apache/arrow/pull/10538#discussion_r660103166



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -353,6 +353,332 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   }
 };
 
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+  using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // A - Array
+  // S - Scalar
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    const uint8_t* cond_data = cond.buffers[1]->data();
+    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+
+    const auto* left_offsets = left.GetValues<OffsetType>(1);
+    const uint8_t* left_data = left.buffers[2]->data();
+    const auto* right_offsets = right.GetValues<OffsetType>(1);
+    const uint8_t* right_data = right.buffers[2]->data();
+
+    // reserve an additional space
+    ARROW_ASSIGN_OR_RAISE(auto out_offset_buf,
+                          ctx->Allocate((cond.length + 1) * sizeof(OffsetType)));
+    auto* out_offsets = reinterpret_cast<OffsetType*>(out_offset_buf->mutable_data());
+    out_offsets[0] = 0;
+
+    // allocate data buffer conservatively
+    auto data_buff_alloc =
+        static_cast<int64_t>((left_offsets[left.length] - left_offsets[0]) +
+                             (right_offsets[right.length] - right_offsets[0]));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> out_data_buf,
+                          ctx->Allocate(data_buff_alloc));
+    uint8_t* out_data = out_data_buf->mutable_data();
+
+    int64_t offset = cond.offset;
+    OffsetType total_bytes_written = 0;
+    while (offset < cond.offset + cond.length) {

Review comment:
       Agreed. There are some improvements I am planning to add to this. I'll ping you, once I have pushed them. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org