You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/07/27 09:16:20 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #10642: ARROW-13220: [C++] Implement 'choose' function

pitrou commented on a change in pull request #10642:
URL: https://github.com/apache/arrow/pull/10642#discussion_r677253863



##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1182,6 +1182,20 @@ void CopyOneArrayValue(const DataType& type, const uint8_t* in_valid,
                                   out_offset);
 }
 
+template <typename Type>
+void CopyOneValue(const Datum& in_values, const int64_t in_offset, uint8_t* out_valid,
+                  uint8_t* out_values, const int64_t out_offset) {
+  if (in_values.is_array()) {
+    const ArrayData& array = *in_values.array();
+    CopyOneArrayValue<Type>(*array.type, array.GetValues<uint8_t>(0, 0),
+                            array.GetValues<uint8_t>(1, 0), array.offset + in_offset,
+                            out_valid, out_values, out_offset);
+  } else {
+    CopyValues<Type>(in_values, in_offset, /*length=*/1, out_valid, out_values,

Review comment:
       Should we factor out the scalar copy from `CopyValues<Type>` into a `CopyScalarValue<Type>`?

##########
File path: docs/source/cpp/compute.rst
##########
@@ -917,36 +919,43 @@ Structural transforms
   the first value datum for which the corresponding Boolean is true, or the
   corresponding value from the 'default' input, or null otherwise.
 
-* \(2) Each row of the output will be the corresponding value of the first
+* \(2) The first input must be an integral type. The rest of the arguments can be
+  any type, but must all be the same type or promotable to a common type. Each
+  value of the first input (the 'index') is used as a zero-based index into the
+  remaining arguments (i.e. index 0 is the second argument, index 1 is the third
+  argument, etc.), and the value of the output for that row will be the
+  corresponding value of the selected input at that row.

Review comment:
       Mention behaviour for null indices?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1606,6 +1620,203 @@ struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
   }
 };
 
+template <typename Type>
+Status ExecScalarChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const auto& index_scalar = *batch[0].scalar();
+  if (!index_scalar.is_valid) {
+    if (out->is_array()) {
+      auto source = MakeNullScalar(out->type());
+      ArrayData* output = out->mutable_array();
+      CopyValues<Type>(source, /*row=*/0, batch.length,
+                       output->GetMutableValues<uint8_t>(0, /*absolute_offset=*/0),
+                       output->GetMutableValues<uint8_t>(1, /*absolute_offset=*/0),
+                       output->offset);
+    }
+    return Status::OK();
+  }
+  auto index = UnboxScalar<Int64Type>::Unbox(index_scalar);
+  if (index < 0 || static_cast<size_t>(index + 1) >= batch.values.size()) {
+    return Status::IndexError("choose: index ", index, " out of range");
+  }
+  auto source = batch.values[index + 1];
+  if (out->is_scalar()) {
+    *out = source;
+  } else {
+    ArrayData* output = out->mutable_array();
+    CopyValues<Type>(source, /*row=*/0, batch.length,
+                     output->GetMutableValues<uint8_t>(0, /*absolute_offset=*/0),
+                     output->GetMutableValues<uint8_t>(1, /*absolute_offset=*/0),
+                     output->offset);
+  }
+  return Status::OK();
+}
+
+template <typename Type>
+Status ExecArrayChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  ArrayData* output = out->mutable_array();
+  const int64_t out_offset = output->offset;
+  // Need a null bitmap if any input has nulls
+  uint8_t* out_valid = nullptr;
+  if (std::any_of(batch.values.begin(), batch.values.end(),
+                  [](const Datum& d) { return d.null_count() > 0; })) {
+    out_valid = output->buffers[0]->mutable_data();
+  } else {
+    BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, batch.length,
+                       true);
+  }
+  uint8_t* out_values = output->buffers[1]->mutable_data();
+  int64_t row = 0;
+  return VisitArrayValuesInline<Int64Type>(
+      *batch[0].array(),
+      [&](int64_t index) {
+        if (index < 0 || static_cast<size_t>(index + 1) >= batch.values.size()) {
+          return Status::IndexError("choose: index ", index, " out of range");
+        }
+        const auto& source = batch.values[index + 1];
+        CopyOneValue<Type>(source, row, out_valid, out_values, out_offset + row);
+        row++;
+        return Status::OK();
+      },
+      [&]() {
+        // Index is null, but we should still initialize the output with some value
+        const auto& source = batch.values[1];
+        CopyOneValue<Type>(source, row, out_valid, out_values, out_offset + row);
+        BitUtil::ClearBit(out_valid, out_offset + row);
+        row++;
+        return Status::OK();
+      });
+}
+
+template <typename Type, typename Enable = void>
+struct ChooseFunctor {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch.values[0].is_scalar()) {
+      return ExecScalarChoose<Type>(ctx, batch, out);
+    }
+    return ExecArrayChoose<Type>(ctx, batch, out);
+  }
+};
+
+template <>
+struct ChooseFunctor<NullType> {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return Status::OK();
+  }
+};
+
+template <typename Type>
+struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch.values[0].is_scalar()) {
+      const auto& index_scalar = *batch[0].scalar();
+      if (!index_scalar.is_valid) {
+        if (out->is_array()) {
+          auto null_scalar = MakeNullScalar(out->type());
+          ARROW_ASSIGN_OR_RAISE(
+              auto temp_array,
+              MakeArrayFromScalar(*null_scalar, batch.length, ctx->memory_pool()));

Review comment:
       Why not reuse `MakeArrayOfNull`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1723,6 +1963,22 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
     }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
+  {
+    auto func = std::make_shared<ChooseFunction>("choose", Arity::VarArgs(/*min_args=*/2),
+                                                 &choose_doc);
+    AddPrimitiveChooseKernels(func, NumericTypes());
+    AddPrimitiveChooseKernels(func, TemporalTypes());
+    AddPrimitiveChooseKernels(func,
+                              {boolean(), null(), day_time_interval(), month_interval()});
+    AddChooseKernel(func, Type::FIXED_SIZE_BINARY,
+                    ChooseFunctor<FixedSizeBinaryType>::Exec);
+    AddChooseKernel(func, Type::DECIMAL128, ChooseFunctor<Decimal128Type>::Exec);
+    AddChooseKernel(func, Type::DECIMAL256, ChooseFunctor<Decimal256Type>::Exec);

Review comment:
       Perhaps we want a `GenerateTypeAgnosticFixedSizeBinaryBase`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -1040,5 +1040,192 @@ TEST(TestCoalesce, FixedSizeBinary) {
               ArrayFromJSON(type, R"(["abc", "abc", "abc", "abc"])"));
 }
 
+template <typename Type>
+class TestChooseNumeric : public ::testing::Test {};
+template <typename Type>
+class TestChooseBinary : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestChooseNumeric, NumericBasedTypes);
+TYPED_TEST_SUITE(TestChooseBinary, BinaryTypes);
+
+TYPED_TEST(TestChooseNumeric, FixedSize) {
+  auto type = default_type_instance<TypeParam>();
+  auto indices1 = ArrayFromJSON(int64(), "[0, 1, 0, 1, null]");
+  auto values1 = ArrayFromJSON(type, "[10, 11, null, null, 14]");
+  auto values2 = ArrayFromJSON(type, "[20, 21, null, null, 24]");
+  auto nulls = ArrayFromJSON(type, "[null, null, null, null, null]");
+  CheckScalar("choose", {indices1, values1, values2},
+              ArrayFromJSON(type, "[10, 21, null, null, null]"));
+  // Mixed scalar and array (note CheckScalar checks all-scalar cases for us)
+  CheckScalar("choose", {ScalarFromJSON(int64(), "0"), values1, values2}, values1);
+  CheckScalar("choose", {ScalarFromJSON(int64(), "1"), values1, values2}, values2);
+  CheckScalar("choose", {ScalarFromJSON(int64(), "null"), values1, values2}, nulls);
+  auto scalar1 = ScalarFromJSON(type, "42");
+  CheckScalar("choose", {ScalarFromJSON(int64(), "0"), scalar1, values2},
+              *MakeArrayFromScalar(*scalar1, 5));
+  CheckScalar("choose", {ScalarFromJSON(int64(), "1"), scalar1, values2}, values2);
+  CheckScalar("choose", {ScalarFromJSON(int64(), "null"), values1, values2}, nulls);
+  auto scalar_null = ScalarFromJSON(type, "null");
+  CheckScalar("choose", {ScalarFromJSON(int64(), "0"), scalar_null, values2},
+              *MakeArrayOfNull(type, 5));

Review comment:
       Can you also test where the indices are an array but one of the values is a scalar?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1723,6 +1963,22 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
     }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
+  {
+    auto func = std::make_shared<ChooseFunction>("choose", Arity::VarArgs(/*min_args=*/2),
+                                                 &choose_doc);
+    AddPrimitiveChooseKernels(func, NumericTypes());
+    AddPrimitiveChooseKernels(func, TemporalTypes());
+    AddPrimitiveChooseKernels(func,
+                              {boolean(), null(), day_time_interval(), month_interval()});
+    AddChooseKernel(func, Type::FIXED_SIZE_BINARY,
+                    ChooseFunctor<FixedSizeBinaryType>::Exec);
+    AddChooseKernel(func, Type::DECIMAL128, ChooseFunctor<Decimal128Type>::Exec);
+    AddChooseKernel(func, Type::DECIMAL256, ChooseFunctor<Decimal256Type>::Exec);

Review comment:
       Can decimal types uses `ChooseFunctor<FixedSizeBinaryType>` instead?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
##########
@@ -312,5 +343,8 @@ BENCHMARK(CoalesceBench64)->Args({elems, 99});
 BENCHMARK(CoalesceNonNullBench64)->Args({elems, 0});
 BENCHMARK(CoalesceNonNullBench64)->Args({elems, 99});
 
+BENCHMARK(ChooseBench64)->Args({elems, 0});
+BENCHMARK(ChooseBench64)->Args({elems, 99});

Review comment:
       Since we are at it, can we rename `elems` to `kNumItems` or something similar?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1723,6 +1963,22 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
     }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
+  {
+    auto func = std::make_shared<ChooseFunction>("choose", Arity::VarArgs(/*min_args=*/2),
+                                                 &choose_doc);
+    AddPrimitiveChooseKernels(func, NumericTypes());
+    AddPrimitiveChooseKernels(func, TemporalTypes());
+    AddPrimitiveChooseKernels(func,
+                              {boolean(), null(), day_time_interval(), month_interval()});

Review comment:
       Should we have a `IntervalTypes()` helper?

##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1606,6 +1620,203 @@ struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
   }
 };
 
+template <typename Type>
+Status ExecScalarChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const auto& index_scalar = *batch[0].scalar();
+  if (!index_scalar.is_valid) {
+    if (out->is_array()) {
+      auto source = MakeNullScalar(out->type());
+      ArrayData* output = out->mutable_array();
+      CopyValues<Type>(source, /*row=*/0, batch.length,
+                       output->GetMutableValues<uint8_t>(0, /*absolute_offset=*/0),
+                       output->GetMutableValues<uint8_t>(1, /*absolute_offset=*/0),
+                       output->offset);
+    }
+    return Status::OK();
+  }
+  auto index = UnboxScalar<Int64Type>::Unbox(index_scalar);
+  if (index < 0 || static_cast<size_t>(index + 1) >= batch.values.size()) {
+    return Status::IndexError("choose: index ", index, " out of range");
+  }
+  auto source = batch.values[index + 1];
+  if (out->is_scalar()) {
+    *out = source;
+  } else {
+    ArrayData* output = out->mutable_array();
+    CopyValues<Type>(source, /*row=*/0, batch.length,
+                     output->GetMutableValues<uint8_t>(0, /*absolute_offset=*/0),
+                     output->GetMutableValues<uint8_t>(1, /*absolute_offset=*/0),
+                     output->offset);
+  }
+  return Status::OK();
+}
+
+template <typename Type>
+Status ExecArrayChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  ArrayData* output = out->mutable_array();
+  const int64_t out_offset = output->offset;
+  // Need a null bitmap if any input has nulls
+  uint8_t* out_valid = nullptr;
+  if (std::any_of(batch.values.begin(), batch.values.end(),
+                  [](const Datum& d) { return d.null_count() > 0; })) {
+    out_valid = output->buffers[0]->mutable_data();
+  } else {
+    BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, batch.length,
+                       true);

Review comment:
       Also set `output->null_count` to 0?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org