You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/13 16:39:29 UTC

[GitHub] [arrow] arw2019 commented on a change in pull request #8294: ARROW-1846: [C++][Compute] Implement "any" reduction kernel for boolean data

arw2019 commented on a change in pull request #8294:
URL: https://github.com/apache/arrow/pull/8294#discussion_r504080604



##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -584,6 +614,33 @@ struct MinMaxInitState {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct AnyInitState {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  const DataType& in_type;
+  const std::shared_ptr<DataType>& out_type;
+  const MinMaxOptions& options;
+
+  AnyInitState(KernelContext* ctx, const DataType& in_type,
+               const std::shared_ptr<DataType>& out_type, const MinMaxOptions& options)
+      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
+
+  Status Visit(const DataType&) {
+    return Status::NotImplemented("No any kernel implemented");
+  }
+
+  Status Visit(const BooleanType&) {
+    state.reset(new BooleanAnyImpl<SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  std::unique_ptr<KernelState> Create() {
+    ctx->SetStatus(VisitTypeInline(in_type, this));
+    return std::move(state);
+  }
+};

Review comment:
       It's not. I've simplified it

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -549,6 +549,28 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    // short-circuit if seen a True already
+    if (this->state.max == true) {
+      return;
+    }
+
+    BooleanArray arr(batch[0].array());
+    const auto true_count = arr.true_count();

Review comment:
       This worked pretty much straight away. Thanks @pitrou!!!

##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -142,6 +142,20 @@ Result<Datum> MinMax(const Datum& value,
                      const MinMaxOptions& options = MinMaxOptions::Defaults(),
                      ExecContext* ctx = NULLPTR);
 
+/// \brief Test whether any element in a boolean array evaluates to true.
+///
+/// This function returns true if any of the elements in the array evaluates
+/// to true and false otherwise. Null values are taken to evaluate to false.

Review comment:
       I'm not sure about the desired behavior either (although if it was up to me I would want to shoot for consistency with existing kernels).
   
   That said I may need to improve the phrasing in the docstring. I think he current kernel behavior is what you describe: we skip nulls and return whether we saw any `True` values so perhaps it's better to just say that. I *think* that as is treating `null` as `false` or skipping is the same in this case, since neither evaluate to true.
   
   A bit off-topic, but for an `all` kernel (which could be nice to have) I think we'd want to have null handling options, so that users could switch between
   ```
   any([true, null]) = true # skip nulls
   any([true, null]) = false # null evaluates as false
   any([true, null]) = false # kleene logic (I think?)
   ```
   (PS Apologies for  not replying to your comment directly - I have opened ARROW-10291 to track that discussion )

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -549,6 +549,28 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {

Review comment:
       It doesn't. I've rewritten it so it inherits from `ScalarAggregator` directly
   
   Also since it's no longer a template I moved it to `aggregate_basic.cc`

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -726,6 +726,57 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) {
   }
 }
 
+
+//
+// Any
+//
+
+class TestPrimitiveAnyKernel : public ::testing::Test {
+ public:
+  void AssertAnyIs(const Datum& array, bool expected) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Any(array));
+    const BooleanScalar& out_any = out.scalar_as<BooleanScalar>();
+    const auto expected_any = static_cast<const BooleanScalar>(expected);
+    ASSERT_EQ(out_any, expected_any);
+  }
+
+  void AssertAnyIs(const std::string& json, bool expected) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  void AssertAnyIs(const std::vector<std::string>& json, bool expected) {
+    auto array = ChunkedArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  std::shared_ptr<DataType> type_singleton() {
+    return TypeTraits<BooleanType>::type_singleton();
+  }
+};
+
+class TestAnyKernel : public TestPrimitiveAnyKernel {};
+
+TEST_F(TestAnyKernel, Basics) {
+  std::vector<std::string> chunked_input0 = {"[]", "[true]"};
+  std::vector<std::string> chunked_input1 = {"[true, true, null]", "[true, null]"};
+  std::vector<std::string> chunked_input2 = {"[false, false, false]", "[false]"};
+  std::vector<std::string> chunked_input3 = {"[false, null]", "[null, false]"};
+
+  this->AssertAnyIs("[false]", false);

Review comment:
       Done




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org