You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/01 15:52:34 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #8294: ARROW-1846: [C++][Compute] Implement "any" reduction kernel for boolean data

pitrou commented on a change in pull request #8294:
URL: https://github.com/apache/arrow/pull/8294#discussion_r498344896



##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -549,6 +549,28 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {

Review comment:
       I don't really understand why this is inheriting from MinMax. Does it help reduce the code size in any way?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -726,6 +726,57 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) {
   }
 }
 
+
+//
+// Any
+//
+
+class TestPrimitiveAnyKernel : public ::testing::Test {
+ public:
+  void AssertAnyIs(const Datum& array, bool expected) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Any(array));
+    const BooleanScalar& out_any = out.scalar_as<BooleanScalar>();
+    const auto expected_any = static_cast<const BooleanScalar>(expected);
+    ASSERT_EQ(out_any, expected_any);
+  }
+
+  void AssertAnyIs(const std::string& json, bool expected) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  void AssertAnyIs(const std::vector<std::string>& json, bool expected) {
+    auto array = ChunkedArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  std::shared_ptr<DataType> type_singleton() {
+    return TypeTraits<BooleanType>::type_singleton();
+  }
+};
+
+class TestAnyKernel : public TestPrimitiveAnyKernel {};
+
+TEST_F(TestAnyKernel, Basics) {
+  std::vector<std::string> chunked_input0 = {"[]", "[true]"};
+  std::vector<std::string> chunked_input1 = {"[true, true, null]", "[true, null]"};
+  std::vector<std::string> chunked_input2 = {"[false, false, false]", "[false]"};
+  std::vector<std::string> chunked_input3 = {"[false, null]", "[null, false]"};
+
+  this->AssertAnyIs("[false]", false);

Review comment:
       Please also test with an empty array.

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -549,6 +549,28 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    // short-circuit if seen a True already
+    if (this->state.max == true) {
+      return;
+    }
+
+    BooleanArray arr(batch[0].array());
+    const auto true_count = arr.true_count();

Review comment:
       This forces counting the set bits in the whole array, but we're only interested in the presence of a single set bit, so we should be able to shortcut much more aggressively.
   
   You may try to use `OptionalBinaryBitBlockCounter` for that. Untested:
   ```c++
   const auto& data = *batch[0].array();
   OptionalBinaryBitBlockCounter counter(data.buffers[0], data.offset, data.buffers[1], data.offset, data.length);
   int64_t position = 0;
   while (position < data.length) {
     const auto block = counter.NextAndBlock();
     if (block.popcount > 0) {
       this->state.max = true;
       break;
     }
     position += block.length;
   }
   ```
   }
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org