You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/09/30 06:18:04 UTC

[GitHub] [arrow] arw2019 commented on a change in pull request #8294: ARROW-1846: [C++][Compute] Implement "any" reduction kernel for boolean data

arw2019 commented on a change in pull request #8294:
URL: https://github.com/apache/arrow/pull/8294#discussion_r497262235



##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -547,6 +547,36 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using StateType = MinMaxState<BooleanType, SimdLevel>;
+  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    // short-circuit if seen a True already
+    if (this->state.max == true) {
+      return;
+    }
+
+    ArrayType arr(batch[0].array());
+    const auto true_count = arr.true_count();
+    if (true_count > 0) {
+      this->state.max = true;
+    }
+  }
+
+  void Finalize(KernelContext*, Datum* out) override {
+    using ScalarType = typename TypeTraits<BooleanType>::ScalarType;

Review comment:
       Switched to that here (and in other places)

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -547,6 +547,36 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using StateType = MinMaxState<BooleanType, SimdLevel>;
+  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    // short-circuit if seen a True already
+    if (this->state.max == true) {
+      return;
+    }
+
+    ArrayType arr(batch[0].array());

Review comment:
       Switched to that here (and in other places)

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -726,6 +726,109 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) {
   }
 }
 
+//
+// Any
+//
+
+class TestPrimitiveAnyKernel : public ::testing::Test {
+  using Traits = TypeTraits<BooleanType>;
+  using ArrayType = typename Traits::ArrayType;
+  using c_type = typename BooleanType::c_type;
+  using ScalarType = typename Traits::ScalarType;

Review comment:
       Got rid of all these

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -547,6 +547,36 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using StateType = MinMaxState<BooleanType, SimdLevel>;
+  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    // short-circuit if seen a True already
+    if (this->state.max == true) {
+      return;
+    }
+
+    ArrayType arr(batch[0].array());
+    const auto true_count = arr.true_count();
+    if (true_count > 0) {
+      this->state.max = true;
+    }
+  }
+
+  void Finalize(KernelContext*, Datum* out) override {
+    using ScalarType = typename TypeTraits<BooleanType>::ScalarType;
+
+    if (this->state.max == true) {
+      out->value = std::make_shared<ScalarType>(true);
+    } else {
+      out->value = std::make_shared<ScalarType>(false);
+    }

Review comment:
       Done!

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -547,6 +547,36 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
   }
 };
 
+template <SimdLevel::type SimdLevel>
+struct BooleanAnyImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using StateType = MinMaxState<BooleanType, SimdLevel>;

Review comment:
       Done

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -726,6 +726,109 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) {
   }
 }
 
+//
+// Any
+//
+
+class TestPrimitiveAnyKernel : public ::testing::Test {
+  using Traits = TypeTraits<BooleanType>;
+  using ArrayType = typename Traits::ArrayType;
+  using c_type = typename BooleanType::c_type;
+  using ScalarType = typename Traits::ScalarType;
+
+ public:
+  void AssertAnyIs(const Datum& array, c_type expected) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Any(array));
+    const ScalarType& out_any = out.scalar_as<ScalarType>();
+    const auto expected_any = static_cast<const ScalarType>(expected);
+    ASSERT_EQ(out_any, expected_any);
+  }
+
+  void AssertAnyIs(const std::string& json, c_type expected) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  void AssertAnyIs(const std::vector<std::string>& json, c_type expected) {
+    auto array = ChunkedArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  std::shared_ptr<DataType> type_singleton() { return Traits::type_singleton(); }
+};
+
+class TestAnyKernel : public TestPrimitiveAnyKernel {};
+
+TEST_F(TestAnyKernel, Basics) {
+  std::vector<std::string> chunked_input0 = {"[]", "[true]"};
+  std::vector<std::string> chunked_input1 = {"[true, true, null]", "[true, null]"};
+  std::vector<std::string> chunked_input2 = {"[false, false, false]", "[false]"};
+  std::vector<std::string> chunked_input3 = {"[false, null]", "[null, false]"};
+
+  this->AssertAnyIs("[false]", false);
+  this->AssertAnyIs("[true, false]", true);
+  this->AssertAnyIs("[null, null, null]", false);
+  this->AssertAnyIs("[false, false, false]", false);
+  this->AssertAnyIs("[false, false, false, null]", false);
+  this->AssertAnyIs("[true, null, true, true]", true);
+  this->AssertAnyIs("[false, null, false, true]", true);
+  this->AssertAnyIs("[true, null, false, true]", true);
+  this->AssertAnyIs(chunked_input0, true);
+  this->AssertAnyIs(chunked_input1, true);
+  this->AssertAnyIs(chunked_input2, false);
+  this->AssertAnyIs(chunked_input3, false);
+}
+
+struct AnyResult {
+  using c_type = typename BooleanType::c_type;
+
+  c_type any = false;
+};
+
+static AnyResult NaiveAny(const Array& array) {
+  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+
+  AnyResult result;
+
+  const auto& array_numeric = reinterpret_cast<const ArrayType&>(array);
+  const auto true_count = array_numeric.true_count();
+
+  if (true_count > 0) {
+    result.any = true;
+  }
+  return result;
+}
+
+void ValidateAny(const Array& array) {
+  using Traits = TypeTraits<BooleanType>;
+  using ScalarType = typename Traits::ScalarType;
+
+  ASSERT_OK_AND_ASSIGN(Datum out, Any(array));
+  const ScalarType& out_any = out.scalar_as<ScalarType>();
+
+  auto expected = NaiveAny(array);
+  const auto& expected_any = static_cast<const ScalarType>(expected.any);
+
+  ASSERT_EQ(out_any, expected_any);
+}
+
+class TestRandomBooleanAnyKernel : public ::testing::Test {};
+
+TEST_F(TestRandomBooleanAnyKernel, RandomArrayAny) {
+  auto rand = random::RandomArrayGenerator(0x8afc055);
+  // Test size up to 1<<11 (2048).
+  for (size_t i = 3; i < 12; i += 2) {
+    for (auto null_probability : {0.0, 0.01, 0.1, 0.5, 0.99, 1.0}) {
+      int64_t base_length = (1UL << i) + 2;
+      auto array = rand.Boolean(base_length, null_probability, null_probability);
+      for (auto length_adjust : {-2, -1, 0, 1, 2}) {
+        int64_t length = (1UL << i) + length_adjust;
+        ValidateAny(*array->Slice(0, length));
+      }
+    }
+  }
+}

Review comment:
       Right. Should I drop this part of the test altogether?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -726,6 +726,109 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) {
   }
 }
 
+//
+// Any
+//
+
+class TestPrimitiveAnyKernel : public ::testing::Test {
+  using Traits = TypeTraits<BooleanType>;
+  using ArrayType = typename Traits::ArrayType;
+  using c_type = typename BooleanType::c_type;
+  using ScalarType = typename Traits::ScalarType;
+
+ public:
+  void AssertAnyIs(const Datum& array, c_type expected) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Any(array));
+    const ScalarType& out_any = out.scalar_as<ScalarType>();
+    const auto expected_any = static_cast<const ScalarType>(expected);
+    ASSERT_EQ(out_any, expected_any);
+  }
+
+  void AssertAnyIs(const std::string& json, c_type expected) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  void AssertAnyIs(const std::vector<std::string>& json, c_type expected) {
+    auto array = ChunkedArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  std::shared_ptr<DataType> type_singleton() { return Traits::type_singleton(); }
+};
+
+class TestAnyKernel : public TestPrimitiveAnyKernel {};
+
+TEST_F(TestAnyKernel, Basics) {
+  std::vector<std::string> chunked_input0 = {"[]", "[true]"};
+  std::vector<std::string> chunked_input1 = {"[true, true, null]", "[true, null]"};
+  std::vector<std::string> chunked_input2 = {"[false, false, false]", "[false]"};
+  std::vector<std::string> chunked_input3 = {"[false, null]", "[null, false]"};
+
+  this->AssertAnyIs("[false]", false);
+  this->AssertAnyIs("[true, false]", true);
+  this->AssertAnyIs("[null, null, null]", false);
+  this->AssertAnyIs("[false, false, false]", false);
+  this->AssertAnyIs("[false, false, false, null]", false);
+  this->AssertAnyIs("[true, null, true, true]", true);
+  this->AssertAnyIs("[false, null, false, true]", true);
+  this->AssertAnyIs("[true, null, false, true]", true);
+  this->AssertAnyIs(chunked_input0, true);
+  this->AssertAnyIs(chunked_input1, true);
+  this->AssertAnyIs(chunked_input2, false);
+  this->AssertAnyIs(chunked_input3, false);
+}
+
+struct AnyResult {
+  using c_type = typename BooleanType::c_type;
+
+  c_type any = false;
+};
+
+static AnyResult NaiveAny(const Array& array) {
+  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+
+  AnyResult result;
+
+  const auto& array_numeric = reinterpret_cast<const ArrayType&>(array);

Review comment:
       Done!

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -726,6 +726,109 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) {
   }
 }
 
+//
+// Any
+//
+
+class TestPrimitiveAnyKernel : public ::testing::Test {
+  using Traits = TypeTraits<BooleanType>;
+  using ArrayType = typename Traits::ArrayType;
+  using c_type = typename BooleanType::c_type;
+  using ScalarType = typename Traits::ScalarType;
+
+ public:
+  void AssertAnyIs(const Datum& array, c_type expected) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Any(array));
+    const ScalarType& out_any = out.scalar_as<ScalarType>();
+    const auto expected_any = static_cast<const ScalarType>(expected);
+    ASSERT_EQ(out_any, expected_any);
+  }
+
+  void AssertAnyIs(const std::string& json, c_type expected) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  void AssertAnyIs(const std::vector<std::string>& json, c_type expected) {
+    auto array = ChunkedArrayFromJSON(type_singleton(), json);
+    AssertAnyIs(array, expected);
+  }
+
+  std::shared_ptr<DataType> type_singleton() { return Traits::type_singleton(); }
+};
+
+class TestAnyKernel : public TestPrimitiveAnyKernel {};
+
+TEST_F(TestAnyKernel, Basics) {
+  std::vector<std::string> chunked_input0 = {"[]", "[true]"};
+  std::vector<std::string> chunked_input1 = {"[true, true, null]", "[true, null]"};
+  std::vector<std::string> chunked_input2 = {"[false, false, false]", "[false]"};
+  std::vector<std::string> chunked_input3 = {"[false, null]", "[null, false]"};
+
+  this->AssertAnyIs("[false]", false);
+  this->AssertAnyIs("[true, false]", true);
+  this->AssertAnyIs("[null, null, null]", false);
+  this->AssertAnyIs("[false, false, false]", false);
+  this->AssertAnyIs("[false, false, false, null]", false);
+  this->AssertAnyIs("[true, null, true, true]", true);
+  this->AssertAnyIs("[false, null, false, true]", true);
+  this->AssertAnyIs("[true, null, false, true]", true);
+  this->AssertAnyIs(chunked_input0, true);
+  this->AssertAnyIs(chunked_input1, true);
+  this->AssertAnyIs(chunked_input2, false);
+  this->AssertAnyIs(chunked_input3, false);
+}
+
+struct AnyResult {
+  using c_type = typename BooleanType::c_type;
+
+  c_type any = false;
+};

Review comment:
       No - switched to plain bool variable in the new commit as you suggest below




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org