You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/06/15 12:19:29 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7410: ARROW-971: [C++][Compute] IsValid, IsNull kernels

pitrou commented on a change in pull request #7410:
URL: https://github.com/apache/arrow/pull/7410#discussion_r440130579



##########
File path: cpp/src/arrow/testing/random.h
##########
@@ -250,6 +250,9 @@ class ARROW_EXPORT RandomArrayGenerator {
                                            int32_t min_length, int32_t max_length,
                                            double null_probability = 0);
 
+  std::shared_ptr<Array> Of(std::shared_ptr<DataType> type, int64_t size,

Review comment:
       Call it `Array`?

##########
File path: cpp/src/arrow/compute/kernels/test_util.h
##########
@@ -97,5 +99,63 @@ using TestingStringTypes =
 
 static constexpr random::SeedType kRandomSeed = 0x0ff1ce;
 
+struct ScalarFunctionPropertyTestParam {

Review comment:
       I'm not sure what the name "property" is supposed to refer to.

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity.cc
##########
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/common.h"
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+namespace compute {
+
+struct IsValid {
+  static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    checked_cast<BooleanScalar*>(out)->value = in.is_valid;
+  }
+
+  static void Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+    if (arr.buffers[0] != nullptr && out->offset == arr.offset &&
+        out->length == arr.length) {
+      out->buffers[1] = arr.buffers[0];
+      return;
+    }
+
+    if (arr.null_count == 0 || arr.buffers[0] == nullptr) {
+      BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true);
+      return;
+    }
+
+    internal::CopyBitmap(arr.buffers[0]->data(), arr.offset, arr.length,
+                         out->buffers[1]->mutable_data(), out->offset);
+  }
+};
+
+struct IsNull {
+  static void Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    checked_cast<BooleanScalar*>(out)->value = !in.is_valid;
+  }
+
+  static void Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+    if (arr.null_count == 0 || arr.buffers[0] == nullptr) {
+      BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
+                         false);
+      return;
+    }
+
+    internal::InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length,
+                           out->buffers[1]->mutable_data(), out->offset);
+  }
+};
+
+namespace codegen {
+
+void MakeFunction(std::string name, std::vector<InputType> in_types, OutputType out_type,
+                  ArrayKernelExec exec, FunctionRegistry* registry,
+                  NullHandling::type null_handling) {
+  Arity arity{static_cast<int>(in_types.size())};
+  auto func = std::make_shared<ScalarFunction>(name, arity);
+
+  ScalarKernel kernel(std::move(in_types), out_type, exec);
+  kernel.null_handling = null_handling;
+  kernel.can_write_into_slices = true;
+
+  DCHECK_OK(func->AddKernel(std::move(kernel)));
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+}  // namespace codegen

Review comment:
       Looks like most of this file should be under the anonymous namespace.

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity_test.cc
##########
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/compute/api.h"
+#include "arrow/compute/kernels/test_util.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+
+class TestValidityKernels : public ::testing::Test {
+ protected:
+  // XXX Since IsValid and IsNull don't touch any buffers but the null bitmap
+  // testing multiple types seems redundant.
+  using ArrowType = BooleanType;
+
+  using CType = typename ArrowType::c_type;
+
+  static std::shared_ptr<DataType> type_singleton() {
+    return TypeTraits<ArrowType>::type_singleton();
+  }
+
+  void AssertUnary(Datum arg, Datum expected) {
+    ASSERT_OK_AND_ASSIGN(auto actual, func_(arg, nullptr));
+    ASSERT_EQ(actual.kind(), expected.kind());
+    if (actual.kind() == Datum::ARRAY) {
+      ASSERT_OK(actual.make_array()->ValidateFull());
+      AssertArraysApproxEqual(*expected.make_array(), *actual.make_array());
+    } else {
+      AssertScalarsEqual(*expected.scalar(), *actual.scalar());
+    }
+  }
+
+  void AssertUnary(const std::string& arg_json, const std::string& expected_json) {
+    AssertUnary(ArrayFromJSON(type_singleton(), arg_json),
+                ArrayFromJSON(type_singleton(), expected_json));
+  }
+
+  using UnaryFunction = std::function<Result<Datum>(const Datum&, ExecContext*)>;
+  UnaryFunction func_;
+};
+
+TEST_F(TestValidityKernels, ArrayIsValid) {
+  func_ = arrow::compute::IsValid;
+
+  this->AssertUnary("[]", "[]");
+  this->AssertUnary("[null]", "[false]");
+  this->AssertUnary("[1]", "[true]");
+  this->AssertUnary("[null, 1, 0, null]", "[false, true, true, false]");
+}
+
+TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) {
+  Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]");
+  ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg));
+  ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]);
+}
+
+TEST_F(TestValidityKernels, ScalarIsValid) {
+  func_ = arrow::compute::IsValid;
+
+  AssertUnary(Datum(19.7), Datum(true));
+  AssertUnary(MakeNullScalar(float64()), Datum(false));
+}
+
+TEST_F(TestValidityKernels, ArrayIsNull) {
+  func_ = arrow::compute::IsNull;
+
+  this->AssertUnary("[]", "[]");
+  this->AssertUnary("[null]", "[true]");
+  this->AssertUnary("[1]", "[false]");
+  this->AssertUnary("[null, 1, 0, null]", "[true, false, false, true]");
+}
+
+TEST_F(TestValidityKernels, DISABLED_ScalarIsNull) {
+  func_ = arrow::compute::IsNull;
+
+  AssertUnary(Datum(19.7), Datum(false));
+  AssertUnary(MakeNullScalar(float64()), Datum(true));
+}
+
+class IsValidProperty : public ScalarFunctionPropertyMixin {
+ public:
+  std::shared_ptr<ScalarFunction> GetFunction() override {
+    return internal::checked_pointer_cast<ScalarFunction>(
+        *GetFunctionRegistry()->GetFunction("is_valid"));
+  }
+
+  Result<std::shared_ptr<Scalar>> Contract(const ScalarVector& args,
+                                           const FunctionOptions*) override {
+    return std::make_shared<BooleanScalar>(args[0]->is_valid);
+  }
+};
+
+TEST_P(IsValidProperty, TestIsValidProperty) { Validate(); }
+
+INSTANTIATE_TEST_SUITE_P(IsValidPropertyTest, IsValidProperty,
+                         ScalarFunctionPropertyTestParam::Values({
+                             {0, 0.0},
+                             {1, 0.0},
+                             {2, 0.0},
+                             {1024, 0.25},
+                         }));
+
+class IsNullProperty : public ScalarFunctionPropertyMixin {
+ public:
+  std::shared_ptr<ScalarFunction> GetFunction() override {
+    return internal::checked_pointer_cast<ScalarFunction>(
+        *GetFunctionRegistry()->GetFunction("is_null"));
+  }
+
+  Result<std::shared_ptr<Scalar>> Contract(const ScalarVector& args,
+                                           const FunctionOptions*) override {
+    return std::make_shared<BooleanScalar>(!args[0]->is_valid);
+  }
+};
+
+TEST_P(IsNullProperty, TestIsNullProperty) { Validate(); }
+
+INSTANTIATE_TEST_SUITE_P(IsNullPropertyTest, IsNullProperty,
+                         ScalarFunctionPropertyTestParam::Values({
+                             {0, 0.0},
+                             {1, 0.0},
+                             {2, 0.0},
+                             {1024, 0.25},
+                         }));

Review comment:
       I agree that this looks over-engineered for what it seems to be doing (I may be misunderstanding) -- basically generating test data and checking expected values.

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity_test.cc
##########
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/compute/api.h"
+#include "arrow/compute/kernels/test_util.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+
+class TestValidityKernels : public ::testing::Test {
+ protected:
+  // XXX Since IsValid and IsNull don't touch any buffers but the null bitmap
+  // testing multiple types seems redundant.
+  using ArrowType = BooleanType;
+
+  using CType = typename ArrowType::c_type;
+
+  static std::shared_ptr<DataType> type_singleton() {
+    return TypeTraits<ArrowType>::type_singleton();
+  }
+
+  void AssertUnary(Datum arg, Datum expected) {
+    ASSERT_OK_AND_ASSIGN(auto actual, func_(arg, nullptr));
+    ASSERT_EQ(actual.kind(), expected.kind());
+    if (actual.kind() == Datum::ARRAY) {
+      ASSERT_OK(actual.make_array()->ValidateFull());
+      AssertArraysApproxEqual(*expected.make_array(), *actual.make_array());
+    } else {
+      AssertScalarsEqual(*expected.scalar(), *actual.scalar());
+    }
+  }
+
+  void AssertUnary(const std::string& arg_json, const std::string& expected_json) {
+    AssertUnary(ArrayFromJSON(type_singleton(), arg_json),
+                ArrayFromJSON(type_singleton(), expected_json));
+  }
+
+  using UnaryFunction = std::function<Result<Datum>(const Datum&, ExecContext*)>;
+  UnaryFunction func_;
+};
+
+TEST_F(TestValidityKernels, ArrayIsValid) {
+  func_ = arrow::compute::IsValid;
+
+  this->AssertUnary("[]", "[]");
+  this->AssertUnary("[null]", "[false]");
+  this->AssertUnary("[1]", "[true]");
+  this->AssertUnary("[null, 1, 0, null]", "[false, true, true, false]");
+}
+
+TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) {
+  Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]");
+  ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg));
+  ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]);
+}
+
+TEST_F(TestValidityKernels, ScalarIsValid) {
+  func_ = arrow::compute::IsValid;
+
+  AssertUnary(Datum(19.7), Datum(true));
+  AssertUnary(MakeNullScalar(float64()), Datum(false));
+}
+
+TEST_F(TestValidityKernels, ArrayIsNull) {
+  func_ = arrow::compute::IsNull;
+
+  this->AssertUnary("[]", "[]");
+  this->AssertUnary("[null]", "[true]");
+  this->AssertUnary("[1]", "[false]");
+  this->AssertUnary("[null, 1, 0, null]", "[true, false, false, true]");
+}
+
+TEST_F(TestValidityKernels, DISABLED_ScalarIsNull) {
+  func_ = arrow::compute::IsNull;
+
+  AssertUnary(Datum(19.7), Datum(false));
+  AssertUnary(MakeNullScalar(float64()), Datum(true));
+}
+
+class IsValidProperty : public ScalarFunctionPropertyMixin {
+ public:
+  std::shared_ptr<ScalarFunction> GetFunction() override {
+    return internal::checked_pointer_cast<ScalarFunction>(
+        *GetFunctionRegistry()->GetFunction("is_valid"));
+  }
+
+  Result<std::shared_ptr<Scalar>> Contract(const ScalarVector& args,
+                                           const FunctionOptions*) override {
+    return std::make_shared<BooleanScalar>(args[0]->is_valid);
+  }
+};
+
+TEST_P(IsValidProperty, TestIsValidProperty) { Validate(); }
+
+INSTANTIATE_TEST_SUITE_P(IsValidPropertyTest, IsValidProperty,
+                         ScalarFunctionPropertyTestParam::Values({
+                             {0, 0.0},
+                             {1, 0.0},
+                             {2, 0.0},
+                             {1024, 0.25},
+                         }));
+
+class IsNullProperty : public ScalarFunctionPropertyMixin {
+ public:
+  std::shared_ptr<ScalarFunction> GetFunction() override {
+    return internal::checked_pointer_cast<ScalarFunction>(
+        *GetFunctionRegistry()->GetFunction("is_null"));
+  }
+
+  Result<std::shared_ptr<Scalar>> Contract(const ScalarVector& args,
+                                           const FunctionOptions*) override {
+    return std::make_shared<BooleanScalar>(!args[0]->is_valid);
+  }
+};
+
+TEST_P(IsNullProperty, TestIsNullProperty) { Validate(); }
+
+INSTANTIATE_TEST_SUITE_P(IsNullPropertyTest, IsNullProperty,
+                         ScalarFunctionPropertyTestParam::Values({
+                             {0, 0.0},
+                             {1, 0.0},
+                             {2, 0.0},
+                             {1024, 0.25},

Review comment:
       Am I wrong, or is this generating a new test subclass for simply different parameter values? This does not seem sound, especially as it will blow up compile times.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org