You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/05 13:46:48 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #9023: ARROW-11043: [C++] Add "is_nan" kernel

pitrou commented on a change in pull request #9023:
URL: https://github.com/apache/arrow/pull/9023#discussion_r551938144



##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -345,6 +345,18 @@ Result<Datum> IsValid(const Datum& values, ExecContext* ctx = NULLPTR);
 ARROW_EXPORT
 Result<Datum> IsNull(const Datum& values, ExecContext* ctx = NULLPTR);
 
+/// \brief IsNan returns true for each element of `values` that is NaN,
+/// false otherwise
+///
+/// \param[in] values input to look for NaN
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since X.X.X

Review comment:
       This would be 3.0.0 if this PR is merged in a week or two.

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity_test.cc
##########
@@ -31,61 +31,98 @@
 namespace arrow {
 namespace compute {
 
+template <typename ArrowType>
 class TestValidityKernels : public ::testing::Test {
  protected:
-  // XXX Since IsValid and IsNull don't touch any buffers but the null bitmap
-  // testing multiple types seems redundant.
-  using ArrowType = BooleanType;
-
   static std::shared_ptr<DataType> type_singleton() {
     return TypeTraits<ArrowType>::type_singleton();
   }
 };
 
-TEST_F(TestValidityKernels, ArrayIsValid) {
+typedef TestValidityKernels<BooleanType> TestBooleanValidityKernels;
+typedef TestValidityKernels<FloatType> TestFloatValidityKernels;
+typedef TestValidityKernels<DoubleType> TestDoubleValidityKernels;
+
+TEST_F(TestBooleanValidityKernels, ArrayIsValid) {
   CheckScalarUnary("is_valid", type_singleton(), "[]", type_singleton(), "[]");
   CheckScalarUnary("is_valid", type_singleton(), "[null]", type_singleton(), "[false]");
   CheckScalarUnary("is_valid", type_singleton(), "[1]", type_singleton(), "[true]");
   CheckScalarUnary("is_valid", type_singleton(), "[null, 1, 0, null]", type_singleton(),
                    "[false, true, true, false]");
 }
 
-TEST_F(TestValidityKernels, IsValidIsNullNullType) {
+TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) {
   CheckScalarUnary("is_null", std::make_shared<NullArray>(5),
                    ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
   CheckScalarUnary("is_valid", std::make_shared<NullArray>(5),
                    ArrayFromJSON(boolean(), "[false, false, false, false, false]"));
 }
 
-TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) {
+TEST_F(TestBooleanValidityKernels, ArrayIsValidBufferPassthruOptimization) {
   Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]");
   ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg));
   ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]);
 }
 
-TEST_F(TestValidityKernels, ScalarIsValid) {
+TEST_F(TestBooleanValidityKernels, ScalarIsValid) {
   CheckScalarUnary("is_valid", MakeScalar(19.7), MakeScalar(true));
   CheckScalarUnary("is_valid", MakeNullScalar(float64()), MakeScalar(false));
 }
 
-TEST_F(TestValidityKernels, ArrayIsNull) {
+TEST_F(TestBooleanValidityKernels, ArrayIsNull) {
   CheckScalarUnary("is_null", type_singleton(), "[]", type_singleton(), "[]");
   CheckScalarUnary("is_null", type_singleton(), "[null]", type_singleton(), "[true]");
   CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]");
   CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(),
                    "[true, false, false, true]");
 }
 
-TEST_F(TestValidityKernels, IsNullSetsZeroNullCount) {
+TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) {
   auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]");
   std::shared_ptr<ArrayData> result = (*IsNull(arr)).array();
   ASSERT_EQ(result->null_count, 0);
 }
 
-TEST_F(TestValidityKernels, ScalarIsNull) {
+TEST_F(TestBooleanValidityKernels, ScalarIsNull) {
   CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false));
   CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true));
 }
 
+TEST_F(TestFloatValidityKernels, FloatArrayIsNan) {

Review comment:
       Note that you can write type-parameterized tests, though it's perhaps not very useful here:
   https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#typed-tests

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity_test.cc
##########
@@ -31,61 +31,98 @@
 namespace arrow {
 namespace compute {
 
+template <typename ArrowType>
 class TestValidityKernels : public ::testing::Test {
  protected:
-  // XXX Since IsValid and IsNull don't touch any buffers but the null bitmap
-  // testing multiple types seems redundant.
-  using ArrowType = BooleanType;
-
   static std::shared_ptr<DataType> type_singleton() {
     return TypeTraits<ArrowType>::type_singleton();
   }
 };
 
-TEST_F(TestValidityKernels, ArrayIsValid) {
+typedef TestValidityKernels<BooleanType> TestBooleanValidityKernels;
+typedef TestValidityKernels<FloatType> TestFloatValidityKernels;
+typedef TestValidityKernels<DoubleType> TestDoubleValidityKernels;
+
+TEST_F(TestBooleanValidityKernels, ArrayIsValid) {
   CheckScalarUnary("is_valid", type_singleton(), "[]", type_singleton(), "[]");
   CheckScalarUnary("is_valid", type_singleton(), "[null]", type_singleton(), "[false]");
   CheckScalarUnary("is_valid", type_singleton(), "[1]", type_singleton(), "[true]");
   CheckScalarUnary("is_valid", type_singleton(), "[null, 1, 0, null]", type_singleton(),
                    "[false, true, true, false]");
 }
 
-TEST_F(TestValidityKernels, IsValidIsNullNullType) {
+TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) {
   CheckScalarUnary("is_null", std::make_shared<NullArray>(5),
                    ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
   CheckScalarUnary("is_valid", std::make_shared<NullArray>(5),
                    ArrayFromJSON(boolean(), "[false, false, false, false, false]"));
 }
 
-TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) {
+TEST_F(TestBooleanValidityKernels, ArrayIsValidBufferPassthruOptimization) {
   Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]");
   ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg));
   ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]);
 }
 
-TEST_F(TestValidityKernels, ScalarIsValid) {
+TEST_F(TestBooleanValidityKernels, ScalarIsValid) {
   CheckScalarUnary("is_valid", MakeScalar(19.7), MakeScalar(true));
   CheckScalarUnary("is_valid", MakeNullScalar(float64()), MakeScalar(false));
 }
 
-TEST_F(TestValidityKernels, ArrayIsNull) {
+TEST_F(TestBooleanValidityKernels, ArrayIsNull) {
   CheckScalarUnary("is_null", type_singleton(), "[]", type_singleton(), "[]");
   CheckScalarUnary("is_null", type_singleton(), "[null]", type_singleton(), "[true]");
   CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]");
   CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(),
                    "[true, false, false, true]");
 }
 
-TEST_F(TestValidityKernels, IsNullSetsZeroNullCount) {
+TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) {
   auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]");
   std::shared_ptr<ArrayData> result = (*IsNull(arr)).array();
   ASSERT_EQ(result->null_count, 0);
 }
 
-TEST_F(TestValidityKernels, ScalarIsNull) {
+TEST_F(TestBooleanValidityKernels, ScalarIsNull) {
   CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false));
   CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true));
 }
 
+TEST_F(TestFloatValidityKernels, FloatArrayIsNan) {
+  // All NaN
+  CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
+  // No NaN
+  CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, 4.0, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, null]"));
+  // Some NaNs
+  CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, 4.0, null]"),
+                   ArrayFromJSON(boolean(), "[false, true, false, true, false, null]"));
+}
+
+TEST_F(TestDoubleValidityKernels, DoubleArrayIsNan) {
+  // All NaN
+  CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
+  // No NaN
+  CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, 4.0, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, null]"));
+  // Some NaNs
+  CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, 4.0, null]"),
+                   ArrayFromJSON(boolean(), "[false, true, false, true, false, null]"));
+}
+
+TEST_F(TestFloatValidityKernels, FloatScalarIsNan) {
+  CheckScalarUnary("is_nan", MakeNullScalar(float32()), MakeNullScalar(boolean()));
+  CheckScalarUnary("is_nan", MakeScalar(42.0), MakeScalar(false));

Review comment:
       You probably mean `42.0f`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity.cc
##########
@@ -132,6 +156,11 @@ const FunctionDoc is_null_doc("Return true if null",
                               ("For each input value, emit true iff the value is null."),
                               {"values"});
 
+const FunctionDoc is_nan_doc("Return true if NaN",
+                             ("For each input value, emit true iff the value is NaN "
+                              "(according to std::isnan(value))."),

Review comment:
       I don't think "according to" is necessary, NaN values are standardized by IEEE floating point.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -453,22 +453,26 @@ Structural transforms
 +==========================+============+================================================+=====================+=========+
 | fill_null                | Binary     | Boolean, Null, Numeric, Temporal, String-like  | Input type          | \(1)    |
 +--------------------------+------------+------------------------------------------------+---------------------+---------+
-| is_null                  | Unary      | Any                                            | Boolean             | \(2)    |
+| is_nan                   | Unary      | Float, Double                                  | Boolean             | \(2)    |
 +--------------------------+------------+------------------------------------------------+---------------------+---------+
-| is_valid                 | Unary      | Any                                            | Boolean             | \(2)    |
+| is_null                  | Unary      | Any                                            | Boolean             | \(3)    |
 +--------------------------+------------+------------------------------------------------+---------------------+---------+
-| list_value_length        | Unary      | List-like                                      | Int32 or Int64      | \(4)    |
+| is_valid                 | Unary      | Any                                            | Boolean             | \(4)    |
++--------------------------+------------+------------------------------------------------+---------------------+---------+
+| list_value_length        | Unary      | List-like                                      | Int32 or Int64      | \(5)    |
 +--------------------------+------------+------------------------------------------------+---------------------+---------+
 
 * \(1) First input must be an array, second input a scalar of the same type.
   Output is an array of the same type as the inputs, and with the same values
   as the first input, except for nulls replaced with the second input value.
 
-* \(2) Output is true iff the corresponding input element is non-null.
+* \(2) Output is true iff the corresponding input element is NaN according to std::isnan(value).

Review comment:
       As mentioned above, "according to..." isn't necessary.

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity_test.cc
##########
@@ -31,61 +31,98 @@
 namespace arrow {
 namespace compute {
 
+template <typename ArrowType>
 class TestValidityKernels : public ::testing::Test {
  protected:
-  // XXX Since IsValid and IsNull don't touch any buffers but the null bitmap
-  // testing multiple types seems redundant.
-  using ArrowType = BooleanType;
-
   static std::shared_ptr<DataType> type_singleton() {
     return TypeTraits<ArrowType>::type_singleton();
   }
 };
 
-TEST_F(TestValidityKernels, ArrayIsValid) {
+typedef TestValidityKernels<BooleanType> TestBooleanValidityKernels;
+typedef TestValidityKernels<FloatType> TestFloatValidityKernels;
+typedef TestValidityKernels<DoubleType> TestDoubleValidityKernels;

Review comment:
       Style nit: we would rather use `using TestDoubleValidityKernels = TestValidityKernels<DoubleType>`.

##########
File path: cpp/src/arrow/compute/kernels/common.h
##########
@@ -19,6 +19,7 @@
 
 // IWYU pragma: begin_exports
 
+#include <cmath>

Review comment:
       Why add this to `common.h`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org