You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/08/26 13:08:51 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #10896: ARROW-12959: [C++][R] Option for is_null(NaN) to evaluate to true

pitrou commented on a change in pull request #10896:
URL: https://github.com/apache/arrow/pull/10896#discussion_r696574146



##########
File path: cpp/src/arrow/compute/kernels/scalar_validity_test.cc
##########
@@ -77,17 +77,105 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) {
                    "[true, false, false, true]");
 }
 
+TEST_F(TestFloatValidityKernels, FloatArrayIsNull) {
+  // Default NanNullOptions (nan_is_null = false)
+  NanNullOptions options;
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false]"),
+                   &options);
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+
+  // Set nan_is_null value to true
+  options.nan_is_null = true;
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options);
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"),
+                   &options);
+}
+
+TEST_F(TestDoubleValidityKernels, DoubleArrayIsNull) {
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false]"));
+
+  NanNullOptions options{false};
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+
+  options.nan_is_null = true;
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options);
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"),
+                   &options);
+}
+
+TEST_F(TestFloatValidityKernels, FloatScalarIsNull) {
+  CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false));
+  CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(false));
+
+  NanNullOptions options{false};
+  CheckScalarUnary("is_null", MakeScalar(std::numeric_limits<float>::infinity()),
+                   MakeScalar(false), &options);
+  CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options);
+
+  options.nan_is_null = true;
+  CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false), &options);
+  CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(true), &options);
+  CheckScalarUnary("is_null", MakeScalar(std::numeric_limits<float>::infinity()),
+                   MakeScalar(false), &options);
+  CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options);
+}
+
+TEST_F(TestDoubleValidityKernels, DoubleScalarIsNull) {
+  NanNullOptions options{false};
+  CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options);
+  CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(false), &options);
+  CheckScalarUnary("is_null", MakeScalar(std::numeric_limits<double>::infinity()),
+                   MakeScalar(false));
+  CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true));
+
+  options.nan_is_null = true;
+  CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options);
+  CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(true), &options);
+  CheckScalarUnary("is_null", MakeScalar(std::numeric_limits<double>::infinity()),
+                   MakeScalar(false), &options);
+  CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true), &options);
+}

Review comment:
       I think the repetition can easily be avoided here, for example by making this a templated test.

##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -224,6 +224,15 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions {
   int64_t start, stop, step;
 };
 
+class ARROW_EXPORT NanNullOptions : public FunctionOptions {

Review comment:
       I suggest renaming this `NullOptions`, in case other values than NaN need to be considered null at some point.

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity.cc
##########
@@ -202,9 +256,12 @@ const FunctionDoc is_inf_doc(
     ("For each input value, emit true iff the value is infinite (inf or -inf)."),
     {"values"});
 
-const FunctionDoc is_null_doc("Return true if null",
-                              ("For each input value, emit true iff the value is null."),
-                              {"values"});
+const FunctionDoc is_null_doc(
+    "Return true if null, NaN values can be considered as null",
+    ("For each input value, emit true if the value is null. Default behavior is to emit "

Review comment:
       By convention, function descriptions are explicitly line-wrapped with `\n` characters. I suggest doing the same here.

##########
File path: cpp/src/arrow/compute/kernels/scalar_validity_test.cc
##########
@@ -77,17 +77,105 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) {
                    "[true, false, false, true]");
 }
 
+TEST_F(TestFloatValidityKernels, FloatArrayIsNull) {
+  // Default NanNullOptions (nan_is_null = false)
+  NanNullOptions options;
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false]"),
+                   &options);
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+
+  // Set nan_is_null value to true
+  options.nan_is_null = true;
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options);
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"),
+                   &options);
+}
+
+TEST_F(TestDoubleValidityKernels, DoubleArrayIsNull) {
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false]"));
+
+  NanNullOptions options{false};
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+
+  options.nan_is_null = true;
+  // All NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"),
+                   ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options);
+  // No NaN
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"),
+                   &options);
+  // Some NaNs
+  CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"),
+                   ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"),
+                   &options);
+}
+
+TEST_F(TestFloatValidityKernels, FloatScalarIsNull) {
+  CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false));
+  CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(false));

Review comment:
       These separate tests shouldn't be necessary, as `CheckScalarUnary` already tests Scalar values implicitly when Array values are given.

##########
File path: python/pyarrow/array.pxi
##########
@@ -1039,11 +1039,12 @@ cdef class Array(_PandasConvertible):
         else:
             return 0
 
-    def is_null(self):
+    def is_null(self, nan_is_null=False):

Review comment:
       Same here.

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -234,9 +234,13 @@ cdef class Expression(_Weakrefable):
         """Checks whether the expression is not-null (valid)"""
         return Expression._call("is_valid", [self])
 
-    def is_null(self):
+    def is_null(self, bint nan_is_null=False):

Review comment:
       I would make this argument keyword-only.

##########
File path: python/pyarrow/compute.py
##########
@@ -55,6 +55,7 @@
     StrptimeOptions,
     StrftimeOptions,
     DayOfWeekOptions,
+    NanNullOptions,
     TakeOptions,

Review comment:
       We should try to maintain this alphabetically-ordered (though `DayOfWeekOptions` already violates it :-/).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org