You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/03/02 20:11:28 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #5446: Fix is_distinct from for float NaN values

alamb commented on code in PR #5446:
URL: https://github.com/apache/arrow-datafusion/pull/5446#discussion_r1123643959


##########
datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs:
##########
@@ -87,25 +78,120 @@ pub(crate) fn is_not_distinct_from<T>(
 where
     T: ArrowNumericType,
 {
-    let left_data = left.data();
-    let right_data = right.data();
-    let array_len = left_data.len().min(right_data.len());
+    distinct(
+        left,
+        right,
+        |left_value, right_value, left_isnull, right_isnull| {
+            !(left_isnull != right_isnull || left_value != right_value)
+        },
+    )
+}
 
+fn distinct<
+    T,
+    F: FnMut(
+        <T as ArrowPrimitiveType>::Native,
+        <T as ArrowPrimitiveType>::Native,
+        bool,
+        bool,
+    ) -> bool,
+>(
+    left: &PrimitiveArray<T>,
+    right: &PrimitiveArray<T>,
+    mut op: F,
+) -> Result<BooleanArray>
+where
+    T: ArrowNumericType,
+{
     let left_values = left.values();
     let right_values = right.values();
+    let left_data = left.data();
+    let right_data = right.data();
 
+    let array_len = left_data.len().min(right_data.len());
     let distinct = arrow_buffer::MutableBuffer::collect_bool(array_len, |i| {
-        !(left_data.is_null(i) != right_data.is_null(i)
-            || left_values[i] != right_values[i])
+        op(
+            left_values[i],
+            right_values[i],
+            left_data.is_null(i),
+            right_data.is_null(i),
+        )
     });
-
     let array_data = ArrayData::builder(arrow_schema::DataType::Boolean)
         .len(array_len)
         .add_buffer(distinct.into());
 
     Ok(BooleanArray::from(unsafe { array_data.build_unchecked() }))
 }
 
+pub(crate) fn is_distinct_from_f32(
+    left: &Float32Array,
+    right: &Float32Array,
+) -> Result<BooleanArray> {
+    distinct(
+        left,
+        right,
+        |left_value, right_value, left_isnull, right_isnull| {
+            left_isnull != right_isnull
+                || left_value.is_nan() != right_value.is_nan()
+                || (!left_value.is_nan()
+                    && !right_value.is_nan()
+                    && left_value != right_value)
+        },
+    )
+}
+
+pub(crate) fn is_not_distinct_from_f32(
+    left: &Float32Array,
+    right: &Float32Array,
+) -> Result<BooleanArray> {
+    distinct(
+        left,
+        right,
+        |left_value, right_value, left_isnull, right_isnull| {
+            !(left_isnull != right_isnull
+                || left_value.is_nan() != right_value.is_nan()
+                || (!left_value.is_nan()
+                    && !right_value.is_nan()
+                    && left_value != right_value))
+        },

Review Comment:
   It took me quite some time to see that the only difference here is that there is a `!` in front of the entire expression. Would it be possible to factor out the duplication so the difference between the kernels are clearer?



##########
datafusion/core/tests/sqllogictests/test_files/select.slt:
##########
@@ -80,3 +80,29 @@ select '1' from foo order by column1;
 # foo distinct order by
 statement error DataFusion error: Error during planning: For SELECT DISTINCT, ORDER BY expressions column1 must appear in select list
 select distinct '1' from foo order by column1;
+
+# distincts for float nan

Review Comment:
   This might be a good test to add to https://github.com/apache/arrow-datafusion/tree/main/datafusion/core/tests/sqllogictests/test_files/pg_compat so they are automatically compared with postgres as part of CI (kudos to @melgenek  for help there)
   
   https://github.com/apache/arrow-datafusion/blob/main/datafusion/core/tests/sqllogictests/test_files/pg_compat/pg_compat_simple.slt perhaps?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org