You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/05/22 13:38:12 UTC

[arrow-datafusion] branch main updated: Support is [not] distinct from for binaryarray types (#6394)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 53d6987ac8 Support is [not] distinct from for binaryarray types (#6394)
53d6987ac8 is described below

commit 53d6987ac8db265009daa1db5432ca3345d6c8d5
Author: Daniël Heres <da...@gmail.com>
AuthorDate: Mon May 22 15:38:05 2023 +0200

    Support is [not] distinct from for binaryarray types (#6394)
    
    * Support is distinc from binary
    
    * Add tests
    
    * Tests
    
    * Fix test
    
    ---------
    
    Co-authored-by: Daniël Heres <da...@coralogix.com>
---
 .../core/tests/sqllogictests/test_files/select.slt | 28 ++++++++++++++++++
 datafusion/physical-expr/src/expressions/binary.rs | 33 +++++++++++++++++-----
 .../src/expressions/binary/kernels_arrow.rs        | 22 +++++++++++++++
 3 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/datafusion/core/tests/sqllogictests/test_files/select.slt b/datafusion/core/tests/sqllogictests/test_files/select.slt
index 03f96bad95..399c212210 100644
--- a/datafusion/core/tests/sqllogictests/test_files/select.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/select.slt
@@ -298,6 +298,34 @@ select column1 is not distinct from column2 from t;
 false
 
 
+# Binary Expression for Binary
+statement ok
+CREATE TABLE binary_t as select arrow_cast('Bar', 'Binary') as column1, arrow_cast('B%', 'Binary') as column2;
+
+query B
+select column1 is distinct from column2 from binary_t;
+----
+true
+
+query B
+select column1 is not distinct from column2 from binary_t;
+----
+false
+
+# Binary Expression for LargeBinary
+statement ok
+CREATE TABLE large_binary_t as select arrow_cast('Bar', 'LargeBinary') as column1, arrow_cast('B%', 'LargeBinary') as column2;
+
+query B
+select column1 is distinct from column2 from large_binary_t;
+----
+true
+
+query B
+select column1 is not distinct from column2 from large_binary_t;
+----
+false
+
 # select all
 # these two queries should return the same result
 query R
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index 7bdbba88a8..46acbe6b5e 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -70,13 +70,14 @@ use kernels::{
 use kernels_arrow::{
     add_decimal_dyn_scalar, add_dyn_decimal, add_dyn_temporal, add_dyn_temporal_scalar,
     divide_decimal_dyn_scalar, divide_dyn_opt_decimal, is_distinct_from,
-    is_distinct_from_bool, is_distinct_from_decimal, is_distinct_from_f32,
-    is_distinct_from_f64, is_distinct_from_null, is_distinct_from_utf8,
-    is_not_distinct_from, is_not_distinct_from_bool, is_not_distinct_from_decimal,
-    is_not_distinct_from_f32, is_not_distinct_from_f64, is_not_distinct_from_null,
-    is_not_distinct_from_utf8, modulus_decimal_dyn_scalar, modulus_dyn_decimal,
-    multiply_decimal_dyn_scalar, multiply_dyn_decimal, subtract_decimal_dyn_scalar,
-    subtract_dyn_decimal, subtract_dyn_temporal, subtract_dyn_temporal_scalar,
+    is_distinct_from_binary, is_distinct_from_bool, is_distinct_from_decimal,
+    is_distinct_from_f32, is_distinct_from_f64, is_distinct_from_null,
+    is_distinct_from_utf8, is_not_distinct_from, is_not_distinct_from_binary,
+    is_not_distinct_from_bool, is_not_distinct_from_decimal, is_not_distinct_from_f32,
+    is_not_distinct_from_f64, is_not_distinct_from_null, is_not_distinct_from_utf8,
+    modulus_decimal_dyn_scalar, modulus_dyn_decimal, multiply_decimal_dyn_scalar,
+    multiply_dyn_decimal, subtract_decimal_dyn_scalar, subtract_dyn_decimal,
+    subtract_dyn_temporal, subtract_dyn_temporal_scalar,
 };
 
 use arrow::datatypes::{DataType, Schema, TimeUnit};
@@ -245,6 +246,21 @@ macro_rules! compute_utf8_op {
     }};
 }
 
+/// Invoke a compute kernel on a pair of binary data arrays
+macro_rules! compute_binary_op {
+    ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident) => {{
+        let ll = $LEFT
+            .as_any()
+            .downcast_ref::<$DT>()
+            .expect("compute_op failed to downcast left side array");
+        let rr = $RIGHT
+            .as_any()
+            .downcast_ref::<$DT>()
+            .expect("compute_op failed to downcast right side array");
+        Ok(Arc::new(paste::expr! {[<$OP _binary>]}(&ll, &rr)?))
+    }};
+}
+
 /// Invoke a compute kernel on a data array and a scalar value
 macro_rules! compute_utf8_op_scalar {
     ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident, $OP_TYPE:expr) => {{
@@ -510,7 +526,10 @@ macro_rules! binary_array_op {
             DataType::Float32 => compute_f32_op!($LEFT, $RIGHT, $OP, Float32Array),
             DataType::Float64 => compute_f64_op!($LEFT, $RIGHT, $OP, Float64Array),
             DataType::Utf8 => compute_utf8_op!($LEFT, $RIGHT, $OP, StringArray),
+            DataType::Binary => compute_binary_op!($LEFT, $RIGHT, $OP, BinaryArray),
+            DataType::LargeBinary => compute_binary_op!($LEFT, $RIGHT, $OP, LargeBinaryArray),
             DataType::LargeUtf8 => compute_utf8_op!($LEFT, $RIGHT, $OP, LargeStringArray),
+
             DataType::Timestamp(TimeUnit::Nanosecond, _) => {
                 compute_op!($LEFT, $RIGHT, $OP, TimestampNanosecondArray)
             }
diff --git a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
index 90fca17157..50a9f86c06 100644
--- a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
+++ b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
@@ -210,6 +210,17 @@ pub(crate) fn is_distinct_from_utf8<OffsetSize: OffsetSizeTrait>(
         .collect())
 }
 
+pub(crate) fn is_distinct_from_binary<OffsetSize: OffsetSizeTrait>(
+    left: &GenericBinaryArray<OffsetSize>,
+    right: &GenericBinaryArray<OffsetSize>,
+) -> Result<BooleanArray> {
+    Ok(left
+        .iter()
+        .zip(right.iter())
+        .map(|(x, y)| Some(x != y))
+        .collect())
+}
+
 pub(crate) fn is_distinct_from_null(
     left: &NullArray,
     _right: &NullArray,
@@ -241,6 +252,17 @@ pub(crate) fn is_not_distinct_from_utf8<OffsetSize: OffsetSizeTrait>(
         .collect())
 }
 
+pub(crate) fn is_not_distinct_from_binary<OffsetSize: OffsetSizeTrait>(
+    left: &GenericBinaryArray<OffsetSize>,
+    right: &GenericBinaryArray<OffsetSize>,
+) -> Result<BooleanArray> {
+    Ok(left
+        .iter()
+        .zip(right.iter())
+        .map(|(x, y)| Some(x == y))
+        .collect())
+}
+
 pub(crate) fn is_distinct_from_decimal(
     left: &Decimal128Array,
     right: &Decimal128Array,