You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/27 21:20:53 UTC

[arrow-rs] branch master updated: Use SlicesIterator for ArrayData Equality (#3198)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f98581801 Use SlicesIterator for ArrayData Equality (#3198)
f98581801 is described below

commit f985818012e6d0a56fca49487dbc13c516f4613c
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Sun Nov 27 13:20:47 2022 -0800

    Use SlicesIterator for ArrayData Equality (#3198)
    
    * Use SlicesIterator for ArrayData Equality
    
    * Use BitSliceIterator
---
 arrow-data/src/equal/fixed_binary.rs | 72 ++++++++++++++++++++++++----------
 arrow-data/src/equal/primitive.rs    | 75 ++++++++++++++++++++++++++----------
 arrow-select/src/filter.rs           |  2 +-
 arrow/benches/equal.rs               |  3 ++
 4 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs
index d6af20801..17e470b5c 100644
--- a/arrow-data/src/equal/fixed_binary.rs
+++ b/arrow-data/src/equal/fixed_binary.rs
@@ -15,7 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::data::{contains_nulls, ArrayData};
+use crate::bit_iterator::BitSliceIterator;
+use crate::contains_nulls;
+use crate::data::ArrayData;
+use crate::equal::primitive::NULL_SLICES_SELECTIVITY_THRESHOLD;
 use arrow_buffer::bit_util::get_bit;
 use arrow_schema::DataType;
 
@@ -47,26 +50,55 @@ pub(super) fn fixed_binary_equal(
             size * len,
         )
     } else {
-        // get a ref of the null buffer bytes, to use in testing for nullness
-        let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
-        let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
-        // with nulls, we need to compare item by item whenever it is not null
-        (0..len).all(|i| {
-            let lhs_pos = lhs_start + i;
-            let rhs_pos = rhs_start + i;
+        let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64;
 
-            let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
-            let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+        if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD {
+            // get a ref of the null buffer bytes, to use in testing for nullness
+            let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
+            let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
+            // with nulls, we need to compare item by item whenever it is not null
+            (0..len).all(|i| {
+                let lhs_pos = lhs_start + i;
+                let rhs_pos = rhs_start + i;
 
-            lhs_is_null
-                || (lhs_is_null == rhs_is_null)
-                    && equal_len(
-                        lhs_values,
-                        rhs_values,
-                        lhs_pos * size,
-                        rhs_pos * size,
-                        size, // 1 * size since we are comparing a single entry
-                    )
-        })
+                let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
+                let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+
+                lhs_is_null
+                    || (lhs_is_null == rhs_is_null)
+                        && equal_len(
+                            lhs_values,
+                            rhs_values,
+                            lhs_pos * size,
+                            rhs_pos * size,
+                            size, // 1 * size since we are comparing a single entry
+                        )
+            })
+        } else {
+            let lhs_slices_iter = BitSliceIterator::new(
+                lhs.null_buffer().as_ref().unwrap(),
+                lhs_start + lhs.offset(),
+                len,
+            );
+            let rhs_slices_iter = BitSliceIterator::new(
+                rhs.null_buffer().as_ref().unwrap(),
+                rhs_start + rhs.offset(),
+                len,
+            );
+
+            lhs_slices_iter.zip(rhs_slices_iter).all(
+                |((l_start, l_end), (r_start, r_end))| {
+                    l_start == r_start
+                        && l_end == r_end
+                        && equal_len(
+                            lhs_values,
+                            rhs_values,
+                            (lhs_start + l_start) * size,
+                            (rhs_start + r_start) * size,
+                            (l_end - l_start) * size,
+                        )
+                },
+            )
+        }
     }
 }
diff --git a/arrow-data/src/equal/primitive.rs b/arrow-data/src/equal/primitive.rs
index e619375d5..f52541e28 100644
--- a/arrow-data/src/equal/primitive.rs
+++ b/arrow-data/src/equal/primitive.rs
@@ -15,13 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::bit_iterator::BitSliceIterator;
+use crate::contains_nulls;
+use arrow_buffer::bit_util::get_bit;
 use std::mem::size_of;
 
-use crate::data::{contains_nulls, ArrayData};
-use arrow_buffer::bit_util::get_bit;
+use crate::data::ArrayData;
 
 use super::utils::equal_len;
 
+pub(crate) const NULL_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.4;
+
 pub(super) fn primitive_equal<T>(
     lhs: &ArrayData,
     rhs: &ArrayData,
@@ -45,25 +49,54 @@ pub(super) fn primitive_equal<T>(
             len * byte_width,
         )
     } else {
-        // get a ref of the null buffer bytes, to use in testing for nullness
-        let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
-        let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
-        // with nulls, we need to compare item by item whenever it is not null
-        (0..len).all(|i| {
-            let lhs_pos = lhs_start + i;
-            let rhs_pos = rhs_start + i;
-            let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
-            let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+        let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64;
+
+        if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD {
+            // get a ref of the null buffer bytes, to use in testing for nullness
+            let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
+            let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
+            // with nulls, we need to compare item by item whenever it is not null
+            (0..len).all(|i| {
+                let lhs_pos = lhs_start + i;
+                let rhs_pos = rhs_start + i;
+                let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
+                let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+
+                lhs_is_null
+                    || (lhs_is_null == rhs_is_null)
+                        && equal_len(
+                            lhs_values,
+                            rhs_values,
+                            lhs_pos * byte_width,
+                            rhs_pos * byte_width,
+                            byte_width, // 1 * byte_width since we are comparing a single entry
+                        )
+            })
+        } else {
+            let lhs_slices_iter = BitSliceIterator::new(
+                lhs.null_buffer().as_ref().unwrap(),
+                lhs_start + lhs.offset(),
+                len,
+            );
+            let rhs_slices_iter = BitSliceIterator::new(
+                rhs.null_buffer().as_ref().unwrap(),
+                rhs_start + rhs.offset(),
+                len,
+            );
 
-            lhs_is_null
-                || (lhs_is_null == rhs_is_null)
-                    && equal_len(
-                        lhs_values,
-                        rhs_values,
-                        lhs_pos * byte_width,
-                        rhs_pos * byte_width,
-                        byte_width, // 1 * byte_width since we are comparing a single entry
-                    )
-        })
+            lhs_slices_iter.zip(rhs_slices_iter).all(
+                |((l_start, l_end), (r_start, r_end))| {
+                    l_start == r_start
+                        && l_end == r_end
+                        && equal_len(
+                            lhs_values,
+                            rhs_values,
+                            (lhs_start + l_start) * byte_width,
+                            (rhs_start + r_start) * byte_width,
+                            (l_end - l_start) * byte_width,
+                        )
+                },
+            )
+        }
     }
 }
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index 41d93aefa..fde4b41b0 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -39,7 +39,7 @@ use arrow_schema::*;
 const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8;
 
 /// An iterator of `(usize, usize)` each representing an interval
-/// `[start, end)` whose slots of a [BooleanArray] are true. Each
+/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each
 /// interval corresponds to a contiguous region of memory to be
 /// "taken" from an array to be filtered.
 ///
diff --git a/arrow/benches/equal.rs b/arrow/benches/equal.rs
index f54aff1b5..2f4e2fada 100644
--- a/arrow/benches/equal.rs
+++ b/arrow/benches/equal.rs
@@ -43,6 +43,9 @@ fn add_benchmark(c: &mut Criterion) {
     let arr_a_nulls = create_primitive_array::<Float32Type>(512, 0.5);
     c.bench_function("equal_nulls_512", |b| b.iter(|| bench_equal(&arr_a_nulls)));
 
+    let arr_a = create_primitive_array::<Float32Type>(51200, 0.1);
+    c.bench_function("equal_51200", |b| b.iter(|| bench_equal(&arr_a)));
+
     let arr_a = create_string_array::<i32>(512, 0.0);
     c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a)));