You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/27 21:20:53 UTC
[arrow-rs] branch master updated: Use SlicesIterator for ArrayData Equality (#3198)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f98581801 Use SlicesIterator for ArrayData Equality (#3198)
f98581801 is described below
commit f985818012e6d0a56fca49487dbc13c516f4613c
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Sun Nov 27 13:20:47 2022 -0800
Use SlicesIterator for ArrayData Equality (#3198)
* Use SlicesIterator for ArrayData Equality
* Use BitSliceIterator
---
arrow-data/src/equal/fixed_binary.rs | 72 ++++++++++++++++++++++++----------
arrow-data/src/equal/primitive.rs | 75 ++++++++++++++++++++++++++----------
arrow-select/src/filter.rs | 2 +-
arrow/benches/equal.rs | 3 ++
4 files changed, 110 insertions(+), 42 deletions(-)
diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs
index d6af20801..17e470b5c 100644
--- a/arrow-data/src/equal/fixed_binary.rs
+++ b/arrow-data/src/equal/fixed_binary.rs
@@ -15,7 +15,10 @@
// specific language governing permissions and limitations
// under the License.
-use crate::data::{contains_nulls, ArrayData};
+use crate::bit_iterator::BitSliceIterator;
+use crate::contains_nulls;
+use crate::data::ArrayData;
+use crate::equal::primitive::NULL_SLICES_SELECTIVITY_THRESHOLD;
use arrow_buffer::bit_util::get_bit;
use arrow_schema::DataType;
@@ -47,26 +50,55 @@ pub(super) fn fixed_binary_equal(
size * len,
)
} else {
- // get a ref of the null buffer bytes, to use in testing for nullness
- let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
- let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
- // with nulls, we need to compare item by item whenever it is not null
- (0..len).all(|i| {
- let lhs_pos = lhs_start + i;
- let rhs_pos = rhs_start + i;
+ let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64;
- let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
- let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+ if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD {
+ // get a ref of the null buffer bytes, to use in testing for nullness
+ let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
+ let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
+ // with nulls, we need to compare item by item whenever it is not null
+ (0..len).all(|i| {
+ let lhs_pos = lhs_start + i;
+ let rhs_pos = rhs_start + i;
- lhs_is_null
- || (lhs_is_null == rhs_is_null)
- && equal_len(
- lhs_values,
- rhs_values,
- lhs_pos * size,
- rhs_pos * size,
- size, // 1 * size since we are comparing a single entry
- )
- })
+ let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
+ let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+
+ lhs_is_null
+ || (lhs_is_null == rhs_is_null)
+ && equal_len(
+ lhs_values,
+ rhs_values,
+ lhs_pos * size,
+ rhs_pos * size,
+ size, // 1 * size since we are comparing a single entry
+ )
+ })
+ } else {
+ let lhs_slices_iter = BitSliceIterator::new(
+ lhs.null_buffer().as_ref().unwrap(),
+ lhs_start + lhs.offset(),
+ len,
+ );
+ let rhs_slices_iter = BitSliceIterator::new(
+ rhs.null_buffer().as_ref().unwrap(),
+ rhs_start + rhs.offset(),
+ len,
+ );
+
+ lhs_slices_iter.zip(rhs_slices_iter).all(
+ |((l_start, l_end), (r_start, r_end))| {
+ l_start == r_start
+ && l_end == r_end
+ && equal_len(
+ lhs_values,
+ rhs_values,
+ (lhs_start + l_start) * size,
+ (rhs_start + r_start) * size,
+ (l_end - l_start) * size,
+ )
+ },
+ )
+ }
}
}
diff --git a/arrow-data/src/equal/primitive.rs b/arrow-data/src/equal/primitive.rs
index e619375d5..f52541e28 100644
--- a/arrow-data/src/equal/primitive.rs
+++ b/arrow-data/src/equal/primitive.rs
@@ -15,13 +15,17 @@
// specific language governing permissions and limitations
// under the License.
+use crate::bit_iterator::BitSliceIterator;
+use crate::contains_nulls;
+use arrow_buffer::bit_util::get_bit;
use std::mem::size_of;
-use crate::data::{contains_nulls, ArrayData};
-use arrow_buffer::bit_util::get_bit;
+use crate::data::ArrayData;
use super::utils::equal_len;
+pub(crate) const NULL_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.4;
+
pub(super) fn primitive_equal<T>(
lhs: &ArrayData,
rhs: &ArrayData,
@@ -45,25 +49,54 @@ pub(super) fn primitive_equal<T>(
len * byte_width,
)
} else {
- // get a ref of the null buffer bytes, to use in testing for nullness
- let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
- let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
- // with nulls, we need to compare item by item whenever it is not null
- (0..len).all(|i| {
- let lhs_pos = lhs_start + i;
- let rhs_pos = rhs_start + i;
- let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
- let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+ let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64;
+
+ if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD {
+ // get a ref of the null buffer bytes, to use in testing for nullness
+ let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice();
+ let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice();
+ // with nulls, we need to compare item by item whenever it is not null
+ (0..len).all(|i| {
+ let lhs_pos = lhs_start + i;
+ let rhs_pos = rhs_start + i;
+ let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset());
+ let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset());
+
+ lhs_is_null
+ || (lhs_is_null == rhs_is_null)
+ && equal_len(
+ lhs_values,
+ rhs_values,
+ lhs_pos * byte_width,
+ rhs_pos * byte_width,
+ byte_width, // 1 * byte_width since we are comparing a single entry
+ )
+ })
+ } else {
+ let lhs_slices_iter = BitSliceIterator::new(
+ lhs.null_buffer().as_ref().unwrap(),
+ lhs_start + lhs.offset(),
+ len,
+ );
+ let rhs_slices_iter = BitSliceIterator::new(
+ rhs.null_buffer().as_ref().unwrap(),
+ rhs_start + rhs.offset(),
+ len,
+ );
- lhs_is_null
- || (lhs_is_null == rhs_is_null)
- && equal_len(
- lhs_values,
- rhs_values,
- lhs_pos * byte_width,
- rhs_pos * byte_width,
- byte_width, // 1 * byte_width since we are comparing a single entry
- )
- })
+ lhs_slices_iter.zip(rhs_slices_iter).all(
+ |((l_start, l_end), (r_start, r_end))| {
+ l_start == r_start
+ && l_end == r_end
+ && equal_len(
+ lhs_values,
+ rhs_values,
+ (lhs_start + l_start) * byte_width,
+ (rhs_start + r_start) * byte_width,
+ (l_end - l_start) * byte_width,
+ )
+ },
+ )
+ }
}
}
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index 41d93aefa..fde4b41b0 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -39,7 +39,7 @@ use arrow_schema::*;
const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8;
/// An iterator of `(usize, usize)` each representing an interval
-/// `[start, end)` whose slots of a [BooleanArray] are true. Each
+/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each
/// interval corresponds to a contiguous region of memory to be
/// "taken" from an array to be filtered.
///
diff --git a/arrow/benches/equal.rs b/arrow/benches/equal.rs
index f54aff1b5..2f4e2fada 100644
--- a/arrow/benches/equal.rs
+++ b/arrow/benches/equal.rs
@@ -43,6 +43,9 @@ fn add_benchmark(c: &mut Criterion) {
let arr_a_nulls = create_primitive_array::<Float32Type>(512, 0.5);
c.bench_function("equal_nulls_512", |b| b.iter(|| bench_equal(&arr_a_nulls)));
+ let arr_a = create_primitive_array::<Float32Type>(51200, 0.1);
+ c.bench_function("equal_51200", |b| b.iter(|| bench_equal(&arr_a)));
+
let arr_a = create_string_array::<i32>(512, 0.0);
c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a)));