You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/06/30 11:45:21 UTC
[arrow-rs] branch master updated: Add DictionaryArray::occupancy (#4415)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 3354a4c5d Add DictionaryArray::occupancy (#4415)
3354a4c5d is described below
commit 3354a4c5d5b5b6fd3ec55c49fd2f0930b935d07e
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Jun 30 12:45:16 2023 +0100
Add DictionaryArray::occupancy (#4415)
---
arrow-array/Cargo.toml | 5 +++
arrow-array/benches/occupancy.rs | 57 +++++++++++++++++++++++++++++++
arrow-array/src/array/dictionary_array.rs | 48 +++++++++++++++++++++++++-
3 files changed, 109 insertions(+), 1 deletion(-)
diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml
index f2703bb6f..d4f0f9fa0 100644
--- a/arrow-array/Cargo.toml
+++ b/arrow-array/Cargo.toml
@@ -56,5 +56,10 @@ simd = ["packed_simd"]
[dev-dependencies]
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
+criterion = { version = "0.5", default-features = false }
[build-dependencies]
+
+[[bench]]
+name = "occupancy"
+harness = false
diff --git a/arrow-array/benches/occupancy.rs b/arrow-array/benches/occupancy.rs
new file mode 100644
index 000000000..ed4b94351
--- /dev/null
+++ b/arrow-array/benches/occupancy.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::types::Int32Type;
+use arrow_array::{DictionaryArray, Int32Array};
+use arrow_buffer::NullBuffer;
+use criterion::*;
+use rand::{thread_rng, Rng};
+use std::sync::Arc;
+
+fn gen_dict(
+ len: usize,
+ values_len: usize,
+ occupancy: f64,
+ null_percent: f64,
+) -> DictionaryArray<Int32Type> {
+ let mut rng = thread_rng();
+ let values = Int32Array::from(vec![0; values_len]);
+ let max_key = (values_len as f64 * occupancy) as i32;
+ let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect();
+ let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect();
+
+ let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls)));
+ DictionaryArray::new(keys, Arc::new(values))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+ for values in [10, 100, 512] {
+ for occupancy in [1., 0.5, 0.1] {
+ for null_percent in [0.0, 0.1, 0.5, 0.9] {
+ let dict = gen_dict(1024, values, occupancy, null_percent);
+ c.bench_function(&format!("occupancy(values: {values}, occupancy: {occupancy}, null_percent: {null_percent})"), |b| {
+ b.iter(|| {
+ black_box(&dict).occupancy()
+ });
+ });
+ }
+ }
+ }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs
index b9112d103..5a2f439a8 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -23,8 +23,9 @@ use crate::{
make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType,
PrimitiveArray, StringArray,
};
+use arrow_buffer::bit_util::set_bit;
use arrow_buffer::buffer::NullBuffer;
-use arrow_buffer::ArrowNativeType;
+use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
@@ -549,6 +550,29 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
.for_each(|v| *v = op(*v));
Ok(builder.finish())
}
+
+ /// Computes an occupancy mask for this dictionary's values
+ ///
+ /// For each value in [`Self::values`] the corresponding bit will be set in the
+ /// returned mask if it is referenced by a key in this [`DictionaryArray`]
+ pub fn occupancy(&self) -> BooleanBuffer {
+ let len = self.values.len();
+ let mut builder = BooleanBufferBuilder::new(len);
+ builder.resize(len);
+ let slice = builder.as_slice_mut();
+ match self.keys.nulls().filter(|n| n.null_count() > 0) {
+ Some(n) => {
+ let v = self.keys.values();
+ n.valid_indices()
+ .for_each(|idx| set_bit(slice, v[idx].as_usize()))
+ }
+ None => {
+ let v = self.keys.values();
+ v.iter().for_each(|v| set_bit(slice, v.as_usize()))
+ }
+ }
+ builder.finish()
+ }
}
/// Constructs a `DictionaryArray` from an array data reference.
@@ -1207,4 +1231,26 @@ mod tests {
let expected = DictionaryArray::new(keys, Arc::new(values));
assert_eq!(expected, returned);
}
+
+ #[test]
+ fn test_occupancy() {
+ let keys = Int32Array::new((100..200).collect(), None);
+ let values = Int32Array::from(vec![0; 1024]);
+ let dict = DictionaryArray::new(keys, Arc::new(values));
+ for (idx, v) in dict.occupancy().iter().enumerate() {
+ let expected = (100..200).contains(&idx);
+ assert_eq!(v, expected, "{idx}");
+ }
+
+ let keys = Int32Array::new(
+ (0..100).collect(),
+ Some((0..100).map(|x| x % 4 == 0).collect()),
+ );
+ let values = Int32Array::from(vec![0; 1024]);
+ let dict = DictionaryArray::new(keys, Arc::new(values));
+ for (idx, v) in dict.occupancy().iter().enumerate() {
+ let expected = idx % 4 == 0 && idx < 100;
+ assert_eq!(v, expected, "{idx}");
+ }
+ }
}