You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/06/30 11:45:21 UTC

[arrow-rs] branch master updated: Add DictionaryArray::occupancy (#4415)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 3354a4c5d Add DictionaryArray::occupancy (#4415)
3354a4c5d is described below

commit 3354a4c5d5b5b6fd3ec55c49fd2f0930b935d07e
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Jun 30 12:45:16 2023 +0100

    Add DictionaryArray::occupancy (#4415)
---
 arrow-array/Cargo.toml                    |  5 +++
 arrow-array/benches/occupancy.rs          | 57 +++++++++++++++++++++++++++++++
 arrow-array/src/array/dictionary_array.rs | 48 +++++++++++++++++++++++++-
 3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml
index f2703bb6f..d4f0f9fa0 100644
--- a/arrow-array/Cargo.toml
+++ b/arrow-array/Cargo.toml
@@ -56,5 +56,10 @@ simd = ["packed_simd"]
 
 [dev-dependencies]
 rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
+criterion = { version = "0.5", default-features = false }
 
 [build-dependencies]
+
+[[bench]]
+name = "occupancy"
+harness = false
diff --git a/arrow-array/benches/occupancy.rs b/arrow-array/benches/occupancy.rs
new file mode 100644
index 000000000..ed4b94351
--- /dev/null
+++ b/arrow-array/benches/occupancy.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::types::Int32Type;
+use arrow_array::{DictionaryArray, Int32Array};
+use arrow_buffer::NullBuffer;
+use criterion::*;
+use rand::{thread_rng, Rng};
+use std::sync::Arc;
+
+fn gen_dict(
+    len: usize,
+    values_len: usize,
+    occupancy: f64,
+    null_percent: f64,
+) -> DictionaryArray<Int32Type> {
+    let mut rng = thread_rng();
+    let values = Int32Array::from(vec![0; values_len]);
+    let max_key = (values_len as f64 * occupancy) as i32;
+    let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect();
+    let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect();
+
+    let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls)));
+    DictionaryArray::new(keys, Arc::new(values))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for values in [10, 100, 512] {
+        for occupancy in [1., 0.5, 0.1] {
+            for null_percent in [0.0, 0.1, 0.5, 0.9] {
+                let dict = gen_dict(1024, values, occupancy, null_percent);
+                c.bench_function(&format!("occupancy(values: {values}, occupancy: {occupancy}, null_percent: {null_percent})"), |b| {
+                    b.iter(|| {
+                        black_box(&dict).occupancy()
+                    });
+                });
+            }
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs
index b9112d103..5a2f439a8 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -23,8 +23,9 @@ use crate::{
     make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType,
     PrimitiveArray, StringArray,
 };
+use arrow_buffer::bit_util::set_bit;
 use arrow_buffer::buffer::NullBuffer;
-use arrow_buffer::ArrowNativeType;
+use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
 use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType};
 use std::any::Any;
@@ -549,6 +550,29 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
             .for_each(|v| *v = op(*v));
         Ok(builder.finish())
     }
+
+    /// Computes an occupancy mask for this dictionary's values
+    ///
+    /// For each value in [`Self::values`] the corresponding bit will be set in the
+    /// returned mask if it is referenced by a key in this [`DictionaryArray`]
+    pub fn occupancy(&self) -> BooleanBuffer {
+        let len = self.values.len();
+        let mut builder = BooleanBufferBuilder::new(len);
+        builder.resize(len);
+        let slice = builder.as_slice_mut();
+        match self.keys.nulls().filter(|n| n.null_count() > 0) {
+            Some(n) => {
+                let v = self.keys.values();
+                n.valid_indices()
+                    .for_each(|idx| set_bit(slice, v[idx].as_usize()))
+            }
+            None => {
+                let v = self.keys.values();
+                v.iter().for_each(|v| set_bit(slice, v.as_usize()))
+            }
+        }
+        builder.finish()
+    }
 }
 
 /// Constructs a `DictionaryArray` from an array data reference.
@@ -1207,4 +1231,26 @@ mod tests {
         let expected = DictionaryArray::new(keys, Arc::new(values));
         assert_eq!(expected, returned);
     }
+
+    #[test]
+    fn test_occupancy() {
+        let keys = Int32Array::new((100..200).collect(), None);
+        let values = Int32Array::from(vec![0; 1024]);
+        let dict = DictionaryArray::new(keys, Arc::new(values));
+        for (idx, v) in dict.occupancy().iter().enumerate() {
+            let expected = (100..200).contains(&idx);
+            assert_eq!(v, expected, "{idx}");
+        }
+
+        let keys = Int32Array::new(
+            (0..100).collect(),
+            Some((0..100).map(|x| x % 4 == 0).collect()),
+        );
+        let values = Int32Array::from(vec![0; 1024]);
+        let dict = DictionaryArray::new(keys, Arc::new(values));
+        for (idx, v) in dict.occupancy().iter().enumerate() {
+            let expected = idx % 4 == 0 && idx < 100;
+            assert_eq!(v, expected, "{idx}");
+        }
+    }
 }