You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/29 18:12:44 UTC
[arrow-rs] branch master updated: Faster StringDictionaryBuilder (#1851) (#1861)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 903b24ac9 Faster StringDictionaryBuilder (#1851) (#1861)
903b24ac9 is described below
commit 903b24ac9e4f85e17e38e97f505501c10762ad9f
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Jun 29 19:12:40 2022 +0100
Faster StringDictionaryBuilder (#1851) (#1861)
---
arrow/Cargo.toml | 2 +
arrow/benches/string_dictionary_builder.rs | 70 ++++++++++
arrow/src/array/builder/generic_list_builder.rs | 10 ++
arrow/src/array/builder/generic_string_builder.rs | 10 ++
arrow/src/array/builder/primitive_builder.rs | 5 +
.../src/array/builder/string_dictionary_builder.rs | 141 ++++++++++++++-------
6 files changed, 195 insertions(+), 43 deletions(-)
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index c0bc514ee..268ef0f94 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -38,6 +38,7 @@ path = "src/lib.rs"
bench = false
[dependencies]
+ahash = { version = "0.7", default-features = false }
serde = { version = "1.0", default-features = false }
serde_derive = { version = "1.0", default-features = false }
serde_json = { version = "1.0", default-features = false, features = ["preserve_order"] }
@@ -45,6 +46,7 @@ indexmap = { version = "1.9", default-features = false, features = ["std"] }
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true }
num = { version = "0.4", default-features = false, features = ["std"] }
half = { version = "2.0", default-features = false }
+hashbrown = { version = "0.12", default-features = false }
csv_crate = { version = "1.1", default-features = false, optional = true, package="csv" }
regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] }
lazy_static = { version = "1.4", default-features = false }
diff --git a/arrow/benches/string_dictionary_builder.rs b/arrow/benches/string_dictionary_builder.rs
new file mode 100644
index 000000000..bc014bec1
--- /dev/null
+++ b/arrow/benches/string_dictionary_builder.rs
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Int32Builder, StringBuilder, StringDictionaryBuilder};
+use criterion::{criterion_group, criterion_main, Criterion};
+use rand::{thread_rng, Rng};
+
+/// Note: this is best effort, not all keys are necessarily present or unique
+fn build_strings(dict_size: usize, total_size: usize, key_len: usize) -> Vec<String> {
+ let mut rng = thread_rng();
+ let values: Vec<String> = (0..dict_size)
+ .map(|_| (0..key_len).map(|_| rng.gen::<char>()).collect())
+ .collect();
+
+ (0..total_size)
+ .map(|_| values[rng.gen_range(0..dict_size)].clone())
+ .collect()
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let mut group = c.benchmark_group("string_dictionary_builder");
+
+ let mut do_bench = |dict_size: usize, total_size: usize, key_len: usize| {
+ group.bench_function(
+ format!(
+ "(dict_size:{}, len:{}, key_len: {})",
+ dict_size, total_size, key_len
+ ),
+ |b| {
+ let strings = build_strings(dict_size, total_size, key_len);
+ b.iter(|| {
+ let keys = Int32Builder::new(strings.len());
+ let values = StringBuilder::new((key_len + 1) * dict_size);
+ let mut builder = StringDictionaryBuilder::new(keys, values);
+
+ for val in &strings {
+ builder.append(val).unwrap();
+ }
+
+ builder.finish();
+ })
+ },
+ );
+ };
+
+ do_bench(20, 1000, 5);
+ do_bench(100, 1000, 5);
+ do_bench(100, 1000, 10);
+ do_bench(100, 10000, 10);
+ do_bench(100, 10000, 100);
+
+ group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow/src/array/builder/generic_list_builder.rs
index 1449b5c09..d1333b7bf 100644
--- a/arrow/src/array/builder/generic_list_builder.rs
+++ b/arrow/src/array/builder/generic_list_builder.rs
@@ -107,6 +107,11 @@ where
&mut self.values_builder
}
+ /// Returns the child array builder as an immutable reference
+ pub fn values_ref(&self) -> &T {
+ &self.values_builder
+ }
+
/// Finish the current variable-length list array slot
#[inline]
pub fn append(&mut self, is_valid: bool) -> Result<()> {
@@ -152,6 +157,11 @@ where
GenericListArray::<OffsetSize>::from(array_data)
}
+
+ /// Returns the current offsets buffer as a slice
+ pub fn offsets_slice(&self) -> &[OffsetSize] {
+ self.offsets_builder.as_slice()
+ }
}
#[cfg(test)]
diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow/src/array/builder/generic_string_builder.rs
index ee391c4d4..c09ffd66e 100644
--- a/arrow/src/array/builder/generic_string_builder.rs
+++ b/arrow/src/array/builder/generic_string_builder.rs
@@ -87,6 +87,16 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
GenericStringArray::<OffsetSize>::from(self.builder.finish())
}
+
+ /// Returns the current values buffer as a slice
+ pub fn values_slice(&self) -> &[u8] {
+ self.builder.values_ref().values_slice()
+ }
+
+ /// Returns the current offsets buffer as a slice
+ pub fn offsets_slice(&self) -> &[OffsetSize] {
+ self.builder.offsets_slice()
+ }
}
impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSize> {
diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs
index 83c62509c..b18239b75 100644
--- a/arrow/src/array/builder/primitive_builder.rs
+++ b/arrow/src/array/builder/primitive_builder.rs
@@ -230,6 +230,11 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
b.append_n(self.values_builder.len(), true);
self.bitmap_builder = Some(b);
}
+
+ /// Returns the current values buffer as a slice
+ pub fn values_slice(&self) -> &[T::Native] {
+ self.values_builder.as_slice()
+ }
}
#[cfg(test)]
diff --git a/arrow/src/array/builder/string_dictionary_builder.rs b/arrow/src/array/builder/string_dictionary_builder.rs
index d1b872fd3..918bf9756 100644
--- a/arrow/src/array/builder/string_dictionary_builder.rs
+++ b/arrow/src/array/builder/string_dictionary_builder.rs
@@ -15,21 +15,17 @@
// specific language governing permissions and limitations
// under the License.
+use super::PrimitiveBuilder;
+use crate::array::{
+ Array, ArrayBuilder, ArrayRef, DictionaryArray, StringArray, StringBuilder,
+};
+use crate::datatypes::{ArrowDictionaryKeyType, ArrowNativeType};
+use crate::error::{ArrowError, Result};
+use hashbrown::hash_map::RawEntryMut;
+use hashbrown::HashMap;
use std::any::Any;
-use std::collections::HashMap;
use std::sync::Arc;
-use crate::array::array::Array;
-use crate::array::ArrayBuilder;
-use crate::array::ArrayRef;
-use crate::array::ArrowDictionaryKeyType;
-use crate::array::DictionaryArray;
-use crate::array::PrimitiveBuilder;
-use crate::array::StringArray;
-use crate::array::StringBuilder;
-use crate::datatypes::ArrowNativeType;
-use crate::error::{ArrowError, Result};
-
/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices
/// to String values. Note that the use of a `HashMap` here will not scale to very large
/// arrays or result in an ordered dictionary.
@@ -76,9 +72,16 @@ pub struct StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
+ state: ahash::RandomState,
+ /// Used to provide a lookup from string value to key type
+ ///
+ /// Note: K's hash implementation is not used, instead the raw entry
+ /// API is used to store keys w.r.t the hash of the strings themselves
+ ///
+ dedup: HashMap<K::Native, (), ()>,
+
keys_builder: PrimitiveBuilder<K>,
values_builder: StringBuilder,
- map: HashMap<Box<[u8]>, K::Native>,
}
impl<K> StringDictionaryBuilder<K>
@@ -88,9 +91,10 @@ where
/// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder.
pub fn new(keys_builder: PrimitiveBuilder<K>, values_builder: StringBuilder) -> Self {
Self {
+ state: Default::default(),
+ dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()),
keys_builder,
values_builder,
- map: HashMap::new(),
}
}
@@ -122,27 +126,44 @@ where
keys_builder: PrimitiveBuilder<K>,
dictionary_values: &StringArray,
) -> Result<Self> {
+ let state = ahash::RandomState::default();
let dict_len = dictionary_values.len();
- let mut values_builder =
- StringBuilder::with_capacity(dict_len, dictionary_values.value_data().len());
- let mut map: HashMap<Box<[u8]>, K::Native> = HashMap::with_capacity(dict_len);
- for i in 0..dict_len {
- if dictionary_values.is_valid(i) {
- let value = dictionary_values.value(i);
- map.insert(
- value.as_bytes().into(),
- K::Native::from_usize(i)
- .ok_or(ArrowError::DictionaryKeyOverflowError)?,
- );
- values_builder.append_value(value)?;
- } else {
- values_builder.append_null()?;
+
+ let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ());
+
+ let values_len = dictionary_values.value_data().len();
+ let mut values_builder = StringBuilder::with_capacity(dict_len, values_len);
+
+ for (idx, maybe_value) in dictionary_values.iter().enumerate() {
+ match maybe_value {
+ Some(value) => {
+ let hash = compute_hash(&state, value.as_bytes());
+
+ let key = K::Native::from_usize(idx)
+ .ok_or(ArrowError::DictionaryKeyOverflowError)?;
+
+ let entry =
+ dedup.raw_entry_mut().from_hash(hash, |key: &K::Native| {
+ value.as_bytes() == get_bytes(&values_builder, key)
+ });
+
+ if let RawEntryMut::Vacant(v) = entry {
+ v.insert_with_hasher(hash, key, (), |key| {
+ compute_hash(&state, get_bytes(&values_builder, key))
+ });
+ }
+
+ values_builder.append_value(value)?;
+ }
+ None => values_builder.append_null()?,
}
}
+
Ok(Self {
+ state,
+ dedup,
keys_builder,
values_builder,
- map,
})
}
}
@@ -190,19 +211,35 @@ where
/// if already present in the values array or a new index if the
/// value is appended to the values array.
pub fn append(&mut self, value: impl AsRef<str>) -> Result<K::Native> {
- if let Some(&key) = self.map.get(value.as_ref().as_bytes()) {
- // Append existing value.
- self.keys_builder.append_value(key)?;
- Ok(key)
- } else {
- // Append new value.
- let key = K::Native::from_usize(self.values_builder.len())
- .ok_or(ArrowError::DictionaryKeyOverflowError)?;
- self.values_builder.append_value(value.as_ref())?;
- self.keys_builder.append_value(key as K::Native)?;
- self.map.insert(value.as_ref().as_bytes().into(), key);
- Ok(key)
- }
+ let value = value.as_ref();
+
+ let state = &self.state;
+ let storage = &mut self.values_builder;
+ let hash = compute_hash(state, value.as_bytes());
+
+ let entry = self
+ .dedup
+ .raw_entry_mut()
+ .from_hash(hash, |key| value.as_bytes() == get_bytes(storage, key));
+
+ let key = match entry {
+ RawEntryMut::Occupied(entry) => *entry.into_key(),
+ RawEntryMut::Vacant(entry) => {
+ let index = storage.len();
+ storage.append_value(value)?;
+ let key = K::Native::from_usize(index)
+ .ok_or(ArrowError::DictionaryKeyOverflowError)?;
+
+ *entry
+ .insert_with_hasher(hash, key, (), |key| {
+ compute_hash(state, get_bytes(storage, key))
+ })
+ .0
+ }
+ };
+ self.keys_builder.append_value(key)?;
+
+ Ok(key)
}
#[inline]
@@ -212,12 +249,30 @@ where
/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
- self.map.clear();
+ self.dedup.clear();
let value_ref: ArrayRef = Arc::new(self.values_builder.finish());
self.keys_builder.finish_dict(value_ref)
}
}
+fn compute_hash(hasher: &ahash::RandomState, value: &[u8]) -> u64 {
+ use std::hash::{BuildHasher, Hash, Hasher};
+ let mut state = hasher.build_hasher();
+ value.hash(&mut state);
+ state.finish()
+}
+
+fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a [u8] {
+ let offsets = values.offsets_slice();
+ let values = values.values_slice();
+
+ let idx = key.to_usize().unwrap();
+ let end_offset = offsets[idx + 1].to_usize().unwrap();
+ let start_offset = offsets[idx].to_usize().unwrap();
+
+ &values[start_offset..end_offset]
+}
+
#[cfg(test)]
mod tests {
use super::*;