You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/31 17:07:53 UTC
[arrow-rs] branch master updated: Reduce Dictionary Builder Codegen (#3616)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e80d87fbf Reduce Dictionary Builder Codegen (#3616)
e80d87fbf is described below
commit e80d87fbf7fb36fe415ae6927c5550439a85d937
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Jan 31 17:07:47 2023 +0000
Reduce Dictionary Builder Codegen (#3616)
* Reduce dictionary builder codegen
* Clippy
* Format
---
.../builder/generic_bytes_dictionary_builder.rs | 48 ++++++++++------------
.../src/builder/primitive_dictionary_builder.rs | 11 ++---
arrow-buffer/src/native.rs | 28 +++++++++++--
3 files changed, 51 insertions(+), 36 deletions(-)
diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
index 5af41a519..dd9a70b1d 100644
--- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
@@ -40,10 +40,10 @@ where
state: ahash::RandomState,
/// Used to provide a lookup from string value to key type
///
- /// Note: K's hash implementation is not used, instead the raw entry
+ /// Note: usize's hash implementation is not used, instead the raw entry
/// API is used to store keys w.r.t the hash of the strings themselves
///
- dedup: HashMap<K::Native, (), ()>,
+ dedup: HashMap<usize, (), ()>,
keys_builder: PrimitiveBuilder<K>,
values_builder: GenericByteBuilder<T>,
@@ -133,23 +133,22 @@ where
let mut values_builder =
GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
+ K::Native::from_usize(dictionary_values.len())
+ .ok_or(ArrowError::DictionaryKeyOverflowError)?;
+
for (idx, maybe_value) in dictionary_values.iter().enumerate() {
match maybe_value {
Some(value) => {
let value_bytes: &[u8] = value.as_ref();
let hash = state.hash_one(value_bytes);
- let key = K::Native::from_usize(idx)
- .ok_or(ArrowError::DictionaryKeyOverflowError)?;
-
- let entry =
- dedup.raw_entry_mut().from_hash(hash, |key: &K::Native| {
- value_bytes == get_bytes(&values_builder, key)
- });
+ let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| {
+ value_bytes == get_bytes(&values_builder, *idx)
+ });
if let RawEntryMut::Vacant(v) = entry {
- v.insert_with_hasher(hash, key, (), |key| {
- state.hash_one(get_bytes(&values_builder, key))
+ v.insert_with_hasher(hash, idx, (), |idx| {
+ state.hash_one(get_bytes(&values_builder, *idx))
});
}
@@ -233,21 +232,20 @@ where
let entry = self
.dedup
.raw_entry_mut()
- .from_hash(hash, |key| value_bytes == get_bytes(storage, key));
+ .from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx));
let key = match entry {
- RawEntryMut::Occupied(entry) => *entry.into_key(),
+ RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()),
RawEntryMut::Vacant(entry) => {
- let index = storage.len();
+ let idx = storage.len();
storage.append_value(value);
- let key = K::Native::from_usize(index)
- .ok_or(ArrowError::DictionaryKeyOverflowError)?;
-
- *entry
- .insert_with_hasher(hash, key, (), |key| {
- state.hash_one(get_bytes(storage, key))
- })
- .0
+
+ entry.insert_with_hasher(hash, idx, (), |idx| {
+ state.hash_one(get_bytes(storage, *idx))
+ });
+
+ K::Native::from_usize(idx)
+ .ok_or(ArrowError::DictionaryKeyOverflowError)?
}
};
self.keys_builder.append_value(key);
@@ -330,14 +328,10 @@ impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Op
}
}
-fn get_bytes<'a, K: ArrowNativeType, T: ByteArrayType>(
- values: &'a GenericByteBuilder<T>,
- key: &K,
-) -> &'a [u8] {
+fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
let offsets = values.offsets_slice();
let values = values.values_slice();
- let idx = key.as_usize();
let end_offset = offsets[idx + 1].as_usize();
let start_offset = offsets[idx].as_usize();
diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs
index f44f0e306..00187cdde 100644
--- a/arrow-array/src/builder/primitive_dictionary_builder.rs
+++ b/arrow-array/src/builder/primitive_dictionary_builder.rs
@@ -86,7 +86,7 @@ where
{
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
- map: HashMap<Value<V::Native>, K::Native>,
+ map: HashMap<Value<V::Native>, usize>,
}
impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
@@ -180,13 +180,13 @@ where
let key = match self.map.entry(Value(value)) {
Entry::Vacant(vacant) => {
// Append new value.
- let key = K::Native::from_usize(self.values_builder.len())
- .ok_or(ArrowError::DictionaryKeyOverflowError)?;
+ let key = self.values_builder.len();
self.values_builder.append_value(value);
vacant.insert(key);
- key
+ K::Native::from_usize(key)
+ .ok_or(ArrowError::DictionaryKeyOverflowError)?
}
- Entry::Occupied(o) => *o.get(),
+ Entry::Occupied(o) => K::Native::usize_as(*o.get()),
};
self.keys_builder.append_value(key);
@@ -198,6 +198,7 @@ where
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
+ #[inline]
pub fn append_value(&mut self, value: V::Native) {
self.append(value).expect("dictionary key overflow");
}
diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs
index 6ac11a16f..4ea06974b 100644
--- a/arrow-buffer/src/native.rs
+++ b/arrow-buffer/src/native.rs
@@ -58,6 +58,11 @@ pub trait ArrowNativeType:
/// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
fn as_usize(self) -> usize;
+ /// Convert from usize according to the [`as`] operator
+ ///
+ /// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
+ fn usize_as(i: usize) -> Self;
+
/// Convert native type to usize.
///
/// Returns `None` if [`Self`] is not an integer or conversion would result
@@ -119,6 +124,12 @@ macro_rules! native_integer {
self as _
}
+ #[inline]
+ fn usize_as(i: usize) -> Self {
+ i as _
+ }
+
+
$(
#[inline]
fn $from(v: $t) -> Option<Self> {
@@ -140,7 +151,7 @@ native_integer!(u32);
native_integer!(u64);
macro_rules! native_float {
- ($t:ty, $s:ident, $as_usize: expr) => {
+ ($t:ty, $s:ident, $as_usize: expr, $i:ident, $usize_as: expr) => {
impl private::Sealed for $t {}
impl ArrowNativeType for $t {
#[inline]
@@ -162,13 +173,18 @@ macro_rules! native_float {
fn as_usize($s) -> usize {
$as_usize
}
+
+ #[inline]
+ fn usize_as($i: usize) -> Self {
+ $usize_as
+ }
}
};
}
-native_float!(f16, self, self.to_f32() as _);
-native_float!(f32, self, self as _);
-native_float!(f64, self, self as _);
+native_float!(f16, self, self.to_f32() as _, i, f16::from_f32(i as _));
+native_float!(f32, self, self as _, i, i as _);
+native_float!(f64, self, self as _, i, i as _);
impl private::Sealed for i256 {}
impl ArrowNativeType for i256 {
@@ -180,6 +196,10 @@ impl ArrowNativeType for i256 {
self.to_parts().0 as usize
}
+ fn usize_as(i: usize) -> Self {
+ Self::from_parts(i as u128, 0)
+ }
+
fn to_usize(self) -> Option<usize> {
let (low, high) = self.to_parts();
if high != 0 {