You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/31 17:07:53 UTC

[arrow-rs] branch master updated: Reduce Dictionary Builder Codegen (#3616)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e80d87fbf Reduce Dictionary Builder Codegen (#3616)
e80d87fbf is described below

commit e80d87fbf7fb36fe415ae6927c5550439a85d937
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Jan 31 17:07:47 2023 +0000

    Reduce Dictionary Builder Codegen (#3616)
    
    * Reduce dictionary builder codegen
    
    * Clippy
    
    * Format
---
 .../builder/generic_bytes_dictionary_builder.rs    | 48 ++++++++++------------
 .../src/builder/primitive_dictionary_builder.rs    | 11 ++---
 arrow-buffer/src/native.rs                         | 28 +++++++++++--
 3 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
index 5af41a519..dd9a70b1d 100644
--- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
@@ -40,10 +40,10 @@ where
     state: ahash::RandomState,
     /// Used to provide a lookup from string value to key type
     ///
-    /// Note: K's hash implementation is not used, instead the raw entry
+    /// Note: usize's hash implementation is not used, instead the raw entry
     /// API is used to store keys w.r.t the hash of the strings themselves
     ///
-    dedup: HashMap<K::Native, (), ()>,
+    dedup: HashMap<usize, (), ()>,
 
     keys_builder: PrimitiveBuilder<K>,
     values_builder: GenericByteBuilder<T>,
@@ -133,23 +133,22 @@ where
         let mut values_builder =
             GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
 
+        K::Native::from_usize(dictionary_values.len())
+            .ok_or(ArrowError::DictionaryKeyOverflowError)?;
+
         for (idx, maybe_value) in dictionary_values.iter().enumerate() {
             match maybe_value {
                 Some(value) => {
                     let value_bytes: &[u8] = value.as_ref();
                     let hash = state.hash_one(value_bytes);
 
-                    let key = K::Native::from_usize(idx)
-                        .ok_or(ArrowError::DictionaryKeyOverflowError)?;
-
-                    let entry =
-                        dedup.raw_entry_mut().from_hash(hash, |key: &K::Native| {
-                            value_bytes == get_bytes(&values_builder, key)
-                        });
+                    let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| {
+                        value_bytes == get_bytes(&values_builder, *idx)
+                    });
 
                     if let RawEntryMut::Vacant(v) = entry {
-                        v.insert_with_hasher(hash, key, (), |key| {
-                            state.hash_one(get_bytes(&values_builder, key))
+                        v.insert_with_hasher(hash, idx, (), |idx| {
+                            state.hash_one(get_bytes(&values_builder, *idx))
                         });
                     }
 
@@ -233,21 +232,20 @@ where
         let entry = self
             .dedup
             .raw_entry_mut()
-            .from_hash(hash, |key| value_bytes == get_bytes(storage, key));
+            .from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx));
 
         let key = match entry {
-            RawEntryMut::Occupied(entry) => *entry.into_key(),
+            RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()),
             RawEntryMut::Vacant(entry) => {
-                let index = storage.len();
+                let idx = storage.len();
                 storage.append_value(value);
-                let key = K::Native::from_usize(index)
-                    .ok_or(ArrowError::DictionaryKeyOverflowError)?;
-
-                *entry
-                    .insert_with_hasher(hash, key, (), |key| {
-                        state.hash_one(get_bytes(storage, key))
-                    })
-                    .0
+
+                entry.insert_with_hasher(hash, idx, (), |idx| {
+                    state.hash_one(get_bytes(storage, *idx))
+                });
+
+                K::Native::from_usize(idx)
+                    .ok_or(ArrowError::DictionaryKeyOverflowError)?
             }
         };
         self.keys_builder.append_value(key);
@@ -330,14 +328,10 @@ impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Op
     }
 }
 
-fn get_bytes<'a, K: ArrowNativeType, T: ByteArrayType>(
-    values: &'a GenericByteBuilder<T>,
-    key: &K,
-) -> &'a [u8] {
+fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
     let offsets = values.offsets_slice();
     let values = values.values_slice();
 
-    let idx = key.as_usize();
     let end_offset = offsets[idx + 1].as_usize();
     let start_offset = offsets[idx].as_usize();
 
diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs
index f44f0e306..00187cdde 100644
--- a/arrow-array/src/builder/primitive_dictionary_builder.rs
+++ b/arrow-array/src/builder/primitive_dictionary_builder.rs
@@ -86,7 +86,7 @@ where
 {
     keys_builder: PrimitiveBuilder<K>,
     values_builder: PrimitiveBuilder<V>,
-    map: HashMap<Value<V::Native>, K::Native>,
+    map: HashMap<Value<V::Native>, usize>,
 }
 
 impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
@@ -180,13 +180,13 @@ where
         let key = match self.map.entry(Value(value)) {
             Entry::Vacant(vacant) => {
                 // Append new value.
-                let key = K::Native::from_usize(self.values_builder.len())
-                    .ok_or(ArrowError::DictionaryKeyOverflowError)?;
+                let key = self.values_builder.len();
                 self.values_builder.append_value(value);
                 vacant.insert(key);
-                key
+                K::Native::from_usize(key)
+                    .ok_or(ArrowError::DictionaryKeyOverflowError)?
             }
-            Entry::Occupied(o) => *o.get(),
+            Entry::Occupied(o) => K::Native::usize_as(*o.get()),
         };
 
         self.keys_builder.append_value(key);
@@ -198,6 +198,7 @@ where
     /// # Panics
     ///
     /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
+    #[inline]
     pub fn append_value(&mut self, value: V::Native) {
         self.append(value).expect("dictionary key overflow");
     }
diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs
index 6ac11a16f..4ea06974b 100644
--- a/arrow-buffer/src/native.rs
+++ b/arrow-buffer/src/native.rs
@@ -58,6 +58,11 @@ pub trait ArrowNativeType:
     /// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
     fn as_usize(self) -> usize;
 
+    /// Convert from usize according to the [`as`] operator
+    ///
+    /// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
+    fn usize_as(i: usize) -> Self;
+
     /// Convert native type to usize.
     ///
     /// Returns `None` if [`Self`] is not an integer or conversion would result
@@ -119,6 +124,12 @@ macro_rules! native_integer {
                 self as _
             }
 
+            #[inline]
+            fn usize_as(i: usize) -> Self {
+                i as _
+            }
+
+
             $(
                 #[inline]
                 fn $from(v: $t) -> Option<Self> {
@@ -140,7 +151,7 @@ native_integer!(u32);
 native_integer!(u64);
 
 macro_rules! native_float {
-    ($t:ty, $s:ident, $as_usize: expr) => {
+    ($t:ty, $s:ident, $as_usize: expr, $i:ident, $usize_as: expr) => {
         impl private::Sealed for $t {}
         impl ArrowNativeType for $t {
             #[inline]
@@ -162,13 +173,18 @@ macro_rules! native_float {
             fn as_usize($s) -> usize {
                 $as_usize
             }
+
+            #[inline]
+            fn usize_as($i: usize) -> Self {
+                $usize_as
+            }
         }
     };
 }
 
-native_float!(f16, self, self.to_f32() as _);
-native_float!(f32, self, self as _);
-native_float!(f64, self, self as _);
+native_float!(f16, self, self.to_f32() as _, i, f16::from_f32(i as _));
+native_float!(f32, self, self as _, i, i as _);
+native_float!(f64, self, self as _, i, i as _);
 
 impl private::Sealed for i256 {}
 impl ArrowNativeType for i256 {
@@ -180,6 +196,10 @@ impl ArrowNativeType for i256 {
         self.to_parts().0 as usize
     }
 
+    fn usize_as(i: usize) -> Self {
+        Self::from_parts(i as u128, 0)
+    }
+
     fn to_usize(self) -> Option<usize> {
         let (low, high) = self.to_parts();
         if high != 0 {