You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/06/15 13:18:49 UTC

[arrow-rs] branch master updated: Mark typed buffer APIs safe (#996) (#1027) (#1866)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b10d58b56 Mark typed buffer APIs safe (#996) (#1027) (#1866)
b10d58b56 is described below

commit b10d58b56a862fe318501c2a2bf44833d57e4050
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Jun 15 14:18:45 2022 +0100

    Mark typed buffer APIs safe (#996) (#1027) (#1866)
    
    * Mark typed buffer APIs safe (#996) (#1027)
    
    * Fix parquet
    
    * Format
    
    * Review feedback
---
 arrow/src/array/array_union.rs                     |  8 +++----
 arrow/src/array/builder.rs                         |  4 ++--
 arrow/src/array/data.rs                            |  5 ++---
 arrow/src/buffer/immutable.rs                      | 26 +++++++++-------------
 arrow/src/buffer/mutable.rs                        | 15 ++++++-------
 arrow/src/buffer/ops.rs                            |  4 +---
 arrow/src/compute/kernels/cast.rs                  |  2 +-
 arrow/src/compute/kernels/sort.rs                  |  6 ++---
 arrow/src/compute/kernels/take.rs                  |  3 +--
 arrow/src/datatypes/native.rs                      |  3 ++-
 parquet/src/arrow/array_reader/byte_array.rs       |  4 ++--
 .../arrow/array_reader/byte_array_dictionary.rs    |  6 ++---
 parquet/src/arrow/array_reader/mod.rs              |  8 +++----
 parquet/src/arrow/arrow_writer/mod.rs              |  2 +-
 parquet/src/arrow/buffer/dictionary_buffer.rs      |  2 +-
 parquet/src/arrow/record_reader/buffer.rs          |  4 ++--
 parquet/src/arrow/record_reader/mod.rs             |  4 ++--
 17 files changed, 47 insertions(+), 59 deletions(-)

diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs
index bae771bb9..4ff0a31c6 100644
--- a/arrow/src/array/array_union.rs
+++ b/arrow/src/array/array_union.rs
@@ -185,7 +185,7 @@ impl UnionArray {
         }
 
         // Check the type_ids
-        let type_id_slice: &[i8] = unsafe { type_ids.typed_data() };
+        let type_id_slice: &[i8] = type_ids.typed_data();
         let invalid_type_ids = type_id_slice
             .iter()
             .filter(|i| *i < &0)
@@ -201,7 +201,7 @@ impl UnionArray {
         // Check the value offsets if provided
         if let Some(offset_buffer) = &value_offsets {
             let max_len = type_ids.len() as i32;
-            let offsets_slice: &[i32] = unsafe { offset_buffer.typed_data() };
+            let offsets_slice: &[i32] = offset_buffer.typed_data();
             let invalid_offsets = offsets_slice
                 .iter()
                 .filter(|i| *i < &0 || *i > &max_len)
@@ -255,9 +255,7 @@ impl UnionArray {
     pub fn value_offset(&self, index: usize) -> i32 {
         assert!(index - self.offset() < self.len());
         if self.is_dense() {
-            // safety: reinterpreting is safe since the offset buffer contains `i32` values and is
-            // properly aligned.
-            unsafe { self.data().buffers()[1].typed_data::<i32>()[index] }
+            self.data().buffers()[1].typed_data::<i32>()[index]
         } else {
             index as i32
         }
diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs
index ed26d3c2f..2df4aecf6 100644
--- a/arrow/src/array/builder.rs
+++ b/arrow/src/array/builder.rs
@@ -76,7 +76,7 @@ pub(crate) fn builder_to_mutable_buffer<T: ArrowNativeType>(
 /// builder.append(45);
 /// let buffer = builder.finish();
 ///
-/// assert_eq!(unsafe { buffer.typed_data::<u8>() }, &[42, 43, 44, 45]);
+/// assert_eq!(buffer.typed_data::<u8>(), &[42, 43, 44, 45]);
 /// # Ok(())
 /// # }
 /// ```
@@ -380,7 +380,7 @@ impl<T: ArrowNativeType> BufferBuilder<T> {
     ///
     /// let buffer = builder.finish();
     ///
-    /// assert_eq!(unsafe { buffer.typed_data::<u8>() }, &[42, 44, 46]);
+    /// assert_eq!(buffer.typed_data::<u8>(), &[42, 44, 46]);
     /// ```
     #[inline]
     pub fn finish(&mut self) -> Buffer {
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index 0ccbe6a70..65fbc4df9 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -767,8 +767,7 @@ impl ArrayData {
             )));
         }
 
-        // SAFETY: Bounds checked above
-        Ok(unsafe { &(buffer.typed_data::<T>()[self.offset..self.offset + len]) })
+        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
     }
 
     /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
@@ -1161,7 +1160,7 @@ impl ArrayData {
 
         // Justification: buffer size was validated above
         let indexes: &[T] =
-            unsafe { &(buffer.typed_data::<T>()[self.offset..self.offset + self.len]) };
+            &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
 
         indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
             // Do not check the value is null (value can be arbitrary)
diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs
index c34ea101b..f5d59c5ed 100644
--- a/arrow/src/buffer/immutable.rs
+++ b/arrow/src/buffer/immutable.rs
@@ -181,19 +181,15 @@ impl Buffer {
 
     /// View buffer as typed slice.
     ///
-    /// # Safety
+    /// # Panics
     ///
-    /// `ArrowNativeType` is public so that it can be used as a trait bound for other public
-    /// components, such as the `ToByteSlice` trait.  However, this means that it can be
-    /// implemented by user defined types, which it is not intended for.
-    pub unsafe fn typed_data<T: ArrowNativeType + num::Num>(&self) -> &[T] {
-        // JUSTIFICATION
-        //  Benefit
-        //      Many of the buffers represent specific types, and consumers of `Buffer` often need to re-interpret them.
-        //  Soundness
-        //      * The pointer is non-null by construction
-        //      * alignment asserted below.
-        let (prefix, offsets, suffix) = self.as_slice().align_to::<T>();
+    /// This function panics if the underlying buffer is not aligned
+    /// correctly for type `T`.
+    pub fn typed_data<T: ArrowNativeType>(&self) -> &[T] {
+        // SAFETY
+        // ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect
+        // implementation outside this crate, and this method checks alignment
+        let (prefix, offsets, suffix) = unsafe { self.as_slice().align_to::<T>() };
         assert!(prefix.is_empty() && suffix.is_empty());
         offsets
     }
@@ -451,7 +447,7 @@ mod tests {
     macro_rules! check_as_typed_data {
         ($input: expr, $native_t: ty) => {{
             let buffer = Buffer::from_slice_ref($input);
-            let slice: &[$native_t] = unsafe { buffer.typed_data::<$native_t>() };
+            let slice: &[$native_t] = buffer.typed_data::<$native_t>();
             assert_eq!($input, slice);
         }};
     }
@@ -573,12 +569,12 @@ mod tests {
             )
         };
 
-        let slice = unsafe { buffer.typed_data::<i32>() };
+        let slice = buffer.typed_data::<i32>();
         assert_eq!(slice, &[1, 2, 3, 4, 5]);
 
         let buffer = buffer.slice(std::mem::size_of::<i32>());
 
-        let slice = unsafe { buffer.typed_data::<i32>() };
+        let slice = buffer.typed_data::<i32>();
         assert_eq!(slice, &[2, 3, 4, 5]);
     }
 }
diff --git a/arrow/src/buffer/mutable.rs b/arrow/src/buffer/mutable.rs
index ef3e35209..5710a97f3 100644
--- a/arrow/src/buffer/mutable.rs
+++ b/arrow/src/buffer/mutable.rs
@@ -290,17 +290,16 @@ impl MutableBuffer {
 
     /// View this buffer asa slice of a specific type.
     ///
-    /// # Safety
-    ///
-    /// This function must only be used with buffers which are treated
-    /// as type `T` (e.g.  extended with items of type `T`).
-    ///
     /// # Panics
     ///
     /// This function panics if the underlying buffer is not aligned
     /// correctly for type `T`.
-    pub unsafe fn typed_data_mut<T: ArrowNativeType>(&mut self) -> &mut [T] {
-        let (prefix, offsets, suffix) = self.as_slice_mut().align_to_mut::<T>();
+    pub fn typed_data_mut<T: ArrowNativeType>(&mut self) -> &mut [T] {
+        // SAFETY
+        // ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect
+        // implementation outside this crate, and this method checks alignment
+        let (prefix, offsets, suffix) =
+            unsafe { self.as_slice_mut().align_to_mut::<T>() };
         assert!(prefix.is_empty() && suffix.is_empty());
         offsets
     }
@@ -314,7 +313,7 @@ impl MutableBuffer {
     /// assert_eq!(buffer.len(), 8) // u32 has 4 bytes
     /// ```
     #[inline]
-    pub fn extend_from_slice<T: ToByteSlice>(&mut self, items: &[T]) {
+    pub fn extend_from_slice<T: ArrowNativeType>(&mut self, items: &[T]) {
         let len = items.len();
         let additional = len * std::mem::size_of::<T>();
         self.reserve(additional);
diff --git a/arrow/src/buffer/ops.rs b/arrow/src/buffer/ops.rs
index b3571d174..ea155c8d7 100644
--- a/arrow/src/buffer/ops.rs
+++ b/arrow/src/buffer/ops.rs
@@ -68,9 +68,7 @@ where
 
     let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);
 
-    // Safety: buffer is always treated as type `u64` in the code
-    // below.
-    let result_chunks = unsafe { result.typed_data_mut::<u64>().iter_mut() };
+    let result_chunks = result.typed_data_mut::<u64>().iter_mut();
 
     result_chunks
         .zip(left_chunks.iter())
diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs
index 93a8ebcb6..9a4638d97 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -2084,7 +2084,7 @@ where
     let list_data = array.data();
     let str_values_buf = str_array.value_data();
 
-    let offsets = unsafe { list_data.buffers()[0].typed_data::<OffsetSizeFrom>() };
+    let offsets = list_data.buffers()[0].typed_data::<OffsetSizeFrom>();
 
     let mut offset_builder = BufferBuilder::<OffsetSizeTo>::new(offsets.len());
     offsets.iter().try_for_each::<_, Result<_>>(|offset| {
diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs
index 140a57f33..72ee8b68d 100644
--- a/arrow/src/compute/kernels/sort.rs
+++ b/arrow/src/compute/kernels/sort.rs
@@ -452,8 +452,7 @@ fn sort_boolean(
     let mut result = MutableBuffer::new(result_capacity);
     // sets len to capacity so we can access the whole buffer as a typed slice
     result.resize(result_capacity, 0);
-    // Safety: the buffer is always treated as `u32` in the code below
-    let result_slice: &mut [u32] = unsafe { result.typed_data_mut() };
+    let result_slice: &mut [u32] = result.typed_data_mut();
 
     if options.nulls_first {
         let size = nulls_len.min(len);
@@ -565,8 +564,7 @@ where
     let mut result = MutableBuffer::new(result_capacity);
     // sets len to capacity so we can access the whole buffer as a typed slice
     result.resize(result_capacity, 0);
-    // Safety: the buffer is always treated as `u32` in the code below
-    let result_slice: &mut [u32] = unsafe { result.typed_data_mut() };
+    let result_slice: &mut [u32] = result.typed_data_mut();
 
     if options.nulls_first {
         let size = nulls_len.min(len);
diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs
index 567bf5c8b..03637ec81 100644
--- a/arrow/src/compute/kernels/take.rs
+++ b/arrow/src/compute/kernels/take.rs
@@ -688,8 +688,7 @@ where
     let bytes_offset = (data_len + 1) * std::mem::size_of::<OffsetSize>();
     let mut offsets_buffer = MutableBuffer::from_len_zeroed(bytes_offset);
 
-    // Safety: the buffer is always treated as as a type of `OffsetSize` in the code below
-    let offsets = unsafe { offsets_buffer.typed_data_mut() };
+    let offsets = offsets_buffer.typed_data_mut();
     let mut values = MutableBuffer::new(0);
     let mut length_so_far = OffsetSize::zero();
     offsets[0] = length_so_far;
diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs
index efb1d3e6b..d9a3f667d 100644
--- a/arrow/src/datatypes/native.rs
+++ b/arrow/src/datatypes/native.rs
@@ -42,7 +42,8 @@ pub trait JsonSerializable: 'static {
 ///
 /// Note: in the case of floating point numbers this transmutation can result in a signalling
 /// NaN, which, whilst sound, can be unwieldy. In general, whilst it is perfectly sound to
-/// reinterpret bytes as different types using this trait, it is likely unwise
+/// reinterpret bytes as different types using this trait, it is likely unwise. For more information
+/// see [f32::from_bits] and [f64::from_bits].
 ///
 /// Note: `bool` is restricted to `0` or `1`, and so `bool: !ArrowNativeType`
 ///
diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs
index 2e29b6094..9e0f83fa9 100644
--- a/parquet/src/arrow/array_reader/byte_array.rs
+++ b/parquet/src/arrow/array_reader/byte_array.rs
@@ -125,13 +125,13 @@ impl<I: OffsetSizeTrait + ScalarValue> ArrayReader for ByteArrayReader<I> {
     fn get_def_levels(&self) -> Option<&[i16]> {
         self.def_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 
     fn get_rep_levels(&self) -> Option<&[i16]> {
         self.rep_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 }
 
diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 0e64f0d25..0cd67206f 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -187,13 +187,13 @@ where
     fn get_def_levels(&self) -> Option<&[i16]> {
         self.def_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 
     fn get_rep_levels(&self) -> Option<&[i16]> {
         self.rep_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 }
 
@@ -356,7 +356,7 @@ where
                         assert_eq!(dict.data_type(), &self.value_type);
 
                         let dict_buffers = dict.data().buffers();
-                        let dict_offsets = unsafe { dict_buffers[0].typed_data::<V>() };
+                        let dict_offsets = dict_buffers[0].typed_data::<V>();
                         let dict_values = dict_buffers[1].as_slice();
 
                         values.extend_from_dictionary(
diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs
index 21c49b338..6207b377d 100644
--- a/parquet/src/arrow/array_reader/mod.rs
+++ b/parquet/src/arrow/array_reader/mod.rs
@@ -226,13 +226,13 @@ where
     fn get_def_levels(&self) -> Option<&[i16]> {
         self.def_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 
     fn get_rep_levels(&self) -> Option<&[i16]> {
         self.rep_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 }
 
@@ -447,13 +447,13 @@ where
     fn get_def_levels(&self) -> Option<&[i16]> {
         self.def_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 
     fn get_rep_levels(&self) -> Option<&[i16]> {
         self.rep_levels_buffer
             .as_ref()
-            .map(|buf| unsafe { buf.typed_data() })
+            .map(|buf| buf.typed_data())
     }
 }
 
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 44631e574..b64517ad1 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -576,7 +576,7 @@ macro_rules! def_get_binary_array_fn {
         fn $name(array: &$ty) -> Vec<ByteArray> {
             let mut byte_array = ByteArray::new();
             let ptr = crate::util::memory::ByteBufferPtr::new(
-                unsafe { array.value_data().typed_data::<u8>() }.to_vec(),
+                array.value_data().as_slice().to_vec(),
             );
             byte_array.set_data(ptr);
             array
diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs
index 7f4458507..ffa3a4843 100644
--- a/parquet/src/arrow/buffer/dictionary_buffer.rs
+++ b/parquet/src/arrow/buffer/dictionary_buffer.rs
@@ -106,7 +106,7 @@ impl<K: ScalarValue + ArrowNativeType + Ord, V: ScalarValue + OffsetSizeTrait>
             Self::Dict { keys, values } => {
                 let mut spilled = OffsetBuffer::default();
                 let dict_buffers = values.data().buffers();
-                let dict_offsets = unsafe { dict_buffers[0].typed_data::<V>() };
+                let dict_offsets = dict_buffers[0].typed_data::<V>();
                 let dict_values = dict_buffers[1].as_slice();
 
                 if values.is_empty() {
diff --git a/parquet/src/arrow/record_reader/buffer.rs b/parquet/src/arrow/record_reader/buffer.rs
index fa0f91991..7101eaa9c 100644
--- a/parquet/src/arrow/record_reader/buffer.rs
+++ b/parquet/src/arrow/record_reader/buffer.rs
@@ -19,7 +19,7 @@ use std::marker::PhantomData;
 
 use crate::arrow::buffer::bit_util::iter_set_bits_rev;
 use arrow::buffer::{Buffer, MutableBuffer};
-use arrow::datatypes::ToByteSlice;
+use arrow::datatypes::ArrowNativeType;
 
 /// A buffer that supports writing new data to the end, and removing data from the front
 ///
@@ -172,7 +172,7 @@ impl<T: ScalarValue> ScalarBuffer<T> {
     }
 }
 
-impl<T: ScalarValue + ToByteSlice> ScalarBuffer<T> {
+impl<T: ScalarValue + ArrowNativeType> ScalarBuffer<T> {
     pub fn push(&mut self, v: T) {
         self.buffer.push(v);
         self.len += 1;
diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs
index 89d782b1a..023a538a2 100644
--- a/parquet/src/arrow/record_reader/mod.rs
+++ b/parquet/src/arrow/record_reader/mod.rs
@@ -573,7 +573,7 @@ mod tests {
 
         // Verify result record data
         let actual = record_reader.consume_record_data().unwrap();
-        let actual_values = unsafe { actual.typed_data::<i32>() };
+        let actual_values = actual.typed_data::<i32>();
 
         let expected = &[0, 7, 0, 6, 3, 0, 8];
         assert_eq!(actual_values.len(), expected.len());
@@ -687,7 +687,7 @@ mod tests {
 
         // Verify result record data
         let actual = record_reader.consume_record_data().unwrap();
-        let actual_values = unsafe { actual.typed_data::<i32>() };
+        let actual_values = actual.typed_data::<i32>();
         let expected = &[4, 0, 0, 7, 6, 3, 2, 8, 9];
         assert_eq!(actual_values.len(), expected.len());