You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/06/09 08:43:41 UTC

[arrow-rs] branch master updated: Consolidate ByteArray::from_iterator (#4386)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 2846cde87 Consolidate ByteArray::from_iterator (#4386)
2846cde87 is described below

commit 2846cde87de2d51afbe4ab642b31c861f152de6f
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Jun 9 09:43:34 2023 +0100

    Consolidate ByteArray::from_iterator (#4386)
---
 arrow-array/src/array/binary_array.rs | 46 ++------------------------
 arrow-array/src/array/byte_array.rs   | 23 +++++++++++++
 arrow-array/src/array/string_array.rs | 61 +----------------------------------
 3 files changed, 26 insertions(+), 104 deletions(-)

diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs
index a4d64040c..e809d3a6d 100644
--- a/arrow-array/src/array/binary_array.rs
+++ b/arrow-array/src/array/binary_array.rs
@@ -19,7 +19,7 @@ use crate::types::{ByteArrayType, GenericBinaryType};
 use crate::{
     Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait,
 };
-use arrow_buffer::{bit_util, Buffer, MutableBuffer};
+use arrow_buffer::MutableBuffer;
 use arrow_data::ArrayData;
 use arrow_schema::DataType;
 
@@ -174,49 +174,6 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericStringArray<OffsetSize>>
     }
 }
 
-impl<Ptr, OffsetSize: OffsetSizeTrait> FromIterator<Option<Ptr>>
-    for GenericBinaryArray<OffsetSize>
-where
-    Ptr: AsRef<[u8]>,
-{
-    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
-        let iter = iter.into_iter();
-        let (_, data_len) = iter.size_hint();
-        let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
-
-        let mut offsets = Vec::with_capacity(data_len + 1);
-        let mut values = Vec::new();
-        let mut null_buf = MutableBuffer::new_null(data_len);
-        let mut length_so_far: OffsetSize = OffsetSize::zero();
-        offsets.push(length_so_far);
-
-        {
-            let null_slice = null_buf.as_slice_mut();
-
-            for (i, s) in iter.enumerate() {
-                if let Some(s) = s {
-                    let s = s.as_ref();
-                    bit_util::set_bit(null_slice, i);
-                    length_so_far += OffsetSize::from_usize(s.len()).unwrap();
-                    values.extend_from_slice(s);
-                }
-                // always add an element in offsets
-                offsets.push(length_so_far);
-            }
-        }
-
-        // calculate actual data_len, which may be different from the iterator's upper bound
-        let data_len = offsets.len() - 1;
-        let array_data = ArrayData::builder(Self::DATA_TYPE)
-            .len(data_len)
-            .add_buffer(Buffer::from_vec(offsets))
-            .add_buffer(Buffer::from_vec(values))
-            .null_bit_buffer(Some(null_buf.into()));
-        let array_data = unsafe { array_data.build_unchecked() };
-        Self::from(array_data)
-    }
-}
-
 /// An array of `[u8]` using `i32` offsets
 ///
 /// The byte length of each element is represented by an i32.
@@ -301,6 +258,7 @@ pub type LargeBinaryArray = GenericBinaryArray<i64>;
 mod tests {
     use super::*;
     use crate::{ListArray, StringArray};
+    use arrow_buffer::Buffer;
     use arrow_schema::Field;
     use std::sync::Arc;
 
diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs
index 629ffd22c..563e965e5 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -456,6 +456,29 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
     }
 }
 
+impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
+where
+    Ptr: AsRef<T::Native> + 'a,
+{
+    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
+        iter.into_iter()
+            .map(|o| o.as_ref().map(|p| p.as_ref()))
+            .collect()
+    }
+}
+
+impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
+where
+    Ptr: AsRef<T::Native>,
+{
+    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
+        let iter = iter.into_iter();
+        let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
+        builder.extend(iter);
+        builder.finish()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::{BinaryArray, StringArray};
diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index 8a1c0bd15..ecc3e3eab 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -17,7 +17,7 @@
 
 use crate::types::GenericStringType;
 use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
-use arrow_buffer::{bit_util, MutableBuffer};
+use arrow_buffer::MutableBuffer;
 use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType};
 
@@ -105,65 +105,6 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
     }
 }
 
-impl<'a, Ptr, OffsetSize: OffsetSizeTrait> FromIterator<&'a Option<Ptr>>
-    for GenericStringArray<OffsetSize>
-where
-    Ptr: AsRef<str> + 'a,
-{
-    /// Creates a [`GenericStringArray`] based on an iterator of `Option` references.
-    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
-        // Convert each owned Ptr into &str and wrap in an owned `Option`
-        let iter = iter.into_iter().map(|o| o.as_ref().map(|p| p.as_ref()));
-        // Build a `GenericStringArray` with the resulting iterator
-        iter.collect::<GenericStringArray<OffsetSize>>()
-    }
-}
-
-impl<Ptr, OffsetSize: OffsetSizeTrait> FromIterator<Option<Ptr>>
-    for GenericStringArray<OffsetSize>
-where
-    Ptr: AsRef<str>,
-{
-    /// Creates a [`GenericStringArray`] based on an iterator of [`Option`]s
-    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
-        let iter = iter.into_iter();
-        let (_, data_len) = iter.size_hint();
-        let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
-
-        let offset_size = std::mem::size_of::<OffsetSize>();
-        let mut offsets = MutableBuffer::new((data_len + 1) * offset_size);
-        let mut values = MutableBuffer::new(0);
-        let mut null_buf = MutableBuffer::new_null(data_len);
-        let null_slice = null_buf.as_slice_mut();
-        let mut length_so_far = OffsetSize::zero();
-        offsets.push(length_so_far);
-
-        for (i, s) in iter.enumerate() {
-            let value_bytes = if let Some(ref s) = s {
-                // set null bit
-                bit_util::set_bit(null_slice, i);
-                let s_bytes = s.as_ref().as_bytes();
-                length_so_far += OffsetSize::from_usize(s_bytes.len()).unwrap();
-                s_bytes
-            } else {
-                b""
-            };
-            values.extend_from_slice(value_bytes);
-            offsets.push(length_so_far);
-        }
-
-        // calculate actual data_len, which may be different from the iterator's upper bound
-        let data_len = (offsets.len() / offset_size) - 1;
-        let array_data = ArrayData::builder(Self::DATA_TYPE)
-            .len(data_len)
-            .add_buffer(offsets.into())
-            .add_buffer(values.into())
-            .null_bit_buffer(Some(null_buf.into()));
-        let array_data = unsafe { array_data.build_unchecked() };
-        Self::from(array_data)
-    }
-}
-
 impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
     for GenericStringArray<OffsetSize>
 {