You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/06/09 08:43:41 UTC
[arrow-rs] branch master updated: Consolidate ByteArray::from_iterator (#4386)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 2846cde87 Consolidate ByteArray::from_iterator (#4386)
2846cde87 is described below
commit 2846cde87de2d51afbe4ab642b31c861f152de6f
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Jun 9 09:43:34 2023 +0100
Consolidate ByteArray::from_iterator (#4386)
---
arrow-array/src/array/binary_array.rs | 46 ++------------------------
arrow-array/src/array/byte_array.rs | 23 +++++++++++++
arrow-array/src/array/string_array.rs | 61 +----------------------------------
3 files changed, 26 insertions(+), 104 deletions(-)
diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs
index a4d64040c..e809d3a6d 100644
--- a/arrow-array/src/array/binary_array.rs
+++ b/arrow-array/src/array/binary_array.rs
@@ -19,7 +19,7 @@ use crate::types::{ByteArrayType, GenericBinaryType};
use crate::{
Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait,
};
-use arrow_buffer::{bit_util, Buffer, MutableBuffer};
+use arrow_buffer::MutableBuffer;
use arrow_data::ArrayData;
use arrow_schema::DataType;
@@ -174,49 +174,6 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericStringArray<OffsetSize>>
}
}
-impl<Ptr, OffsetSize: OffsetSizeTrait> FromIterator<Option<Ptr>>
- for GenericBinaryArray<OffsetSize>
-where
- Ptr: AsRef<[u8]>,
-{
- fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
- let iter = iter.into_iter();
- let (_, data_len) = iter.size_hint();
- let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
-
- let mut offsets = Vec::with_capacity(data_len + 1);
- let mut values = Vec::new();
- let mut null_buf = MutableBuffer::new_null(data_len);
- let mut length_so_far: OffsetSize = OffsetSize::zero();
- offsets.push(length_so_far);
-
- {
- let null_slice = null_buf.as_slice_mut();
-
- for (i, s) in iter.enumerate() {
- if let Some(s) = s {
- let s = s.as_ref();
- bit_util::set_bit(null_slice, i);
- length_so_far += OffsetSize::from_usize(s.len()).unwrap();
- values.extend_from_slice(s);
- }
- // always add an element in offsets
- offsets.push(length_so_far);
- }
- }
-
- // calculate actual data_len, which may be different from the iterator's upper bound
- let data_len = offsets.len() - 1;
- let array_data = ArrayData::builder(Self::DATA_TYPE)
- .len(data_len)
- .add_buffer(Buffer::from_vec(offsets))
- .add_buffer(Buffer::from_vec(values))
- .null_bit_buffer(Some(null_buf.into()));
- let array_data = unsafe { array_data.build_unchecked() };
- Self::from(array_data)
- }
-}
-
/// An array of `[u8]` using `i32` offsets
///
/// The byte length of each element is represented by an i32.
@@ -301,6 +258,7 @@ pub type LargeBinaryArray = GenericBinaryArray<i64>;
mod tests {
use super::*;
use crate::{ListArray, StringArray};
+ use arrow_buffer::Buffer;
use arrow_schema::Field;
use std::sync::Arc;
diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs
index 629ffd22c..563e965e5 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -456,6 +456,29 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
}
}
+impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
+where
+ Ptr: AsRef<T::Native> + 'a,
+{
+ fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
+ iter.into_iter()
+ .map(|o| o.as_ref().map(|p| p.as_ref()))
+ .collect()
+ }
+}
+
+impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
+where
+ Ptr: AsRef<T::Native>,
+{
+ fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
+ let iter = iter.into_iter();
+ let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
+ builder.extend(iter);
+ builder.finish()
+ }
+}
+
#[cfg(test)]
mod tests {
use crate::{BinaryArray, StringArray};
diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index 8a1c0bd15..ecc3e3eab 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -17,7 +17,7 @@
use crate::types::GenericStringType;
use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
-use arrow_buffer::{bit_util, MutableBuffer};
+use arrow_buffer::MutableBuffer;
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
@@ -105,65 +105,6 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
}
}
-impl<'a, Ptr, OffsetSize: OffsetSizeTrait> FromIterator<&'a Option<Ptr>>
- for GenericStringArray<OffsetSize>
-where
- Ptr: AsRef<str> + 'a,
-{
- /// Creates a [`GenericStringArray`] based on an iterator of `Option` references.
- fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
- // Convert each owned Ptr into &str and wrap in an owned `Option`
- let iter = iter.into_iter().map(|o| o.as_ref().map(|p| p.as_ref()));
- // Build a `GenericStringArray` with the resulting iterator
- iter.collect::<GenericStringArray<OffsetSize>>()
- }
-}
-
-impl<Ptr, OffsetSize: OffsetSizeTrait> FromIterator<Option<Ptr>>
- for GenericStringArray<OffsetSize>
-where
- Ptr: AsRef<str>,
-{
- /// Creates a [`GenericStringArray`] based on an iterator of [`Option`]s
- fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
- let iter = iter.into_iter();
- let (_, data_len) = iter.size_hint();
- let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
-
- let offset_size = std::mem::size_of::<OffsetSize>();
- let mut offsets = MutableBuffer::new((data_len + 1) * offset_size);
- let mut values = MutableBuffer::new(0);
- let mut null_buf = MutableBuffer::new_null(data_len);
- let null_slice = null_buf.as_slice_mut();
- let mut length_so_far = OffsetSize::zero();
- offsets.push(length_so_far);
-
- for (i, s) in iter.enumerate() {
- let value_bytes = if let Some(ref s) = s {
- // set null bit
- bit_util::set_bit(null_slice, i);
- let s_bytes = s.as_ref().as_bytes();
- length_so_far += OffsetSize::from_usize(s_bytes.len()).unwrap();
- s_bytes
- } else {
- b""
- };
- values.extend_from_slice(value_bytes);
- offsets.push(length_so_far);
- }
-
- // calculate actual data_len, which may be different from the iterator's upper bound
- let data_len = (offsets.len() / offset_size) - 1;
- let array_data = ArrayData::builder(Self::DATA_TYPE)
- .len(data_len)
- .add_buffer(offsets.into())
- .add_buffer(values.into())
- .null_bit_buffer(Some(null_buf.into()));
- let array_data = unsafe { array_data.build_unchecked() };
- Self::from(array_data)
- }
-}
-
impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{