You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/05/29 10:46:24 UTC
[arrow-rs] branch master updated: Reduce memory usage of concat
(large)utf8 (#348)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e801d4b Reduce memory usage of concat (large)utf8 (#348)
e801d4b is described below
commit e801d4b91bb8a5340b8e01849c579d84e360674c
Author: Ritchie Vink <ri...@gmail.com>
AuthorDate: Sat May 29 12:45:55 2021 +0200
Reduce memory usage of concat (large)utf8 (#348)
* reduce memory needed for concat
* reuse code for str allocation buffer
---
arrow/src/array/transform/mod.rs | 41 ++++++++++++++++++++++++++++++++++++-
arrow/src/compute/kernels/concat.rs | 19 +++++++++++++++++
2 files changed, 59 insertions(+), 1 deletion(-)
diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs
index e7ec41e..5611671 100644
--- a/arrow/src/array/transform/mod.rs
+++ b/arrow/src/array/transform/mod.rs
@@ -21,11 +21,13 @@ use crate::{
error::{ArrowError, Result},
util::bit_util,
};
+use std::mem;
use super::{
data::{into_buffers, new_buffers},
ArrayData,
};
+use crate::array::StringOffsetSizeTrait;
mod boolean;
mod fixed_binary;
@@ -324,6 +326,37 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
})
}
+fn preallocate_str_buffer<Offset: StringOffsetSizeTrait>(
+ capacity: usize,
+ arrays: &[&ArrayData],
+) -> [MutableBuffer; 2] {
+ // offsets
+ let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
+ // safety: `unsafe` code assumes that this buffer is initialized with one element
+ if Offset::is_large() {
+ buffer.push(0i64);
+ } else {
+ buffer.push(0i32)
+ }
+ let str_values_size = arrays
+ .iter()
+ .map(|data| {
+ // get the length of the value buffer
+ let buf_len = data.buffers()[1].len();
+ // find the offset of the buffer
+ // this returns a slice of offsets, starting from the offset of the array
+ // so we can take the first value
+ let offset = data.buffer::<Offset>(0)[0];
+ buf_len - offset.to_usize().unwrap()
+ })
+ .sum::<usize>();
+
+ [
+ buffer,
+ MutableBuffer::new(str_values_size * mem::size_of::<u8>()),
+ ]
+}
+
impl<'a> MutableArrayData<'a> {
/// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an
/// [ArrayData] from multiple `arrays`.
@@ -341,7 +374,13 @@ impl<'a> MutableArrayData<'a> {
use_nulls = true;
};
- let [buffer1, buffer2] = new_buffers(data_type, capacity);
+ // We can prevent reallocation by precomputing the needed size.
+ // This is faster and more memory efficient.
+ let [buffer1, buffer2] = match data_type {
+ DataType::LargeUtf8 => preallocate_str_buffer::<i64>(capacity, &arrays),
+ DataType::Utf8 => preallocate_str_buffer::<i32>(capacity, &arrays),
+ _ => new_buffers(data_type, capacity),
+ };
let child_data = match &data_type {
DataType::Null
diff --git a/arrow/src/compute/kernels/concat.rs b/arrow/src/compute/kernels/concat.rs
index 35ff183..83140c8 100644
--- a/arrow/src/compute/kernels/concat.rs
+++ b/arrow/src/compute/kernels/concat.rs
@@ -452,4 +452,23 @@ mod tests {
let concat = concat_dictionary(input_1, input_2);
assert_eq!(concat, expected);
}
+
+ #[test]
+ fn test_concat_string_sizes() -> Result<()> {
+ let a: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
+ let b: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
+ let c = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
+ // 150 * 3 = 450
+ // 150 * 3 = 450
+ // 3 * 3 = 9
+ // ------------+
+ // 909
+ // closest 64 byte aligned cap = 960
+
+ let arr = concat(&[&a, &b, &c])?;
+ // this would have been 1280 if we did not precompute the value lengths.
+ assert_eq!(arr.data().buffers()[1].capacity(), 960);
+
+ Ok(())
+ }
}