You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/05/29 10:46:24 UTC
[arrow-rs] branch master updated: Reduce memory usage of concat (large)utf8 (#348)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e801d4b  Reduce memory usage of concat (large)utf8 (#348)
e801d4b is described below

commit e801d4b91bb8a5340b8e01849c579d84e360674c
Author: Ritchie Vink <ri...@gmail.com>
AuthorDate: Sat May 29 12:45:55 2021 +0200

    Reduce memory usage of concat (large)utf8 (#348)
    
    * reduce memory needed for concat
    
    * reuse code for str allocation buffer
---
 arrow/src/array/transform/mod.rs    | 41 ++++++++++++++++++++++++++++++++++++-
 arrow/src/compute/kernels/concat.rs | 19 +++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs
index e7ec41e..5611671 100644
--- a/arrow/src/array/transform/mod.rs
+++ b/arrow/src/array/transform/mod.rs
@@ -21,11 +21,13 @@ use crate::{
     error::{ArrowError, Result},
     util::bit_util,
 };
+use std::mem;
 
 use super::{
     data::{into_buffers, new_buffers},
     ArrayData,
 };
+use crate::array::StringOffsetSizeTrait;
 
 mod boolean;
 mod fixed_binary;
@@ -324,6 +326,37 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
     })
 }
 
+fn preallocate_str_buffer<Offset: StringOffsetSizeTrait>(
+    capacity: usize,
+    arrays: &[&ArrayData],
+) -> [MutableBuffer; 2] {
+    // offsets
+    let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
+    // safety: `unsafe` code assumes that this buffer is initialized with one element
+    if Offset::is_large() {
+        buffer.push(0i64);
+    } else {
+        buffer.push(0i32)
+    }
+    let str_values_size = arrays
+        .iter()
+        .map(|data| {
+            // get the length of the value buffer
+            let buf_len = data.buffers()[1].len();
+            // find the offset of the buffer
+            // this returns a slice of offsets, starting from the offset of the array
+            // so we can take the first value
+            let offset = data.buffer::<Offset>(0)[0];
+            buf_len - offset.to_usize().unwrap()
+        })
+        .sum::<usize>();
+
+    [
+        buffer,
+        MutableBuffer::new(str_values_size * mem::size_of::<u8>()),
+    ]
+}
+
 impl<'a> MutableArrayData<'a> {
     /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an
     /// [ArrayData] from multiple `arrays`.
@@ -341,7 +374,13 @@ impl<'a> MutableArrayData<'a> {
             use_nulls = true;
         };
 
-        let [buffer1, buffer2] = new_buffers(data_type, capacity);
+        // We can prevent reallocation by precomputing the needed size.
+        // This is faster and more memory efficient.
+        let [buffer1, buffer2] = match data_type {
+            DataType::LargeUtf8 => preallocate_str_buffer::<i64>(capacity, &arrays),
+            DataType::Utf8 => preallocate_str_buffer::<i32>(capacity, &arrays),
+            _ => new_buffers(data_type, capacity),
+        };
 
         let child_data = match &data_type {
             DataType::Null
diff --git a/arrow/src/compute/kernels/concat.rs b/arrow/src/compute/kernels/concat.rs
index 35ff183..83140c8 100644
--- a/arrow/src/compute/kernels/concat.rs
+++ b/arrow/src/compute/kernels/concat.rs
@@ -452,4 +452,23 @@ mod tests {
         let concat = concat_dictionary(input_1, input_2);
         assert_eq!(concat, expected);
     }
+
+    #[test]
+    fn test_concat_string_sizes() -> Result<()> {
+        let a: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
+        let b: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
+        let c = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
+        // 150 * 3 = 450
+        // 150 * 3 = 450
+        // 3 * 3   = 9
+        // ------------+
+        // 909
+        // closest 64 byte aligned cap = 960
+
+        let arr = concat(&[&a, &b, &c])?;
+        // this would have been 1280 if we did not precompute the value lengths.
+        assert_eq!(arr.data().buffers()[1].capacity(), 960);
+
+        Ok(())
+    }
 }