You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/07/01 21:27:21 UTC

[arrow-rs] branch active_release updated: Correct array memory usage calculation for dictionary arrays (#505) (#515)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/active_release by this push:
     new c053601  Correct array memory usage calculation for dictionary arrays (#505) (#515)
c053601 is described below

commit c0536013b87b7e58b3dcfac2b14eba7f8f04ff6f
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Thu Jul 1 17:27:13 2021 -0400

    Correct array memory usage calculation for dictionary arrays (#505) (#515)
    
    * Correct array memory usage calculation for dictionary arrays
    
    * update comments
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
    
    * update comments
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
    
    * Adjust memory usage calculation and move related tests to the same file
    
    * Comment about bitmap size
    
    * Clippy fixes
    
    * Adjust calculation for validity bitmap
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
    
    Co-authored-by: Jörn Horstmann <jo...@signavio.com>
---
 arrow/src/array/array.rs            | 118 +++++++++++++++++++++++++++++++++++-
 arrow/src/array/array_binary.rs     |  31 ----------
 arrow/src/array/array_boolean.rs    |  11 ----
 arrow/src/array/array_dictionary.rs |  13 ----
 arrow/src/array/array_list.rs       |  23 -------
 arrow/src/array/array_primitive.rs  |  16 -----
 arrow/src/array/array_string.rs     |  11 ----
 arrow/src/array/array_struct.rs     |  11 ----
 arrow/src/array/array_union.rs      |  20 ------
 arrow/src/array/data.rs             |  14 ++---
 arrow/src/array/null.rs             |  17 ------
 11 files changed, 121 insertions(+), 164 deletions(-)

diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs
index de87c3d..76b29d3 100644
--- a/arrow/src/array/array.rs
+++ b/arrow/src/array/array.rs
@@ -197,11 +197,21 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual {
         self.data_ref().null_count()
     }
 
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this array.
-    fn get_buffer_memory_size(&self) -> usize;
+    /// Returns the total number of bytes of memory pointed to by this array.
+    /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
+    fn get_buffer_memory_size(&self) -> usize {
+        self.data_ref().get_buffer_memory_size()
+    }
 
     /// Returns the total number of bytes of memory occupied physically by this array.
-    fn get_array_memory_size(&self) -> usize;
+    /// This value will always be greater than returned by `get_buffer_memory_size()` and
+    /// includes the overhead of the data structures that contain the pointers to the various buffers.
+    fn get_array_memory_size(&self) -> usize {
+        // both data.get_array_memory_size and size_of_val(self) include ArrayData fields,
+        // to only count additional fields of this array substract size_of(ArrayData)
+        self.data_ref().get_array_memory_size() + std::mem::size_of_val(self)
+            - std::mem::size_of::<ArrayData>()
+    }
 
     /// returns two pointers that represent this array in the C Data Interface (FFI)
     fn to_raw(
@@ -575,6 +585,7 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
+
     #[test]
     fn test_empty_primitive() {
         let array = new_empty_array(&DataType::Int32);
@@ -661,4 +672,105 @@ mod tests {
             null_array.data().buffers()[0].len()
         );
     }
+
+    #[test]
+    fn test_memory_size_null() {
+        let null_arr = NullArray::new(32);
+
+        assert_eq!(0, null_arr.get_buffer_memory_size());
+        assert_eq!(
+            std::mem::size_of::<NullArray>(),
+            null_arr.get_array_memory_size()
+        );
+        assert_eq!(
+            std::mem::size_of::<NullArray>(),
+            std::mem::size_of::<ArrayData>(),
+        );
+    }
+
+    #[test]
+    fn test_memory_size_primitive() {
+        let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
+        let empty =
+            PrimitiveArray::<Int64Type>::from(ArrayData::new_empty(arr.data_type()));
+
+        // substract empty array to avoid magic numbers for the size of additional fields
+        assert_eq!(
+            arr.get_array_memory_size() - empty.get_array_memory_size(),
+            128 * std::mem::size_of::<i64>()
+        );
+    }
+
+    #[test]
+    fn test_memory_size_primitive_nullable() {
+        let arr: PrimitiveArray<Int64Type> = (0..128).map(Some).collect();
+        let empty_with_bitmap = PrimitiveArray::<Int64Type>::from(
+            ArrayData::builder(arr.data_type().clone())
+                .add_buffer(MutableBuffer::new(0).into())
+                .null_bit_buffer(MutableBuffer::new_null(0).into())
+                .build(),
+        );
+
+        // expected size is the size of the PrimitiveArray struct,
+        // which includes the optional validity buffer
+        // plus one buffer on the heap
+        assert_eq!(
+            std::mem::size_of::<PrimitiveArray<Int64Type>>()
+                + std::mem::size_of::<Buffer>(),
+            empty_with_bitmap.get_array_memory_size()
+        );
+
+        // substract empty array to avoid magic numbers for the size of additional fields
+        // the size of the validity bitmap is rounded up to 64 bytes
+        assert_eq!(
+            arr.get_array_memory_size() - empty_with_bitmap.get_array_memory_size(),
+            128 * std::mem::size_of::<i64>() + 64
+        );
+    }
+
+    #[test]
+    fn test_memory_size_dictionary() {
+        let values = PrimitiveArray::<Int64Type>::from_iter_values(0..16);
+        let keys = PrimitiveArray::<Int16Type>::from_iter_values(
+            (0..256).map(|i| (i % values.len()) as i16),
+        );
+
+        let dict_data = ArrayData::builder(DataType::Dictionary(
+            Box::new(keys.data_type().clone()),
+            Box::new(values.data_type().clone()),
+        ))
+        .len(keys.len())
+        .buffers(keys.data_ref().buffers().to_vec())
+        .child_data(vec![ArrayData::builder(DataType::Int64)
+            .len(values.len())
+            .buffers(values.data_ref().buffers().to_vec())
+            .build()])
+        .build();
+
+        let empty_data = ArrayData::new_empty(&DataType::Dictionary(
+            Box::new(DataType::Int16),
+            Box::new(DataType::Int64),
+        ));
+
+        let arr = DictionaryArray::<Int16Type>::from(dict_data);
+        let empty = DictionaryArray::<Int16Type>::from(empty_data);
+
+        let expected_keys_size = 256 * std::mem::size_of::<i16>();
+        assert_eq!(
+            arr.keys().get_array_memory_size() - empty.keys().get_array_memory_size(),
+            expected_keys_size
+        );
+
+        let expected_values_size = 16 * std::mem::size_of::<i64>();
+        assert_eq!(
+            arr.values().get_array_memory_size() - empty.values().get_array_memory_size(),
+            expected_values_size
+        );
+
+        let expected_size = expected_keys_size + expected_values_size;
+        assert_eq!(
+            arr.get_array_memory_size() - empty.get_array_memory_size(),
+            expected_size
+        );
+    }
 }
diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index e397e0d..18f7f7c 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -17,7 +17,6 @@
 
 use std::convert::{From, TryInto};
 use std::fmt;
-use std::mem;
 use std::{any::Any, iter::FromIterator};
 
 use super::{
@@ -199,16 +198,6 @@ impl<OffsetSize: BinaryOffsetSizeTrait> Array for GenericBinaryArray<OffsetSize>
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [$name].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of_val(self)
-    }
 }
 
 impl<OffsetSize: BinaryOffsetSizeTrait> From<ArrayData>
@@ -600,16 +589,6 @@ impl Array for FixedSizeBinaryArray {
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of_val(self)
-    }
 }
 
 /// A type of `DecimalArray` whose elements are binaries.
@@ -756,16 +735,6 @@ impl Array for DecimalArray {
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [DecimalArray].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of_val(self)
-    }
 }
 
 #[cfg(test)]
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index de24c44..37080fe 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -18,7 +18,6 @@
 use std::borrow::Borrow;
 use std::convert::From;
 use std::iter::{FromIterator, IntoIterator};
-use std::mem;
 use std::{any::Any, fmt};
 
 use super::*;
@@ -114,16 +113,6 @@ impl Array for BooleanArray {
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [BooleanArray].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of_val(self)
-    }
 }
 
 impl From<Vec<bool>> for BooleanArray {
diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs
index a2419a1..cb5f511 100644
--- a/arrow/src/array/array_dictionary.rs
+++ b/arrow/src/array/array_dictionary.rs
@@ -18,7 +18,6 @@
 use std::any::Any;
 use std::fmt;
 use std::iter::IntoIterator;
-use std::mem;
 use std::{convert::From, iter::FromIterator};
 
 use super::{
@@ -224,18 +223,6 @@ impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> {
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    fn get_buffer_memory_size(&self) -> usize {
-        // Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`.
-        self.data.get_buffer_memory_size()
-    }
-
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size()
-            + self.keys.get_array_memory_size()
-            + self.values.get_array_memory_size()
-            + mem::size_of_val(self)
-    }
 }
 
 impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs
index f794dbe..0efb65c 100644
--- a/arrow/src/array/array_list.rs
+++ b/arrow/src/array/array_list.rs
@@ -17,7 +17,6 @@
 
 use std::any::Any;
 use std::fmt;
-use std::mem;
 
 use num::Num;
 
@@ -261,16 +260,6 @@ impl<OffsetSize: 'static + OffsetSizeTrait> Array for GenericListArray<OffsetSiz
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [ListArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [ListArray].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of_val(self)
-    }
 }
 
 impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericListArray<OffsetSize> {
@@ -444,18 +433,6 @@ impl Array for FixedSizeListArray {
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeListArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [FixedSizeListArray].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size()
-            + self.values().get_array_memory_size()
-            + mem::size_of_val(self)
-    }
 }
 
 impl fmt::Debug for FixedSizeListArray {
diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs
index 5e85260..fa950a9 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -150,16 +150,6 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [PrimitiveArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [PrimitiveArray].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of::<RawPtrBox<T::Native>>()
-    }
 }
 
 fn as_datetime<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveDateTime> {
@@ -508,9 +498,6 @@ mod tests {
             assert!(arr.is_valid(i));
             assert_eq!(i as i32, arr.value(i));
         }
-
-        assert_eq!(64, arr.get_buffer_memory_size());
-        assert_eq!(136, arr.get_array_memory_size());
     }
 
     #[test]
@@ -530,9 +517,6 @@ mod tests {
                 assert!(!arr.is_valid(i));
             }
         }
-
-        assert_eq!(128, arr.get_buffer_memory_size());
-        assert_eq!(216, arr.get_array_memory_size());
     }
 
     #[test]
diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index 8b0ad10..ba28507 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -17,7 +17,6 @@
 
 use std::convert::From;
 use std::fmt;
-use std::mem;
 use std::{any::Any, iter::FromIterator};
 
 use super::{
@@ -286,16 +285,6 @@ impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize>
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [$name].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of_val(self)
-    }
 }
 
 impl<OffsetSize: StringOffsetSizeTrait> From<ArrayData>
diff --git a/arrow/src/array/array_struct.rs b/arrow/src/array/array_struct.rs
index f721d35..0e7304e 100644
--- a/arrow/src/array/array_struct.rs
+++ b/arrow/src/array/array_struct.rs
@@ -19,7 +19,6 @@ use std::any::Any;
 use std::convert::{From, TryFrom};
 use std::fmt;
 use std::iter::IntoIterator;
-use std::mem;
 
 use super::{make_array, Array, ArrayData, ArrayRef};
 use crate::datatypes::DataType;
@@ -178,16 +177,6 @@ impl Array for StructArray {
     fn len(&self) -> usize {
         self.data_ref().len()
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [StructArray].
-    fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size() + mem::size_of_val(self)
-    }
 }
 
 impl From<Vec<(Field, ArrayRef)>> for StructArray {
diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs
index e4a4dfb..e369037 100644
--- a/arrow/src/array/array_union.rs
+++ b/arrow/src/array/array_union.rs
@@ -80,7 +80,6 @@ use crate::error::{ArrowError, Result};
 
 use core::fmt;
 use std::any::Any;
-use std::mem;
 use std::mem::size_of;
 
 /// An Array that can represent slots of varying types.
@@ -276,25 +275,6 @@ impl Array for UnionArray {
     fn data(&self) -> &ArrayData {
         &self.data
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [UnionArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        let mut size = self.data.get_buffer_memory_size();
-        for field in &self.boxed_fields {
-            size += field.get_buffer_memory_size();
-        }
-        size
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [UnionArray].
-    fn get_array_memory_size(&self) -> usize {
-        let mut size = self.data.get_array_memory_size();
-        size += mem::size_of_val(self) - mem::size_of_val(&self.boxed_fields);
-        for field in &self.boxed_fields {
-            size += field.get_array_memory_size();
-        }
-        size
-    }
 }
 
 impl fmt::Debug for UnionArray {
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index 172bdaa..00c8e15 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -354,20 +354,18 @@ impl ArrayData {
 
     /// Returns the total number of bytes of memory occupied physically by this [ArrayData].
     pub fn get_array_memory_size(&self) -> usize {
-        let mut size = 0;
-        // Calculate size of the fields that don't have [get_array_memory_size] method internally.
-        size += mem::size_of_val(self)
-            - mem::size_of_val(&self.buffers)
-            - mem::size_of_val(&self.null_bitmap)
-            - mem::size_of_val(&self.child_data);
+        let mut size = mem::size_of_val(self);
 
         // Calculate rest of the fields top down which contain actual data
         for buffer in &self.buffers {
-            size += mem::size_of_val(&buffer);
+            size += mem::size_of::<Buffer>();
             size += buffer.capacity();
         }
         if let Some(bitmap) = &self.null_bitmap {
-            size += bitmap.get_array_memory_size()
+            // this includes the size of the bitmap struct itself, since it is stored directly in
+            // this struct we already counted those bytes in the size_of_val(self) above
+            size += bitmap.get_array_memory_size();
+            size -= mem::size_of::<Bitmap>();
         }
         for child in &self.child_data {
             size += child.get_array_memory_size();
diff --git a/arrow/src/array/null.rs b/arrow/src/array/null.rs
index da472c7..6728a40 100644
--- a/arrow/src/array/null.rs
+++ b/arrow/src/array/null.rs
@@ -19,7 +19,6 @@
 
 use std::any::Any;
 use std::fmt;
-use std::mem;
 
 use crate::array::{Array, ArrayData};
 use crate::datatypes::*;
@@ -84,16 +83,6 @@ impl Array for NullArray {
     fn null_count(&self) -> usize {
         self.data_ref().len()
     }
-
-    /// Returns the total number of bytes of memory occupied by the buffers owned by this [NullArray].
-    fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size()
-    }
-
-    /// Returns the total number of bytes of memory occupied physically by this [NullArray].
-    fn get_array_memory_size(&self) -> usize {
-        mem::size_of_val(self)
-    }
 }
 
 impl From<ArrayData> for NullArray {
@@ -133,12 +122,6 @@ mod tests {
         assert_eq!(null_arr.len(), 32);
         assert_eq!(null_arr.null_count(), 32);
         assert!(!null_arr.is_valid(0));
-
-        assert_eq!(0, null_arr.get_buffer_memory_size());
-        assert_eq!(
-            null_arr.get_buffer_memory_size() + std::mem::size_of::<NullArray>(),
-            null_arr.get_array_memory_size()
-        );
     }
 
     #[test]