You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/07/01 21:27:21 UTC
[arrow-rs] branch active_release updated: Correct array memory
usage calculation for dictionary arrays (#505) (#515)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/active_release by this push:
new c053601 Correct array memory usage calculation for dictionary arrays (#505) (#515)
c053601 is described below
commit c0536013b87b7e58b3dcfac2b14eba7f8f04ff6f
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Thu Jul 1 17:27:13 2021 -0400
Correct array memory usage calculation for dictionary arrays (#505) (#515)
* Correct array memory usage calculation for dictionary arrays
* update comments
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
* update comments
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
* Adjust memory usage calculation and move related tests to the same file
* Comment about bitmap size
* Clippy fixes
* Adjust calculation for validity bitmap
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
Co-authored-by: Jörn Horstmann <jo...@signavio.com>
---
arrow/src/array/array.rs | 118 +++++++++++++++++++++++++++++++++++-
arrow/src/array/array_binary.rs | 31 ----------
arrow/src/array/array_boolean.rs | 11 ----
arrow/src/array/array_dictionary.rs | 13 ----
arrow/src/array/array_list.rs | 23 -------
arrow/src/array/array_primitive.rs | 16 -----
arrow/src/array/array_string.rs | 11 ----
arrow/src/array/array_struct.rs | 11 ----
arrow/src/array/array_union.rs | 20 ------
arrow/src/array/data.rs | 14 ++---
arrow/src/array/null.rs | 17 ------
11 files changed, 121 insertions(+), 164 deletions(-)
diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs
index de87c3d..76b29d3 100644
--- a/arrow/src/array/array.rs
+++ b/arrow/src/array/array.rs
@@ -197,11 +197,21 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual {
self.data_ref().null_count()
}
- /// Returns the total number of bytes of memory occupied by the buffers owned by this array.
- fn get_buffer_memory_size(&self) -> usize;
+ /// Returns the total number of bytes of memory pointed to by this array.
+ /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
+ fn get_buffer_memory_size(&self) -> usize {
+ self.data_ref().get_buffer_memory_size()
+ }
/// Returns the total number of bytes of memory occupied physically by this array.
- fn get_array_memory_size(&self) -> usize;
+ /// This value will always be greater than returned by `get_buffer_memory_size()` and
+ /// includes the overhead of the data structures that contain the pointers to the various buffers.
+ fn get_array_memory_size(&self) -> usize {
+ // both data.get_array_memory_size and size_of_val(self) include ArrayData fields,
+ // to only count additional fields of this array substract size_of(ArrayData)
+ self.data_ref().get_array_memory_size() + std::mem::size_of_val(self)
+ - std::mem::size_of::<ArrayData>()
+ }
/// returns two pointers that represent this array in the C Data Interface (FFI)
fn to_raw(
@@ -575,6 +585,7 @@ where
#[cfg(test)]
mod tests {
use super::*;
+
#[test]
fn test_empty_primitive() {
let array = new_empty_array(&DataType::Int32);
@@ -661,4 +672,105 @@ mod tests {
null_array.data().buffers()[0].len()
);
}
+
+ #[test]
+ fn test_memory_size_null() {
+ let null_arr = NullArray::new(32);
+
+ assert_eq!(0, null_arr.get_buffer_memory_size());
+ assert_eq!(
+ std::mem::size_of::<NullArray>(),
+ null_arr.get_array_memory_size()
+ );
+ assert_eq!(
+ std::mem::size_of::<NullArray>(),
+ std::mem::size_of::<ArrayData>(),
+ );
+ }
+
+ #[test]
+ fn test_memory_size_primitive() {
+ let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
+ let empty =
+ PrimitiveArray::<Int64Type>::from(ArrayData::new_empty(arr.data_type()));
+
+ // substract empty array to avoid magic numbers for the size of additional fields
+ assert_eq!(
+ arr.get_array_memory_size() - empty.get_array_memory_size(),
+ 128 * std::mem::size_of::<i64>()
+ );
+ }
+
+ #[test]
+ fn test_memory_size_primitive_nullable() {
+ let arr: PrimitiveArray<Int64Type> = (0..128).map(Some).collect();
+ let empty_with_bitmap = PrimitiveArray::<Int64Type>::from(
+ ArrayData::builder(arr.data_type().clone())
+ .add_buffer(MutableBuffer::new(0).into())
+ .null_bit_buffer(MutableBuffer::new_null(0).into())
+ .build(),
+ );
+
+ // expected size is the size of the PrimitiveArray struct,
+ // which includes the optional validity buffer
+ // plus one buffer on the heap
+ assert_eq!(
+ std::mem::size_of::<PrimitiveArray<Int64Type>>()
+ + std::mem::size_of::<Buffer>(),
+ empty_with_bitmap.get_array_memory_size()
+ );
+
+ // substract empty array to avoid magic numbers for the size of additional fields
+ // the size of the validity bitmap is rounded up to 64 bytes
+ assert_eq!(
+ arr.get_array_memory_size() - empty_with_bitmap.get_array_memory_size(),
+ 128 * std::mem::size_of::<i64>() + 64
+ );
+ }
+
+ #[test]
+ fn test_memory_size_dictionary() {
+ let values = PrimitiveArray::<Int64Type>::from_iter_values(0..16);
+ let keys = PrimitiveArray::<Int16Type>::from_iter_values(
+ (0..256).map(|i| (i % values.len()) as i16),
+ );
+
+ let dict_data = ArrayData::builder(DataType::Dictionary(
+ Box::new(keys.data_type().clone()),
+ Box::new(values.data_type().clone()),
+ ))
+ .len(keys.len())
+ .buffers(keys.data_ref().buffers().to_vec())
+ .child_data(vec![ArrayData::builder(DataType::Int64)
+ .len(values.len())
+ .buffers(values.data_ref().buffers().to_vec())
+ .build()])
+ .build();
+
+ let empty_data = ArrayData::new_empty(&DataType::Dictionary(
+ Box::new(DataType::Int16),
+ Box::new(DataType::Int64),
+ ));
+
+ let arr = DictionaryArray::<Int16Type>::from(dict_data);
+ let empty = DictionaryArray::<Int16Type>::from(empty_data);
+
+ let expected_keys_size = 256 * std::mem::size_of::<i16>();
+ assert_eq!(
+ arr.keys().get_array_memory_size() - empty.keys().get_array_memory_size(),
+ expected_keys_size
+ );
+
+ let expected_values_size = 16 * std::mem::size_of::<i64>();
+ assert_eq!(
+ arr.values().get_array_memory_size() - empty.values().get_array_memory_size(),
+ expected_values_size
+ );
+
+ let expected_size = expected_keys_size + expected_values_size;
+ assert_eq!(
+ arr.get_array_memory_size() - empty.get_array_memory_size(),
+ expected_size
+ );
+ }
}
diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index e397e0d..18f7f7c 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -17,7 +17,6 @@
use std::convert::{From, TryInto};
use std::fmt;
-use std::mem;
use std::{any::Any, iter::FromIterator};
use super::{
@@ -199,16 +198,6 @@ impl<OffsetSize: BinaryOffsetSizeTrait> Array for GenericBinaryArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [$name].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of_val(self)
- }
}
impl<OffsetSize: BinaryOffsetSizeTrait> From<ArrayData>
@@ -600,16 +589,6 @@ impl Array for FixedSizeBinaryArray {
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of_val(self)
- }
}
/// A type of `DecimalArray` whose elements are binaries.
@@ -756,16 +735,6 @@ impl Array for DecimalArray {
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [DecimalArray].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of_val(self)
- }
}
#[cfg(test)]
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index de24c44..37080fe 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -18,7 +18,6 @@
use std::borrow::Borrow;
use std::convert::From;
use std::iter::{FromIterator, IntoIterator};
-use std::mem;
use std::{any::Any, fmt};
use super::*;
@@ -114,16 +113,6 @@ impl Array for BooleanArray {
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [BooleanArray].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of_val(self)
- }
}
impl From<Vec<bool>> for BooleanArray {
diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs
index a2419a1..cb5f511 100644
--- a/arrow/src/array/array_dictionary.rs
+++ b/arrow/src/array/array_dictionary.rs
@@ -18,7 +18,6 @@
use std::any::Any;
use std::fmt;
use std::iter::IntoIterator;
-use std::mem;
use std::{convert::From, iter::FromIterator};
use super::{
@@ -224,18 +223,6 @@ impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}
-
- fn get_buffer_memory_size(&self) -> usize {
- // Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`.
- self.data.get_buffer_memory_size()
- }
-
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size()
- + self.keys.get_array_memory_size()
- + self.values.get_array_memory_size()
- + mem::size_of_val(self)
- }
}
impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs
index f794dbe..0efb65c 100644
--- a/arrow/src/array/array_list.rs
+++ b/arrow/src/array/array_list.rs
@@ -17,7 +17,6 @@
use std::any::Any;
use std::fmt;
-use std::mem;
use num::Num;
@@ -261,16 +260,6 @@ impl<OffsetSize: 'static + OffsetSizeTrait> Array for GenericListArray<OffsetSiz
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [ListArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [ListArray].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of_val(self)
- }
}
impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericListArray<OffsetSize> {
@@ -444,18 +433,6 @@ impl Array for FixedSizeListArray {
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeListArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [FixedSizeListArray].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size()
- + self.values().get_array_memory_size()
- + mem::size_of_val(self)
- }
}
impl fmt::Debug for FixedSizeListArray {
diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs
index 5e85260..fa950a9 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -150,16 +150,6 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [PrimitiveArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [PrimitiveArray].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of::<RawPtrBox<T::Native>>()
- }
}
fn as_datetime<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveDateTime> {
@@ -508,9 +498,6 @@ mod tests {
assert!(arr.is_valid(i));
assert_eq!(i as i32, arr.value(i));
}
-
- assert_eq!(64, arr.get_buffer_memory_size());
- assert_eq!(136, arr.get_array_memory_size());
}
#[test]
@@ -530,9 +517,6 @@ mod tests {
assert!(!arr.is_valid(i));
}
}
-
- assert_eq!(128, arr.get_buffer_memory_size());
- assert_eq!(216, arr.get_array_memory_size());
}
#[test]
diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index 8b0ad10..ba28507 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -17,7 +17,6 @@
use std::convert::From;
use std::fmt;
-use std::mem;
use std::{any::Any, iter::FromIterator};
use super::{
@@ -286,16 +285,6 @@ impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [$name].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of_val(self)
- }
}
impl<OffsetSize: StringOffsetSizeTrait> From<ArrayData>
diff --git a/arrow/src/array/array_struct.rs b/arrow/src/array/array_struct.rs
index f721d35..0e7304e 100644
--- a/arrow/src/array/array_struct.rs
+++ b/arrow/src/array/array_struct.rs
@@ -19,7 +19,6 @@ use std::any::Any;
use std::convert::{From, TryFrom};
use std::fmt;
use std::iter::IntoIterator;
-use std::mem;
use super::{make_array, Array, ArrayData, ArrayRef};
use crate::datatypes::DataType;
@@ -178,16 +177,6 @@ impl Array for StructArray {
fn len(&self) -> usize {
self.data_ref().len()
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [StructArray].
- fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + mem::size_of_val(self)
- }
}
impl From<Vec<(Field, ArrayRef)>> for StructArray {
diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs
index e4a4dfb..e369037 100644
--- a/arrow/src/array/array_union.rs
+++ b/arrow/src/array/array_union.rs
@@ -80,7 +80,6 @@ use crate::error::{ArrowError, Result};
use core::fmt;
use std::any::Any;
-use std::mem;
use std::mem::size_of;
/// An Array that can represent slots of varying types.
@@ -276,25 +275,6 @@ impl Array for UnionArray {
fn data(&self) -> &ArrayData {
&self.data
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [UnionArray].
- fn get_buffer_memory_size(&self) -> usize {
- let mut size = self.data.get_buffer_memory_size();
- for field in &self.boxed_fields {
- size += field.get_buffer_memory_size();
- }
- size
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [UnionArray].
- fn get_array_memory_size(&self) -> usize {
- let mut size = self.data.get_array_memory_size();
- size += mem::size_of_val(self) - mem::size_of_val(&self.boxed_fields);
- for field in &self.boxed_fields {
- size += field.get_array_memory_size();
- }
- size
- }
}
impl fmt::Debug for UnionArray {
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index 172bdaa..00c8e15 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -354,20 +354,18 @@ impl ArrayData {
/// Returns the total number of bytes of memory occupied physically by this [ArrayData].
pub fn get_array_memory_size(&self) -> usize {
- let mut size = 0;
- // Calculate size of the fields that don't have [get_array_memory_size] method internally.
- size += mem::size_of_val(self)
- - mem::size_of_val(&self.buffers)
- - mem::size_of_val(&self.null_bitmap)
- - mem::size_of_val(&self.child_data);
+ let mut size = mem::size_of_val(self);
// Calculate rest of the fields top down which contain actual data
for buffer in &self.buffers {
- size += mem::size_of_val(&buffer);
+ size += mem::size_of::<Buffer>();
size += buffer.capacity();
}
if let Some(bitmap) = &self.null_bitmap {
- size += bitmap.get_array_memory_size()
+ // this includes the size of the bitmap struct itself, since it is stored directly in
+ // this struct we already counted those bytes in the size_of_val(self) above
+ size += bitmap.get_array_memory_size();
+ size -= mem::size_of::<Bitmap>();
}
for child in &self.child_data {
size += child.get_array_memory_size();
diff --git a/arrow/src/array/null.rs b/arrow/src/array/null.rs
index da472c7..6728a40 100644
--- a/arrow/src/array/null.rs
+++ b/arrow/src/array/null.rs
@@ -19,7 +19,6 @@
use std::any::Any;
use std::fmt;
-use std::mem;
use crate::array::{Array, ArrayData};
use crate::datatypes::*;
@@ -84,16 +83,6 @@ impl Array for NullArray {
fn null_count(&self) -> usize {
self.data_ref().len()
}
-
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [NullArray].
- fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
- }
-
- /// Returns the total number of bytes of memory occupied physically by this [NullArray].
- fn get_array_memory_size(&self) -> usize {
- mem::size_of_val(self)
- }
}
impl From<ArrayData> for NullArray {
@@ -133,12 +122,6 @@ mod tests {
assert_eq!(null_arr.len(), 32);
assert_eq!(null_arr.null_count(), 32);
assert!(!null_arr.is_valid(0));
-
- assert_eq!(0, null_arr.get_buffer_memory_size());
- assert_eq!(
- null_arr.get_buffer_memory_size() + std::mem::size_of::<NullArray>(),
- null_arr.get_array_memory_size()
- );
}
#[test]