You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/17 23:41:47 UTC
[arrow-rs] branch master updated: Minor: Add documentation about memory use for ArrayData (#3529)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 14545a42e Minor: Add documentation about memory use for ArrayData (#3529)
14545a42e is described below
commit 14545a42ec09782ec0371c05c01d112e0ca37604
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Jan 18 00:41:40 2023 +0100
Minor: Add documentation about memory use for ArrayData (#3529)
* Minor: Add documentation about memory use for ArrayData
* Apply suggestions from code review
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
* Apply suggestions from code review
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
* Update arrow-data/src/data.rs
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
* Update arrow-data/src/data.rs
* Update arrow-data/src/bitmap.rs
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
arrow-data/src/bitmap.rs | 11 +++++-
arrow-data/src/data.rs | 100 ++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 94 insertions(+), 17 deletions(-)
diff --git a/arrow-data/src/bitmap.rs b/arrow-data/src/bitmap.rs
index 0002ef022..a356b9ff7 100644
--- a/arrow-data/src/bitmap.rs
+++ b/arrow-data/src/bitmap.rs
@@ -68,12 +68,19 @@ impl Bitmap {
self.bits
}
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [Bitmap].
+ /// Returns the total number of bytes of memory occupied by the
+ /// buffers owned by this [Bitmap].
+ ///
+ /// If multiple [`Bitmap`]s refer to the same underlying
+ /// [`Buffer`] they will both report the same size.
pub fn get_buffer_memory_size(&self) -> usize {
self.bits.capacity()
}
- /// Returns the total number of bytes of memory occupied physically by this [Bitmap].
+ /// Returns the total number of bytes of memory occupied
+ /// physically by this [Bitmap] and its [`Buffer`]s.
+ ///
+ /// Equivalent to: `size_of_val(self)` + [`Self::get_buffer_memory_size`]
pub fn get_array_memory_size(&self) -> usize {
self.bits.capacity() + mem::size_of_val(self)
}
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 14dbe9387..258ee082d 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates
+//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
//! common attributes and operations for Arrow array.
use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap};
@@ -245,6 +245,46 @@ pub(crate) fn into_buffers(
/// An generic representation of Arrow array data which encapsulates common attributes and
/// operations for Arrow array. Specific operations for different arrays types (e.g.,
/// primitive, list, struct) are implemented in `Array`.
+///
+/// # Memory Layout
+///
+/// `ArrayData` has references to one or more underlying data buffers
+/// and optional child ArrayDatas, depending on type as illustrated
+/// below. Bitmaps are not shown for simplicity but they are stored
+/// similarly to the buffers.
+///
+/// ```text
+/// offset
+/// points to
+/// ┌───────────────────┐ start of ┌───────┐ Different
+/// │ │ data │ │ ArrayData may
+/// │ArrayData { │ │.... │ also refers to
+/// │ data_type: ... │ ─ ─ ─ ─▶│1234 │ ┌ ─ the same
+/// │ offset: ... ─ ─ ─│─ ┘ │4372 │ underlying
+/// │ len: ... ─ ─ ─│─ ┐ │4888 │ │ buffer with different offset/len
+/// │ buffers: [ │ │5882 │◀─
+/// │ ... │ │ │4323 │
+/// │ ] │ ─ ─ ─ ─▶│4859 │
+/// │ child_data: [ │ │.... │
+/// │ ... │ │ │
+/// │ ] │ └───────┘
+/// │} │
+/// │ │ Shared Buffer uses
+/// │ │ │ bytes::Bytes to hold
+/// └───────────────────┘ actual data values
+/// ┌ ─ ─ ┘
+///
+/// ▼
+/// ┌───────────────────┐
+/// │ArrayData { │
+/// │ ... │
+/// │} │
+/// │ │
+/// └───────────────────┘
+///
+/// Child ArrayData may also have its own buffers and children
+/// ```
+
#[derive(Debug, Clone)]
pub struct ArrayData {
/// The data type for this array data
@@ -375,24 +415,25 @@ impl ArrayData {
Ok(new_self)
}
- /// Returns a builder to construct a `ArrayData` instance.
+ /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
#[inline]
pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
ArrayDataBuilder::new(data_type)
}
- /// Returns a reference to the data type of this array data
+ /// Returns a reference to the [`DataType`] of this [`ArrayData`]
#[inline]
pub const fn data_type(&self) -> &DataType {
&self.data_type
}
- /// Returns a slice of buffers for this array data
+ /// Returns a slice of the [`Buffer`]s that hold the data.
pub fn buffers(&self) -> &[Buffer] {
&self.buffers[..]
}
- /// Returns a slice of children data arrays
+ /// Returns a slice of children [`ArrayData`]. This will be non
+ /// empty for type such as lists and structs.
pub fn child_data(&self) -> &[ArrayData] {
&self.child_data[..]
}
@@ -405,13 +446,13 @@ impl ArrayData {
false
}
- /// Returns a reference to the null bitmap of this array data
+ /// Returns a reference to the null bitmap of this [`ArrayData`]
#[inline]
pub const fn null_bitmap(&self) -> Option<&Bitmap> {
self.null_bitmap.as_ref()
}
- /// Returns a reference to the null buffer of this array data.
+ /// Returns a reference to the null buffer of this [`ArrayData`].
pub fn null_buffer(&self) -> Option<&Buffer> {
self.null_bitmap().as_ref().map(|b| b.buffer_ref())
}
@@ -424,19 +465,19 @@ impl ArrayData {
true
}
- /// Returns the length (i.e., number of elements) of this array
+ /// Returns the length (i.e., number of elements) of this [`ArrayData`].
#[inline]
pub const fn len(&self) -> usize {
self.len
}
- // Returns whether array data is empty
+ /// Returns whether this [`ArrayData`] is empty
#[inline]
pub const fn is_empty(&self) -> bool {
self.len == 0
}
- /// Returns the offset of this array
+ /// Returns the offset of this [`ArrayData`]
#[inline]
pub const fn offset(&self) -> usize {
self.offset
@@ -448,7 +489,17 @@ impl ArrayData {
self.null_count
}
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData].
+ /// Returns the total number of bytes of memory occupied by the
+ /// buffers owned by this [`ArrayData`] and all of its
+ /// children. (See also diagram on [`ArrayData`]).
+ ///
+ /// Note that this [`ArrayData`] may only refer to a subset of the
+ /// data in the underlying [`Buffer`]s (due to `offset` and
+ /// `length`), but the size returned includes the entire size of
+ /// the buffers.
+ ///
+ /// If multiple [`ArrayData`]s refer to the same underlying
+ /// [`Buffer`]s they will both report the same size.
pub fn get_buffer_memory_size(&self) -> usize {
let mut size = 0;
for buffer in &self.buffers {
@@ -463,7 +514,18 @@ impl ArrayData {
size
}
- /// Returns the total number of the bytes of memory occupied by the buffers by this slice of [ArrayData]
+ /// Returns the total number of the bytes of memory occupied by
+ /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
+ ///
+ /// This is approximately the number of bytes if a new
+ /// [`ArrayData`] was formed by creating new [`Buffer`]s with
+ /// exactly the data needed.
+ ///
+ /// For example, a [`DataType::Int64`] with `100` elements,
+ /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
+ /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
+ /// first `20` elements, then [`Self::get_slice_memory_size`] on the
+ /// sliced [`ArrayData`] would return `20 * 8 = 160`.
pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
let mut result: usize = 0;
let layout = layout(&self.data_type);
@@ -519,7 +581,14 @@ impl ArrayData {
Ok(result)
}
- /// Returns the total number of bytes of memory occupied physically by this [ArrayData].
+ /// Returns the total number of bytes of memory occupied
+ /// physically by this [`ArrayData`] and all its [`Buffer`]s and
+ /// children. (See also diagram on [`ArrayData`]).
+ ///
+ /// Equivalent to:
+ /// `size_of_val(self)` +
+ /// [`Self::get_buffer_memory_size`] +
+ /// `size_of_val(child)` for all children
pub fn get_array_memory_size(&self) -> usize {
let mut size = mem::size_of_val(self);
@@ -541,8 +610,9 @@ impl ArrayData {
size
}
- /// Creates a zero-copy slice of itself. This creates a new [ArrayData]
- /// with a different offset, len and a shifted null bitmap.
+ /// Creates a zero-copy slice of itself. This creates a new
+ /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
+ /// different offset and len
///
/// # Panics
///