You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/05/17 11:08:30 UTC
[arrow-rs] branch master updated: Fix FFI and add support for
Struct type (#287)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c863a2c Fix FFI and add support for Struct type (#287)
c863a2c is described below
commit c863a2c44bffa5c092a49e07910d5e9225483193
Author: Roee Shlomo <ro...@gmail.com>
AuthorDate: Mon May 17 14:08:24 2021 +0300
Fix FFI and add support for Struct type (#287)
* fix: support nested types in FFI
Ported from https://github.com/jorgecarleitao/arrow2
Fix #20
Fix #251
Signed-off-by: roee88 <ro...@gmail.com>
* Removed Clone from FFI_ArrowArray
Signed-off-by: roee88 <ro...@gmail.com>
* Add nesting to FFI struct test
Signed-off-by: roee88 <ro...@gmail.com>
---
arrow/src/array/ffi.rs | 122 ++++------
arrow/src/ffi.rs | 606 ++++++++++++++++++++++++++++---------------------
2 files changed, 387 insertions(+), 341 deletions(-)
diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs
index 450685b..847649c 100644
--- a/arrow/src/array/ffi.rs
+++ b/arrow/src/array/ffi.rs
@@ -22,41 +22,16 @@ use std::convert::TryFrom;
use crate::{
error::{ArrowError, Result},
ffi,
+ ffi::ArrowArrayRef,
};
use super::ArrayData;
-use crate::datatypes::DataType;
-use crate::ffi::ArrowArray;
impl TryFrom<ffi::ArrowArray> for ArrayData {
type Error = ArrowError;
fn try_from(value: ffi::ArrowArray) -> Result<Self> {
- let child_data = value.children()?;
-
- let child_type = if !child_data.is_empty() {
- Some(child_data[0].data_type().clone())
- } else {
- None
- };
-
- let data_type = value.data_type(child_type)?;
-
- let len = value.len();
- let offset = value.offset();
- let null_count = value.null_count();
- let buffers = value.buffers()?;
- let null_bit_buffer = value.null_bit_buffer();
-
- Ok(ArrayData::new(
- data_type,
- len,
- Some(null_count),
- null_bit_buffer,
- offset,
- buffers,
- child_data,
- ))
+ value.to_data()
}
}
@@ -64,58 +39,7 @@ impl TryFrom<ArrayData> for ffi::ArrowArray {
type Error = ArrowError;
fn try_from(value: ArrayData) -> Result<Self> {
- // If parent is nullable, then children also must be nullable
- // so we pass this nullable to the creation of hte child data
- let nullable = match value.data_type() {
- DataType::List(field) => field.is_nullable(),
- DataType::LargeList(field) => field.is_nullable(),
- _ => false,
- };
-
- let len = value.len();
- let offset = value.offset() as usize;
- let null_count = value.null_count();
- let buffers = value.buffers().to_vec();
- let null_buffer = value.null_buffer().cloned();
- let child_data = value
- .child_data()
- .iter()
- .map(|arr| {
- let len = arr.len();
- let offset = arr.offset() as usize;
- let null_count = arr.null_count();
- let buffers = arr.buffers().to_vec();
- let null_buffer = arr.null_buffer().cloned();
-
- // Note: the nullable comes from the parent data.
- unsafe {
- ArrowArray::try_new(
- arr.data_type(),
- len,
- null_count,
- null_buffer,
- offset,
- buffers,
- vec![],
- nullable,
- )
- .expect("infallible")
- }
- })
- .collect::<Vec<_>>();
-
- unsafe {
- ffi::ArrowArray::try_new(
- value.data_type(),
- len,
- null_count,
- null_buffer,
- offset,
- buffers,
- child_data,
- nullable,
- )
- }
+ unsafe { ffi::ArrowArray::try_new(value) }
}
}
@@ -123,10 +47,15 @@ impl TryFrom<ArrayData> for ffi::ArrowArray {
mod tests {
use crate::error::Result;
use crate::{
- array::{Array, ArrayData, Int64Array, UInt32Array, UInt64Array},
+ array::{
+ Array, ArrayData, BooleanArray, Int64Array, StructArray, UInt32Array,
+ UInt64Array,
+ },
+ datatypes::{DataType, Field},
ffi::ArrowArray,
};
use std::convert::TryFrom;
+ use std::sync::Arc;
fn test_round_trip(expected: &ArrayData) -> Result<()> {
// create a `ArrowArray` from the data.
@@ -165,4 +94,37 @@ mod tests {
let data = array.data();
test_round_trip(data)
}
+
+ #[test]
+ fn test_struct() -> Result<()> {
+ let inner = StructArray::from(vec![
+ (
+ Field::new("a1", DataType::Boolean, false),
+ Arc::new(BooleanArray::from(vec![true, true, false, false]))
+ as Arc<dyn Array>,
+ ),
+ (
+ Field::new("a2", DataType::UInt32, false),
+ Arc::new(UInt32Array::from(vec![1, 2, 3, 4])),
+ ),
+ ]);
+
+ let array = StructArray::from(vec![
+ (
+ Field::new("a", inner.data_type().clone(), false),
+ Arc::new(inner) as Arc<dyn Array>,
+ ),
+ (
+ Field::new("b", DataType::Boolean, false),
+ Arc::new(BooleanArray::from(vec![false, false, true, true]))
+ as Arc<dyn Array>,
+ ),
+ (
+ Field::new("c", DataType::UInt32, false),
+ Arc::new(UInt32Array::from(vec![42, 28, 19, 31])),
+ ),
+ ]);
+ let data = array.data();
+ test_round_trip(data)
+ }
}
diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs
index 3a6d031..7b789f8 100644
--- a/arrow/src/ffi.rs
+++ b/arrow/src/ffi.rs
@@ -77,12 +77,10 @@ To export an array, create an `ArrowArray` using [ArrowArray::try_new].
*/
use std::{
- convert::TryFrom,
ffi::CStr,
ffi::CString,
iter,
- mem::{size_of, ManuallyDrop},
- os::raw::c_char,
+ mem::size_of,
ptr::{self, NonNull},
sync::Arc,
};
@@ -93,6 +91,12 @@ use crate::datatypes::{DataType, Field, TimeUnit};
use crate::error::{ArrowError, Result};
use crate::util::bit_util;
+#[allow(dead_code)]
+struct SchemaPrivateData {
+ field: Field,
+ children_ptr: Box<[*mut FFI_ArrowSchema]>,
+}
+
/// ABI-compatible struct for `ArrowSchema` from C Data Interface
/// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
/// This was created by bindgen
@@ -112,45 +116,65 @@ pub struct FFI_ArrowSchema {
// callback used to drop [FFI_ArrowSchema] when it is exported.
unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) {
+ if schema.is_null() {
+ return;
+ }
let schema = &mut *schema;
// take ownership back to release it.
CString::from_raw(schema.format as *mut std::os::raw::c_char);
+ CString::from_raw(schema.name as *mut std::os::raw::c_char);
+ let private = Box::from_raw(schema.private_data as *mut SchemaPrivateData);
+ for child in private.children_ptr.iter() {
+ let _ = Box::from_raw(*child);
+ }
schema.release = None;
}
-struct SchemaPrivateData {
- children: Box<[*mut FFI_ArrowSchema]>,
-}
-
impl FFI_ArrowSchema {
- /// create a new [FFI_ArrowSchema] from a format.
- fn new(
- format: &str,
- children: Vec<*mut FFI_ArrowSchema>,
- nullable: bool,
- ) -> FFI_ArrowSchema {
- let children = children.into_boxed_slice();
- let n_children = children.len() as i64;
- let children_ptr = children.as_ptr() as *mut *mut FFI_ArrowSchema;
-
- let flags = if nullable { 2 } else { 0 };
+ /// create a new [`Ffi_ArrowSchema`]. This fails if the fields' [`DataType`] is not supported.
+ fn try_new(field: Field) -> Result<FFI_ArrowSchema> {
+ let format = to_format(field.data_type())?;
+ let name = field.name().clone();
+
+ // allocate (and hold) the children
+ let children_vec = match field.data_type() {
+ DataType::List(field) => {
+ vec![Box::new(FFI_ArrowSchema::try_new(field.as_ref().clone())?)]
+ }
+ DataType::LargeList(field) => {
+ vec![Box::new(FFI_ArrowSchema::try_new(field.as_ref().clone())?)]
+ }
+ DataType::Struct(fields) => fields
+ .iter()
+ .map(|field| Ok(Box::new(FFI_ArrowSchema::try_new(field.clone())?)))
+ .collect::<Result<Vec<_>>>()?,
+ _ => vec![],
+ };
+ // note: this cannot be done along with the above because the above is fallible and this op leaks.
+ let mut children_ptr = children_vec
+ .into_iter()
+ .map(Box::into_raw)
+ .collect::<Box<_>>();
+ let n_children = children_ptr.len() as i64;
+ let children = children_ptr.as_mut_ptr();
- let private_data = Box::new(SchemaPrivateData { children });
// <https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema>
- FFI_ArrowSchema {
+ Ok(FFI_ArrowSchema {
format: CString::new(format).unwrap().into_raw(),
- // For child data a non null string is expected and is called item
- name: CString::new("item").unwrap().into_raw(),
+ name: CString::new(name).unwrap().into_raw(),
metadata: std::ptr::null_mut(),
- flags,
+ flags: field.is_nullable() as i64 * 2,
n_children,
- children: children_ptr,
+ children,
dictionary: std::ptr::null_mut(),
release: Some(release_schema),
- private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void,
- }
+ private_data: Box::into_raw(Box::new(SchemaPrivateData {
+ field,
+ children_ptr,
+ })) as *mut ::std::os::raw::c_void,
+ })
}
/// create an empty [FFI_ArrowSchema]
@@ -170,10 +194,29 @@ impl FFI_ArrowSchema {
/// returns the format of this schema.
pub fn format(&self) -> &str {
+ assert!(!self.format.is_null());
+ // safe because the lifetime of `self.format` equals `self`
unsafe { CStr::from_ptr(self.format) }
.to_str()
.expect("The external API has a non-utf8 as format")
}
+
+ /// returns the name of this schema.
+ pub fn name(&self) -> &str {
+ assert!(!self.name.is_null());
+ // safe because the lifetime of `self.name` equals `self`
+ unsafe { CStr::from_ptr(self.name) }.to_str().unwrap()
+ }
+
+ pub fn child(&self, index: usize) -> &Self {
+ assert!(index < self.n_children as usize);
+ assert!(!self.name.is_null());
+ unsafe { self.children.add(index).as_ref().unwrap().as_ref().unwrap() }
+ }
+
+ pub fn nullable(&self) -> bool {
+ (self.flags / 2) & 1 == 1
+ }
}
impl Drop for FFI_ArrowSchema {
@@ -185,14 +228,9 @@ impl Drop for FFI_ArrowSchema {
}
}
-/// maps a DataType `format` to a [DataType](arrow::datatypes::DataType).
/// See https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
-fn to_datatype(
- format: &str,
- child_type: Option<DataType>,
- schema: &FFI_ArrowSchema,
-) -> Result<DataType> {
- Ok(match format {
+fn to_field(schema: &FFI_ArrowSchema) -> Result<Field> {
+ let data_type = match schema.format() {
"n" => DataType::Null,
"b" => DataType::Boolean,
"c" => DataType::Int8,
@@ -216,54 +254,33 @@ fn to_datatype(
"ttm" => DataType::Time32(TimeUnit::Millisecond),
"ttu" => DataType::Time64(TimeUnit::Microsecond),
"ttn" => DataType::Time64(TimeUnit::Nanosecond),
-
- // Note: The datatype null will only be created when called from ArrowArray::buffer_len
- // at that point the child data is not yet known, but it is also not required to determine
- // the buffer length of the list arrays.
"+l" => {
- let nullable = schema.flags == 2;
- // Safety
- // Should be set as this is expected from the C FFI definition
- debug_assert!(!schema.name.is_null());
- let name = unsafe { CString::from_raw(schema.name as *mut c_char) }
- .into_string()
- .unwrap();
- // prevent a double free
- let name = ManuallyDrop::new(name);
- DataType::List(Box::new(Field::new(
- &name,
- child_type.unwrap_or(DataType::Null),
- nullable,
- )))
+ let child = schema.child(0);
+ DataType::List(Box::new(to_field(child)?))
}
"+L" => {
- let nullable = schema.flags == 2;
- // Safety
- // Should be set as this is expected from the C FFI definition
- debug_assert!(!schema.name.is_null());
- let name = unsafe { CString::from_raw(schema.name as *mut c_char) }
- .into_string()
- .unwrap();
- // prevent a double free
- let name = ManuallyDrop::new(name);
- DataType::LargeList(Box::new(Field::new(
- &name,
- child_type.unwrap_or(DataType::Null),
- nullable,
- )))
+ let child = schema.child(0);
+ DataType::LargeList(Box::new(to_field(child)?))
}
- dt => {
+ "+s" => {
+ let children = (0..schema.n_children as usize)
+ .map(|x| to_field(schema.child(x)))
+ .collect::<Result<Vec<_>>>()?;
+ DataType::Struct(children)
+ }
+ other => {
return Err(ArrowError::CDataInterface(format!(
- "The datatype \"{}\" is not supported in the Rust implementation",
- dt
+ "The datatype \"{:?}\" is still not supported in Rust implementation",
+ other
)))
}
- })
+ };
+ Ok(Field::new(schema.name(), data_type, schema.nullable()))
}
-/// the inverse of [to_datatype]
-fn from_datatype(datatype: &DataType) -> Result<String> {
- Ok(match datatype {
+/// See https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
+fn to_format(data_type: &DataType) -> Result<String> {
+ Ok(match data_type {
DataType::Null => "n",
DataType::Boolean => "b",
DataType::Int8 => "c",
@@ -289,6 +306,7 @@ fn from_datatype(datatype: &DataType) -> Result<String> {
DataType::Time64(TimeUnit::Nanosecond) => "ttn",
DataType::List(_) => "+l",
DataType::LargeList(_) => "+L",
+ DataType::Struct(_) => "+s",
z => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" is still not supported in Rust implementation",
@@ -386,14 +404,27 @@ pub struct FFI_ArrowArray {
private_data: *mut ::std::os::raw::c_void,
}
+impl Drop for FFI_ArrowArray {
+ fn drop(&mut self) {
+ match self.release {
+ None => (),
+ Some(release) => unsafe { release(self) },
+ };
+ }
+}
+
// callback used to drop [FFI_ArrowArray] when it is exported
unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) {
if array.is_null() {
return;
}
let array = &mut *array;
- // take ownership of `private_data`, therefore dropping it
- Box::from_raw(array.private_data as *mut PrivateData);
+
+ // take ownership of `private_data`, therefore dropping it`
+ let private = Box::from_raw(array.private_data as *mut PrivateData);
+ for child in private.children.iter() {
+ let _ = Box::from_raw(*child);
+ }
array.release = None;
}
@@ -409,15 +440,15 @@ impl FFI_ArrowArray {
/// # Safety
/// This method releases `buffers`. Consumers of this struct *must* call `release` before
/// releasing this struct, or contents in `buffers` leak.
- unsafe fn new(
- length: i64,
- null_count: i64,
- offset: i64,
- n_buffers: i64,
- buffers: Vec<Option<Buffer>>,
- children: Vec<*mut FFI_ArrowArray>,
- ) -> Self {
- let buffers_ptr = buffers
+ fn new(data: &ArrayData) -> Self {
+ // * insert the null buffer at the start
+ // * make all others `Option<Buffer>`.
+ let buffers = iter::once(data.null_buffer().cloned())
+ .chain(data.buffers().iter().map(|b| Some(b.clone())))
+ .collect::<Vec<_>>();
+ let n_buffers = buffers.len() as i64;
+
+ let mut buffers_ptr = buffers
.iter()
.map(|maybe_buffer| match maybe_buffer {
// note that `raw_data` takes into account the buffer's offset
@@ -425,10 +456,14 @@ impl FFI_ArrowArray {
None => std::ptr::null(),
})
.collect::<Box<[_]>>();
- let pointer = buffers_ptr.as_ptr() as *mut *const std::ffi::c_void;
+ let pointer = buffers_ptr.as_mut_ptr();
- let children = children.into_boxed_slice();
- let children_ptr = children.as_ptr() as *mut *mut FFI_ArrowArray;
+ let mut children = data
+ .child_data()
+ .iter()
+ .map(|child| Box::into_raw(Box::new(FFI_ArrowArray::new(child))))
+ .collect::<Box<_>>();
+ let children_ptr = children.as_mut_ptr();
let n_children = children.len() as i64;
// create the private data owning everything.
@@ -440,9 +475,9 @@ impl FFI_ArrowArray {
});
Self {
- length,
- null_count,
- offset,
+ length: data.len() as i64,
+ null_count: data.null_count() as i64,
+ offset: data.offset() as i64,
n_buffers,
n_children,
buffers: pointer,
@@ -468,6 +503,26 @@ impl FFI_ArrowArray {
private_data: std::ptr::null_mut(),
}
}
+
+ /// the length of the array
+ pub fn len(&self) -> usize {
+ self.length as usize
+ }
+
+ /// whether the array is empty
+ pub fn is_empty(&self) -> bool {
+ self.length == 0
+ }
+
+ /// the offset of the array
+ pub fn offset(&self) -> usize {
+ self.offset as usize
+ }
+
+ /// the null count of the array
+ pub fn null_count(&self) -> usize {
+ self.null_count as usize
+ }
}
/// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer).
@@ -478,7 +533,8 @@ impl FFI_ArrowArray {
/// # Safety
/// This function assumes that `ceil(self.length * bits, 8)` is the size of the buffer
unsafe fn create_buffer(
- array: Arc<FFI_ArrowArray>,
+ owner: Arc<FFI_ArrowArray>,
+ array: &FFI_ArrowArray,
index: usize,
len: usize,
) -> Option<Buffer> {
@@ -490,33 +546,149 @@ unsafe fn create_buffer(
assert!(index < array.n_buffers as usize);
let ptr = *buffers.add(index);
- NonNull::new(ptr as *mut u8).map(|ptr| Buffer::from_unowned(ptr, len, array))
+ NonNull::new(ptr as *mut u8).map(|ptr| Buffer::from_unowned(ptr, len, owner))
}
-unsafe fn create_child_arrays(
- array: Arc<FFI_ArrowArray>,
- schema: Arc<FFI_ArrowSchema>,
-) -> Result<Vec<ArrayData>> {
- (0..array.n_children as usize)
- .map(|i| {
- let arr_ptr = *array.children.add(i);
- let schema_ptr = *schema.children.add(i);
- let arrow_arr = ArrowArray::try_from_raw(
- arr_ptr as *const FFI_ArrowArray,
- schema_ptr as *const FFI_ArrowSchema,
- )?;
- ArrayData::try_from(arrow_arr)
- })
- .collect()
+fn create_child(
+ owner: Arc<FFI_ArrowArray>,
+ array: &FFI_ArrowArray,
+ schema: &FFI_ArrowSchema,
+ index: usize,
+) -> ArrowArrayChild<'static> {
+ assert!(index < array.n_children as usize);
+ assert!(!array.children.is_null());
+ assert!(!array.children.is_null());
+ unsafe {
+ let arr_ptr = *array.children.add(index);
+ let schema_ptr = *schema.children.add(index);
+ assert!(!arr_ptr.is_null());
+ assert!(!schema_ptr.is_null());
+ let arr_ptr = &*arr_ptr;
+ let schema_ptr = &*schema_ptr;
+ ArrowArrayChild::from_raw(arr_ptr, schema_ptr, owner)
+ }
}
-impl Drop for FFI_ArrowArray {
- fn drop(&mut self) {
- match self.release {
- None => (),
- Some(release) => unsafe { release(self) },
- };
+pub trait ArrowArrayRef {
+ fn to_data(&self) -> Result<ArrayData> {
+ let data_type = self.data_type()?;
+ let len = self.array().len();
+ let offset = self.array().offset();
+ let null_count = self.array().null_count();
+ let buffers = self.buffers()?;
+ let null_bit_buffer = self.null_bit_buffer();
+
+ let child_data = (0..self.array().n_children as usize)
+ .map(|i| {
+ let child = self.child(i);
+ child.to_data()
+ })
+ .map(|d| d.unwrap())
+ .collect();
+
+ Ok(ArrayData::new(
+ data_type,
+ len,
+ Some(null_count),
+ null_bit_buffer,
+ offset,
+ buffers,
+ child_data,
+ ))
+ }
+
+ /// returns all buffers, as organized by Rust (i.e. null buffer is skipped)
+ fn buffers(&self) -> Result<Vec<Buffer>> {
+ (0..self.array().n_buffers - 1)
+ .map(|index| {
+ // + 1: skip null buffer
+ let index = (index + 1) as usize;
+
+ let len = self.buffer_len(index)?;
+
+ unsafe { create_buffer(self.owner().clone(), self.array(), index, len) }
+ .ok_or_else(|| {
+ ArrowError::CDataInterface(format!(
+ "The external buffer at position {} is null.",
+ index - 1
+ ))
+ })
+ })
+ .collect()
}
+
+ /// Returns the length, in bytes, of the buffer `i` (indexed according to the C data interface)
+ // Rust implementation uses fixed-sized buffers, which require knowledge of their `len`.
+ // for variable-sized buffers, such as the second buffer of a stringArray, we need
+ // to fetch offset buffer's len to build the second buffer.
+ fn buffer_len(&self, i: usize) -> Result<usize> {
+ // Inner type is not important for buffer length.
+ let data_type = &self.data_type()?;
+
+ Ok(match (data_type, i) {
+ (DataType::Utf8, 1)
+ | (DataType::LargeUtf8, 1)
+ | (DataType::Binary, 1)
+ | (DataType::LargeBinary, 1)
+ | (DataType::List(_), 1)
+ | (DataType::LargeList(_), 1) => {
+ // the len of the offset buffer (buffer 1) equals length + 1
+ let bits = bit_width(data_type, i)?;
+ debug_assert_eq!(bits % 8, 0);
+ (self.array().length as usize + 1) * (bits / 8)
+ }
+ (DataType::Utf8, 2) | (DataType::Binary, 2) | (DataType::List(_), 2) => {
+ // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
+ let len = self.buffer_len(1)?;
+ // first buffer is the null buffer => add(1)
+ // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
+ #[allow(clippy::cast_ptr_alignment)]
+ let offset_buffer = unsafe {
+ *(self.array().buffers as *mut *const u8).add(1) as *const i32
+ };
+ // get last offset
+ (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
+ }
+ (DataType::LargeUtf8, 2)
+ | (DataType::LargeBinary, 2)
+ | (DataType::LargeList(_), 2) => {
+ // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
+ let len = self.buffer_len(1)?;
+ // first buffer is the null buffer => add(1)
+ // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
+ #[allow(clippy::cast_ptr_alignment)]
+ let offset_buffer = unsafe {
+ *(self.array().buffers as *mut *const u8).add(1) as *const i64
+ };
+ // get last offset
+ (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
+ }
+ // buffer len of primitive types
+ _ => {
+ let bits = bit_width(data_type, i)?;
+ bit_util::ceil(self.array().length as usize * bits, 8)
+ }
+ })
+ }
+
+ /// returns the null bit buffer.
+ /// Rust implementation uses a buffer that is not part of the array of buffers.
+ /// The C Data interface's null buffer is part of the array of buffers.
+ fn null_bit_buffer(&self) -> Option<Buffer> {
+ // similar to `self.buffer_len(0)`, but without `Result`.
+ let buffer_len = bit_util::ceil(self.array().length as usize, 8);
+
+ unsafe { create_buffer(self.owner().clone(), self.array(), 0, buffer_len) }
+ }
+
+ fn child(&self, index: usize) -> ArrowArrayChild {
+ create_child(self.owner().clone(), self.array(), self.schema(), index)
+ }
+
+ fn owner(&self) -> &Arc<FFI_ArrowArray>;
+ fn array(&self) -> &FFI_ArrowArray;
+ fn schema(&self) -> &FFI_ArrowSchema;
+ fn data_type(&self) -> Result<DataType>;
}
/// Struct used to move an Array from and to the C Data Interface.
@@ -540,51 +712,64 @@ impl Drop for FFI_ArrowArray {
/// Furthermore, this struct assumes that the incoming data agrees with the C data interface.
#[derive(Debug)]
pub struct ArrowArray {
- // these are ref-counted because they can be shared by multiple buffers.
array: Arc<FFI_ArrowArray>,
schema: Arc<FFI_ArrowSchema>,
}
+#[derive(Debug)]
+pub struct ArrowArrayChild<'a> {
+ array: &'a FFI_ArrowArray,
+ schema: &'a FFI_ArrowSchema,
+ owner: Arc<FFI_ArrowArray>,
+}
+
+impl ArrowArrayRef for ArrowArray {
+ /// the data_type as declared in the schema
+ fn data_type(&self) -> Result<DataType> {
+ to_field(&self.schema).map(|x| x.data_type().clone())
+ }
+
+ fn array(&self) -> &FFI_ArrowArray {
+ self.array.as_ref()
+ }
+
+ fn schema(&self) -> &FFI_ArrowSchema {
+ self.schema.as_ref()
+ }
+
+ fn owner(&self) -> &Arc<FFI_ArrowArray> {
+ &self.array
+ }
+}
+
+impl<'a> ArrowArrayRef for ArrowArrayChild<'a> {
+ /// the data_type as declared in the schema
+ fn data_type(&self) -> Result<DataType> {
+ to_field(self.schema).map(|x| x.data_type().clone())
+ }
+
+ fn array(&self) -> &FFI_ArrowArray {
+ self.array
+ }
+
+ fn schema(&self) -> &FFI_ArrowSchema {
+ self.schema
+ }
+
+ fn owner(&self) -> &Arc<FFI_ArrowArray> {
+ &self.owner
+ }
+}
+
impl ArrowArray {
/// creates a new `ArrowArray`. This is used to export to the C Data Interface.
/// # Safety
/// See safety of [ArrowArray]
#[allow(clippy::too_many_arguments)]
- pub unsafe fn try_new(
- data_type: &DataType,
- len: usize,
- null_count: usize,
- null_buffer: Option<Buffer>,
- offset: usize,
- buffers: Vec<Buffer>,
- child_data: Vec<ArrowArray>,
- nullable: bool,
- ) -> Result<Self> {
- let format = from_datatype(data_type)?;
- // * insert the null buffer at the start
- // * make all others `Option<Buffer>`.
- let new_buffers = iter::once(null_buffer)
- .chain(buffers.iter().map(|b| Some(b.clone())))
- .collect::<Vec<_>>();
-
- let mut ffi_arrow_arrays = Vec::with_capacity(child_data.len());
- let mut ffi_arrow_schemas = Vec::with_capacity(child_data.len());
-
- child_data.into_iter().for_each(|arrow_arr| {
- let (arr, schema) = ArrowArray::into_raw(arrow_arr);
- ffi_arrow_arrays.push(arr as *mut FFI_ArrowArray);
- ffi_arrow_schemas.push(schema as *mut FFI_ArrowSchema);
- });
-
- let schema = Arc::new(FFI_ArrowSchema::new(&format, ffi_arrow_schemas, nullable));
- let array = Arc::new(FFI_ArrowArray::new(
- len as i64,
- null_count as i64,
- offset as i64,
- new_buffers.len() as i64,
- new_buffers,
- ffi_arrow_arrays,
- ));
+ pub unsafe fn try_new(data: ArrayData) -> Result<Self> {
+ let field = Field::new("", data.data_type().clone(), data.null_count() != 0);
+ let array = Arc::new(FFI_ArrowArray::new(&data));
+ let schema = Arc::new(FFI_ArrowSchema::try_new(field)?);
Ok(ArrowArray { array, schema })
}
@@ -623,120 +808,19 @@ impl ArrowArray {
pub fn into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema) {
(Arc::into_raw(this.array), Arc::into_raw(this.schema))
}
+}
- /// returns the null bit buffer.
- /// Rust implementation uses a buffer that is not part of the array of buffers.
- /// The C Data interface's null buffer is part of the array of buffers.
- pub fn null_bit_buffer(&self) -> Option<Buffer> {
- // similar to `self.buffer_len(0)`, but without `Result`.
- let buffer_len = bit_util::ceil(self.array.length as usize, 8);
-
- unsafe { create_buffer(self.array.clone(), 0, buffer_len) }
- }
-
- /// Returns the length, in bytes, of the buffer `i` (indexed according to the C data interface)
- // Rust implementation uses fixed-sized buffers, which require knowledge of their `len`.
- // for variable-sized buffers, such as the second buffer of a stringArray, we need
- // to fetch offset buffer's len to build the second buffer.
- fn buffer_len(&self, i: usize) -> Result<usize> {
- // Inner type is not important for buffer length.
- let data_type = &self.data_type(None)?;
-
- Ok(match (data_type, i) {
- (DataType::Utf8, 1)
- | (DataType::LargeUtf8, 1)
- | (DataType::Binary, 1)
- | (DataType::LargeBinary, 1)
- | (DataType::List(_), 1)
- | (DataType::LargeList(_), 1) => {
- // the len of the offset buffer (buffer 1) equals length + 1
- let bits = bit_width(data_type, i)?;
- debug_assert_eq!(bits % 8, 0);
- (self.array.length as usize + 1) * (bits / 8)
- }
- (DataType::Utf8, 2) | (DataType::Binary, 2) | (DataType::List(_), 2) => {
- // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
- let len = self.buffer_len(1)?;
- // first buffer is the null buffer => add(1)
- // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
- #[allow(clippy::cast_ptr_alignment)]
- let offset_buffer = unsafe {
- *(self.array.buffers as *mut *const u8).add(1) as *const i32
- };
- // get last offset
- (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
- }
- (DataType::LargeUtf8, 2)
- | (DataType::LargeBinary, 2)
- | (DataType::LargeList(_), 2) => {
- // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
- let len = self.buffer_len(1)?;
- // first buffer is the null buffer => add(1)
- // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
- #[allow(clippy::cast_ptr_alignment)]
- let offset_buffer = unsafe {
- *(self.array.buffers as *mut *const u8).add(1) as *const i64
- };
- // get last offset
- (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
- }
- // buffer len of primitive types
- _ => {
- let bits = bit_width(data_type, i)?;
- bit_util::ceil(self.array.length as usize * bits, 8)
- }
- })
- }
-
- /// returns all buffers, as organized by Rust (i.e. null buffer is skipped)
- pub fn buffers(&self) -> Result<Vec<Buffer>> {
- (0..self.array.n_buffers - 1)
- .map(|index| {
- // + 1: skip null buffer
- let index = (index + 1) as usize;
-
- let len = self.buffer_len(index)?;
-
- unsafe { create_buffer(self.array.clone(), index, len) }.ok_or_else(
- || {
- ArrowError::CDataInterface(format!(
- "The external buffer at position {} is null.",
- index - 1
- ))
- },
- )
- })
- .collect()
- }
-
- /// returns the child data of this array
- pub fn children(&self) -> Result<Vec<ArrayData>> {
- unsafe { create_child_arrays(self.array.clone(), self.schema.clone()) }
- }
-
- /// the length of the array
- pub fn len(&self) -> usize {
- self.array.length as usize
- }
-
- /// whether the array is empty
- pub fn is_empty(&self) -> bool {
- self.array.length == 0
- }
-
- /// the offset of the array
- pub fn offset(&self) -> usize {
- self.array.offset as usize
- }
-
- /// the null count of the array
- pub fn null_count(&self) -> usize {
- self.array.null_count as usize
- }
-
- /// the data_type as declared in the schema
- pub fn data_type(&self, child_type: Option<DataType>) -> Result<DataType> {
- to_datatype(self.schema.format(), child_type, self.schema.as_ref())
+impl<'a> ArrowArrayChild<'a> {
+ fn from_raw(
+ array: &'a FFI_ArrowArray,
+ schema: &'a FFI_ArrowSchema,
+ owner: Arc<FFI_ArrowArray>,
+ ) -> Self {
+ Self {
+ array,
+ schema,
+ owner,
+ }
}
}