You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by su...@apache.org on 2019/03/20 23:51:41 UTC
[arrow] branch master updated: ARROW-4853: [Rust] Array slice
doesn't work on ListArray and StructArray
This is an automated email from the ASF dual-hosted git repository.
sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new fbcaf9f ARROW-4853: [Rust] Array slice doesn't work on ListArray and StructArray
fbcaf9f is described below
commit fbcaf9f920ef0664de37ac37f9924c7ca9299344
Author: Chao Sun <su...@apache.org>
AuthorDate: Wed Mar 20 16:51:30 2019 -0700
ARROW-4853: [Rust] Array slice doesn't work on ListArray and StructArray
This fixes a few issues related to Array offset and slicing:
1. The invariant check in ListArray::from() is no longer valid for the case
of offset. This removes the check.
2. Both `Array::is_valid` and `Array::is_null` should take into account of offset.
3. `StructArray::len()` should return `data.len()` instead of the length of the
first child array. In the case of slicing, we'll not modify child array data so
the previous approach will return wrong result.
Author: Chao Sun <su...@apache.org>
Closes #3972 from sunchao/ARROW-4853 and squashes the following commits:
299d6a43 <Chao Sun> Fix StructBuilder reuse
c8633467 <Chao Sun> Improve unit tests
78e8e880 <Chao Sun> Fix format
e0fed916 <Chao Sun> Fix StructArray::column
8278c786 <Chao Sun> Fix StructArray::From<Vec<(Field, ArrayRef)>>
c7814331 <Chao Sun> Fix StructBuilder length & unit tests
99af8d50 <Chao Sun> ARROW-4853: Array slice doesn't work on ListArray and StructArray
---
rust/arrow/src/array.rs | 201 +++++++++++++++++++++++++++++++++++++++-------
rust/arrow/src/builder.rs | 19 +++++
2 files changed, 190 insertions(+), 30 deletions(-)
diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs
index 7b692bb..6438cb9 100644
--- a/rust/arrow/src/array.rs
+++ b/rust/arrow/src/array.rs
@@ -113,12 +113,12 @@ pub trait Array: Send + Sync {
/// Returns whether the element at index `i` is null
fn is_null(&self, i: usize) -> bool {
- self.data().is_null(i)
+ self.data().is_null(self.data().offset() + i)
}
/// Returns whether the element at index `i` is not null
fn is_valid(&self, i: usize) -> bool {
- self.data().is_valid(i)
+ self.data().is_valid(self.data().offset() + i)
}
/// Returns the total number of nulls in this array
@@ -489,8 +489,8 @@ impl PrimitiveArray<BooleanType> {
/// Returns the boolean value at index `i`.
pub fn value(&self, i: usize) -> bool {
+ assert!(i < self.data.len());
let offset = i + self.offset();
- assert!(offset < self.data.len());
unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset) }
}
@@ -751,11 +751,6 @@ impl From<ArrayDataRef> for ListArray {
let value_offsets = raw_value_offsets as *const i32;
unsafe {
assert_eq!(*value_offsets.offset(0), 0, "offsets do not start at zero");
- assert_eq!(
- *value_offsets.offset(data.len() as isize),
- values.data().len() as i32,
- "inconsistent offsets buffer and values array"
- );
}
Self {
data: data.clone(),
@@ -954,7 +949,12 @@ impl From<ArrayDataRef> for StructArray {
fn from(data: ArrayDataRef) -> Self {
let mut boxed_fields = vec![];
for cd in data.child_data() {
- boxed_fields.push(make_array(cd.clone()));
+ let child_data = if data.offset != 0 || data.len != cd.len {
+ slice_data(cd.clone(), data.offset, data.len)
+ } else {
+ cd.clone()
+ };
+ boxed_fields.push(make_array(child_data));
}
Self { data, boxed_fields }
}
@@ -975,7 +975,7 @@ impl Array for StructArray {
/// Returns the length (i.e., number of elements) of this array
fn len(&self) -> usize {
- self.boxed_fields[0].len()
+ self.data().len()
}
}
@@ -995,6 +995,7 @@ impl From<Vec<(Field, ArrayRef)>> for StructArray {
let data = ArrayData::builder(DataType::Struct(field_types))
.child_data(field_values.into_iter().map(|a| a.data()).collect())
+ .len(length)
.build();
Self::from(data)
}
@@ -1142,7 +1143,11 @@ mod tests {
assert_eq!(5, arr2.len());
assert_eq!(2, arr2.offset());
assert_eq!(1, arr2.null_count());
- assert!(arr2.is_null(1));
+
+ for i in 0..arr2.len() {
+ assert_eq!(i == 1, arr2.is_null(i));
+ assert_eq!(i != 1, arr2.is_valid(i));
+ }
let arr3 = arr2.slice(2, 3);
assert_eq!(3, arr3.len());
@@ -1395,6 +1400,71 @@ mod tests {
}
#[test]
+
+ fn test_list_array_slice() {
+ // Construct a value array
+ let value_data = ArrayData::builder(DataType::Int32)
+ .len(10)
+ .add_buffer(Buffer::from(
+ &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9].to_byte_slice(),
+ ))
+ .build();
+
+ // Construct a buffer for value offsets, for the nested array:
+ // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]]
+ let value_offsets =
+ Buffer::from(&[0, 2, 2, 2, 4, 6, 6, 9, 9, 10].to_byte_slice());
+ // 01011001 00000001
+ let mut null_bits: [u8; 2] = [0; 2];
+ bit_util::set_bit(&mut null_bits, 0);
+ bit_util::set_bit(&mut null_bits, 3);
+ bit_util::set_bit(&mut null_bits, 4);
+ bit_util::set_bit(&mut null_bits, 6);
+ bit_util::set_bit(&mut null_bits, 8);
+
+ // Construct a list array from the above two
+ let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data = ArrayData::builder(list_data_type.clone())
+ .len(9)
+ .add_buffer(value_offsets.clone())
+ .add_child_data(value_data.clone())
+ .null_bit_buffer(Buffer::from(null_bits))
+ .build();
+ let list_array = ListArray::from(list_data);
+
+ let values = list_array.values();
+ assert_eq!(value_data, values.data());
+ assert_eq!(DataType::Int32, list_array.value_type());
+ assert_eq!(9, list_array.len());
+ assert_eq!(4, list_array.null_count());
+ assert_eq!(2, list_array.value_offset(3));
+ assert_eq!(2, list_array.value_length(3));
+
+ let sliced_array = list_array.slice(1, 6);
+ assert_eq!(6, sliced_array.len());
+ assert_eq!(1, sliced_array.offset());
+ assert_eq!(3, sliced_array.null_count());
+
+ for i in 0..sliced_array.len() {
+ if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+ assert!(sliced_array.is_valid(i));
+ } else {
+ assert!(sliced_array.is_null(i));
+ }
+ }
+
+ // Check offset and length for each non-null value.
+ let sliced_list_array =
+ sliced_array.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(2, sliced_list_array.value_offset(2));
+ assert_eq!(2, sliced_list_array.value_length(2));
+ assert_eq!(4, sliced_list_array.value_offset(3));
+ assert_eq!(2, sliced_list_array.value_length(3));
+ assert_eq!(6, sliced_list_array.value_offset(5));
+ assert_eq!(3, sliced_list_array.value_length(5));
+ }
+
+ #[test]
#[should_panic(
expected = "ListArray data should contain a single buffer only (value offsets)"
)]
@@ -1445,25 +1515,6 @@ mod tests {
}
#[test]
- #[should_panic(expected = "inconsistent offsets buffer and values array")]
- fn test_list_array_invalid_value_offset_end() {
- let value_data = ArrayData::builder(DataType::Int32)
- .len(8)
- .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()))
- .build();
-
- let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice());
-
- let list_data_type = DataType::List(Box::new(DataType::Int32));
- let list_data = ArrayData::builder(list_data_type.clone())
- .len(3)
- .add_buffer(value_offsets.clone())
- .add_child_data(value_data.clone())
- .build();
- ListArray::from(list_data);
- }
-
- #[test]
fn test_binary_array() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
@@ -1659,6 +1710,7 @@ mod tests {
field_types.push(Field::new("a", DataType::Boolean, false));
field_types.push(Field::new("b", DataType::Int64, false));
let struct_array_data = ArrayData::builder(DataType::Struct(field_types))
+ .len(4)
.add_child_data(boolean_data.clone())
.add_child_data(int_data.clone())
.build();
@@ -1691,6 +1743,95 @@ mod tests {
]);
assert_eq!(boolean_data, struct_array.column(0).data());
assert_eq!(int_data, struct_array.column(1).data());
+ assert_eq!(4, struct_array.len());
+ assert_eq!(0, struct_array.null_count());
+ assert_eq!(0, struct_array.offset());
+ }
+
+ #[test]
+ fn test_struct_array_slice() {
+ let boolean_data = ArrayData::builder(DataType::Boolean)
+ .len(5)
+ .add_buffer(Buffer::from([0b00010000]))
+ .null_bit_buffer(Buffer::from([0b00010001]))
+ .build();
+ let int_data = ArrayData::builder(DataType::Int32)
+ .len(5)
+ .add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice()))
+ .null_bit_buffer(Buffer::from([0b00000110]))
+ .build();
+
+ let mut field_types = vec![];
+ field_types.push(Field::new("a", DataType::Boolean, false));
+ field_types.push(Field::new("b", DataType::Int32, false));
+ let struct_array_data = ArrayData::builder(DataType::Struct(field_types))
+ .len(5)
+ .add_child_data(boolean_data.clone())
+ .add_child_data(int_data.clone())
+ .null_bit_buffer(Buffer::from([0b00010111]))
+ .build();
+ let struct_array = StructArray::from(struct_array_data);
+
+ assert_eq!(5, struct_array.len());
+ assert_eq!(1, struct_array.null_count());
+ assert!(struct_array.is_valid(0));
+ assert!(struct_array.is_valid(1));
+ assert!(struct_array.is_valid(2));
+ assert!(struct_array.is_null(3));
+ assert!(struct_array.is_valid(4));
+ assert_eq!(boolean_data, struct_array.column(0).data());
+ assert_eq!(int_data, struct_array.column(1).data());
+
+ let c0 = struct_array.column(0);
+ let c0 = c0.as_any().downcast_ref::<BooleanArray>().unwrap();
+ assert_eq!(5, c0.len());
+ assert_eq!(3, c0.null_count());
+ assert!(c0.is_valid(0));
+ assert_eq!(false, c0.value(0));
+ assert!(c0.is_null(1));
+ assert!(c0.is_null(2));
+ assert!(c0.is_null(3));
+ assert!(c0.is_valid(4));
+ assert_eq!(true, c0.value(4));
+
+ let c1 = struct_array.column(1);
+ let c1 = c1.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert_eq!(5, c1.len());
+ assert_eq!(3, c1.null_count());
+ assert!(c1.is_null(0));
+ assert!(c1.is_valid(1));
+ assert_eq!(28, c1.value(1));
+ assert!(c1.is_valid(2));
+ assert_eq!(42, c1.value(2));
+ assert!(c1.is_null(3));
+ assert!(c1.is_null(4));
+
+ let sliced_array = struct_array.slice(2, 3);
+ let sliced_array = sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
+ assert_eq!(3, sliced_array.len());
+ assert_eq!(2, sliced_array.offset());
+ assert_eq!(1, sliced_array.null_count());
+ assert!(sliced_array.is_valid(0));
+ assert!(sliced_array.is_null(1));
+ assert!(sliced_array.is_valid(2));
+
+ let sliced_c0 = sliced_array.column(0);
+ let sliced_c0 = sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
+ assert_eq!(3, sliced_c0.len());
+ assert_eq!(2, sliced_c0.offset());
+ assert!(sliced_c0.is_null(0));
+ assert!(sliced_c0.is_null(1));
+ assert!(sliced_c0.is_valid(2));
+ assert_eq!(true, sliced_c0.value(2));
+
+ let sliced_c1 = sliced_array.column(1);
+ let sliced_c1 = sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert_eq!(3, sliced_c1.len());
+ assert_eq!(2, sliced_c1.offset());
+ assert!(sliced_c1.is_valid(0));
+ assert_eq!(42, sliced_c1.value(0));
+ assert!(sliced_c1.is_null(1));
+ assert!(sliced_c1.is_null(2));
}
#[test]
diff --git a/rust/arrow/src/builder.rs b/rust/arrow/src/builder.rs
index 0cbb5e1..b58628a 100644
--- a/rust/arrow/src/builder.rs
+++ b/rust/arrow/src/builder.rs
@@ -696,6 +696,9 @@ impl StructBuilder {
.null_count(null_count)
.null_bit_buffer(null_bit_buffer);
}
+
+ self.len = 0;
+
StructArray::from(builder.build())
}
}
@@ -1355,7 +1358,15 @@ mod tests {
])
.unwrap();
+ // Append slot values - all are valid.
+ for _ in 0..10 {
+ assert!(builder.append(true).is_ok())
+ }
+
+ assert_eq!(10, builder.len());
+
let arr = builder.finish();
+
assert_eq!(10, arr.len());
assert_eq!(0, builder.len());
@@ -1370,7 +1381,15 @@ mod tests {
.append_slice(&[false, true, false, true, false])
.unwrap();
+ // Append slot values - all are valid.
+ for _ in 0..5 {
+ assert!(builder.append(true).is_ok())
+ }
+
+ assert_eq!(5, builder.len());
+
let arr = builder.finish();
+
assert_eq!(5, arr.len());
assert_eq!(0, builder.len());
}