You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by su...@apache.org on 2019/03/20 23:51:41 UTC

[arrow] branch master updated: ARROW-4853: [Rust] Array slice doesn't work on ListArray and StructArray

This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new fbcaf9f  ARROW-4853: [Rust] Array slice doesn't work on ListArray and StructArray
fbcaf9f is described below

commit fbcaf9f920ef0664de37ac37f9924c7ca9299344
Author: Chao Sun <su...@apache.org>
AuthorDate: Wed Mar 20 16:51:30 2019 -0700

    ARROW-4853: [Rust] Array slice doesn't work on ListArray and StructArray
    
    This fixes a few issues related to Array offset and slicing:
    
    1. The invariant check in ListArray::from() is no longer valid for the case
      of offset. This removes the check.
    2. Both `Array::is_valid` and `Array::is_null` should take into account of offset.
    3. `StructArray::len()` should return `data.len()` instead of the length of the
      first child array. In the case of slicing, we'll not modify child array data so
      the previous approach will return wrong result.
    
    Author: Chao Sun <su...@apache.org>
    
    Closes #3972 from sunchao/ARROW-4853 and squashes the following commits:
    
    299d6a43 <Chao Sun> Fix StructBuilder reuse
    c8633467 <Chao Sun> Improve unit tests
    78e8e880 <Chao Sun> Fix format
    e0fed916 <Chao Sun> Fix StructArray::column
    8278c786 <Chao Sun> Fix StructArray::From<Vec<(Field, ArrayRef)>>
    c7814331 <Chao Sun> Fix StructBuilder length & unit tests
    99af8d50 <Chao Sun> ARROW-4853:  Array slice doesn't work on ListArray and StructArray
---
 rust/arrow/src/array.rs   | 201 +++++++++++++++++++++++++++++++++++++++-------
 rust/arrow/src/builder.rs |  19 +++++
 2 files changed, 190 insertions(+), 30 deletions(-)

diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs
index 7b692bb..6438cb9 100644
--- a/rust/arrow/src/array.rs
+++ b/rust/arrow/src/array.rs
@@ -113,12 +113,12 @@ pub trait Array: Send + Sync {
 
     /// Returns whether the element at index `i` is null
     fn is_null(&self, i: usize) -> bool {
-        self.data().is_null(i)
+        self.data().is_null(self.data().offset() + i)
     }
 
     /// Returns whether the element at index `i` is not null
     fn is_valid(&self, i: usize) -> bool {
-        self.data().is_valid(i)
+        self.data().is_valid(self.data().offset() + i)
     }
 
     /// Returns the total number of nulls in this array
@@ -489,8 +489,8 @@ impl PrimitiveArray<BooleanType> {
 
     /// Returns the boolean value at index `i`.
     pub fn value(&self, i: usize) -> bool {
+        assert!(i < self.data.len());
         let offset = i + self.offset();
-        assert!(offset < self.data.len());
         unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset) }
     }
 
@@ -751,11 +751,6 @@ impl From<ArrayDataRef> for ListArray {
         let value_offsets = raw_value_offsets as *const i32;
         unsafe {
             assert_eq!(*value_offsets.offset(0), 0, "offsets do not start at zero");
-            assert_eq!(
-                *value_offsets.offset(data.len() as isize),
-                values.data().len() as i32,
-                "inconsistent offsets buffer and values array"
-            );
         }
         Self {
             data: data.clone(),
@@ -954,7 +949,12 @@ impl From<ArrayDataRef> for StructArray {
     fn from(data: ArrayDataRef) -> Self {
         let mut boxed_fields = vec![];
         for cd in data.child_data() {
-            boxed_fields.push(make_array(cd.clone()));
+            let child_data = if data.offset != 0 || data.len != cd.len {
+                slice_data(cd.clone(), data.offset, data.len)
+            } else {
+                cd.clone()
+            };
+            boxed_fields.push(make_array(child_data));
         }
         Self { data, boxed_fields }
     }
@@ -975,7 +975,7 @@ impl Array for StructArray {
 
     /// Returns the length (i.e., number of elements) of this array
     fn len(&self) -> usize {
-        self.boxed_fields[0].len()
+        self.data().len()
     }
 }
 
@@ -995,6 +995,7 @@ impl From<Vec<(Field, ArrayRef)>> for StructArray {
 
         let data = ArrayData::builder(DataType::Struct(field_types))
             .child_data(field_values.into_iter().map(|a| a.data()).collect())
+            .len(length)
             .build();
         Self::from(data)
     }
@@ -1142,7 +1143,11 @@ mod tests {
         assert_eq!(5, arr2.len());
         assert_eq!(2, arr2.offset());
         assert_eq!(1, arr2.null_count());
-        assert!(arr2.is_null(1));
+
+        for i in 0..arr2.len() {
+            assert_eq!(i == 1, arr2.is_null(i));
+            assert_eq!(i != 1, arr2.is_valid(i));
+        }
 
         let arr3 = arr2.slice(2, 3);
         assert_eq!(3, arr3.len());
@@ -1395,6 +1400,71 @@ mod tests {
     }
 
     #[test]
+
+    fn test_list_array_slice() {
+        // Construct a value array
+        let value_data = ArrayData::builder(DataType::Int32)
+            .len(10)
+            .add_buffer(Buffer::from(
+                &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9].to_byte_slice(),
+            ))
+            .build();
+
+        // Construct a buffer for value offsets, for the nested array:
+        //  [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]]
+        let value_offsets =
+            Buffer::from(&[0, 2, 2, 2, 4, 6, 6, 9, 9, 10].to_byte_slice());
+        // 01011001 00000001
+        let mut null_bits: [u8; 2] = [0; 2];
+        bit_util::set_bit(&mut null_bits, 0);
+        bit_util::set_bit(&mut null_bits, 3);
+        bit_util::set_bit(&mut null_bits, 4);
+        bit_util::set_bit(&mut null_bits, 6);
+        bit_util::set_bit(&mut null_bits, 8);
+
+        // Construct a list array from the above two
+        let list_data_type = DataType::List(Box::new(DataType::Int32));
+        let list_data = ArrayData::builder(list_data_type.clone())
+            .len(9)
+            .add_buffer(value_offsets.clone())
+            .add_child_data(value_data.clone())
+            .null_bit_buffer(Buffer::from(null_bits))
+            .build();
+        let list_array = ListArray::from(list_data);
+
+        let values = list_array.values();
+        assert_eq!(value_data, values.data());
+        assert_eq!(DataType::Int32, list_array.value_type());
+        assert_eq!(9, list_array.len());
+        assert_eq!(4, list_array.null_count());
+        assert_eq!(2, list_array.value_offset(3));
+        assert_eq!(2, list_array.value_length(3));
+
+        let sliced_array = list_array.slice(1, 6);
+        assert_eq!(6, sliced_array.len());
+        assert_eq!(1, sliced_array.offset());
+        assert_eq!(3, sliced_array.null_count());
+
+        for i in 0..sliced_array.len() {
+            if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+                assert!(sliced_array.is_valid(i));
+            } else {
+                assert!(sliced_array.is_null(i));
+            }
+        }
+
+        // Check offset and length for each non-null value.
+        let sliced_list_array =
+            sliced_array.as_any().downcast_ref::<ListArray>().unwrap();
+        assert_eq!(2, sliced_list_array.value_offset(2));
+        assert_eq!(2, sliced_list_array.value_length(2));
+        assert_eq!(4, sliced_list_array.value_offset(3));
+        assert_eq!(2, sliced_list_array.value_length(3));
+        assert_eq!(6, sliced_list_array.value_offset(5));
+        assert_eq!(3, sliced_list_array.value_length(5));
+    }
+
+    #[test]
     #[should_panic(
         expected = "ListArray data should contain a single buffer only (value offsets)"
     )]
@@ -1445,25 +1515,6 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "inconsistent offsets buffer and values array")]
-    fn test_list_array_invalid_value_offset_end() {
-        let value_data = ArrayData::builder(DataType::Int32)
-            .len(8)
-            .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()))
-            .build();
-
-        let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice());
-
-        let list_data_type = DataType::List(Box::new(DataType::Int32));
-        let list_data = ArrayData::builder(list_data_type.clone())
-            .len(3)
-            .add_buffer(value_offsets.clone())
-            .add_child_data(value_data.clone())
-            .build();
-        ListArray::from(list_data);
-    }
-
-    #[test]
     fn test_binary_array() {
         let values: [u8; 12] = [
             b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
@@ -1659,6 +1710,7 @@ mod tests {
         field_types.push(Field::new("a", DataType::Boolean, false));
         field_types.push(Field::new("b", DataType::Int64, false));
         let struct_array_data = ArrayData::builder(DataType::Struct(field_types))
+            .len(4)
             .add_child_data(boolean_data.clone())
             .add_child_data(int_data.clone())
             .build();
@@ -1691,6 +1743,95 @@ mod tests {
         ]);
         assert_eq!(boolean_data, struct_array.column(0).data());
         assert_eq!(int_data, struct_array.column(1).data());
+        assert_eq!(4, struct_array.len());
+        assert_eq!(0, struct_array.null_count());
+        assert_eq!(0, struct_array.offset());
+    }
+
+    #[test]
+    fn test_struct_array_slice() {
+        let boolean_data = ArrayData::builder(DataType::Boolean)
+            .len(5)
+            .add_buffer(Buffer::from([0b00010000]))
+            .null_bit_buffer(Buffer::from([0b00010001]))
+            .build();
+        let int_data = ArrayData::builder(DataType::Int32)
+            .len(5)
+            .add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice()))
+            .null_bit_buffer(Buffer::from([0b00000110]))
+            .build();
+
+        let mut field_types = vec![];
+        field_types.push(Field::new("a", DataType::Boolean, false));
+        field_types.push(Field::new("b", DataType::Int32, false));
+        let struct_array_data = ArrayData::builder(DataType::Struct(field_types))
+            .len(5)
+            .add_child_data(boolean_data.clone())
+            .add_child_data(int_data.clone())
+            .null_bit_buffer(Buffer::from([0b00010111]))
+            .build();
+        let struct_array = StructArray::from(struct_array_data);
+
+        assert_eq!(5, struct_array.len());
+        assert_eq!(1, struct_array.null_count());
+        assert!(struct_array.is_valid(0));
+        assert!(struct_array.is_valid(1));
+        assert!(struct_array.is_valid(2));
+        assert!(struct_array.is_null(3));
+        assert!(struct_array.is_valid(4));
+        assert_eq!(boolean_data, struct_array.column(0).data());
+        assert_eq!(int_data, struct_array.column(1).data());
+
+        let c0 = struct_array.column(0);
+        let c0 = c0.as_any().downcast_ref::<BooleanArray>().unwrap();
+        assert_eq!(5, c0.len());
+        assert_eq!(3, c0.null_count());
+        assert!(c0.is_valid(0));
+        assert_eq!(false, c0.value(0));
+        assert!(c0.is_null(1));
+        assert!(c0.is_null(2));
+        assert!(c0.is_null(3));
+        assert!(c0.is_valid(4));
+        assert_eq!(true, c0.value(4));
+
+        let c1 = struct_array.column(1);
+        let c1 = c1.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(5, c1.len());
+        assert_eq!(3, c1.null_count());
+        assert!(c1.is_null(0));
+        assert!(c1.is_valid(1));
+        assert_eq!(28, c1.value(1));
+        assert!(c1.is_valid(2));
+        assert_eq!(42, c1.value(2));
+        assert!(c1.is_null(3));
+        assert!(c1.is_null(4));
+
+        let sliced_array = struct_array.slice(2, 3);
+        let sliced_array = sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(3, sliced_array.len());
+        assert_eq!(2, sliced_array.offset());
+        assert_eq!(1, sliced_array.null_count());
+        assert!(sliced_array.is_valid(0));
+        assert!(sliced_array.is_null(1));
+        assert!(sliced_array.is_valid(2));
+
+        let sliced_c0 = sliced_array.column(0);
+        let sliced_c0 = sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
+        assert_eq!(3, sliced_c0.len());
+        assert_eq!(2, sliced_c0.offset());
+        assert!(sliced_c0.is_null(0));
+        assert!(sliced_c0.is_null(1));
+        assert!(sliced_c0.is_valid(2));
+        assert_eq!(true, sliced_c0.value(2));
+
+        let sliced_c1 = sliced_array.column(1);
+        let sliced_c1 = sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(3, sliced_c1.len());
+        assert_eq!(2, sliced_c1.offset());
+        assert!(sliced_c1.is_valid(0));
+        assert_eq!(42, sliced_c1.value(0));
+        assert!(sliced_c1.is_null(1));
+        assert!(sliced_c1.is_null(2));
     }
 
     #[test]
diff --git a/rust/arrow/src/builder.rs b/rust/arrow/src/builder.rs
index 0cbb5e1..b58628a 100644
--- a/rust/arrow/src/builder.rs
+++ b/rust/arrow/src/builder.rs
@@ -696,6 +696,9 @@ impl StructBuilder {
                 .null_count(null_count)
                 .null_bit_buffer(null_bit_buffer);
         }
+
+        self.len = 0;
+
         StructArray::from(builder.build())
     }
 }
@@ -1355,7 +1358,15 @@ mod tests {
             ])
             .unwrap();
 
+        // Append slot values - all are valid.
+        for _ in 0..10 {
+            assert!(builder.append(true).is_ok())
+        }
+
+        assert_eq!(10, builder.len());
+
         let arr = builder.finish();
+
         assert_eq!(10, arr.len());
         assert_eq!(0, builder.len());
 
@@ -1370,7 +1381,15 @@ mod tests {
             .append_slice(&[false, true, false, true, false])
             .unwrap();
 
+        // Append slot values - all are valid.
+        for _ in 0..5 {
+            assert!(builder.append(true).is_ok())
+        }
+
+        assert_eq!(5, builder.len());
+
         let arr = builder.finish();
+
         assert_eq!(5, arr.len());
         assert_eq!(0, builder.len());
     }