You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/05 12:22:48 UTC
[GitHub] [arrow] alamb commented on a change in pull request #9093: ARROW-11125: [Rust] Logical equality for list arrays

alamb commented on a change in pull request #9093:
URL: https://github.com/apache/arrow/pull/9093#discussion_r551885031



##########
File path: rust/datafusion/tests/sql.rs
##########
@@ -132,6 +132,7 @@ async fn parquet_single_nan_schema() {
 }
 
 #[tokio::test]
+#[ignore = "Test ignored, will be enabled as part of the nested Parquet reader"]

Review comment:
       What happened to this test?  It looks like it used to pass and now it doesn't?

##########
File path: rust/arrow/src/array/equal/boolean.rs
##########
@@ -15,35 +15,60 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::array::ArrayData;
+use crate::array::{data::count_nulls, ArrayData};
+use crate::buffer::Buffer;
+use crate::util::bit_util::get_bit;
 
 use super::utils::equal_bits;
 
 pub(super) fn boolean_equal(
     lhs: &ArrayData,
     rhs: &ArrayData,
+    lhs_nulls: Option<&Buffer>,
+    rhs_nulls: Option<&Buffer>,
     lhs_start: usize,
     rhs_start: usize,
     len: usize,
 ) -> bool {
     let lhs_values = lhs.buffers()[0].as_slice();
     let rhs_values = rhs.buffers()[0].as_slice();
 
-    // TODO: we can do this more efficiently if all values are not-null
-    (0..len).all(|i| {
-        let lhs_pos = lhs_start + i;
-        let rhs_pos = rhs_start + i;
-        let lhs_is_null = lhs.is_null(lhs_pos);
-        let rhs_is_null = rhs.is_null(rhs_pos);
-
-        lhs_is_null
-            || (lhs_is_null == rhs_is_null)
-                && equal_bits(
-                    lhs_values,
-                    rhs_values,
-                    lhs_pos + lhs.offset(),
-                    rhs_pos + rhs.offset(),
-                    1,
-                )
-    })
+    let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len);
+    let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len);
+
+    if lhs_null_count == 0 && rhs_null_count == 0 {
+        (0..len).all(|i| {
+            let lhs_pos = lhs_start + i;
+            let rhs_pos = rhs_start + i;
+
+            equal_bits(
+                lhs_values,
+                rhs_values,
+                lhs_pos + lhs.offset(),
+                rhs_pos + rhs.offset(),
+                1,
+            )
+        })
+    } else {
+        // get a ref of the null buffer bytes, to use in testing for nullness
+        let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice();

Review comment:
       Is it possible for `lhs_nulls == Some(..)` but `rhs_nulls == None` (and visa versa?) Given they are optional arguments I wasn't sure if they would always both be either `None` or `Some`

##########
File path: rust/arrow/src/array/equal/structure.rs
##########
@@ -37,39 +37,20 @@ fn equal_values(
     rhs_start: usize,
     len: usize,
 ) -> bool {
-    let mut temp_lhs: Option<Buffer> = None;
-    let mut temp_rhs: Option<Buffer> = None;
-
     lhs.child_data()
         .iter()
         .zip(rhs.child_data())
         .all(|(lhs_values, rhs_values)| {
             // merge the null data
-            let lhs_merged_nulls = match (lhs_nulls, lhs_values.null_buffer()) {

Review comment:
       I think the use of `temp_lhs` and `temp_rhs` here is to avoid the `lhs_nulls.cloned()` and `rhs_nulls.cloned()` calls below. 

##########
File path: rust/datafusion/tests/sql.rs
##########
@@ -132,6 +132,7 @@ async fn parquet_single_nan_schema() {
 }
 
 #[tokio::test]
+#[ignore = "Test ignored, will be enabled as part of the nested Parquet reader"]

Review comment:
       When I ran this test locally, it fails with a seemingly non-sensical error
   
   ```
   failures:
   
   ---- parquet_list_columns stdout ----
   thread 'parquet_list_columns' panicked at 'assertion failed: `(left == right)`
     left: `PrimitiveArray<Int64>
   [
     null,
     1,
   ]`,
    right: `PrimitiveArray<Int64>
   [
     null,
     1,
   ]`', datafusion/tests/sql.rs:204:5
   ```

##########
File path: rust/arrow/src/array/equal/utils.rs
##########
@@ -76,3 +80,185 @@ pub(super) fn equal_len(
 ) -> bool {
     lhs_values[lhs_start..(lhs_start + len)] == rhs_values[rhs_start..(rhs_start + len)]
 }
+
+/// Computes the logical validity bitmap of the array data using the
+/// parent's array data. The parent should be a list or struct, else
+/// the logical bitmap of the array is returned unaltered.
+///
+/// Parent data is passed along with the parent's logical bitmap, as
+/// nested arrays could have a logical bitmap different to the physical
+/// one on the `ArrayData`.
+pub(super) fn child_logical_null_buffer(
+    parent_data: &ArrayData,
+    logical_null_buffer: Option<Buffer>,

Review comment:
       I think if you changed 
   
   ```
       let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| {
   ```
   
   to 
   
   ```
       let parent_bitmap = logical_null_buffer.cloned().map(Bitmap::from).unwrap_or_else(|| {
   
   ```
   
   Then the signature could take an `Option<&Buffer>` and the code is cleaner (fewer calls to `.cloned()` outside this function). 
   
   But I don't think it has any runtime effect

##########
File path: rust/arrow/src/array/equal/list.rs
##########
@@ -71,45 +79,94 @@ fn offset_value_equal<T: OffsetSizeTrait>(
 pub(super) fn list_equal<T: OffsetSizeTrait>(
     lhs: &ArrayData,
     rhs: &ArrayData,
+    lhs_nulls: Option<&Buffer>,
+    rhs_nulls: Option<&Buffer>,
     lhs_start: usize,
     rhs_start: usize,
     len: usize,
 ) -> bool {
     let lhs_offsets = lhs.buffer::<T>(0);
     let rhs_offsets = rhs.buffer::<T>(0);
 
+    // There is an edge-case where a n-length list that has 0 children, results in panics.
+    // For example; an array with offsets [0, 0, 0, 0, 0] has 4 slots, but will have

Review comment:
       I probably am mis understanding but `[0, 0, 0, 0, 0]` has 5 entries but this comment says "4 slots"

##########
File path: rust/arrow/src/array/equal/utils.rs
##########
@@ -76,3 +80,185 @@ pub(super) fn equal_len(
 ) -> bool {
     lhs_values[lhs_start..(lhs_start + len)] == rhs_values[rhs_start..(rhs_start + len)]
 }
+
+/// Computes the logical validity bitmap of the array data using the
+/// parent's array data. The parent should be a list or struct, else
+/// the logical bitmap of the array is returned unaltered.
+///
+/// Parent data is passed along with the parent's logical bitmap, as
+/// nested arrays could have a logical bitmap different to the physical
+/// one on the `ArrayData`.
+pub(super) fn child_logical_null_buffer(
+    parent_data: &ArrayData,
+    logical_null_buffer: Option<Buffer>,
+    child_data: &ArrayData,
+) -> Option<Buffer> {
+    let parent_len = parent_data.len();
+    let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| {
+        let ceil = bit_util::ceil(parent_len, 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| {
+        let ceil = bit_util::ceil(child_data.len(), 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    match parent_data.data_type() {
+        DataType::List(_) => Some(logical_list_bitmap::<i32>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::LargeList(_) => Some(logical_list_bitmap::<i64>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::FixedSizeList(_, len) => {
+            let len = *len as usize;
+            let array_offset = parent_data.offset();
+            let bitmap_len = bit_util::ceil(parent_len * len, 8);
+            let mut buffer =
+                MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false);
+            let mut null_slice = buffer.as_slice_mut();
+            (array_offset..parent_len + array_offset).for_each(|index| {
+                let start = index * len;
+                let end = start + len;
+                let mask = parent_bitmap.is_set(index);
+                (start..end).for_each(|child_index| {
+                    if mask && self_null_bitmap.is_set(child_index) {
+                        bit_util::set_bit(&mut null_slice, child_index);
+                    }
+                });
+            });
+            Some(buffer.into())
+        }
+        DataType::Struct(_) => {
+            // Arrow implementations are free to pad data, which can result in null buffers not
+            // having the same length.
+            // Rust bitwise comparisons will return an error if left AND right is performed on
+            // buffers of different length.
+            // This might be a valid case during integration testing, where we read Arrow arrays
+            // from IPC data, which has padding.
+            //
+            // We first perform a bitwise comparison, and if there is an error, we revert to a
+            // slower method that indexes into the buffers one-by-one.
+            let result = &parent_bitmap & &self_null_bitmap;
+            if let Ok(bitmap) = result {
+                return Some(bitmap.bits);
+            }
+            // slow path
+            let array_offset = parent_data.offset();
+            let mut buffer = MutableBuffer::new_null(parent_len);
+            let mut null_slice = buffer.as_slice_mut();
+            (0..parent_len).for_each(|index| {
+                if parent_bitmap.is_set(index + array_offset)
+                    && self_null_bitmap.is_set(index + array_offset)
+                {
+                    bit_util::set_bit(&mut null_slice, index);
+                }
+            });
+            Some(buffer.into())
+        }
+        DataType::Union(_) => {
+            unimplemented!("Logical equality not yet implemented for union arrays")
+        }
+        DataType::Dictionary(_, _) => {
+            unimplemented!("Logical equality not yet implemented for nested dictionaries")
+        }
+        data_type => {
+            panic!("Data type {:?} is not a supported nested type", data_type)
+        }
+    }
+}
+
+// Calculate a list child's logical bitmap/buffer

Review comment:
       I don't fully understand how lists work  in arrow, but I will take your word for it that it does the right thing and that the tests are accurate. 

##########
File path: rust/arrow/src/array/equal/list.rs
##########
@@ -71,45 +79,94 @@ fn offset_value_equal<T: OffsetSizeTrait>(
 pub(super) fn list_equal<T: OffsetSizeTrait>(
     lhs: &ArrayData,
     rhs: &ArrayData,
+    lhs_nulls: Option<&Buffer>,
+    rhs_nulls: Option<&Buffer>,
     lhs_start: usize,
     rhs_start: usize,
     len: usize,
 ) -> bool {
     let lhs_offsets = lhs.buffer::<T>(0);
     let rhs_offsets = rhs.buffer::<T>(0);
 
+    // There is an edge-case where a n-length list that has 0 children, results in panics.
+    // For example; an array with offsets [0, 0, 0, 0, 0] has 4 slots, but will have
+    // no valid children.
+    // Under logical equality, the child null bitmap will be an empty buffer, as there are
+    // no child values. This causes panics when trying to count set bits.
+    //
+    // We caught this by chance from an accidental test-case, but due to the nature of this
+    // crash only occuring on list equality checks, we are adding a check here, instead of
+    // on the buffer/bitmap utilities, as a length check would incur a penalty for almost all
+    // other use-cases.
+    //
+    // The solution is to check the number of child values from offsets, and return `true` if
+    // they = 0. Empty arrays are equal, so this is correct.
+    //
+    // It's unlikely that one would create a n-length list array with no values, where n > 0,
+    // however, one is more likely to slice into a list array and get a region that has 0
+    // child values.
+    // The test that triggered this behaviour had [4, 4] as a slice of 1 value slot.
+    let lhs_child_length = lhs_offsets.get(len).unwrap().to_usize().unwrap()

Review comment:
       I don't fully understand the need for this check given that `count_nulls` seems to handle a buffer of `None` by returning zero.  
   
   When this code is commented out, however, I see the panic of
   
   ```
   ---- array::equal::tests::test_list_offsets stdout ----
   thread 'array::equal::tests::test_list_offsets' panicked at 'assertion failed: ceil(offset + len, 8) <= buffer.len() * 8', arrow/src/util/bit_chunk_iterator.rs:33:9
   ```
   
   So given that it is covered by tests, 👍 

##########
File path: rust/arrow/src/array/equal/utils.rs
##########
@@ -76,3 +80,185 @@ pub(super) fn equal_len(
 ) -> bool {
     lhs_values[lhs_start..(lhs_start + len)] == rhs_values[rhs_start..(rhs_start + len)]
 }
+
+/// Computes the logical validity bitmap of the array data using the
+/// parent's array data. The parent should be a list or struct, else
+/// the logical bitmap of the array is returned unaltered.
+///
+/// Parent data is passed along with the parent's logical bitmap, as
+/// nested arrays could have a logical bitmap different to the physical
+/// one on the `ArrayData`.
+pub(super) fn child_logical_null_buffer(
+    parent_data: &ArrayData,
+    logical_null_buffer: Option<Buffer>,
+    child_data: &ArrayData,
+) -> Option<Buffer> {
+    let parent_len = parent_data.len();
+    let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| {
+        let ceil = bit_util::ceil(parent_len, 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| {
+        let ceil = bit_util::ceil(child_data.len(), 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    match parent_data.data_type() {
+        DataType::List(_) => Some(logical_list_bitmap::<i32>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::LargeList(_) => Some(logical_list_bitmap::<i64>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::FixedSizeList(_, len) => {
+            let len = *len as usize;
+            let array_offset = parent_data.offset();
+            let bitmap_len = bit_util::ceil(parent_len * len, 8);
+            let mut buffer =
+                MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false);
+            let mut null_slice = buffer.as_slice_mut();
+            (array_offset..parent_len + array_offset).for_each(|index| {
+                let start = index * len;
+                let end = start + len;
+                let mask = parent_bitmap.is_set(index);
+                (start..end).for_each(|child_index| {
+                    if mask && self_null_bitmap.is_set(child_index) {
+                        bit_util::set_bit(&mut null_slice, child_index);
+                    }
+                });
+            });
+            Some(buffer.into())
+        }
+        DataType::Struct(_) => {
+            // Arrow implementations are free to pad data, which can result in null buffers not
+            // having the same length.
+            // Rust bitwise comparisons will return an error if left AND right is performed on
+            // buffers of different length.
+            // This might be a valid case during integration testing, where we read Arrow arrays
+            // from IPC data, which has padding.
+            //
+            // We first perform a bitwise comparison, and if there is an error, we revert to a

Review comment:
       👍  for comments.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org