You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/07 10:45:58 UTC

[GitHub] [arrow] alamb commented on a change in pull request #8364: ARROW-5350: [Rust] Allow filtering on simple lists

alamb commented on a change in pull request #8364:
URL: https://github.com/apache/arrow/pull/8364#discussion_r500910829



##########
File path: rust/arrow/src/compute/kernels/filter.rs
##########
@@ -230,6 +231,86 @@ macro_rules! filter_dictionary_array {
     }};
 }
 
+macro_rules! filter_primitive_item_list_array {
+    ($context:expr, $array:expr, $item_type:ident) => {{
+        let input_array = $array.as_any().downcast_ref::<ListArray>().unwrap();
+        let values_builder = PrimitiveBuilder::<$item_type>::new($context.filtered_count);
+        let mut builder = ListBuilder::new(values_builder);
+        for i in 0..$context.filter_u64.len() {
+            // foreach u64 batch
+            let filter_batch = $context.filter_u64[i];
+            if filter_batch == 0 {
+                // if batch == 0: skip

Review comment:
       I think this comment would be improved if it explained "why" batch zero is being skipped (the code is pretty clear that batch 0 *is* being skipped). Same comment for `filter_non_primitive_item_list_array`

##########
File path: rust/arrow/src/compute/kernels/filter.rs
##########
@@ -230,6 +231,86 @@ macro_rules! filter_dictionary_array {
     }};
 }
 
+macro_rules! filter_primitive_item_list_array {
+    ($context:expr, $array:expr, $item_type:ident) => {{
+        let input_array = $array.as_any().downcast_ref::<ListArray>().unwrap();
+        let values_builder = PrimitiveBuilder::<$item_type>::new($context.filtered_count);
+        let mut builder = ListBuilder::new(values_builder);
+        for i in 0..$context.filter_u64.len() {
+            // foreach u64 batch
+            let filter_batch = $context.filter_u64[i];
+            if filter_batch == 0 {
+                // if batch == 0: skip
+                continue;
+            }
+            for j in 0..64 {
+                // foreach bit in batch:
+                if (filter_batch & $context.filter_mask[j]) != 0 {
+                    let data_index = (i * 64) + j;
+                    if input_array.is_null(data_index) {
+                        builder.append(false)?;
+                    } else {
+                        let this_inner_list = input_array.value(data_index);
+                        let inner_list = this_inner_list
+                            .as_any()
+                            .downcast_ref::<PrimitiveArray<$item_type>>()
+                            .unwrap();
+                        for k in 0..inner_list.len() {
+                            if inner_list.is_null(k) {
+                                builder.values().append_null()?;
+                            } else {
+                                builder.values().append_value(inner_list.value(k))?;
+                            }
+                        }
+                        builder.append(true)?;
+                    }
+                }
+            }
+        }
+        Ok(Arc::new(builder.finish()))
+    }};
+}
+
+macro_rules! filter_non_primitive_item_list_array {
+    ($context:expr, $array:expr, $item_array_type:ident, $item_builder:ident) => {{

Review comment:
       I like this "item_builder" pattern for non-primative list arrays

##########
File path: rust/arrow/src/compute/kernels/filter.rs
##########
@@ -733,4 +909,53 @@ mod tests {
         assert_eq!("hello", d.value(0));
         assert_eq!("world", d.value(1));
     }
+
+    #[test]
+    fn test_filter_list_array() {
+        let value_data = ArrayData::builder(DataType::Int32)
+            .len(8)
+            .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()))
+            .build();
+
+        let value_offsets = Buffer::from(&[0, 3, 6, 8, 8].to_byte_slice());
+
+        let list_data_type = DataType::List(Box::new(DataType::Int32));
+        let list_data = ArrayData::builder(list_data_type)
+            .len(4)
+            .add_buffer(value_offsets)
+            .add_child_data(value_data)
+            .null_bit_buffer(Buffer::from([0b00000111]))
+            .build();
+
+        //  a = [[0, 1, 2], [3, 4, 5], [6, 7], null]
+        let a = ListArray::from(list_data);
+        let b = BooleanArray::from(vec![false, true, false, true]);
+        let c = filter(&a, &b).unwrap();
+        let d = c.as_ref().as_any().downcast_ref::<ListArray>().unwrap();
+
+        assert_eq!(DataType::Int32, d.value_type());
+
+        // result should be [[3, 4, 5], null]
+        assert_eq!(2, d.len());
+        assert_eq!(1, d.null_count());
+        assert_eq!(true, d.is_null(1));
+
+        assert_eq!(0, d.value_offset(0));
+        assert_eq!(3, d.value_length(0));
+        assert_eq!(3, d.value_offset(1));

Review comment:
       I wonder if it is necessary to check `value_offset` and `value_length` for an element in the list that is marked as null?

##########
File path: rust/arrow/src/compute/kernels/filter.rs
##########
@@ -733,4 +909,53 @@ mod tests {
         assert_eq!("hello", d.value(0));
         assert_eq!("world", d.value(1));
     }
+
+    #[test]
+    fn test_filter_list_array() {
+        let value_data = ArrayData::builder(DataType::Int32)
+            .len(8)
+            .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()))
+            .build();
+
+        let value_offsets = Buffer::from(&[0, 3, 6, 8, 8].to_byte_slice());
+
+        let list_data_type = DataType::List(Box::new(DataType::Int32));
+        let list_data = ArrayData::builder(list_data_type)
+            .len(4)
+            .add_buffer(value_offsets)
+            .add_child_data(value_data)
+            .null_bit_buffer(Buffer::from([0b00000111]))
+            .build();
+
+        //  a = [[0, 1, 2], [3, 4, 5], [6, 7], null]
+        let a = ListArray::from(list_data);
+        let b = BooleanArray::from(vec![false, true, false, true]);
+        let c = filter(&a, &b).unwrap();
+        let d = c.as_ref().as_any().downcast_ref::<ListArray>().unwrap();
+
+        assert_eq!(DataType::Int32, d.value_type());
+
+        // result should be [[3, 4, 5], null]
+        assert_eq!(2, d.len());
+        assert_eq!(1, d.null_count());
+        assert_eq!(true, d.is_null(1));
+
+        assert_eq!(0, d.value_offset(0));
+        assert_eq!(3, d.value_length(0));
+        assert_eq!(3, d.value_offset(1));
+        assert_eq!(0, d.value_length(1));
+        assert_eq!(
+            Buffer::from(&[3, 4, 5].to_byte_slice()),
+            d.values().data().buffers()[0].clone()
+        );
+        assert_eq!(

Review comment:
       I am not sure what the significance of checking the data buffer is here. It seems like the contents are covered in the other checks.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org