You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/09/26 03:00:20 UTC

[GitHub] [arrow] jorgecarleitao commented on a change in pull request #8280: ARROW-10103: [Rust] Add contains kernel

jorgecarleitao commented on a change in pull request #8280:
URL: https://github.com/apache/arrow/pull/8280#discussion_r495394022



##########
File path: rust/arrow/src/compute/util.rs
##########
@@ -71,6 +71,47 @@ pub(super) fn combine_option_bitmap(
     }
 }
 
+/// Compares the null bitmaps of two arrays using a bitwise `or` operation.
+///
+/// This function is useful when implementing operations on higher level arrays.
+pub(super) fn compare_option_bitmap(
+    left_data: &ArrayDataRef,
+    right_data: &ArrayDataRef,
+    len_in_bits: usize,
+) -> Result<Option<Buffer>> {
+    let left_offset_in_bits = left_data.offset();
+    let right_offset_in_bits = right_data.offset();
+
+    let left = left_data.null_buffer();
+    let right = right_data.null_buffer();
+
+    if (left.is_some() && left_offset_in_bits % 8 != 0)

Review comment:
       Out of curiosity: is this true in general, or is for these (this and above) implementation, that uses this assumption?

##########
File path: rust/arrow/src/compute/kernels/comparison.rs
##########
@@ -555,11 +557,159 @@ where
     compare_op_scalar!(left, right, |a, b| a >= b)
 }
 
+/// Checks if a `GenericListArray` contains a value in the `PrimitiveArray`
+pub fn contains<T, OffsetSize>(
+    left: &PrimitiveArray<T>,
+    right: &GenericListArray<OffsetSize>,
+) -> Result<BooleanArray>
+where
+    T: ArrowNumericType,
+    OffsetSize: OffsetSizeTrait,
+{
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(
+            "Cannot perform comparison operation on arrays of different length"
+                .to_string(),
+        ));
+    }
+
+    let not_both_null_bit_buffer =
+        match compare_option_bitmap(left.data_ref(), right.data_ref(), left.len())? {
+            Some(buff) => buff,
+            None => new_all_set_buffer(left.len()),
+        };
+    let not_both_null_bitmap = not_both_null_bit_buffer.data();
+
+    let left_data = left.data();
+    let left_null_bitmap = match left_data.null_bitmap() {
+        Some(bitmap) => bitmap.clone().into_buffer(),
+        _ => new_all_set_buffer(left.len()),
+    };
+    let left_null_bitmap = left_null_bitmap.data();
+
+    let mut result = BooleanBufferBuilder::new(left.len());
+
+    for i in 0..left.len() {
+        let mut is_in = false;
+
+        // contains(null, null) = false
+        if bit_util::get_bit(not_both_null_bitmap, i) {
+            let list = right.value(i);
+
+            // contains(null, [null]) = true
+            if !bit_util::get_bit(left_null_bitmap, i) {
+                if list.null_count() > 0 {
+                    is_in = true;
+                }
+            } else {
+                let list = list.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
+
+                for j in 0..list.len() {
+                    if list.is_valid(j) && (left.value(i) == list.value(j)) {
+                        is_in = true;
+                    }
+                }
+            }
+        }
+        result.append(is_in)?;
+    }
+
+    let data = ArrayData::new(
+        DataType::Boolean,
+        left.len(),
+        None,
+        None,
+        left.offset(),
+        vec![result.finish()],
+        vec![],
+    );
+    Ok(PrimitiveArray::<BooleanType>::from(Arc::new(data)))
+}
+
+/// Checks if a `GenericListArray` contains a value in the `GenericStringArray`
+pub fn contains_utf8<OffsetSize>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &ListArray,
+) -> Result<BooleanArray>
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(
+            "Cannot perform comparison operation on arrays of different length"
+                .to_string(),
+        ));
+    }
+
+    let not_both_null_bit_buffer =
+        match compare_option_bitmap(left.data_ref(), right.data_ref(), left.len())? {
+            Some(buff) => buff,
+            None => new_all_set_buffer(left.len()),
+        };
+    let not_both_null_bitmap = not_both_null_bit_buffer.data();
+
+    let left_data = left.data();
+    let left_null_bitmap = match left_data.null_bitmap() {
+        Some(bitmap) => bitmap.clone().into_buffer(),
+        _ => new_all_set_buffer(left.len()),
+    };
+    let left_null_bitmap = left_null_bitmap.data();
+
+    let mut result = BooleanBufferBuilder::new(left.len());
+
+    for i in 0..left.len() {
+        let mut is_in = false;
+
+        // contains(null, null) = false
+        if bit_util::get_bit(not_both_null_bitmap, i) {
+            let list = right.value(i);
+
+            // contains(null, [null]) = true
+            if !bit_util::get_bit(left_null_bitmap, i) {
+                if list.null_count() > 0 {
+                    is_in = true;
+                }
+            } else {
+                let list = list
+                    .as_any()
+                    .downcast_ref::<GenericStringArray<OffsetSize>>()
+                    .unwrap();
+
+                for j in 0..list.len() {
+                    if list.is_valid(j) && (left.value(i) == list.value(j)) {
+                        is_in = true;
+                    }
+                }
+            }
+        }
+        result.append(is_in)?;
+    }
+
+    let data = ArrayData::new(
+        DataType::Boolean,
+        left.len(),
+        None,
+        None,
+        left.offset(),

Review comment:
       I also think that this should be zero: we are building a new buffer (`result.finish()`) and thus there is no need to start from an offset. I would expect that a test with `left.offset() != 0` to not pass with this code, as we read `result`'s buffer from left's offset.

##########
File path: rust/arrow/src/compute/util.rs
##########
@@ -71,6 +71,47 @@ pub(super) fn combine_option_bitmap(
     }
 }
 
+/// Compares the null bitmaps of two arrays using a bitwise `or` operation.
+///
+/// This function is useful when implementing operations on higher level arrays.
+pub(super) fn compare_option_bitmap(

Review comment:
       tests missing, I think.

##########
File path: rust/arrow/src/compute/kernels/comparison.rs
##########
@@ -555,11 +555,159 @@ where
     compare_op_scalar!(left, right, |a, b| a >= b)
 }
 
+/// Checks if a `GenericListArray` contains a value in the `PrimitiveArray`
+pub fn contains<T, OffsetSize>(
+    left: &PrimitiveArray<T>,
+    right: &GenericListArray<OffsetSize>,
+) -> Result<BooleanArray>
+where
+    T: ArrowNumericType,
+    OffsetSize: OffsetSizeTrait,
+{
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(
+            "Cannot perform comparison operation on arrays of different length"
+                .to_string(),
+        ));
+    }
+
+    let not_both_null_bit_buffer =
+        match compare_option_bitmap(left.data_ref(), right.data_ref(), left.len())? {
+            Some(buff) => buff,
+            None => new_all_set_buffer(left.len()),
+        };
+    let not_both_null_bitmap = not_both_null_bit_buffer.data();
+
+    let left_data = left.data();
+    let left_null_bitmap = match left_data.null_bitmap() {
+        Some(bitmap) => bitmap.clone().into_buffer(),
+        _ => new_all_set_buffer(left.len()),
+    };
+    let left_null_bitmap = left_null_bitmap.data();
+
+    let mut result = BooleanBufferBuilder::new(left.len());
+
+    for i in 0..left.len() {
+        let mut is_in = false;
+
+        // contains(null, null) = false
+        if bit_util::get_bit(not_both_null_bitmap, i) {
+            let list = right.value(i);
+
+            // contains(null, [null]) = true
+            if !bit_util::get_bit(left_null_bitmap, i) {
+                if list.null_count() > 0 {
+                    is_in = true;

Review comment:
       I think that we can short-circuit here (`result.append(true)?; break`) (and equivalent instances). In rust's notation, I think that we are using the [`Iterator::any`](https://doc.rust-lang.org/stable/rust-by-example/fn/closures/closure_examples/iter_any.html) with some null logic sprinkled on top.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org