You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "Dandandan (via GitHub)" <gi...@apache.org> on 2023/06/13 13:16:17 UTC

[GitHub] [arrow-rs] Dandandan commented on a diff in pull request #4405: Improve take primitive performance (#4404)

Dandandan commented on code in PR #4405:
URL: https://github.com/apache/arrow-rs/pull/4405#discussion_r1228115449


##########
arrow-select/src/take.rs:
##########
@@ -374,148 +243,91 @@ fn take_primitive<T, I>(
 where
     T: ArrowPrimitiveType,
     I: ArrowPrimitiveType,
-    I::Native: ToPrimitive,
 {
-    let indices_nulls = indices.nulls().filter(|x| x.null_count() > 0);
-    let values_nulls = values.nulls().filter(|x| x.null_count() > 0);
-
-    // note: this function should only panic when "an index is not null and out of bounds".
-    // if the index is null, its value is undefined and therefore we should not read from it.
-    let (buffer, nulls) = match (values_nulls, indices_nulls) {
-        (None, None) => {
-            // * no nulls
-            // * all `indices.values()` are valid
-            take_no_nulls(values.values(), indices.values())?
-        }
-        (Some(values_nulls), None) => {
-            // * nulls come from `values` alone
-            // * all `indices.values()` are valid
-            take_values_nulls(values.values(), values_nulls, indices.values())?
-        }
-        (None, Some(indices_nulls)) => {
-            // in this branch it is unsound to read and use `index.values()`,
-            // as doing so is UB when they come from a null slot.
-            take_indices_nulls(values.values(), indices.values(), indices_nulls)?
-        }
-        (Some(values_nulls), Some(indices_nulls)) => {
-            // in this branch it is unsound to read and use `index.values()`,
-            // as doing so is UB when they come from a null slot.
-            take_values_indices_nulls(
-                values.values(),
-                values_nulls,
-                indices.values(),
-                indices_nulls,
-            )?
-        }
-    };
-
-    let data = unsafe {
-        ArrayData::new_unchecked(
-            values.data_type().clone(),
-            indices.len(),
-            None,
-            nulls,
-            0,
-            vec![buffer],
-            vec![],
-        )
-    };
-    Ok(PrimitiveArray::<T>::from(data))
+    let values_buf = take_native(values.values(), indices);
+    let nulls = take_nulls(values.nulls(), indices);
+    Ok(PrimitiveArray::new(values_buf, nulls).with_data_type(values.data_type().clone()))
 }
 
-fn take_bits<IndexType>(
-    values: &Buffer,
-    values_offset: usize,
-    indices: &PrimitiveArray<IndexType>,
-) -> Result<Buffer, ArrowError>
-where
-    IndexType: ArrowPrimitiveType,
-    IndexType::Native: ToPrimitive,
-{
-    let len = indices.len();
-    let values_slice = values.as_slice();
-    let mut output_buffer = MutableBuffer::new_null(len);
-    let output_slice = output_buffer.as_slice_mut();
-
-    let indices_has_nulls = indices.null_count() > 0;
+#[inline(never)]
+fn take_nulls<I: ArrowPrimitiveType>(
+    values: Option<&NullBuffer>,
+    indices: &PrimitiveArray<I>,
+) -> Option<NullBuffer> {
+    match values.filter(|n| n.null_count() > 0) {
+        Some(n) => {
+            let buffer = take_bits(n.inner(), indices);
+            Some(NullBuffer::new(buffer)).filter(|n| n.null_count() > 0)
+        }
+        None => indices.nulls().cloned(),
+    }
+}
 
-    if indices_has_nulls {
-        indices
+#[inline(never)]
+fn take_native<T: ArrowNativeType, I: ArrowPrimitiveType>(
+    values: &[T],
+    indices: &PrimitiveArray<I>,
+) -> ScalarBuffer<T> {
+    match indices.nulls().filter(|n| n.null_count() > 0) {
+        Some(n) => indices
+            .values()
             .iter()
             .enumerate()
-            .try_for_each::<_, Result<(), ArrowError>>(|(i, index)| {
-                if let Some(index) = index {
-                    let index = ToPrimitive::to_usize(&index).ok_or_else(|| {
-                        ArrowError::ComputeError("Cast to usize failed".to_string())
-                    })?;
-
-                    if bit_util::get_bit(values_slice, values_offset + index) {
-                        bit_util::set_bit(output_slice, i);
-                    }
-                }
-
-                Ok(())
-            })?;
-    } else {
-        indices
+            .map(|(idx, index)| match values.get(index.as_usize()) {
+                Some(v) => *v,
+                None => match n.is_null(idx) {
+                    true => T::default(),
+                    false => panic!("Out-of-bounds index {index:?}"),
+                },
+            })
+            .collect(),
+        None => indices
             .values()
             .iter()
-            .enumerate()
-            .try_for_each::<_, Result<(), ArrowError>>(|(i, index)| {
-                let index = ToPrimitive::to_usize(index).ok_or_else(|| {
-                    ArrowError::ComputeError("Cast to usize failed".to_string())
-                })?;
+            .map(|index| values[index.as_usize()])
+            .collect(),
+    }
+}
 
-                if bit_util::get_bit(values_slice, values_offset + index) {
-                    bit_util::set_bit(output_slice, i);
-                }
-                Ok(())
-            })?;
+#[inline(never)]
+fn take_bits<I: ArrowPrimitiveType>(
+    values: &BooleanBuffer,
+    indices: &PrimitiveArray<I>,
+) -> BooleanBuffer {
+    let len = indices.len();
+    let mut output_buffer = MutableBuffer::new_null(len);
+    let output_slice = output_buffer.as_slice_mut();
+
+    match indices.nulls().filter(|n| n.null_count() > 0) {
+        Some(nulls) => nulls.valid_indices().for_each(|idx| {
+            if values.value(indices.value(idx).as_usize()) {
+                bit_util::set_bit(output_slice, idx);
+            }
+        }),
+        None => indices.values().iter().enumerate().for_each(|(i, index)| {
+            if values.value(index.as_usize()) {

Review Comment:
   Wasn't there a faster method to create a bitmap rather than doing `set_bit` in sequence?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org