You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/07 20:31:35 UTC
[arrow-rs] branch master updated: Speed up the `substring` kernel by about 2x (#1512)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 497521f3d Speed up the `substring` kernel by about 2x (#1512)
497521f3d is described below
commit 497521f3da0eec3d1f9d922307af7d3dbcbbef76
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Fri Apr 8 04:31:30 2022 +0800
Speed up the `substring` kernel by about 2x (#1512)
* speed up substring
Signed-off-by: remzi <13...@gmail.com>
* add comments
Signed-off-by: remzi <13...@gmail.com>
* reformat code
Signed-off-by: remzi <13...@gmail.com>
* use trait opject to simplify the code
Signed-off-by: remzi <13...@gmail.com>
* reformat code
Signed-off-by: remzi <13...@gmail.com>
* fmt code
Signed-off-by: remzi <13...@gmail.com>
---
arrow/src/compute/kernels/substring.rs | 79 ++++++++++++++++++----------------
1 file changed, 42 insertions(+), 37 deletions(-)
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index 9b468f5d4..1be829f72 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -24,56 +24,61 @@ use crate::{
error::{ArrowError, Result},
};
-#[allow(clippy::unnecessary_wraps)]
fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: OffsetSize,
length: &Option<OffsetSize>,
) -> Result<ArrayRef> {
- // compute current offsets
- let offsets = array.data_ref().clone().buffers()[0].clone();
- let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
-
- // compute null bitmap (copy)
+ let offsets = array.value_offsets();
let null_bit_buffer = array.data_ref().null_buffer().cloned();
-
- // compute values
- let values = &array.data_ref().buffers()[1];
+ let values = array.value_data();
let data = values.as_slice();
+ let zero = OffsetSize::zero();
- let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be.
- let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
-
- let mut length_so_far = OffsetSize::zero();
- new_offsets.push(length_so_far);
- (0..array.len()).for_each(|i| {
- // the length of this entry
- let length_i: OffsetSize = offsets[i + 1] - offsets[i];
- // compute where we should start slicing this entry
- let start = offsets[i]
- + if start >= OffsetSize::zero() {
- start
- } else {
- length_i + start
- };
+ let cal_new_start: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> = if start
+ >= zero
+ {
+ // count from the start of string
+ Box::new(|old_start: OffsetSize, end: OffsetSize| (old_start + start).min(end))
+ } else {
+ // count from the end of string
+ Box::new(|old_start: OffsetSize, end: OffsetSize| (end + start).max(old_start))
+ };
- let start = start.max(offsets[i]).min(offsets[i + 1]);
- // compute the length of the slice
- let length: OffsetSize = length
- .unwrap_or(length_i)
- // .max(0) is not needed as it is guaranteed
- .min(offsets[i + 1] - start); // so we do not go beyond this entry
+ let cal_new_length: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> =
+ if let Some(length) = length {
+ Box::new(|start: OffsetSize, end: OffsetSize| (*length).min(end - start))
+ } else {
+ Box::new(|start: OffsetSize, end: OffsetSize| end - start)
+ };
- length_so_far += length;
+ // start and end offsets for each substring
+ let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> =
+ Vec::with_capacity(array.len());
+ let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
+ let mut len_so_far = zero;
+ new_offsets.push(zero);
- new_offsets.push(length_so_far);
+ offsets.windows(2).for_each(|pair| {
+ let new_start = cal_new_start(pair[0], pair[1]);
+ let new_length = cal_new_length(new_start, pair[1]);
+ len_so_far += new_length;
+ new_starts_ends.push((new_start, new_start + new_length));
+ new_offsets.push(len_so_far);
+ });
- // we need usize for ranges
- let start = start.to_usize().unwrap();
- let length = length.to_usize().unwrap();
+ // concatenate substrings into a buffer
+ let mut new_values =
+ MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap());
- new_values.extend_from_slice(&data[start..start + length]);
- });
+ new_starts_ends
+ .iter()
+ .map(|(start, end)| {
+ let start = start.to_usize().unwrap();
+ let end = end.to_usize().unwrap();
+ &data[start..end]
+ })
+ .for_each(|slice| new_values.extend_from_slice(slice));
let data = unsafe {
ArrayData::new_unchecked(