You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/11 09:18:09 UTC
[arrow-rs] branch master updated: speed up `substring_by_char` by about 2.5x (#1832)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c08e5324d speed up `substring_by_char` by about 2.5x (#1832)
c08e5324d is described below
commit c08e5324d11a8a2b9998d0d625876815f4954968
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Sat Jun 11 17:18:04 2022 +0800
speed up `substring_by_char` by about 2.5x (#1832)
* speed up substring_by_char
Signed-off-by: remzi <13...@gmail.com>
* better estimate the length of value buffer
Signed-off-by: remzi <13...@gmail.com>
---
arrow/src/compute/kernels/substring.rs | 79 ++++++++++++++++++++++++++--------
1 file changed, 62 insertions(+), 17 deletions(-)
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index 1954307e9..625a37514 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -182,24 +182,69 @@ pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
start: i64,
length: Option<u64>,
) -> Result<GenericStringArray<OffsetSize>> {
- Ok(array
- .iter()
- .map(|val| {
- val.map(|val| {
- let char_count = val.chars().count();
- let start = if start >= 0 {
- start.to_usize().unwrap().min(char_count)
- } else {
- char_count - (-start).to_usize().unwrap().min(char_count)
- };
- let length = length.map_or(char_count - start, |length| {
- length.to_usize().unwrap().min(char_count - start)
- });
+ let mut vals = BufferBuilder::<u8>::new({
+ let offsets = array.value_offsets();
+ (offsets[array.len()] - offsets[0]).to_usize().unwrap()
+ });
+ let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
+ new_offsets.append(OffsetSize::zero());
+ let length = length.map(|len| len.to_usize().unwrap());
+
+ array.iter().for_each(|val| {
+ if let Some(val) = val {
+ let char_count = val.chars().count();
+ let start = if start >= 0 {
+ start.to_usize().unwrap()
+ } else {
+ char_count - (-start).to_usize().unwrap().min(char_count)
+ };
+ let (start_offset, end_offset) = get_start_end_offset(val, start, length);
+ vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
+ }
+ new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
+ });
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ GenericStringArray::<OffsetSize>::get_data_type(),
+ array.len(),
+ None,
+ array
+ .data_ref()
+ .null_buffer()
+ .map(|b| b.bit_slice(array.offset(), array.len())),
+ 0,
+ vec![new_offsets.finish(), vals.finish()],
+ vec![],
+ )
+ };
+ Ok(GenericStringArray::<OffsetSize>::from(data))
+}
- val.chars().skip(start).take(length).collect::<String>()
- })
- })
- .collect::<GenericStringArray<OffsetSize>>())
+/// * `val` - string
+/// * `start` - the start char index of the substring
+/// * `length` - the char length of the substring
+///
+/// Return the `start` and `end` offset (by byte) of the substring
+fn get_start_end_offset(
+ val: &str,
+ start: usize,
+ length: Option<usize>,
+) -> (usize, usize) {
+ let len = val.len();
+ let mut offset_char_iter = val.char_indices();
+ let start_offset = offset_char_iter
+ .nth(start)
+ .map_or(len, |(offset, _)| offset);
+ let end_offset = length.map_or(len, |length| {
+ if length > 0 {
+ offset_char_iter
+ .nth(length - 1)
+ .map_or(len, |(offset, _)| offset)
+ } else {
+ start_offset
+ }
+ });
+ (start_offset, end_offset)
}
fn binary_substring<OffsetSize: OffsetSizeTrait>(