You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/11 09:18:09 UTC

[arrow-rs] branch master updated: speed up `substring_by_char` by about 2.5x (#1832)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new c08e5324d speed up `substring_by_char` by about 2.5x (#1832)
c08e5324d is described below

commit c08e5324d11a8a2b9998d0d625876815f4954968
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Sat Jun 11 17:18:04 2022 +0800

    speed up `substring_by_char` by about 2.5x (#1832)
    
    * speed up substring_by_char
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * better estimate the length of value buffer
    
    Signed-off-by: remzi <13...@gmail.com>
---
 arrow/src/compute/kernels/substring.rs | 79 ++++++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 17 deletions(-)

diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index 1954307e9..625a37514 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -182,24 +182,69 @@ pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
     start: i64,
     length: Option<u64>,
 ) -> Result<GenericStringArray<OffsetSize>> {
-    Ok(array
-        .iter()
-        .map(|val| {
-            val.map(|val| {
-                let char_count = val.chars().count();
-                let start = if start >= 0 {
-                    start.to_usize().unwrap().min(char_count)
-                } else {
-                    char_count - (-start).to_usize().unwrap().min(char_count)
-                };
-                let length = length.map_or(char_count - start, |length| {
-                    length.to_usize().unwrap().min(char_count - start)
-                });
+    let mut vals = BufferBuilder::<u8>::new({
+        let offsets = array.value_offsets();
+        (offsets[array.len()] - offsets[0]).to_usize().unwrap()
+    });
+    let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
+    new_offsets.append(OffsetSize::zero());
+    let length = length.map(|len| len.to_usize().unwrap());
+
+    array.iter().for_each(|val| {
+        if let Some(val) = val {
+            let char_count = val.chars().count();
+            let start = if start >= 0 {
+                start.to_usize().unwrap()
+            } else {
+                char_count - (-start).to_usize().unwrap().min(char_count)
+            };
+            let (start_offset, end_offset) = get_start_end_offset(val, start, length);
+            vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
+        }
+        new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
+    });
+    let data = unsafe {
+        ArrayData::new_unchecked(
+            GenericStringArray::<OffsetSize>::get_data_type(),
+            array.len(),
+            None,
+            array
+                .data_ref()
+                .null_buffer()
+                .map(|b| b.bit_slice(array.offset(), array.len())),
+            0,
+            vec![new_offsets.finish(), vals.finish()],
+            vec![],
+        )
+    };
+    Ok(GenericStringArray::<OffsetSize>::from(data))
+}
 
-                val.chars().skip(start).take(length).collect::<String>()
-            })
-        })
-        .collect::<GenericStringArray<OffsetSize>>())
+/// * `val` - string
+/// * `start` - the start char index of the substring
+/// * `length` - the char length of the substring
+///
+/// Return the `start` and `end` offset (by byte) of the substring
+fn get_start_end_offset(
+    val: &str,
+    start: usize,
+    length: Option<usize>,
+) -> (usize, usize) {
+    let len = val.len();
+    let mut offset_char_iter = val.char_indices();
+    let start_offset = offset_char_iter
+        .nth(start)
+        .map_or(len, |(offset, _)| offset);
+    let end_offset = length.map_or(len, |length| {
+        if length > 0 {
+            offset_char_iter
+                .nth(length - 1)
+                .map_or(len, |(offset, _)| offset)
+        } else {
+            start_offset
+        }
+    });
+    (start_offset, end_offset)
 }
 
 fn binary_substring<OffsetSize: OffsetSizeTrait>(