You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/07 20:31:35 UTC

[arrow-rs] branch master updated: Speed up the `substring` kernel by about 2x (#1512)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 497521f3d Speed up the `substring` kernel by about 2x (#1512)
497521f3d is described below

commit 497521f3da0eec3d1f9d922307af7d3dbcbbef76
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Fri Apr 8 04:31:30 2022 +0800

    Speed up the `substring` kernel by about 2x (#1512)
    
    * speed up substring
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add comments
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * reformat code
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * use trait opject to simplify the code
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * reformat code
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * fmt code
    
    Signed-off-by: remzi <13...@gmail.com>
---
 arrow/src/compute/kernels/substring.rs | 79 ++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index 9b468f5d4..1be829f72 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -24,56 +24,61 @@ use crate::{
     error::{ArrowError, Result},
 };
 
-#[allow(clippy::unnecessary_wraps)]
 fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
     array: &GenericStringArray<OffsetSize>,
     start: OffsetSize,
     length: &Option<OffsetSize>,
 ) -> Result<ArrayRef> {
-    // compute current offsets
-    let offsets = array.data_ref().clone().buffers()[0].clone();
-    let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
-
-    // compute null bitmap (copy)
+    let offsets = array.value_offsets();
     let null_bit_buffer = array.data_ref().null_buffer().cloned();
-
-    // compute values
-    let values = &array.data_ref().buffers()[1];
+    let values = array.value_data();
     let data = values.as_slice();
+    let zero = OffsetSize::zero();
 
-    let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be.
-    let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
-
-    let mut length_so_far = OffsetSize::zero();
-    new_offsets.push(length_so_far);
-    (0..array.len()).for_each(|i| {
-        // the length of this entry
-        let length_i: OffsetSize = offsets[i + 1] - offsets[i];
-        // compute where we should start slicing this entry
-        let start = offsets[i]
-            + if start >= OffsetSize::zero() {
-                start
-            } else {
-                length_i + start
-            };
+    let cal_new_start: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> = if start
+        >= zero
+    {
+        // count from the start of string
+        Box::new(|old_start: OffsetSize, end: OffsetSize| (old_start + start).min(end))
+    } else {
+        // count from the end of string
+        Box::new(|old_start: OffsetSize, end: OffsetSize| (end + start).max(old_start))
+    };
 
-        let start = start.max(offsets[i]).min(offsets[i + 1]);
-        // compute the length of the slice
-        let length: OffsetSize = length
-            .unwrap_or(length_i)
-            // .max(0) is not needed as it is guaranteed
-            .min(offsets[i + 1] - start); // so we do not go beyond this entry
+    let cal_new_length: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> =
+        if let Some(length) = length {
+            Box::new(|start: OffsetSize, end: OffsetSize| (*length).min(end - start))
+        } else {
+            Box::new(|start: OffsetSize, end: OffsetSize| end - start)
+        };
 
-        length_so_far += length;
+    // start and end offsets for each substring
+    let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> =
+        Vec::with_capacity(array.len());
+    let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
+    let mut len_so_far = zero;
+    new_offsets.push(zero);
 
-        new_offsets.push(length_so_far);
+    offsets.windows(2).for_each(|pair| {
+        let new_start = cal_new_start(pair[0], pair[1]);
+        let new_length = cal_new_length(new_start, pair[1]);
+        len_so_far += new_length;
+        new_starts_ends.push((new_start, new_start + new_length));
+        new_offsets.push(len_so_far);
+    });
 
-        // we need usize for ranges
-        let start = start.to_usize().unwrap();
-        let length = length.to_usize().unwrap();
+    // concatenate substrings into a buffer
+    let mut new_values =
+        MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap());
 
-        new_values.extend_from_slice(&data[start..start + length]);
-    });
+    new_starts_ends
+        .iter()
+        .map(|(start, end)| {
+            let start = start.to_usize().unwrap();
+            let end = end.to_usize().unwrap();
+            &data[start..end]
+        })
+        .for_each(|slice| new_values.extend_from_slice(slice));
 
     let data = unsafe {
         ArrayData::new_unchecked(