You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/01 00:29:03 UTC

[GitHub] [arrow-rs] HaoYang670 commented on a change in pull request #1512: Speed up the `substring` kernel

HaoYang670 commented on a change in pull request #1512:
URL: https://github.com/apache/arrow-rs/pull/1512#discussion_r840126475



##########
File path: arrow/src/compute/kernels/substring.rs
##########
@@ -24,56 +24,74 @@ use crate::{
     error::{ArrowError, Result},
 };
 
-#[allow(clippy::unnecessary_wraps)]
 fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
     array: &GenericStringArray<OffsetSize>,
     start: OffsetSize,
     length: &Option<OffsetSize>,
 ) -> Result<ArrayRef> {
-    // compute current offsets
-    let offsets = array.data_ref().clone().buffers()[0].clone();
-    let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
-
-    // compute null bitmap (copy)
+    let offsets = array.value_offsets();
     let null_bit_buffer = array.data_ref().null_buffer().cloned();
-
-    // compute values
-    let values = &array.data_ref().buffers()[1];
+    let values = array.value_data();
     let data = values.as_slice();
+    let zero = OffsetSize::zero();
 
-    let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be.
-    let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
-
-    let mut length_so_far = OffsetSize::zero();
-    new_offsets.push(length_so_far);
-    (0..array.len()).for_each(|i| {
-        // the length of this entry
-        let length_i: OffsetSize = offsets[i + 1] - offsets[i];
-        // compute where we should start slicing this entry
-        let start = offsets[i]
-            + if start >= OffsetSize::zero() {
-                start
-            } else {
-                length_i + start
-            };
-
-        let start = start.max(offsets[i]).min(offsets[i + 1]);
-        // compute the length of the slice
-        let length: OffsetSize = length
-            .unwrap_or(length_i)
-            // .max(0) is not needed as it is guaranteed
-            .min(offsets[i + 1] - start); // so we do not go beyond this entry
-
-        length_so_far += length;
+    // calculate the start offset for each substring
+    // if `start` >= 0
+    // then, count from the start of each string
+    // else, count from the end of each string
+    let new_starts: Vec<OffsetSize> = if start >= zero {
+        offsets
+            .windows(2)
+            .map(|pair| (pair[0] + start).min(pair[1]))
+            .collect()
+    } else {
+        offsets
+            .windows(2)
+            .map(|pair| (pair[1] + start).max(pair[0]))
+            .collect()
+    };
 
-        new_offsets.push(length_so_far);
+    // count the length of each substring
+    // if `length` is given
+    // then, use it
+    // else, length is `string[new_start..].len()`
+    let new_length: Vec<OffsetSize> = if let Some(length) = length {
+        offsets[1..]
+            .iter()
+            .zip(new_starts.iter())
+            .map(|(end, start)| *(length.min(&(*end - *start))))
+            .collect()
+    } else {
+        offsets[1..]
+            .iter()
+            .zip(new_starts.iter())
+            .map(|(end, start)| *end - *start)
+            .collect()
+    };
 
-        // we need usize for ranges
-        let start = start.to_usize().unwrap();
-        let length = length.to_usize().unwrap();
+    let new_offsets: Vec<OffsetSize> = [zero]
+        .iter()
+        .copied()
+        .chain(new_length.iter().scan(zero, |len_so_far, &len| {
+            *len_so_far += len;
+            Some(*len_so_far)
+        }))
+        .collect();
 
-        new_values.extend_from_slice(&data[start..start + length]);
-    });
+    // concatenate substrings into a buffer
+    let new_values = {
+        let mut new_values =
+            MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap());
+        new_starts
+            .iter()
+            .zip(new_length.iter())
+            .map(|(start, length)| {
+                (start.to_usize().unwrap(), length.to_usize().unwrap())
+            })
+            .map(|(start, length)| &data[start..start + length])

Review comment:
       Done!




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org