You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/10/12 16:04:08 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #3804: Optimize `regexp_replace` when the input is a sparse array

alamb commented on code in PR #3804:
URL: https://github.com/apache/arrow-datafusion/pull/3804#discussion_r993650161


##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -254,13 +255,38 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
     // with rust ones.
     let replacement = regex_replace_posix_groups(replacement);
 
-    let result = string_array
-        .iter()
-        .map(|string| {
-            string.map(|string| re.replacen(string, limit, replacement.as_str()))
-        })
-        .collect::<GenericStringArray<T>>();
-    Ok(Arc::new(result) as ArrayRef)
+    // We are going to create the underlying string buffer from its parts
+    // to be able to re-use the existing null buffer for sparse arrays.
+    let mut vals = BufferBuilder::<u8>::new({
+        let offsets = string_array.value_offsets();
+        (offsets[string_array.len()] - offsets[0])
+            .to_usize()
+            .unwrap()
+    });
+    let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
+    new_offsets.append(T::zero());
+
+    string_array.iter().for_each(|val| {
+        if let Some(val) = val {
+            let result = re.replacen(val, limit, replacement.as_str());
+            vals.append_slice(result.as_bytes());
+        }
+        new_offsets.append(T::from_usize(vals.len()).unwrap());
+    });
+
+    let data = ArrayData::try_new(
+        GenericStringArray::<T>::DATA_TYPE,
+        string_array.len(),
+        string_array
+            .data_ref()
+            .null_buffer()
+            .map(|b| b.bit_slice(string_array.offset(), string_array.len())),

Review Comment:
   👍  for handling the offset



##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -513,4 +539,40 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_static_pattern_regexp_replace_with_null_buffers() {
+        let values = StringArray::from(vec![
+            Some("a"),
+            None,
+            Some("b"),
+            None,
+            Some("a"),
+            None,
+            None,
+            Some("c"),
+        ]);
+        let patterns = StringArray::from(vec!["a"; 1]);
+        let replacements = StringArray::from(vec!["foo"; 1]);
+        let expected = StringArray::from(vec![
+            Some("foo"),
+            None,
+            Some("b"),
+            None,
+            Some("foo"),
+            None,
+            None,
+            Some("c"),
+        ]);
+
+        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+            Arc::new(values),
+            Arc::new(patterns),
+            Arc::new(replacements),
+        ])
+        .unwrap();
+
+        assert_eq!(re.as_ref(), &expected);
+        assert_eq!(re.null_count(), 4);

Review Comment:
   To test the offset slicing, I also recommend you run the same test with sliced inputs. Something like
   
   ```rust
   let values = values.slice(2, 5);
   let expected = StringArray::from(vec![
               Some("b"),
               None,
               Some("foo"),
               None,
           ]);
   
   
           assert_eq!(re.null_count(), 2);
           let re = _regexp_replace_static_pattern_replace::<i32>(&[
               Arc::new(values),
               Arc::new(patterns),
               Arc::new(replacements),
           ])
           .unwrap();
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org