You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/05/26 18:53:08 UTC
[arrow-rs] branch master updated: support min max binary (#1725)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 3e9e9df7f support min max binary (#1725)
3e9e9df7f is described below
commit 3e9e9df7fb10f68607e34f1612b799d0bee3e3d6
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Fri May 27 02:53:04 2022 +0800
support min max binary (#1725)
Signed-off-by: remzi <13...@gmail.com>
---
arrow/src/compute/kernels/aggregate.rs | 118 ++++++++++++++++++++++-----------
1 file changed, 80 insertions(+), 38 deletions(-)
diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs
index 83fce79c8..12ead669f 100644
--- a/arrow/src/compute/kernels/aggregate.rs
+++ b/arrow/src/compute/kernels/aggregate.rs
@@ -21,7 +21,8 @@ use multiversion::multiversion;
use std::ops::Add;
use crate::array::{
- Array, BooleanArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
+ Array, BooleanArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait,
+ PrimitiveArray,
};
use crate::datatypes::{ArrowNativeType, ArrowNumericType};
@@ -32,31 +33,6 @@ fn is_nan<T: ArrowNativeType + PartialOrd + Copy>(a: T) -> bool {
!(a == a)
}
-/// Helper function to perform min/max of strings
-fn min_max_string<T, F>(array: &GenericStringArray<T>, cmp: F) -> Option<&str>
-where
- T: OffsetSizeTrait,
- F: Fn(&str, &str) -> bool,
-{
- let null_count = array.null_count();
-
- if null_count == array.len() {
- None
- } else if null_count == 0 {
- // JUSTIFICATION
- // Benefit: ~8% speedup
- // Soundness: `i` is always within the array bounds
- (0..array.len())
- .map(|i| unsafe { array.value_unchecked(i) })
- .reduce(|acc, item| if cmp(acc, item) { item } else { acc })
- } else {
- array
- .iter()
- .flatten()
- .reduce(|acc, item| if cmp(acc, item) { item } else { acc })
- }
-}
-
/// Returns the minimum value in the array, according to the natural order.
/// For floating point arrays any NaN values are considered to be greater than any other non-null value
#[cfg(not(feature = "simd"))]
@@ -79,16 +55,6 @@ where
min_max_helper(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b)
}
-/// Returns the maximum value in the string array, according to the natural order.
-pub fn max_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
- min_max_string(array, |a, b| a < b)
-}
-
-/// Returns the minimum value in the string array, according to the natural order.
-pub fn min_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
- min_max_string(array, |a, b| a > b)
-}
-
/// Helper function to perform min/max lambda function on values from a numeric array.
#[multiversion]
#[clone(target = "x86_64+avx")]
@@ -176,6 +142,48 @@ pub fn max_boolean(array: &BooleanArray) -> Option<bool> {
.or(Some(false))
}
+/// Helper to compute min/max of [`GenericStringArray`] and [`GenericBinaryArray`]
+macro_rules! min_max_binary_string {
+ ($array: expr, $cmp: expr) => {{
+ let null_count = $array.null_count();
+ if null_count == $array.len() {
+ None
+ } else if null_count == 0 {
+ // JUSTIFICATION
+ // Benefit: ~8% speedup
+ // Soundness: `i` is always within the array bounds
+ (0..$array.len())
+ .map(|i| unsafe { $array.value_unchecked(i) })
+ .reduce(|acc, item| if $cmp(acc, item) { item } else { acc })
+ } else {
+ $array
+ .iter()
+ .flatten()
+ .reduce(|acc, item| if $cmp(acc, item) { item } else { acc })
+ }
+ }};
+}
+
+/// Returns the maximum value in the binary array, according to the natural order.
+pub fn max_binary<T: OffsetSizeTrait>(array: &GenericBinaryArray<T>) -> Option<&[u8]> {
+ min_max_binary_string!(array, |a, b| a < b)
+}
+
+/// Returns the minimum value in the binary array, according to the natural order.
+pub fn min_binary<T: OffsetSizeTrait>(array: &GenericBinaryArray<T>) -> Option<&[u8]> {
+ min_max_binary_string!(array, |a, b| a > b)
+}
+
+/// Returns the maximum value in the string array, according to the natural order.
+pub fn max_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
+ min_max_binary_string!(array, |a, b| a < b)
+}
+
+/// Returns the minimum value in the string array, according to the natural order.
+pub fn min_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
+ min_max_binary_string!(array, |a, b| a > b)
+}
+
/// Returns the sum of values in the array.
///
/// Returns `None` if the array is empty or only contains null values.
@@ -885,11 +893,45 @@ mod tests {
assert!(max(&a).unwrap().is_nan());
}
+ #[test]
+ fn test_binary_min_max_with_nulls() {
+ let a = BinaryArray::from(vec![
+ Some("b".as_bytes()),
+ None,
+ None,
+ Some(b"a"),
+ Some(b"c"),
+ ]);
+ assert_eq!(Some("a".as_bytes()), min_binary(&a));
+ assert_eq!(Some("c".as_bytes()), max_binary(&a));
+ }
+
+ #[test]
+ fn test_binary_min_max_no_null() {
+ let a = BinaryArray::from(vec![Some("b".as_bytes()), Some(b"a"), Some(b"c")]);
+ assert_eq!(Some("a".as_bytes()), min_binary(&a));
+ assert_eq!(Some("c".as_bytes()), max_binary(&a));
+ }
+
+ #[test]
+ fn test_binary_min_max_all_nulls() {
+ let a = BinaryArray::from(vec![None, None]);
+ assert_eq!(None, min_binary(&a));
+ assert_eq!(None, max_binary(&a));
+ }
+
+ #[test]
+ fn test_binary_min_max_1() {
+ let a = BinaryArray::from(vec![None, None, Some("b".as_bytes()), Some(b"a")]);
+ assert_eq!(Some("a".as_bytes()), min_binary(&a));
+ assert_eq!(Some("b".as_bytes()), max_binary(&a));
+ }
+
#[test]
fn test_string_min_max_with_nulls() {
let a = StringArray::from(vec![Some("b"), None, None, Some("a"), Some("c")]);
- assert_eq!("a", min_string(&a).unwrap());
- assert_eq!("c", max_string(&a).unwrap());
+ assert_eq!(Some("a"), min_string(&a));
+ assert_eq!(Some("c"), max_string(&a));
}
#[test]