You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/05/26 18:53:08 UTC

[arrow-rs] branch master updated: support min max binary (#1725)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 3e9e9df7f support min max binary (#1725)
3e9e9df7f is described below

commit 3e9e9df7fb10f68607e34f1612b799d0bee3e3d6
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Fri May 27 02:53:04 2022 +0800

    support min max binary (#1725)
    
    Signed-off-by: remzi <13...@gmail.com>
---
 arrow/src/compute/kernels/aggregate.rs | 118 ++++++++++++++++++++++-----------
 1 file changed, 80 insertions(+), 38 deletions(-)

diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs
index 83fce79c8..12ead669f 100644
--- a/arrow/src/compute/kernels/aggregate.rs
+++ b/arrow/src/compute/kernels/aggregate.rs
@@ -21,7 +21,8 @@ use multiversion::multiversion;
 use std::ops::Add;
 
 use crate::array::{
-    Array, BooleanArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
+    Array, BooleanArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait,
+    PrimitiveArray,
 };
 use crate::datatypes::{ArrowNativeType, ArrowNumericType};
 
@@ -32,31 +33,6 @@ fn is_nan<T: ArrowNativeType + PartialOrd + Copy>(a: T) -> bool {
     !(a == a)
 }
 
-/// Helper function to perform min/max of strings
-fn min_max_string<T, F>(array: &GenericStringArray<T>, cmp: F) -> Option<&str>
-where
-    T: OffsetSizeTrait,
-    F: Fn(&str, &str) -> bool,
-{
-    let null_count = array.null_count();
-
-    if null_count == array.len() {
-        None
-    } else if null_count == 0 {
-        // JUSTIFICATION
-        //  Benefit:  ~8% speedup
-        //  Soundness: `i` is always within the array bounds
-        (0..array.len())
-            .map(|i| unsafe { array.value_unchecked(i) })
-            .reduce(|acc, item| if cmp(acc, item) { item } else { acc })
-    } else {
-        array
-            .iter()
-            .flatten()
-            .reduce(|acc, item| if cmp(acc, item) { item } else { acc })
-    }
-}
-
 /// Returns the minimum value in the array, according to the natural order.
 /// For floating point arrays any NaN values are considered to be greater than any other non-null value
 #[cfg(not(feature = "simd"))]
@@ -79,16 +55,6 @@ where
     min_max_helper(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b)
 }
 
-/// Returns the maximum value in the string array, according to the natural order.
-pub fn max_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
-    min_max_string(array, |a, b| a < b)
-}
-
-/// Returns the minimum value in the string array, according to the natural order.
-pub fn min_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
-    min_max_string(array, |a, b| a > b)
-}
-
 /// Helper function to perform min/max lambda function on values from a numeric array.
 #[multiversion]
 #[clone(target = "x86_64+avx")]
@@ -176,6 +142,48 @@ pub fn max_boolean(array: &BooleanArray) -> Option<bool> {
         .or(Some(false))
 }
 
+/// Helper to compute min/max of [`GenericStringArray`] and [`GenericBinaryArray`]
+macro_rules! min_max_binary_string {
+    ($array: expr, $cmp: expr) => {{
+        let null_count = $array.null_count();
+        if null_count == $array.len() {
+            None
+        } else if null_count == 0 {
+            // JUSTIFICATION
+            //  Benefit:  ~8% speedup
+            //  Soundness: `i` is always within the array bounds
+            (0..$array.len())
+                .map(|i| unsafe { $array.value_unchecked(i) })
+                .reduce(|acc, item| if $cmp(acc, item) { item } else { acc })
+        } else {
+            $array
+                .iter()
+                .flatten()
+                .reduce(|acc, item| if $cmp(acc, item) { item } else { acc })
+        }
+    }};
+}
+
+/// Returns the maximum value in the binary array, according to the natural order.
+pub fn max_binary<T: OffsetSizeTrait>(array: &GenericBinaryArray<T>) -> Option<&[u8]> {
+    min_max_binary_string!(array, |a, b| a < b)
+}
+
+/// Returns the minimum value in the binary array, according to the natural order.
+pub fn min_binary<T: OffsetSizeTrait>(array: &GenericBinaryArray<T>) -> Option<&[u8]> {
+    min_max_binary_string!(array, |a, b| a > b)
+}
+
+/// Returns the maximum value in the string array, according to the natural order.
+pub fn max_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
+    min_max_binary_string!(array, |a, b| a < b)
+}
+
+/// Returns the minimum value in the string array, according to the natural order.
+pub fn min_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
+    min_max_binary_string!(array, |a, b| a > b)
+}
+
 /// Returns the sum of values in the array.
 ///
 /// Returns `None` if the array is empty or only contains null values.
@@ -885,11 +893,45 @@ mod tests {
         assert!(max(&a).unwrap().is_nan());
     }
 
+    #[test]
+    fn test_binary_min_max_with_nulls() {
+        let a = BinaryArray::from(vec![
+            Some("b".as_bytes()),
+            None,
+            None,
+            Some(b"a"),
+            Some(b"c"),
+        ]);
+        assert_eq!(Some("a".as_bytes()), min_binary(&a));
+        assert_eq!(Some("c".as_bytes()), max_binary(&a));
+    }
+
+    #[test]
+    fn test_binary_min_max_no_null() {
+        let a = BinaryArray::from(vec![Some("b".as_bytes()), Some(b"a"), Some(b"c")]);
+        assert_eq!(Some("a".as_bytes()), min_binary(&a));
+        assert_eq!(Some("c".as_bytes()), max_binary(&a));
+    }
+
+    #[test]
+    fn test_binary_min_max_all_nulls() {
+        let a = BinaryArray::from(vec![None, None]);
+        assert_eq!(None, min_binary(&a));
+        assert_eq!(None, max_binary(&a));
+    }
+
+    #[test]
+    fn test_binary_min_max_1() {
+        let a = BinaryArray::from(vec![None, None, Some("b".as_bytes()), Some(b"a")]);
+        assert_eq!(Some("a".as_bytes()), min_binary(&a));
+        assert_eq!(Some("b".as_bytes()), max_binary(&a));
+    }
+
     #[test]
     fn test_string_min_max_with_nulls() {
         let a = StringArray::from(vec![Some("b"), None, None, Some("a"), Some("c")]);
-        assert_eq!("a", min_string(&a).unwrap());
-        assert_eq!("c", max_string(&a).unwrap());
+        assert_eq!(Some("a"), min_string(&a));
+        assert_eq!(Some("c"), max_string(&a));
     }
 
     #[test]