You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/09/15 09:10:01 UTC
[arrow] branch master updated: ARROW-10010: [Rust] Speedup arithmetic (1.3-1.9x)

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 49e5b46  ARROW-10010: [Rust] Speedup arithmetic (1.3-1.9x)
49e5b46 is described below

commit 49e5b465346e10a3d89e47dfb9ed5dc4e57548d3
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Tue Sep 15 11:09:09 2020 +0200

    ARROW-10010: [Rust] Speedup arithmetic (1.3-1.9x)
    
    This PR speeds-up arithmetic ops by leveraging vectorization of non-divide operations (in non-SIMD), as well as removing an un-needed operation in SIMD division.
    
    For non-SIMD, this yields about `[-30%,-45%]` for all operations (`+-*/`)
    For SIMD, this yields about `-30%` on division.
    
    The culprit in non-SIMD was that we required the operation to return `Result<T::Native>`, which was not allowing the compiler to vectorize the operation. Only the division requires `Result`. For divide, removing the operator further speed up the operation (I do not know the reason).
    
    The culprit in SIMD was primarily a `simd_load` too many that was not doing anything.
    
    ## Benchmarks
    
    The benchmark used:
    
    ```
    set -e
    git checkout 0852869d1a9b7da4a1b91fa7cb7d4ef48e99cdec
    cargo bench --bench arithmetic_kernels
    git checkout divide_simd_faster
    cargo bench --bench arithmetic_kernels
    echo "##################################"
    git checkout 0852869d1a9b7da4a1b91fa7cb7d4ef48e99cdec
    cargo bench --bench arithmetic_kernels --features simd
    git checkout divide_simd_faster
    cargo bench --bench arithmetic_kernels --features simd
    ```
    
    and below are the results for the execution of the second `bench`, which is the one that gives the differential, in my machine:
    
    ### Non-SIMD
    
    ```
    Previous HEAD position was 0852869d1 Improved benches for arithmetic.
    Switched to branch 'divide_simd_faster'
       Compiling arrow v2.0.0-SNAPSHOT (/Users/jorgecarleitao/projects/arrow/rust/arrow)
        Finished bench [optimized] target(s) in 37.24s
         Running /Users/jorgecarleitao/projects/arrow/rust/target/release/deps/arithmetic_kernels-d281862a43faaf38
    Gnuplot not found, using plotters backend
    add 512                 time:   [1.4714 us 1.4758 us 1.4803 us]
                            change: [-44.446% -43.969% -43.522%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 5 outliers among 100 measurements (5.00%)
      5 (5.00%) high severe
    
    subtract 512            time:   [1.4825 us 1.4844 us 1.4866 us]
                            change: [-45.351% -45.018% -44.686%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 9 outliers among 100 measurements (9.00%)
      5 (5.00%) high mild
      4 (4.00%) high severe
    
    multiply 512            time:   [1.4895 us 1.4936 us 1.4990 us]
                            change: [-44.822% -44.135% -43.479%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 9 outliers among 100 measurements (9.00%)
      4 (4.00%) high mild
      5 (5.00%) high severe
    
    divide 512              time:   [1.9742 us 1.9773 us 1.9810 us]
                            change: [-33.273% -32.688% -32.052%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 14 outliers among 100 measurements (14.00%)
      7 (7.00%) high mild
      7 (7.00%) high severe
    
    limit 512, 512          time:   [374.66 ns 375.64 ns 376.53 ns]
                            change: [-0.1000% +0.4442% +0.9503%] (p = 0.10 > 0.05)
                            No change in performance detected.
    Found 8 outliers among 100 measurements (8.00%)
      2 (2.00%) low severe
      2 (2.00%) low mild
      2 (2.00%) high mild
      2 (2.00%) high severe
    
    add_nulls_512           time:   [1.4880 us 1.4982 us 1.5115 us]
                            change: [-44.084% -43.116% -42.111%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 16 outliers among 100 measurements (16.00%)
      3 (3.00%) high mild
      13 (13.00%) high severe
    
    divide_nulls_512        time:   [1.9731 us 1.9758 us 1.9790 us]
                            change: [-33.404% -32.570% -31.416%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 8 outliers among 100 measurements (8.00%)
      2 (2.00%) high mild
      6 (6.00%) high severe
    ```
    
    ### SIMD
    
    divide is the only relevant
    
    ```
    Previous HEAD position was 0852869d1 Improved benches for arithmetic.
    Switched to branch 'divide_simd_faster'
       Compiling arrow v2.0.0-SNAPSHOT (/Users/jorgecarleitao/projects/arrow/rust/arrow)
        Finished bench [optimized] target(s) in 38.63s
         Running /Users/jorgecarleitao/projects/arrow/rust/target/release/deps/arithmetic_kernels-b8dc1739cfb5ae36
    Gnuplot not found, using plotters backend
    add 512                 time:   [879.31 ns 883.95 ns 889.17 ns]
                            change: [-0.2041% +0.6502% +1.5484%] (p = 0.15 > 0.05)
                            No change in performance detected.
    Found 16 outliers among 100 measurements (16.00%)
      5 (5.00%) high mild
      11 (11.00%) high severe
    
    subtract 512            time:   [864.99 ns 866.95 ns 868.95 ns]
                            change: [-4.8531% -4.1561% -3.5163%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 7 outliers among 100 measurements (7.00%)
      2 (2.00%) high mild
      5 (5.00%) high severe
    
    multiply 512            time:   [862.85 ns 864.87 ns 867.71 ns]
                            change: [-3.8532% -3.1774% -2.4459%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 5 outliers among 100 measurements (5.00%)
      5 (5.00%) high severe
    
    divide 512              time:   [1.9703 us 1.9771 us 1.9843 us]
                            change: [-30.046% -29.457% -28.903%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 1 outliers among 100 measurements (1.00%)
      1 (1.00%) high severe
    
    limit 512, 512          time:   [368.89 ns 369.96 ns 370.96 ns]
                            change: [-1.9574% -1.0063% -0.0347%] (p = 0.04 < 0.05)
                            Change within noise threshold.
    Found 26 outliers among 100 measurements (26.00%)
      5 (5.00%) low severe
      6 (6.00%) low mild
      9 (9.00%) high mild
      6 (6.00%) high severe
    
    add_nulls_512           time:   [871.97 ns 876.99 ns 883.57 ns]
                            change: [-5.1106% -3.6889% -2.3080%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 8 outliers among 100 measurements (8.00%)
      2 (2.00%) high mild
      6 (6.00%) high severe
    
    divide_nulls_512        time:   [1.9582 us 1.9625 us 1.9678 us]
                            change: [-34.188% -33.161% -32.136%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 8 outliers among 100 measurements (8.00%)
      2 (2.00%) high mild
      6 (6.00%) high severe
    ```
    
    Closes #8191 from jorgecarleitao/divide_simd_faster
    
    Authored-by: Jorge C. Leitao <jo...@gmail.com>
    Signed-off-by: Neville Dipale <ne...@gmail.com>
---
 rust/arrow/benches/arithmetic_kernels.rs     | 27 ++++++--
 rust/arrow/src/compute/kernels/arithmetic.rs | 93 +++++++++++++++++++++-------
 2 files changed, 91 insertions(+), 29 deletions(-)

diff --git a/rust/arrow/benches/arithmetic_kernels.rs b/rust/arrow/benches/arithmetic_kernels.rs
index c71a666..93ad32b 100644
--- a/rust/arrow/benches/arithmetic_kernels.rs
+++ b/rust/arrow/benches/arithmetic_kernels.rs
@@ -19,6 +19,7 @@
 extern crate criterion;
 use criterion::Criterion;
 
+use rand::Rng;
 use std::sync::Arc;
 
 extern crate arrow;
@@ -27,10 +28,17 @@ use arrow::array::*;
 use arrow::compute::kernels::arithmetic::*;
 use arrow::compute::kernels::limit::*;
 
-fn create_array(size: usize) -> ArrayRef {
+fn create_array(size: usize, with_nulls: bool) -> ArrayRef {
+    // use random numbers to avoid spurious compiler optimizations wrt to branching
+    let mut rng = rand::thread_rng();
     let mut builder = Float32Builder::new(size);
-    for i in 0..size {
-        builder.append_value(1.0 + 1.0 * i as f32).unwrap();
+
+    for _ in 0..size {
+        if with_nulls && rng.gen::<f32>() > 0.5 {
+            builder.append_null().unwrap();
+        } else {
+            builder.append_value(rng.gen()).unwrap();
+        }
     }
     Arc::new(builder.finish())
 }
@@ -64,8 +72,8 @@ fn bench_limit(arr_a: &ArrayRef, max: usize) {
 }
 
 fn add_benchmark(c: &mut Criterion) {
-    let arr_a = create_array(512);
-    let arr_b = create_array(512);
+    let arr_a = create_array(512, false);
+    let arr_b = create_array(512, false);
 
     c.bench_function("add 512", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
     c.bench_function("subtract 512", |b| {
@@ -76,6 +84,15 @@ fn add_benchmark(c: &mut Criterion) {
     });
     c.bench_function("divide 512", |b| b.iter(|| bench_divide(&arr_a, &arr_b)));
     c.bench_function("limit 512, 512", |b| b.iter(|| bench_limit(&arr_a, 512)));
+
+    let arr_a_nulls = create_array(512, false);
+    let arr_b_nulls = create_array(512, false);
+    c.bench_function("add_nulls_512", |b| {
+        b.iter(|| bench_add(&arr_a_nulls, &arr_b_nulls))
+    });
+    c.bench_function("divide_nulls_512", |b| {
+        b.iter(|| bench_divide(&arr_a_nulls, &arr_b_nulls))
+    });
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs
index 03dc169..9b28762 100644
--- a/rust/arrow/src/compute/kernels/arithmetic.rs
+++ b/rust/arrow/src/compute/kernels/arithmetic.rs
@@ -31,7 +31,6 @@ use std::sync::Arc;
 
 use num::{One, Zero};
 
-use crate::array::*;
 #[cfg(feature = "simd")]
 use crate::bitmap::Bitmap;
 use crate::buffer::Buffer;
@@ -43,11 +42,15 @@ use crate::compute::util::simd_load_set_invalid;
 use crate::datatypes;
 use crate::datatypes::ToByteSlice;
 use crate::error::{ArrowError, Result};
-use crate::util::bit_util;
+use crate::{array::*, util::bit_util};
 
 /// Helper function to perform math lambda function on values from two arrays. If either
 /// left or right value is null then the output value is also null, so `1 + null` is
 /// `null`.
+///
+/// # Errors
+///
+/// This function errors if the arrays have different lengths
 pub fn math_op<T, F>(
     left: &PrimitiveArray<T>,
     right: &PrimitiveArray<T>,
@@ -55,7 +58,47 @@ pub fn math_op<T, F>(
 ) -> Result<PrimitiveArray<T>>
 where
     T: datatypes::ArrowNumericType,
-    F: Fn(T::Native, T::Native) -> Result<T::Native>,
+    F: Fn(T::Native, T::Native) -> T::Native,
+{
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(
+            "Cannot perform math operation on arrays of different length".to_string(),
+        ));
+    }
+
+    let null_bit_buffer =
+        combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
+
+    let values = (0..left.len())
+        .map(|i| op(left.value(i), right.value(i)))
+        .collect::<Vec<T::Native>>();
+
+    let data = ArrayData::new(
+        T::get_data_type(),
+        left.len(),
+        None,
+        null_bit_buffer,
+        0,
+        vec![Buffer::from(values.to_byte_slice())],
+        vec![],
+    );
+    Ok(PrimitiveArray::<T>::from(Arc::new(data)))
+}
+
+/// Helper function to divide two arrays.
+///
+/// # Errors
+///
+/// This function errors if:
+/// * the arrays have different lengths
+/// * a division by zero is found
+fn math_divide<T>(
+    left: &PrimitiveArray<T>,
+    right: &PrimitiveArray<T>,
+) -> Result<PrimitiveArray<T>>
+where
+    T: datatypes::ArrowNumericType,
+    T::Native: Div<Output = T::Native> + Zero,
 {
     if left.len() != right.len() {
         return Err(ArrowError::ComputeError(
@@ -68,20 +111,32 @@ where
 
     let mut values = Vec::with_capacity(left.len());
     if let Some(b) = &null_bit_buffer {
+        // some value is null
         for i in 0..left.len() {
-            unsafe {
+            values.push(unsafe {
                 if bit_util::get_bit_raw(b.raw_data(), i) {
-                    values.push(op(left.value(i), right.value(i))?);
+                    let right_value = right.value(i);
+                    if right_value.is_zero() {
+                        return Err(ArrowError::DivideByZero);
+                    } else {
+                        left.value(i) / right_value
+                    }
                 } else {
-                    values.push(T::default_value())
+                    T::default_value()
                 }
-            }
+            });
         }
     } else {
+        // no value is null
         for i in 0..left.len() {
-            values.push(op(left.value(i), right.value(i))?);
+            let right_value = right.value(i);
+            values.push(if right_value.is_zero() {
+                return Err(ArrowError::DivideByZero);
+            } else {
+                left.value(i) / right_value
+            });
         }
-    }
+    };
 
     let data = ArrayData::new(
         T::get_data_type(),
@@ -170,7 +225,7 @@ where
     // Create the combined `Bitmap`
     let null_bit_buffer =
         combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
-    let bitmap = null_bit_buffer.map(Bitmap::from);
+    let bitmap = null_bit_buffer.clone().map(Bitmap::from);
 
     let lanes = T::lanes();
     let buffer_size = left.len() * mem::size_of::<T::Native>();
@@ -183,8 +238,6 @@ where
         if T::mask_any(is_zero) {
             return Err(ArrowError::DivideByZero);
         }
-        let right_no_invalid_zeros =
-            unsafe { simd_load_set_invalid(right, &bitmap, i, lanes, T::Native::one()) };
         let simd_left = T::load(left.value_slice(i, lanes));
         let simd_result = T::bin_op(simd_left, right_no_invalid_zeros, |a, b| a / b);
 
@@ -197,8 +250,6 @@ where
         T::write(simd_result, result_slice);
     }
 
-    let null_bit_buffer = bitmap.map(|b| b.bits);
-
     let data = ArrayData::new(
         T::get_data_type(),
         left.len(),
@@ -229,7 +280,7 @@ where
     return simd_math_op(&left, &right, |a, b| a + b);
 
     #[allow(unreachable_code)]
-    math_op(left, right, |a, b| Ok(a + b))
+    math_op(left, right, |a, b| a + b)
 }
 
 /// Perform `left - right` operation on two arrays. If either left or right value is null
@@ -250,7 +301,7 @@ where
     return simd_math_op(&left, &right, |a, b| a - b);
 
     #[allow(unreachable_code)]
-    math_op(left, right, |a, b| Ok(a - b))
+    math_op(left, right, |a, b| a - b)
 }
 
 /// Perform `left * right` operation on two arrays. If either left or right value is null
@@ -271,7 +322,7 @@ where
     return simd_math_op(&left, &right, |a, b| a * b);
 
     #[allow(unreachable_code)]
-    math_op(left, right, |a, b| Ok(a * b))
+    math_op(left, right, |a, b| a * b)
 }
 
 /// Perform `left / right` operation on two arrays. If either left or right value is null
@@ -294,13 +345,7 @@ where
     return simd_divide(&left, &right);
 
     #[allow(unreachable_code)]
-    math_op(left, right, |a, b| {
-        if b.is_zero() {
-            Err(ArrowError::DivideByZero)
-        } else {
-            Ok(a / b)
-        }
-    })
+    math_divide(&left, &right)
 }
 
 #[cfg(test)]