You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/09/15 09:10:01 UTC
[arrow] branch master updated: ARROW-10010: [Rust] Speedup
arithmetic (1.3-1.9x)
This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 49e5b46 ARROW-10010: [Rust] Speedup arithmetic (1.3-1.9x)
49e5b46 is described below
commit 49e5b465346e10a3d89e47dfb9ed5dc4e57548d3
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Tue Sep 15 11:09:09 2020 +0200
ARROW-10010: [Rust] Speedup arithmetic (1.3-1.9x)
This PR speeds-up arithmetic ops by leveraging vectorization of non-divide operations (in non-SIMD), as well as removing an un-needed operation in SIMD division.
For non-SIMD, this yields about `[-30%,-45%]` for all operations (`+-*/`)
For SIMD, this yields about `-30%` on division.
The culprit in non-SIMD was that we required the operation to return `Result<T::Native>`, which was not allowing the compiler to vectorize the operation. Only the division requires `Result`. For divide, removing the operator further speed up the operation (I do not know the reason).
The culprit in SIMD was primarily a `simd_load` too many that was not doing anything.
## Benchmarks
The benchmark used:
```
set -e
git checkout 0852869d1a9b7da4a1b91fa7cb7d4ef48e99cdec
cargo bench --bench arithmetic_kernels
git checkout divide_simd_faster
cargo bench --bench arithmetic_kernels
echo "##################################"
git checkout 0852869d1a9b7da4a1b91fa7cb7d4ef48e99cdec
cargo bench --bench arithmetic_kernels --features simd
git checkout divide_simd_faster
cargo bench --bench arithmetic_kernels --features simd
```
and below are the results for the execution of the second `bench`, which is the one that gives the differential, in my machine:
### Non-SIMD
```
Previous HEAD position was 0852869d1 Improved benches for arithmetic.
Switched to branch 'divide_simd_faster'
Compiling arrow v2.0.0-SNAPSHOT (/Users/jorgecarleitao/projects/arrow/rust/arrow)
Finished bench [optimized] target(s) in 37.24s
Running /Users/jorgecarleitao/projects/arrow/rust/target/release/deps/arithmetic_kernels-d281862a43faaf38
Gnuplot not found, using plotters backend
add 512 time: [1.4714 us 1.4758 us 1.4803 us]
change: [-44.446% -43.969% -43.522%] (p = 0.00 < 0.05)
Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
5 (5.00%) high severe
subtract 512 time: [1.4825 us 1.4844 us 1.4866 us]
change: [-45.351% -45.018% -44.686%] (p = 0.00 < 0.05)
Performance has improved.
Found 9 outliers among 100 measurements (9.00%)
5 (5.00%) high mild
4 (4.00%) high severe
multiply 512 time: [1.4895 us 1.4936 us 1.4990 us]
change: [-44.822% -44.135% -43.479%] (p = 0.00 < 0.05)
Performance has improved.
Found 9 outliers among 100 measurements (9.00%)
4 (4.00%) high mild
5 (5.00%) high severe
divide 512 time: [1.9742 us 1.9773 us 1.9810 us]
change: [-33.273% -32.688% -32.052%] (p = 0.00 < 0.05)
Performance has improved.
Found 14 outliers among 100 measurements (14.00%)
7 (7.00%) high mild
7 (7.00%) high severe
limit 512, 512 time: [374.66 ns 375.64 ns 376.53 ns]
change: [-0.1000% +0.4442% +0.9503%] (p = 0.10 > 0.05)
No change in performance detected.
Found 8 outliers among 100 measurements (8.00%)
2 (2.00%) low severe
2 (2.00%) low mild
2 (2.00%) high mild
2 (2.00%) high severe
add_nulls_512 time: [1.4880 us 1.4982 us 1.5115 us]
change: [-44.084% -43.116% -42.111%] (p = 0.00 < 0.05)
Performance has improved.
Found 16 outliers among 100 measurements (16.00%)
3 (3.00%) high mild
13 (13.00%) high severe
divide_nulls_512 time: [1.9731 us 1.9758 us 1.9790 us]
change: [-33.404% -32.570% -31.416%] (p = 0.00 < 0.05)
Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
2 (2.00%) high mild
6 (6.00%) high severe
```
### SIMD
divide is the only relevant
```
Previous HEAD position was 0852869d1 Improved benches for arithmetic.
Switched to branch 'divide_simd_faster'
Compiling arrow v2.0.0-SNAPSHOT (/Users/jorgecarleitao/projects/arrow/rust/arrow)
Finished bench [optimized] target(s) in 38.63s
Running /Users/jorgecarleitao/projects/arrow/rust/target/release/deps/arithmetic_kernels-b8dc1739cfb5ae36
Gnuplot not found, using plotters backend
add 512 time: [879.31 ns 883.95 ns 889.17 ns]
change: [-0.2041% +0.6502% +1.5484%] (p = 0.15 > 0.05)
No change in performance detected.
Found 16 outliers among 100 measurements (16.00%)
5 (5.00%) high mild
11 (11.00%) high severe
subtract 512 time: [864.99 ns 866.95 ns 868.95 ns]
change: [-4.8531% -4.1561% -3.5163%] (p = 0.00 < 0.05)
Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
2 (2.00%) high mild
5 (5.00%) high severe
multiply 512 time: [862.85 ns 864.87 ns 867.71 ns]
change: [-3.8532% -3.1774% -2.4459%] (p = 0.00 < 0.05)
Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
5 (5.00%) high severe
divide 512 time: [1.9703 us 1.9771 us 1.9843 us]
change: [-30.046% -29.457% -28.903%] (p = 0.00 < 0.05)
Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
1 (1.00%) high severe
limit 512, 512 time: [368.89 ns 369.96 ns 370.96 ns]
change: [-1.9574% -1.0063% -0.0347%] (p = 0.04 < 0.05)
Change within noise threshold.
Found 26 outliers among 100 measurements (26.00%)
5 (5.00%) low severe
6 (6.00%) low mild
9 (9.00%) high mild
6 (6.00%) high severe
add_nulls_512 time: [871.97 ns 876.99 ns 883.57 ns]
change: [-5.1106% -3.6889% -2.3080%] (p = 0.00 < 0.05)
Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
2 (2.00%) high mild
6 (6.00%) high severe
divide_nulls_512 time: [1.9582 us 1.9625 us 1.9678 us]
change: [-34.188% -33.161% -32.136%] (p = 0.00 < 0.05)
Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
2 (2.00%) high mild
6 (6.00%) high severe
```
Closes #8191 from jorgecarleitao/divide_simd_faster
Authored-by: Jorge C. Leitao <jo...@gmail.com>
Signed-off-by: Neville Dipale <ne...@gmail.com>
---
rust/arrow/benches/arithmetic_kernels.rs | 27 ++++++--
rust/arrow/src/compute/kernels/arithmetic.rs | 93 +++++++++++++++++++++-------
2 files changed, 91 insertions(+), 29 deletions(-)
diff --git a/rust/arrow/benches/arithmetic_kernels.rs b/rust/arrow/benches/arithmetic_kernels.rs
index c71a666..93ad32b 100644
--- a/rust/arrow/benches/arithmetic_kernels.rs
+++ b/rust/arrow/benches/arithmetic_kernels.rs
@@ -19,6 +19,7 @@
extern crate criterion;
use criterion::Criterion;
+use rand::Rng;
use std::sync::Arc;
extern crate arrow;
@@ -27,10 +28,17 @@ use arrow::array::*;
use arrow::compute::kernels::arithmetic::*;
use arrow::compute::kernels::limit::*;
-fn create_array(size: usize) -> ArrayRef {
+fn create_array(size: usize, with_nulls: bool) -> ArrayRef {
+ // use random numbers to avoid spurious compiler optimizations wrt to branching
+ let mut rng = rand::thread_rng();
let mut builder = Float32Builder::new(size);
- for i in 0..size {
- builder.append_value(1.0 + 1.0 * i as f32).unwrap();
+
+ for _ in 0..size {
+ if with_nulls && rng.gen::<f32>() > 0.5 {
+ builder.append_null().unwrap();
+ } else {
+ builder.append_value(rng.gen()).unwrap();
+ }
}
Arc::new(builder.finish())
}
@@ -64,8 +72,8 @@ fn bench_limit(arr_a: &ArrayRef, max: usize) {
}
fn add_benchmark(c: &mut Criterion) {
- let arr_a = create_array(512);
- let arr_b = create_array(512);
+ let arr_a = create_array(512, false);
+ let arr_b = create_array(512, false);
c.bench_function("add 512", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
c.bench_function("subtract 512", |b| {
@@ -76,6 +84,15 @@ fn add_benchmark(c: &mut Criterion) {
});
c.bench_function("divide 512", |b| b.iter(|| bench_divide(&arr_a, &arr_b)));
c.bench_function("limit 512, 512", |b| b.iter(|| bench_limit(&arr_a, 512)));
+
+ let arr_a_nulls = create_array(512, false);
+ let arr_b_nulls = create_array(512, false);
+ c.bench_function("add_nulls_512", |b| {
+ b.iter(|| bench_add(&arr_a_nulls, &arr_b_nulls))
+ });
+ c.bench_function("divide_nulls_512", |b| {
+ b.iter(|| bench_divide(&arr_a_nulls, &arr_b_nulls))
+ });
}
criterion_group!(benches, add_benchmark);
diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs
index 03dc169..9b28762 100644
--- a/rust/arrow/src/compute/kernels/arithmetic.rs
+++ b/rust/arrow/src/compute/kernels/arithmetic.rs
@@ -31,7 +31,6 @@ use std::sync::Arc;
use num::{One, Zero};
-use crate::array::*;
#[cfg(feature = "simd")]
use crate::bitmap::Bitmap;
use crate::buffer::Buffer;
@@ -43,11 +42,15 @@ use crate::compute::util::simd_load_set_invalid;
use crate::datatypes;
use crate::datatypes::ToByteSlice;
use crate::error::{ArrowError, Result};
-use crate::util::bit_util;
+use crate::{array::*, util::bit_util};
/// Helper function to perform math lambda function on values from two arrays. If either
/// left or right value is null then the output value is also null, so `1 + null` is
/// `null`.
+///
+/// # Errors
+///
+/// This function errors if the arrays have different lengths
pub fn math_op<T, F>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
@@ -55,7 +58,47 @@ pub fn math_op<T, F>(
) -> Result<PrimitiveArray<T>>
where
T: datatypes::ArrowNumericType,
- F: Fn(T::Native, T::Native) -> Result<T::Native>,
+ F: Fn(T::Native, T::Native) -> T::Native,
+{
+ if left.len() != right.len() {
+ return Err(ArrowError::ComputeError(
+ "Cannot perform math operation on arrays of different length".to_string(),
+ ));
+ }
+
+ let null_bit_buffer =
+ combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
+
+ let values = (0..left.len())
+ .map(|i| op(left.value(i), right.value(i)))
+ .collect::<Vec<T::Native>>();
+
+ let data = ArrayData::new(
+ T::get_data_type(),
+ left.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![Buffer::from(values.to_byte_slice())],
+ vec![],
+ );
+ Ok(PrimitiveArray::<T>::from(Arc::new(data)))
+}
+
+/// Helper function to divide two arrays.
+///
+/// # Errors
+///
+/// This function errors if:
+/// * the arrays have different lengths
+/// * a division by zero is found
+fn math_divide<T>(
+ left: &PrimitiveArray<T>,
+ right: &PrimitiveArray<T>,
+) -> Result<PrimitiveArray<T>>
+where
+ T: datatypes::ArrowNumericType,
+ T::Native: Div<Output = T::Native> + Zero,
{
if left.len() != right.len() {
return Err(ArrowError::ComputeError(
@@ -68,20 +111,32 @@ where
let mut values = Vec::with_capacity(left.len());
if let Some(b) = &null_bit_buffer {
+ // some value is null
for i in 0..left.len() {
- unsafe {
+ values.push(unsafe {
if bit_util::get_bit_raw(b.raw_data(), i) {
- values.push(op(left.value(i), right.value(i))?);
+ let right_value = right.value(i);
+ if right_value.is_zero() {
+ return Err(ArrowError::DivideByZero);
+ } else {
+ left.value(i) / right_value
+ }
} else {
- values.push(T::default_value())
+ T::default_value()
}
- }
+ });
}
} else {
+ // no value is null
for i in 0..left.len() {
- values.push(op(left.value(i), right.value(i))?);
+ let right_value = right.value(i);
+ values.push(if right_value.is_zero() {
+ return Err(ArrowError::DivideByZero);
+ } else {
+ left.value(i) / right_value
+ });
}
- }
+ };
let data = ArrayData::new(
T::get_data_type(),
@@ -170,7 +225,7 @@ where
// Create the combined `Bitmap`
let null_bit_buffer =
combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
- let bitmap = null_bit_buffer.map(Bitmap::from);
+ let bitmap = null_bit_buffer.clone().map(Bitmap::from);
let lanes = T::lanes();
let buffer_size = left.len() * mem::size_of::<T::Native>();
@@ -183,8 +238,6 @@ where
if T::mask_any(is_zero) {
return Err(ArrowError::DivideByZero);
}
- let right_no_invalid_zeros =
- unsafe { simd_load_set_invalid(right, &bitmap, i, lanes, T::Native::one()) };
let simd_left = T::load(left.value_slice(i, lanes));
let simd_result = T::bin_op(simd_left, right_no_invalid_zeros, |a, b| a / b);
@@ -197,8 +250,6 @@ where
T::write(simd_result, result_slice);
}
- let null_bit_buffer = bitmap.map(|b| b.bits);
-
let data = ArrayData::new(
T::get_data_type(),
left.len(),
@@ -229,7 +280,7 @@ where
return simd_math_op(&left, &right, |a, b| a + b);
#[allow(unreachable_code)]
- math_op(left, right, |a, b| Ok(a + b))
+ math_op(left, right, |a, b| a + b)
}
/// Perform `left - right` operation on two arrays. If either left or right value is null
@@ -250,7 +301,7 @@ where
return simd_math_op(&left, &right, |a, b| a - b);
#[allow(unreachable_code)]
- math_op(left, right, |a, b| Ok(a - b))
+ math_op(left, right, |a, b| a - b)
}
/// Perform `left * right` operation on two arrays. If either left or right value is null
@@ -271,7 +322,7 @@ where
return simd_math_op(&left, &right, |a, b| a * b);
#[allow(unreachable_code)]
- math_op(left, right, |a, b| Ok(a * b))
+ math_op(left, right, |a, b| a * b)
}
/// Perform `left / right` operation on two arrays. If either left or right value is null
@@ -294,13 +345,7 @@ where
return simd_divide(&left, &right);
#[allow(unreachable_code)]
- math_op(left, right, |a, b| {
- if b.is_zero() {
- Err(ArrowError::DivideByZero)
- } else {
- Ok(a / b)
- }
- })
+ math_divide(&left, &right)
}
#[cfg(test)]