You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2019/09/01 15:25:20 UTC
[arrow] branch master updated: ARROW-4752: [Rust] Add explicit SIMD vectorization for the divide kernel

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a985483  ARROW-4752: [Rust] Add explicit SIMD vectorization for the divide kernel
a985483 is described below

commit a985483eb96fb9352c7ced9d08fb35d46b630bfc
Author: Paddy Horan <pa...@hotmail.com>
AuthorDate: Sun Sep 1 09:25:01 2019 -0600

    ARROW-4752: [Rust] Add explicit SIMD vectorization for the divide kernel
    
    Adds SIMD support for divide binary op.  There are probably a few more optimizations to be made but the performance of divide is now very close to the other binary ops on my computer which is pretty good considering all the extra checking involved to allow for divide by zero correctly.
    
    PTAL
    
    Closes #5143 from paddyhoran/compute-divide and squashes the following commits:
    
    71a5a31aa <Paddy Horan> Renames utility function and makes it more flexible.
    f63a41039 <Paddy Horan> Adds testing for `simd_load_without_invalid_zeros`.
    98189212d <Paddy Horan> Adds test for `is_valid` utility function.
    0d221edad <Paddy Horan> Cleans up docs and trait bounds.
    66580c21a <Paddy Horan> Minor fixes.
    baee6d5de <Paddy Horan> WIP for SIMD support for divide op.
    
    Authored-by: Paddy Horan <pa...@hotmail.com>
    Signed-off-by: Andy Grove <an...@gmail.com>
---
 rust/arrow/benches/arithmetic_kernels.rs     |  20 +++++
 rust/arrow/src/compute/kernels/arithmetic.rs |  90 +++++++++++++++++++-
 rust/arrow/src/compute/util.rs               | 119 ++++++++++++++++++++++++++-
 rust/arrow/src/datatypes.rs                  |  40 +++++++++
 4 files changed, 262 insertions(+), 7 deletions(-)

diff --git a/rust/arrow/benches/arithmetic_kernels.rs b/rust/arrow/benches/arithmetic_kernels.rs
index e985168..bff97e4 100644
--- a/rust/arrow/benches/arithmetic_kernels.rs
+++ b/rust/arrow/benches/arithmetic_kernels.rs
@@ -17,7 +17,9 @@
 
 #[macro_use]
 extern crate criterion;
+use arrow::error::ArrowError;
 use criterion::Criterion;
+use num::Zero;
 
 use std::sync::Arc;
 
@@ -63,6 +65,12 @@ fn multiply_simd(size: usize) {
     criterion::black_box(multiply(&arr_a, &arr_b).unwrap());
 }
 
+fn divide_simd(size: usize) {
+    let arr_a = create_array(size);
+    let arr_b = create_array(size);
+    criterion::black_box(divide(&arr_a, &arr_b).unwrap());
+}
+
 fn sum_no_simd(size: usize) {
     let arr_a = create_array(size);
     criterion::black_box(sum(&arr_a).unwrap());
@@ -86,6 +94,18 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bin_op_no_simd(512, |a, b| Ok(a * b)))
     });
     c.bench_function("multiply 512 simd", |b| b.iter(|| multiply_simd(512)));
+    c.bench_function("divide 512", |b| {
+        b.iter(|| {
+            bin_op_no_simd(512, |a, b| {
+                if b.is_zero() {
+                    Err(ArrowError::DivideByZero)
+                } else {
+                    Ok(a / b)
+                }
+            })
+        })
+    });
+    c.bench_function("divide 512 simd", |b| b.iter(|| divide_simd(512)));
     c.bench_function("sum 512 no simd", |b| b.iter(|| sum_no_simd(512)));
     c.bench_function("limit 512, 256 no simd", |b| {
         b.iter(|| limit_no_simd(512, 256))
diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs
index abfdc1d..6e99280 100644
--- a/rust/arrow/src/compute/kernels/arithmetic.rs
+++ b/rust/arrow/src/compute/kernels/arithmetic.rs
@@ -27,11 +27,12 @@ use std::ops::{Add, Div, Mul, Sub};
 use std::slice::from_raw_parts_mut;
 use std::sync::Arc;
 
-use num::Zero;
+use num::{One, Zero};
 
 use crate::array::*;
+use crate::bitmap::Bitmap;
 use crate::buffer::MutableBuffer;
-use crate::compute::util::apply_bin_op_to_option_bitmap;
+use crate::compute::util::{apply_bin_op_to_option_bitmap, simd_load_set_invalid};
 use crate::datatypes;
 use crate::error::{ArrowError, Result};
 
@@ -121,6 +122,71 @@ where
     Ok(PrimitiveArray::<T>::from(Arc::new(data)))
 }
 
+/// SIMD vectorized version of `divide`, the divide kernel needs it's own implementation as there
+/// is a need to handle situations where a divide by `0` occurs.  This is complicated by `NULL`
+/// slots and padding.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn simd_divide<T>(
+    left: &PrimitiveArray<T>,
+    right: &PrimitiveArray<T>,
+) -> Result<PrimitiveArray<T>>
+where
+    T: datatypes::ArrowNumericType,
+    T::Native: One + Zero,
+{
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(
+            "Cannot perform math operation on arrays of different length".to_string(),
+        ));
+    }
+
+    // Create the combined `Bitmap`
+    let null_bit_buffer = apply_bin_op_to_option_bitmap(
+        left.data().null_bitmap(),
+        right.data().null_bitmap(),
+        |a, b| a & b,
+    )?;
+    let bitmap = null_bit_buffer.map(Bitmap::from);
+
+    let lanes = T::lanes();
+    let buffer_size = left.len() * mem::size_of::<T::Native>();
+    let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);
+
+    for i in (0..left.len()).step_by(lanes) {
+        let right_no_invalid_zeros =
+            unsafe { simd_load_set_invalid(right, &bitmap, i, lanes, T::Native::one()) };
+        let is_zero = T::eq(T::init(T::Native::zero()), right_no_invalid_zeros);
+        if T::mask_any(is_zero) {
+            return Err(ArrowError::DivideByZero);
+        }
+        let right_no_invalid_zeros =
+            unsafe { simd_load_set_invalid(right, &bitmap, i, lanes, T::Native::one()) };
+        let simd_left = T::load(left.value_slice(i, lanes));
+        let simd_result = T::bin_op(simd_left, right_no_invalid_zeros, |a, b| a / b);
+
+        let result_slice: &mut [T::Native] = unsafe {
+            from_raw_parts_mut(
+                (result.data_mut().as_mut_ptr() as *mut T::Native).offset(i as isize),
+                lanes,
+            )
+        };
+        T::write(simd_result, result_slice);
+    }
+
+    let null_bit_buffer = bitmap.and_then(|b| Some(b.bits));
+
+    let data = ArrayData::new(
+        T::get_data_type(),
+        left.len(),
+        None,
+        null_bit_buffer,
+        left.offset(),
+        vec![result.freeze()],
+        vec![],
+    );
+    Ok(PrimitiveArray::<T>::from(Arc::new(data)))
+}
+
 /// Perform `left + right` operation on two arrays. If either left or right value is null
 /// then the result is also null.
 pub fn add<T>(
@@ -197,8 +263,13 @@ where
         + Sub<Output = T::Native>
         + Mul<Output = T::Native>
         + Div<Output = T::Native>
-        + Zero,
+        + Zero
+        + One,
 {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    return simd_divide(&left, &right);
+
+    #[allow(unreachable_code)]
     math_op(left, right, |a, b| {
         if b.is_zero() {
             Err(ArrowError::DivideByZero)
@@ -275,6 +346,19 @@ mod tests {
     }
 
     #[test]
+    fn test_primitive_array_divide_with_nulls() {
+        let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]);
+        let b = Int32Array::from(vec![Some(5), Some(6), Some(8), Some(9), None, None]);
+        let c = divide(&a, &b).unwrap();
+        assert_eq!(3, c.value(0));
+        assert_eq!(true, c.is_null(1));
+        assert_eq!(1, c.value(2));
+        assert_eq!(0, c.value(3));
+        assert_eq!(true, c.is_null(4));
+        assert_eq!(true, c.is_null(5));
+    }
+
+    #[test]
     fn test_primitive_array_divide_by_zero() {
         let a = Int32Array::from(vec![15]);
         let b = Int32Array::from(vec![0]);
diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs
index dc1f54f..8cd5a11 100644
--- a/rust/arrow/src/compute/util.rs
+++ b/rust/arrow/src/compute/util.rs
@@ -20,12 +20,15 @@
 use crate::array::*;
 use crate::bitmap::Bitmap;
 use crate::buffer::Buffer;
+use crate::datatypes::*;
 use crate::error::Result;
+use num::One;
+use std::cmp::min;
 
 /// Applies a given binary operation, `op`, to two references to `Option<Bitmap>`'s.
 ///
 /// This function is useful when implementing operations on higher level arrays.
-pub(crate) fn apply_bin_op_to_option_bitmap<F>(
+pub(super) fn apply_bin_op_to_option_bitmap<F>(
     left: &Option<Bitmap>,
     right: &Option<Bitmap>,
     op: F,
@@ -87,6 +90,68 @@ pub(super) fn take_value_indices_from_list(
     (UInt32Array::from(values), new_offsets)
 }
 
+/// Creates a new SIMD mask, i.e. `packed_simd::m32x16` or similar. that indicates if the
+/// corresponding array slots represented by the mask are 'valid'.  
+///
+/// Lanes of the SIMD mask can be set to 'valid' (`true`) if the corresponding array slot is not
+/// `NULL`, as indicated by it's `Bitmap`, and is within the length of the array.  Lanes outside the
+/// length represent padding and are set to 'invalid' (`false`).
+unsafe fn is_valid<T>(
+    bitmap: &Option<Bitmap>,
+    i: usize,
+    simd_width: usize,
+    array_len: usize,
+) -> T::SimdMask
+where
+    T: ArrowNumericType,
+{
+    let simd_upper_bound = i + simd_width;
+    let mut validity = T::mask_init(true);
+
+    // Validity based on `Bitmap`
+    if let &Some(b) = &bitmap {
+        for j in i..min(array_len, simd_upper_bound) {
+            if !b.is_set(j) {
+                validity = T::mask_set(validity, j - i, false);
+            }
+        }
+    }
+
+    // Validity based on the length of the Array
+    for j in array_len..simd_upper_bound {
+        validity = T::mask_set(validity, j - i, false);
+    }
+
+    validity
+}
+
+/// Performs a SIMD load but sets all 'invalid' lanes to a constant value.
+///
+/// 'invalid' lanes are lanes where the corresponding array slots are either `NULL` or between the
+/// length and capacity of the array, i.e. in the padded region.
+///
+/// Note that `array` below has it's own `Bitmap` separate from the `bitmap` argument.  This
+/// function is used to prepare `array`'s for binary operations.  The `bitmap` argument is the
+/// `Bitmap` after the binary operation.
+pub(super) unsafe fn simd_load_set_invalid<T>(
+    array: &PrimitiveArray<T>,
+    bitmap: &Option<Bitmap>,
+    i: usize,
+    simd_width: usize,
+    fill_value: T::Native,
+) -> T::Simd
+where
+    T: ArrowNumericType,
+    T::Native: One,
+{
+    let simd_with_zeros = T::load(array.value_slice(i, simd_width));
+    T::mask_select(
+        is_valid::<T>(bitmap, i, simd_width, array.len()),
+        simd_with_zeros,
+        T::init(fill_value),
+    )
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -107,7 +172,7 @@ mod tests {
             apply_bin_op_to_option_bitmap(
                 &Some(Bitmap::from(Buffer::from([0b01101010]))),
                 &None,
-                |a, b| a & b
+                |a, b| a & b,
             )
         );
         assert_eq!(
@@ -115,7 +180,7 @@ mod tests {
             apply_bin_op_to_option_bitmap(
                 &None,
                 &Some(Bitmap::from(Buffer::from([0b01001110]))),
-                |a, b| a & b
+                |a, b| a & b,
             )
         );
         assert_eq!(
@@ -123,7 +188,7 @@ mod tests {
             apply_bin_op_to_option_bitmap(
                 &Some(Bitmap::from(Buffer::from([0b01101010]))),
                 &Some(Bitmap::from(Buffer::from([0b01001110]))),
-                |a, b| a & b
+                |a, b| a & b,
             )
         );
     }
@@ -154,4 +219,50 @@ mod tests {
         .data();
         assert_eq!(data, indexed.data());
     }
+
+    #[test]
+    fn test_is_valid() {
+        let a = Int32Array::from(vec![
+            Some(15),
+            None,
+            None,
+            Some(1),
+            None,
+            None,
+            Some(5),
+            None,
+            None,
+            Some(4),
+        ]);
+        let simd_lanes = 16;
+        let data = a.data();
+        let bitmap = data.null_bitmap();
+        let result = unsafe { is_valid::<Int32Type>(&bitmap, 0, simd_lanes, a.len()) };
+        for i in 0..simd_lanes {
+            if i % 3 != 0 || i > 9 {
+                assert_eq!(false, result.extract(i));
+            } else {
+                assert_eq!(true, result.extract(i));
+            }
+        }
+    }
+
+    #[test]
+    fn test_simd_load_set_invalid() {
+        let a = Int64Array::from(vec![None, Some(15), Some(5), Some(0)]);
+        let new_bitmap = &Some(Bitmap::from(Buffer::from([0b00001010])));
+        let simd_lanes = 8;
+        let result = unsafe {
+            simd_load_set_invalid::<Int64Type>(&a, &new_bitmap, 0, simd_lanes, 1)
+        };
+        for i in 0..simd_lanes {
+            if i == 1 {
+                assert_eq!(15_i64, result.extract(i));
+            } else if i == 3 {
+                assert_eq!(0_i64, result.extract(i));
+            } else {
+                assert_eq!(1_i64, result.extract(i));
+            }
+        }
+    }
 }
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index f167e4e..822b953 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -320,12 +320,27 @@ where
     /// The number of SIMD lanes available
     fn lanes() -> usize;
 
+    /// Initializes a SIMD register to a constant value
+    fn init(value: Self::Native) -> Self::Simd;
+
     /// Loads a slice into a SIMD register
     fn load(slice: &[Self::Native]) -> Self::Simd;
 
+    /// Creates a new SIMD mask for this SIMD type filling it with `value`
+    fn mask_init(value: bool) -> Self::SimdMask;
+
     /// Gets the value of a single lane in a SIMD mask
     fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool;
 
+    /// Sets the value of a single lane of a SIMD mask
+    fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask;
+
+    /// Selects elements of `a` and `b` using `mask`
+    fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd;
+
+    /// Returns `true` if any of the lanes in the mask are `true`
+    fn mask_any(mask: Self::SimdMask) -> bool;
+
     /// Performs a SIMD binary operation
     fn bin_op<F: Fn(Self::Simd, Self::Simd) -> Self::Simd>(
         left: Self::Simd,
@@ -370,14 +385,39 @@ macro_rules! make_numeric_type {
                 Self::Simd::lanes()
             }
 
+            fn init(value: Self::Native) -> Self::Simd {
+                Self::Simd::splat(value)
+            }
+
             fn load(slice: &[Self::Native]) -> Self::Simd {
                 unsafe { Self::Simd::from_slice_unaligned_unchecked(slice) }
             }
 
+            fn mask_init(value: bool) -> Self::SimdMask {
+                Self::SimdMask::splat(value)
+            }
+
             fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool {
                 unsafe { mask.extract_unchecked(idx) }
             }
 
+            fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask {
+                unsafe { mask.replace_unchecked(idx, value) }
+            }
+
+            /// Selects elements of `a` and `b` using `mask`
+            fn mask_select(
+                mask: Self::SimdMask,
+                a: Self::Simd,
+                b: Self::Simd,
+            ) -> Self::Simd {
+                mask.select(a, b)
+            }
+
+            fn mask_any(mask: Self::SimdMask) -> bool {
+                mask.any()
+            }
+
             fn bin_op<F: Fn(Self::Simd, Self::Simd) -> Self::Simd>(
                 left: Self::Simd,
                 right: Self::Simd,