You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/05/19 20:40:38 UTC

[arrow] branch master updated: ARROW-8831: [Rust] change simd_compare_op in comparison kernel to use bitmask SIMD operation to significantly improve performance

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 897be76  ARROW-8831: [Rust] change simd_compare_op in comparison kernel to use bitmask SIMD operation to significantly improve performance
897be76 is described below

commit 897be76a19fc9992731aa0db0d9af9376a090e27
Author: Yordan Pavlov <yo...@outlook.com>
AuthorDate: Tue May 19 22:39:43 2020 +0200

    ARROW-8831: [Rust] change simd_compare_op in comparison kernel to use bitmask SIMD operation to significantly improve performance
    
    This pull request completes the SIMD implementation of the comparison kernel in simd_compare_op by using a bitmask SIMD operation instead of a for loop to copy the results of comparison.
    
    Previously simd_compare_op was only about 10% faster compared to the non-SIMD implementation and was taking approximately the same time for types of different length (which indicates that the SIMD implementation was not complete). Below are results from benchmarks of the old implementation with Int8 and Float32 types:
    
    eq Int8                 time:   [947.53 us 947.81 us 948.05 us]
    eq Int8 simd            time:   [855.02 us 858.26 us 862.48 us]
    neq Int8                time:   [904.09 us 907.34 us 911.44 us]
    neq Int8 simd           time:   [848.49 us 849.28 us 850.28 us]
    lt Int8                 time:   [900.87 us 902.65 us 904.86 us]
    lt Int8 simd            time:   [850.32 us 850.96 us 851.90 us]
    lt_eq Int8              time:   [974.68 us 983.03 us 991.98 us]
    lt_eq Int8 simd         time:   [851.83 us 852.22 us 852.74 us]
    gt Int8                 time:   [908.48 us 911.76 us 914.72 us]
    gt Int8 simd            time:   [851.93 us 852.43 us 853.04 us]
    gt_eq Int8              time:   [981.53 us 983.37 us 986.31 us]
    gt_eq Int8 simd         time:   [855.59 us 856.83 us 858.61 us]
    
    eq Float32              time:   [911.46 us 911.70 us 912.01 us]
    eq Float32 simd         time:   [884.74 us 885.97 us 887.74 us]
    neq Float32             time:   [904.26 us 904.73 us 905.27 us]
    neq Float32 simd        time:   [884.40 us 892.32 us 901.98 us]
    lt Float32              time:   [907.90 us 908.54 us 909.34 us]
    lt Float32 simd         time:   [883.23 us 886.05 us 889.31 us]
    lt_eq Float32           time:   [911.44 us 911.62 us 911.82 us]
    lt_eq Float32 simd      time:   [882.78 us 886.78 us 891.05 us]
    gt Float32              time:   [906.88 us 907.96 us 909.32 us]
    gt Float32 simd         time:   [879.78 us 883.03 us 886.63 us]
    gt_eq Float32           time:   [924.72 us 926.03 us 928.29 us]
    gt_eq Float32 simd      time:   [884.80 us 885.93 us 887.35 us]
    
    In the benchmark results above, notice how both the SIMD and non-SIMD operations take similar amount of time for types of different size (Int8 and Float32). This is normal for a non-SIMD implementation but is not normal for a SIMD implementation as SIMD operations can be executed on more values of smaller size.
    
    After the change proposed in this pull request, performance is about 10 times better for Float32 and about 40 times better for Int8. The results below indicate the SIMD implementation is now complete  as operations take approximately 4 times less time for a 4 times smaller type. Here are the benchmark results:
    
    eq Int8                 time:   [949.64 us 951.38 us 953.43 us]
    eq Int8 simd            time:   [20.569 us 20.576 us 20.583 us]
    neq Int8                time:   [903.97 us 908.96 us 914.34 us]
    neq Int8 simd           time:   [20.675 us 20.741 us 20.840 us]
    lt Int8                 time:   [894.93 us 895.99 us 897.55 us]
    lt Int8 simd            time:   [21.564 us 22.105 us 22.802 us]
    lt_eq Int8              time:   [950.65 us 952.05 us 954.11 us]
    lt_eq Int8 simd         time:   [21.072 us 21.220 us 21.399 us]
    gt Int8                 time:   [894.76 us 895.47 us 896.61 us]
    gt Int8 simd            time:   [21.680 us 22.546 us 23.465 us]
    gt_eq Int8              time:   [948.65 us 949.53 us 950.71 us]
    gt_eq Int8 simd         time:   [21.245 us 21.329 us 21.473 us]
    
    eq Float32              time:   [920.17 us 923.43 us 927.43 us]
    eq Float32 simd         time:   [79.024 us 79.179 us 79.372 us]
    neq Float32             time:   [910.46 us 912.84 us 915.68 us]
    neq Float32 simd        time:   [78.683 us 78.992 us 79.452 us]
    lt Float32              time:   [905.08 us 907.03 us 909.38 us]
    lt Float32 simd         time:   [79.296 us 79.545 us 79.873 us]
    lt_eq Float32           time:   [920.25 us 923.87 us 928.94 us]
    lt_eq Float32 simd      time:   [80.974 us 81.584 us 82.365 us]
    gt Float32              time:   [911.43 us 916.43 us 923.39 us]
    gt Float32 simd         time:   [80.079 us 80.336 us 80.692 us]
    gt_eq Float32           time:   [923.22 us 923.76 us 924.37 us]
    gt_eq Float32 simd      time:   [81.175 us 81.402 us 81.683 us]
    
    @paddyhoran let me know what you think
    
    Closes #7204 from yordan-pavlov/simd_compare_op_bitmask
    
    Authored-by: Yordan Pavlov <yo...@outlook.com>
    Signed-off-by: Neville Dipale <ne...@gmail.com>
---
 rust/arrow/src/compute/kernels/comparison.rs | 21 +++++++++++++--------
 rust/arrow/src/datatypes.rs                  | 12 ++++++++++++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs
index 390c276..9d184c0 100644
--- a/rust/arrow/src/compute/kernels/comparison.rs
+++ b/rust/arrow/src/compute/kernels/comparison.rs
@@ -210,6 +210,10 @@ where
     T: ArrowNumericType,
     F: Fn(T::Simd, T::Simd) -> T::SimdMask,
 {
+    use crate::buffer::MutableBuffer;
+    use std::io::Write;
+    use std::mem;
+
     let len = left.len();
     if len != right.len() {
         return Err(ArrowError::ComputeError(
@@ -225,7 +229,7 @@ where
     )?;
 
     let lanes = T::lanes();
-    let mut result = BooleanBufferBuilder::new(len);
+    let mut result = MutableBuffer::new(left.len() * mem::size_of::<bool>());
 
     let rem = len % lanes;
 
@@ -233,18 +237,19 @@ where
         let simd_left = T::load(left.value_slice(i, lanes));
         let simd_right = T::load(right.value_slice(i, lanes));
         let simd_result = op(simd_left, simd_right);
-        for i in 0..lanes {
-            result.append(T::mask_get(&simd_result, i))?;
-        }
+        T::bitmask(&simd_result, |b| {
+            result.write(b).unwrap();
+        });
     }
 
     if rem > 0 {
         let simd_left = T::load(left.value_slice(len - rem, lanes));
         let simd_right = T::load(right.value_slice(len - rem, lanes));
         let simd_result = op(simd_left, simd_right);
-        for i in 0..rem {
-            result.append(T::mask_get(&simd_result, i))?;
-        }
+        let rem_buffer_size = (rem as f32 / 8f32).ceil() as usize;
+        T::bitmask(&simd_result, |b| {
+            result.write(&b[0..rem_buffer_size]).unwrap();
+        });
     }
 
     let data = ArrayData::new(
@@ -253,7 +258,7 @@ where
         None,
         null_bit_buffer,
         left.offset(),
-        vec![result.finish()],
+        vec![result.freeze()],
         vec![],
     );
     Ok(PrimitiveArray::<BooleanType>::from(Arc::new(data)))
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index adb8048..138d551 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -522,6 +522,11 @@ where
     /// Gets the value of a single lane in a SIMD mask
     fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool;
 
+    /// Gets the bitmask for a SimdMask as a byte slice and passes it to the closure used as the action parameter
+    fn bitmask<T>(mask: &Self::SimdMask, action: T)
+    where
+        T: FnMut(&[u8]);
+
     /// Sets the value of a single lane of a SIMD mask
     fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask;
 
@@ -594,6 +599,13 @@ macro_rules! make_numeric_type {
                 unsafe { mask.extract_unchecked(idx) }
             }
 
+            fn bitmask<T>(mask: &Self::SimdMask, mut action: T)
+            where
+                T: FnMut(&[u8]),
+            {
+                action(mask.bitmask().to_byte_slice());
+            }
+
             fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask {
                 unsafe { mask.replace_unchecked(idx, value) }
             }