You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2020/04/06 18:19:05 UTC

[kudu] 01/02: rowblock: use BMI instruction set when available for GetSelectedRows

This is an automated email from the ASF dual-hosted git repository.

todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit e524334ff04253f38eda7ab3bc589ce76c0934ae
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Wed Apr 1 15:11:58 2020 -0700

    rowblock: use BMI instruction set when available for GetSelectedRows
    
    This enables a BMI variant of SelectionVector::GetSelectedRows which has
    a higher throughput. I disassembled the resulting hot loop as follows:
    
    BMI:
      L:
        tzcnt  %rsi,%rbx
        or     %r11d,%ebx
        mov    %bx,(%rdx)
        blsr   %rsi,%rsi
        tzcnt  %rsi,%rbx
        or     %r11d,%ebx
        mov    %bx,0x2(%rdx)
        blsr   %rsi,%rsi
        tzcnt  %rsi,%rbx
        or     %r11d,%ebx
        mov    %bx,0x4(%rdx)
        add    $0x6,%rdx
        blsr   %rsi,%rsi
        add    $0xfffffffd,%ecx
        jne    L
    
    non-BMI:
      L:
        bsf    %rsi,%rax
        or     %r12d,%eax
        mov    %ax,(%rdx)
        lea    -0x1(%rsi),%rax
        and    %rsi,%rax
        bsf    %rax,%rsi
        or     %r12d,%esi
        mov    %si,0x2(%rdx)
        lea    -0x1(%rax),%rbx
        and    %rax,%rbx
        bsf    %rbx,%rax
        or     %r12d,%eax
        mov    %ax,0x4(%rdx)
        add    $0x6,%rdx
        lea    -0x1(%rbx),%rsi
        and    %rbx,%rsi
        add    $0xfffffffd,%ecx
        jne    L
    
    ... and then used llvm-mca on these assembly files across a few common
    architectures to see how many cycles were required for 100 iterations of
    the loop. Results are as follows:
    
    haswell non-bmi.s: Total Cycles:      606
    haswell bmi.s: Total Cycles:          382
    
    broadwell non-bmi.s: Total Cycles     606
    broadwell bmi.s: Total Cycles:        382
    
    skylake non-bmi.s: Total Cycles:      606
    skylake bmi.s: Total Cycles:          307
    
    So, on the most recent chips, this should be about a 2x improvement in
    this function. This function made up a few percent of overall CPU
    consumption in some TSBS workloads, so this patch had some small but
    measurable improvement on end-to-end throughput.
    
    Change-Id: I8ec74bc5db07c18d0e36de14a2343f49fc5c2859
    Reviewed-on: http://gerrit.cloudera.org:8080/15635
    Tested-by: Kudu Jenkins
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
 src/kudu/common/rowblock.cc | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/kudu/common/rowblock.cc b/src/kudu/common/rowblock.cc
index 33fb220..85bd4ca 100644
--- a/src/kudu/common/rowblock.cc
+++ b/src/kudu/common/rowblock.cc
@@ -23,6 +23,7 @@
 #include <glog/logging.h>
 
 #include "kudu/gutil/bits.h"
+#include "kudu/gutil/cpu.h"
 #include "kudu/gutil/port.h"
 #include "kudu/util/bitmap.h"
 
@@ -76,8 +77,7 @@ void SelectionVector::ClearToSelectAtMost(size_t max_rows) {
 }
 
 
-// TODO(todd) this is a bit faster when implemented with target "bmi" enabled.
-// Consider duplicating it and doing runtime switching.
+template<bool BMI>
 static void GetSelectedRowsInternal(const uint8_t* __restrict__ bitmap,
                                     int n_bytes,
                                     uint16_t* __restrict__ dst) {
@@ -87,6 +87,18 @@ static void GetSelectedRowsInternal(const uint8_t* __restrict__ bitmap,
                 });
 }
 
+static const bool kHasBmi = base::CPU().has_bmi();
+
+#ifdef __x86_64__
+// Explicit instantiation with the BMI instruction set enabled, which
+// makes this slightly faster.
+template
+__attribute__((target("bmi")))
+void GetSelectedRowsInternal<true>(const uint8_t* __restrict__ bitmap,
+                                   int n_bytes,
+                                   uint16_t* __restrict__ dst);
+#endif
+
 SelectedRows SelectionVector::GetSelectedRows() const {
   CHECK_LE(n_rows_, std::numeric_limits<uint16_t>::max());
   int n_selected = CountSelected();
@@ -97,7 +109,11 @@ SelectedRows SelectionVector::GetSelectedRows() const {
   vector<uint16_t> selected;
   if (n_selected > 0) {
     selected.resize(n_selected);
-    GetSelectedRowsInternal(&bitmap_[0], n_bytes_, selected.data());
+    if (kHasBmi) {
+      GetSelectedRowsInternal<true>(&bitmap_[0], n_bytes_, selected.data());
+    } else {
+      GetSelectedRowsInternal<false>(&bitmap_[0], n_bytes_, selected.data());
+    }
   }
   return SelectedRows(this, std::move(selected));
 }