You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2020/04/06 18:19:05 UTC
[kudu] 01/02: rowblock: use BMI instruction set when available for
GetSelectedRows
This is an automated email from the ASF dual-hosted git repository.
todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit e524334ff04253f38eda7ab3bc589ce76c0934ae
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Wed Apr 1 15:11:58 2020 -0700
rowblock: use BMI instruction set when available for GetSelectedRows
This enables a BMI variant of SelectionVector::GetSelectedRows which has
a higher throughput. I disassembled the resulting hot loop as follows:
BMI:
L:
tzcnt %rsi,%rbx
or %r11d,%ebx
mov %bx,(%rdx)
blsr %rsi,%rsi
tzcnt %rsi,%rbx
or %r11d,%ebx
mov %bx,0x2(%rdx)
blsr %rsi,%rsi
tzcnt %rsi,%rbx
or %r11d,%ebx
mov %bx,0x4(%rdx)
add $0x6,%rdx
blsr %rsi,%rsi
add $0xfffffffd,%ecx
jne L
non-BMI:
L:
bsf %rsi,%rax
or %r12d,%eax
mov %ax,(%rdx)
lea -0x1(%rsi),%rax
and %rsi,%rax
bsf %rax,%rsi
or %r12d,%esi
mov %si,0x2(%rdx)
lea -0x1(%rax),%rbx
and %rax,%rbx
bsf %rbx,%rax
or %r12d,%eax
mov %ax,0x4(%rdx)
add $0x6,%rdx
lea -0x1(%rbx),%rsi
and %rbx,%rsi
add $0xfffffffd,%ecx
jne L
... and then used llvm-mca on these assembly files across a few common
architectures to see how many cycles were required for 100 iterations of
the loop. Results are as follows:
haswell non-bmi.s: Total Cycles: 606
haswell bmi.s: Total Cycles: 382
broadwell non-bmi.s: Total Cycles 606
broadwell bmi.s: Total Cycles: 382
skylake non-bmi.s: Total Cycles: 606
skylake bmi.s: Total Cycles: 307
So, on the most recent chips, this should be about a 2x improvement in
this function. This function made up a few percent of overall CPU
consumption in some TSBS workloads, so this patch had some small but
measurable improvement on end-to-end throughput.
Change-Id: I8ec74bc5db07c18d0e36de14a2343f49fc5c2859
Reviewed-on: http://gerrit.cloudera.org:8080/15635
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
src/kudu/common/rowblock.cc | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/src/kudu/common/rowblock.cc b/src/kudu/common/rowblock.cc
index 33fb220..85bd4ca 100644
--- a/src/kudu/common/rowblock.cc
+++ b/src/kudu/common/rowblock.cc
@@ -23,6 +23,7 @@
#include <glog/logging.h>
#include "kudu/gutil/bits.h"
+#include "kudu/gutil/cpu.h"
#include "kudu/gutil/port.h"
#include "kudu/util/bitmap.h"
@@ -76,8 +77,7 @@ void SelectionVector::ClearToSelectAtMost(size_t max_rows) {
}
-// TODO(todd) this is a bit faster when implemented with target "bmi" enabled.
-// Consider duplicating it and doing runtime switching.
+template<bool BMI>
static void GetSelectedRowsInternal(const uint8_t* __restrict__ bitmap,
int n_bytes,
uint16_t* __restrict__ dst) {
@@ -87,6 +87,18 @@ static void GetSelectedRowsInternal(const uint8_t* __restrict__ bitmap,
});
}
+static const bool kHasBmi = base::CPU().has_bmi();
+
+#ifdef __x86_64__
+// Explicit instantiation with the BMI instruction set enabled, which
+// makes this slightly faster.
+template
+__attribute__((target("bmi")))
+void GetSelectedRowsInternal<true>(const uint8_t* __restrict__ bitmap,
+ int n_bytes,
+ uint16_t* __restrict__ dst);
+#endif
+
SelectedRows SelectionVector::GetSelectedRows() const {
CHECK_LE(n_rows_, std::numeric_limits<uint16_t>::max());
int n_selected = CountSelected();
@@ -97,7 +109,11 @@ SelectedRows SelectionVector::GetSelectedRows() const {
vector<uint16_t> selected;
if (n_selected > 0) {
selected.resize(n_selected);
- GetSelectedRowsInternal(&bitmap_[0], n_bytes_, selected.data());
+ if (kHasBmi) {
+ GetSelectedRowsInternal<true>(&bitmap_[0], n_bytes_, selected.data());
+ } else {
+ GetSelectedRowsInternal<false>(&bitmap_[0], n_bytes_, selected.data());
+ }
}
return SelectedRows(this, std::move(selected));
}