You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2020/03/25 03:28:24 UTC

[kudu] 01/02: Use popcnt instruction for Bits::Count

This is an automated email from the ASF dual-hosted git repository.

todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 9945954a3f329a808d39bddad4c816de064f8f75
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Tue Mar 24 13:10:18 2020 -0700

    Use popcnt instruction for Bits::Count
    
    I found Bits::Count to be a noticeable contributor to profiles in my
    TSDB benchmarking (a couple percent, if I recall correctly). This
    switches to using popcnt instead of a lookup table.
    
    This is technically a new CPU requirement, since the popcnt flag is
    separate from the SSE4.2 flag. In practice, however, it doesn't appear
    that there are any SSE4.2-capable machines that aren't also capable of
    POPCNT. So, this shouldn't actually change our hardware requirements.
    Another bit of evidence here is that '-msse4.2' (which we use) in clang
    also enables -mpopcnt by default, so it was already possible for clang
    to emit popcnt instructions for its own optimizations.
    
    In any case, this instruction was introduced about 10 years ago, so even
    if I missed one case of a 2010-era server, it's unlikely to still be in
    use for Kudu.
    
    Change-Id: Iad045e8b77e7baf65c42366eea3e107900eb4a64
    Reviewed-on: http://gerrit.cloudera.org:8080/15549
    Reviewed-by: Bankim Bhavsar <ba...@cloudera.com>
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/gutil/bits.cc | 21 +++++++++++++++++++--
 src/kudu/gutil/cpu.cc  |  2 ++
 src/kudu/gutil/cpu.h   |  2 ++
 src/kudu/util/init.cc  |  7 +++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/kudu/gutil/bits.cc b/src/kudu/gutil/bits.cc
index 333e464..437df3f 100644
--- a/src/kudu/gutil/bits.cc
+++ b/src/kudu/gutil/bits.cc
@@ -5,6 +5,9 @@
 #include "kudu/gutil/bits.h"
 
 #include <assert.h>
+#include <stdint.h>
+
+#include "kudu/gutil/port.h"
 
 // this array gives the number of bits for any number from 0 to 255
 // (We could make these ints.  The tradeoff is size (eg does it overwhelm
@@ -29,9 +32,23 @@ const char Bits::num_bits[] = {
 
 int Bits::Count(const void *m, int num_bytes) {
   int nbits = 0;
-  const uint8 *s = (const uint8 *) m;
-  for (int i = 0; i < num_bytes; i++)
+  const uint8 *s = static_cast<const uint8*>(m);
+#ifdef __x86_64__
+  // Assume POPCNT since Kudu checks for it at startup.
+  while (num_bytes >= 8) {
+    nbits += __builtin_popcountll(UnalignedLoad<uint64_t>(s));
+    s += 8;
+    num_bytes -= 8;
+  }
+  while (num_bytes--) {
+    nbits += __builtin_popcount(*s++);
+  }
+#else
+  // Use lookup table on non-x86.
+  for (int i = 0; i < num_bytes; i++) {
     nbits += num_bits[*s++];
+  }
+#endif
   return nbits;
 }
 
diff --git a/src/kudu/gutil/cpu.cc b/src/kudu/gutil/cpu.cc
index b3ea105..e108304 100644
--- a/src/kudu/gutil/cpu.cc
+++ b/src/kudu/gutil/cpu.cc
@@ -34,6 +34,7 @@ CPU::CPU()
     has_ssse3_(false),
     has_sse41_(false),
     has_sse42_(false),
+    has_popcnt_(false),
     has_avx_(false),
     has_avx2_(false),
     has_aesni_(false),
@@ -226,6 +227,7 @@ void CPU::Initialize() {
     has_ssse3_ = (cpu_info[2] & 0x00000200) != 0;
     has_sse41_ = (cpu_info[2] & 0x00080000) != 0;
     has_sse42_ = (cpu_info[2] & 0x00100000) != 0;
+    has_popcnt_ = (cpu_info[2] & 0x00800000) != 0;
     // AVX instructions will generate an illegal instruction exception unless
     //   a) they are supported by the CPU,
     //   b) XSAVE is supported by the CPU and
diff --git a/src/kudu/gutil/cpu.h b/src/kudu/gutil/cpu.h
index b3cf2e5..6462642 100644
--- a/src/kudu/gutil/cpu.h
+++ b/src/kudu/gutil/cpu.h
@@ -45,6 +45,7 @@ class CPU {
   bool has_ssse3() const { return has_ssse3_; }
   bool has_sse41() const { return has_sse41_; }
   bool has_sse42() const { return has_sse42_; }
+  bool has_popcnt() const { return has_popcnt_; }
   bool has_avx() const { return has_avx_; }
   bool has_avx2() const { return has_avx2_; }
   bool has_aesni() const { return has_aesni_; }
@@ -80,6 +81,7 @@ class CPU {
   bool has_ssse3_;
   bool has_sse41_;
   bool has_sse42_;
+  bool has_popcnt_;
   bool has_avx_;
   bool has_avx2_;
   bool has_aesni_;
diff --git a/src/kudu/util/init.cc b/src/kudu/util/init.cc
index d06ea21..5267730 100644
--- a/src/kudu/util/init.cc
+++ b/src/kudu/util/init.cc
@@ -74,6 +74,13 @@ Status CheckCPUFlags() {
     return BadCPUStatus(cpu, "SSSE3");
   }
 
+  // POPCNT should always be present on machines with SSE4.2 support, but just in case
+  // there's some sort of weird missing support in virtualized environments, we'll check
+  // it explicitly.
+  if (!cpu.has_popcnt()) {
+    return BadCPUStatus(cpu, "POPCNT");
+  }
+
   return Status::OK();
 }