You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/04/12 23:19:15 UTC
[41/50] incubator-impala git commit: Use more efficient gutil implementation of Log2Ceiling

Use more efficient gutil implementation of Log2Ceiling

This was used in FreePool::Allocate, which is called a lot in some
workloads, so may have some perf impact. I was looking at using this
function for an allocator for the new buffer pool and it makes the
allocator code simpler if I can call this function freely without
perf concerns.

Modified gutil's implementation slightly so that gcc optimises it to
the following branch-free code:

.LHOTB8:
        .align 2
        .p2align 4,,15
        .globl  _ZN4Bits20Log2CeilingNonZero64Ey
        .type   _ZN4Bits20Log2CeilingNonZero64Ey, @function
_ZN4Bits20Log2CeilingNonZero64Ey:
.LFB1797:
        .loc 1 90 0
        .cfi_startproc
.LVL40:
.LBB33:
        .loc 1 92 0
        leaq    -1(%rdi), %rdx  #, D.40305
.LBB34:
.LBB35:
        .loc 2 155 0
        bsrq    %rdi, %rax      # n, tmp89
.LVL41:
.LBE35:
.LBE34:
        .loc 1 92 0
        andq    %rdx, %rdi      # D.40305, D.40305
.LVL42:
        .loc 1 95 0
        cmpq    $1, %rdi        #, D.40305
        sbbl    $-1, %eax       #, D.40304
.LVL43:
.LBE33:
        .loc 1 96 0
        ret
        .cfi_endproc

Change-Id: Ifbffb49b5c86394e5a9ccd711132543856321be0
Reviewed-on: http://gerrit.cloudera.org:8080/2735
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/92fafa1c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/92fafa1c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/92fafa1c

Branch: refs/heads/master
Commit: 92fafa1c0687141481e56ea6f09a67987d571d6e
Parents: 344fbc0
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Fri Apr 8 10:23:03 2016 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Tue Apr 12 14:03:44 2016 -0700

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc |  3 ++-
 be/src/gutil/bits.cc                | 16 ---------------
 be/src/gutil/bits.h                 | 34 ++++++++++++++++++++++++++++++++
 be/src/runtime/disk-io-mgr.cc       | 12 ++++++-----
 be/src/runtime/free-pool.h          |  4 ++--
 be/src/runtime/runtime-filter.cc    |  6 +++---
 be/src/util/bit-util-test.cc        | 22 ++++++++++++---------
 be/src/util/bit-util.h              | 16 ---------------
 be/src/util/bloom-filter.h          |  4 ++--
 be/src/util/dict-encoding.h         |  3 ++-
 10 files changed, 65 insertions(+), 55 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index 9e18c66..ead3628 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -27,6 +27,7 @@
 #include "exec/scanner-context.inline.h"
 #include "exec/read-write-util.h"
 #include "exprs/expr.h"
+#include "gutil/bits.h"
 #include "runtime/collection-value-builder.h"
 #include "runtime/descriptors.h"
 #include "runtime/runtime-state.h"
@@ -1231,7 +1232,7 @@ Status HdfsParquetScanner::LevelDecoder::Init(const string& filename,
       if (num_bytes < 0) {
         return Status(TErrorCode::PARQUET_CORRUPT_VALUE, filename, num_bytes);
       }
-      int bit_width = BitUtil::Log2(max_level + 1);
+      int bit_width = Bits::Log2Ceiling64(max_level + 1);
       Reset(*data, num_bytes, bit_width);
       break;
     }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/gutil/bits.cc
----------------------------------------------------------------------
diff --git a/be/src/gutil/bits.cc b/be/src/gutil/bits.cc
index bc4e451..dec6f51 100644
--- a/be/src/gutil/bits.cc
+++ b/be/src/gutil/bits.cc
@@ -71,22 +71,6 @@ int Bits::Log2Floor_Portable(uint32 n) {
   return log;
 }
 
-int Bits::Log2Ceiling(uint32 n) {
-  int floor = Log2Floor(n);
-  if (n == (n &~ (n - 1)))              // zero or a power of two
-    return floor;
-  else
-    return floor + 1;
-}
-
-int Bits::Log2Ceiling64(uint64 n) {
-  int floor = Log2Floor64(n);
-  if (n == (n &~ (n - 1)))              // zero or a power of two
-    return floor;
-  else
-    return floor + 1;
-}
-
 int Bits::FindLSBSetNonZero_Portable(uint32 n) {
   int rc = 31;
   for (int i = 4, shift = 1 << 4; i >= 0; --i) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/gutil/bits.h
----------------------------------------------------------------------
diff --git a/be/src/gutil/bits.h b/be/src/gutil/bits.h
index bacd655..579d954 100644
--- a/be/src/gutil/bits.h
+++ b/be/src/gutil/bits.h
@@ -79,6 +79,10 @@ class Bits {
   static int Log2Ceiling(uint32 n);
   static int Log2Ceiling64(uint64 n);
 
+  // Potentially faster version of Log2Ceiling() that returns an
+  // undefined value if n == 0
+  static int Log2CeilingNonZero64(uint64 n);
+
   // Return the first set least / most significant bit, 0-indexed.  Returns an
   // undefined value if n == 0.  FindLSBSetNonZero() is similar to ffs() except
   // that it's 0-indexed, while FindMSBSetNonZero() is the same as
@@ -139,6 +143,36 @@ inline int Bits::Log2FloorNonZero(uint32 n) {
   return 31 ^ __builtin_clz(n);
 }
 
+inline int Bits::Log2Ceiling(uint32 n) {
+  int floor = Log2Floor(n);
+  // Check if zero or a power of two. This pattern is recognised by gcc and optimised
+  // into branch-free code.
+  if (0 == (n & (n - 1)))
+    return floor;
+  else
+    return floor + 1;
+}
+
+inline int Bits::Log2Ceiling64(uint64 n) {
+  int floor = Log2Floor64(n);
+  // Check if zero or a power of two. This pattern is recognised by gcc and optimised
+  // into branch-free code.
+  if (0 == (n & (n - 1)))
+    return floor;
+  else
+    return floor + 1;
+}
+
+inline int Bits::Log2CeilingNonZero64(uint64 n) {
+  int floor = Log2FloorNonZero64(n);
+  // Check if zero or a power of two. This pattern is recognised by gcc and optimised
+  // into branch-free code.
+  if (0 == (n & (n - 1)))
+    return floor;
+  else
+    return floor + 1;
+}
+
 inline int Bits::FindLSBSetNonZero(uint32 n) {
   return __builtin_ctz(n);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/runtime/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.cc b/be/src/runtime/disk-io-mgr.cc
index 80675d7..4788739 100644
--- a/be/src/runtime/disk-io-mgr.cc
+++ b/be/src/runtime/disk-io-mgr.cc
@@ -14,11 +14,13 @@
 
 #include "runtime/disk-io-mgr.h"
 #include "runtime/disk-io-mgr-internal.h"
-#include "util/hdfs-util.h"
 
-#include <gutil/strings/substitute.h>
 #include <boost/algorithm/string.hpp>
 
+#include "gutil/bits.h"
+#include "gutil/strings/substitute.h"
+#include "util/hdfs-util.h"
+
 DECLARE_bool(disable_mem_pools);
 
 #include "common/names.h"
@@ -270,7 +272,7 @@ DiskIoMgr::DiskIoMgr() :
         FileSystemUtil::MaxNumFileHandles()),
         &HdfsCachedFileHandle::Release) {
   int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
-  free_buffers_.resize(BitUtil::Log2(max_buffer_size_scaled) + 1);
+  free_buffers_.resize(Bits::Log2Ceiling64(max_buffer_size_scaled) + 1);
   int num_local_disks = FLAGS_num_disks == 0 ? DiskInfo::num_disks() : FLAGS_num_disks;
   disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
   CheckSseSupport();
@@ -288,7 +290,7 @@ DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_s
     file_handle_cache_(min(FLAGS_max_cached_file_handles,
             FileSystemUtil::MaxNumFileHandles()), &HdfsCachedFileHandle::Release) {
   int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
-  free_buffers_.resize(BitUtil::Log2(max_buffer_size_scaled) + 1);
+  free_buffers_.resize(Bits::Log2Ceiling64(max_buffer_size_scaled) + 1);
   if (num_local_disks == 0) num_local_disks = DiskInfo::num_disks();
   disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
   CheckSseSupport();
@@ -1129,7 +1131,7 @@ Status DiskIoMgr::WriteRangeHelper(FILE* file_handle, WriteRange* write_range) {
 
 int DiskIoMgr::free_buffers_idx(int64_t buffer_size) {
   int64_t buffer_size_scaled = BitUtil::Ceil(buffer_size, min_buffer_size_);
-  int idx = BitUtil::Log2(buffer_size_scaled);
+  int idx = Bits::Log2Ceiling64(buffer_size_scaled);
   DCHECK_GE(idx, 0);
   DCHECK_LT(idx, free_buffers_.size());
   return idx;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/runtime/free-pool.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/free-pool.h b/be/src/runtime/free-pool.h
index 2985ecb..dfabaf0 100644
--- a/be/src/runtime/free-pool.h
+++ b/be/src/runtime/free-pool.h
@@ -20,6 +20,7 @@
 #include <string.h>
 #include <string>
 #include "common/logging.h"
+#include "gutil/bits.h"
 #include "runtime/mem-pool.h"
 #include "util/bit-util.h"
 
@@ -66,8 +67,7 @@ class FreePool {
     /// This is the typical malloc behavior. NULL is reserved for failures.
     if (size == 0) return reinterpret_cast<uint8_t*>(0x1);
 
-    /// Do ceil(log_2(size))
-    int free_list_idx = BitUtil::Log2(size);
+    int free_list_idx = Bits::Log2Ceiling64(size);
     DCHECK_LT(free_list_idx, NUM_LISTS);
 
     FreeListNode* allocation = lists_[free_list_idx].next;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/runtime/runtime-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter.cc b/be/src/runtime/runtime-filter.cc
index 675e0f3..a68eb69 100644
--- a/be/src/runtime/runtime-filter.cc
+++ b/be/src/runtime/runtime-filter.cc
@@ -14,9 +14,9 @@
 
 #include "runtime/runtime-filter.inline.h"
 
-#include <gutil/strings/substitute.h>
-
 #include "common/names.h"
+#include "gutil/bits.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/client-cache.h"
 #include "runtime/exec-env.h"
 #include "service/impala-server.h"
@@ -43,7 +43,7 @@ RuntimeFilterBank::RuntimeFilterBank(const TQueryCtx& query_ctx, RuntimeState* s
   int32_t bloom_filter_size = query_ctx_.request.query_options.runtime_bloom_filter_size;
   bloom_filter_size = std::max(bloom_filter_size, MIN_BLOOM_FILTER_SIZE);
   bloom_filter_size = std::min(bloom_filter_size, MAX_BLOOM_FILTER_SIZE);
-  log_filter_size_ = BitUtil::Log2(bloom_filter_size);
+  log_filter_size_ = Bits::Log2Ceiling64(bloom_filter_size);
 }
 
 RuntimeFilter* RuntimeFilterBank::RegisterFilter(const TRuntimeFilterDesc& filter_desc,

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util-test.cc b/be/src/util/bit-util-test.cc
index 9886429..f376737 100644
--- a/be/src/util/bit-util-test.cc
+++ b/be/src/util/bit-util-test.cc
@@ -19,6 +19,8 @@
 
 #include <boost/utility.hpp>
 #include <gtest/gtest.h>
+
+#include "gutil/bits.h"
 #include "util/bit-util.h"
 #include "util/cpu-info.h"
 
@@ -99,7 +101,7 @@ TEST(BitUtil, ByteSwap) {
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint64_t>(0)), 0);
   EXPECT_EQ(BitUtil::ByteSwap(
       static_cast<uint64_t>(0x1122334455667788)), 0x8877665544332211);
-  
+
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<int64_t>(0)), 0);
   EXPECT_EQ(BitUtil::ByteSwap(
       static_cast<int64_t>(0x1122334455667788)), 0x8877665544332211);
@@ -112,14 +114,16 @@ TEST(BitUtil, ByteSwap) {
 }
 
 TEST(BitUtil, Log2) {
-  EXPECT_EQ(BitUtil::Log2(1), 0);
-  EXPECT_EQ(BitUtil::Log2(2), 1);
-  EXPECT_EQ(BitUtil::Log2(3), 2);
-  EXPECT_EQ(BitUtil::Log2(4), 2);
-  EXPECT_EQ(BitUtil::Log2(5), 3);
-  EXPECT_EQ(BitUtil::Log2(INT_MAX), 31);
-  EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32);
-  EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64);
+  // We use gutil's implementation in place of an older custom implementation in BitUtil.
+  // We leave this test here to ensure no test coverage is lost.
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(1), 0);
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(2), 1);
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(3), 2);
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(4), 2);
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(5), 3);
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(INT_MAX), 31);
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(UINT_MAX), 32);
+  EXPECT_EQ(Bits::Log2CeilingNonZero64(ULLONG_MAX), 64);
 }
 
 TEST(BitUtil, RoundUpToPowerOf2) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 0031558..e255f0c 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -144,22 +144,6 @@ class BitUtil {
     return (v << n) >> n;
   }
 
-  /// Returns ceil(log2(x)).
-  /// TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
-  /// in a hot path.
-  static inline int Log2(uint64_t x) {
-    DCHECK_GT(x, 0);
-    if (x == 1) return 0;
-    // Compute result = ceil(log2(x))
-    //                = floor(log2(x - 1)) + 1, for x > 1
-    // by finding the position of the most significant bit (1-indexed) of x - 1
-    // (floor(log2(n)) = MSB(n) (0-indexed))
-    --x;
-    int result = 1;
-    while (x >>= 1) ++result;
-    return result;
-  }
-
   /// Swaps the byte order (i.e. endianess)
   static inline int64_t ByteSwap(int64_t value) {
     return __builtin_bswap64(value);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/util/bloom-filter.h
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.h b/be/src/util/bloom-filter.h
index 32f6ee6..6fdd140 100644
--- a/be/src/util/bloom-filter.h
+++ b/be/src/util/bloom-filter.h
@@ -109,8 +109,8 @@ class BloomFilter {
   typedef uint64_t BucketWord;
 
   // log2(number of bits in a BucketWord)
-  // TODO: Use BitUtil::Log2(numeric_limits<BucketWord>::digits) once we enable C++14 for
-  // codegen.
+  // TODO: Use Bits::Log2Ceiling64(numeric_limits<BucketWord>::digits) once we enable
+  // C++14 for codegen.
   static const int LOG_BUCKET_WORD_BITS = 6;
   static const BucketWord BUCKET_WORD_MASK = 63; // 2^LOG_BUCKET_WORD_BITS - 1
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/92fafa1c/be/src/util/dict-encoding.h
----------------------------------------------------------------------
diff --git a/be/src/util/dict-encoding.h b/be/src/util/dict-encoding.h
index e629f33..75e89ac 100644
--- a/be/src/util/dict-encoding.h
+++ b/be/src/util/dict-encoding.h
@@ -20,6 +20,7 @@
 #include <boost/foreach.hpp>
 #include <boost/unordered_map.hpp>
 
+#include "gutil/bits.h"
 #include "exec/parquet-common.h"
 #include "runtime/mem-pool.h"
 #include "runtime/string-value.h"
@@ -70,7 +71,7 @@ class DictEncoderBase {
   int bit_width() const {
     if (UNLIKELY(num_entries() == 0)) return 0;
     if (UNLIKELY(num_entries() == 1)) return 1;
-    return BitUtil::Log2(num_entries());
+    return Bits::Log2Ceiling64(num_entries());
   }
 
   /// Writes out any buffered indices to buffer preceded by the bit width of this data.