You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2022/02/01 21:57:55 UTC
[kudu] 02/03: [util] optimized version of BitUtil::Ceil()
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit bac76fcf9d496995591e968dffa822fe0fb21d5d
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Mon Jan 31 15:42:09 2022 -0800
[util] optimized version of BitUtil::Ceil()
This patch introduces an optimized version of BitUtil::Ceil() for the
case when the divisor is a power of two. As it turns out, all usages
of the BitUtil::Ceil() in Kudu are for divisor of 8, so I updated all
the call sites correspondingly.
This patch also contains the updated RLE benchmark. The comparison
results are the following for a RELEASE configuration built with
c++ (GCC) 8.3.1 20190311 (Red Hat 8.3.1-3)):
Non-optimized implementation:
Time spent BooleanBitStream: real 0.487s user 0.486s sys 0.001s
Wrote 1048576 bytes
Time spent BooleanRLE: real 2.302s user 2.304s sys 0.001s
Wrote 46080 bytes
Optimized implementation:
Time spent BooleanBitStream: real 0.017s user 0.016s sys 0.000s
Wrote 1048576 bytes
Time spent BooleanRLE: real 2.055s user 2.056s sys 0.001s
Wrote 46080 bytes
As for benmarking direct calls of these functions:
Time spent BitUtil::Ceil(..., 8): real 0.720s user 0.721s sys 0.000s
Time spent BitUtil::Ceil<3>(...): real 0.402s user 0.402s sys 0.000s
Change-Id: Ia383856aa9a189681f6ee2a0d317476fe3c847bd
Reviewed-on: http://gerrit.cloudera.org:8080/18187
Tested-by: Kudu Jenkins
Reviewed-by: Attila Bukor <ab...@apache.org>
---
src/kudu/benchmarks/rle.cc | 33 +++++++++++++++++++++++++++++++++
src/kudu/util/bit-stream-utils.h | 4 ++--
src/kudu/util/bit-stream-utils.inline.h | 4 ++--
src/kudu/util/bit-util.h | 10 ++++++++++
src/kudu/util/rle-encoding.h | 8 ++++----
src/kudu/util/rle-test.cc | 8 ++++----
6 files changed, 55 insertions(+), 12 deletions(-)
diff --git a/src/kudu/benchmarks/rle.cc b/src/kudu/benchmarks/rle.cc
index a1b1c0c..89520df 100644
--- a/src/kudu/benchmarks/rle.cc
+++ b/src/kudu/benchmarks/rle.cc
@@ -31,6 +31,7 @@
#include "kudu/gutil/mathlimits.h"
#include "kudu/util/bit-stream-utils.h"
#include "kudu/util/bit-stream-utils.inline.h"
+#include "kudu/util/bit-util.h"
#include "kudu/util/faststring.h"
#include "kudu/util/logging.h"
#include "kudu/util/rle-encoding.h"
@@ -92,6 +93,22 @@ int BooleanRLE(faststring* buffer) {
return bytes_written;
}
+int BitUtilCeil(int num_iter) {
+ volatile int res = 0;
+ for (int i = 0; i < num_iter; ++i) {
+ res = BitUtil::Ceil(i, 8);
+ }
+ return res;
+}
+
+int BitUtilCeilLog2Div(int num_iter) {
+ volatile int res;
+ for (int i = 0; i < num_iter; ++i) {
+ res = BitUtil::Ceil<3>(i);
+ }
+ return res;
+}
+
} // namespace kudu
int main(int argc, char** argv) {
@@ -121,5 +138,21 @@ int main(int argc, char** argv) {
LOG(INFO) << "Wrote " << bytes_written << " bytes";
}
+ {
+ int res = 0;
+ LOG_TIMING(INFO, "BitUtil::Ceil(..., 8)") {
+ res = kudu::BitUtilCeil(1000000000);
+ }
+ LOG(INFO) << "Result: " << res;
+ }
+
+ {
+ int res = 0;
+ LOG_TIMING(INFO, "BitUtil::Ceil<3>(...)") {
+ res = kudu::BitUtilCeilLog2Div(1000000000);
+ }
+ LOG(INFO) << "Result: " << res;
+ }
+
return 0;
}
diff --git a/src/kudu/util/bit-stream-utils.h b/src/kudu/util/bit-stream-utils.h
index a772c7c..61a74bc 100644
--- a/src/kudu/util/bit-stream-utils.h
+++ b/src/kudu/util/bit-stream-utils.h
@@ -44,7 +44,7 @@ class BitWriter {
// The number of current bytes written, including the current byte (i.e. may include a
// fraction of a byte). Includes buffered values.
- int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); }
+ int bytes_written() const { return byte_offset_ + BitUtil::Ceil<3>(bit_offset_); }
// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
// packed. num_bits must be <= 32. If 'v' is larger than 'num_bits' bits, the higher
@@ -117,7 +117,7 @@ class BitReader {
// Returns the number of bytes left in the stream, not including the current byte (i.e.,
// there may be an additional fraction of a byte).
- int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); }
+ int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil<3>(bit_offset_)); }
// Current position in the stream, by bit.
int position() const { return byte_offset_ * 8 + bit_offset_; }
diff --git a/src/kudu/util/bit-stream-utils.inline.h b/src/kudu/util/bit-stream-utils.inline.h
index f9186f5..41fa632 100644
--- a/src/kudu/util/bit-stream-utils.inline.h
+++ b/src/kudu/util/bit-stream-utils.inline.h
@@ -49,7 +49,7 @@ inline void BitWriter::PutValue(uint64_t v, int num_bits) {
}
inline void BitWriter::Flush(bool align) {
- int num_bytes = BitUtil::Ceil(bit_offset_, 8);
+ int num_bytes = BitUtil::Ceil<3>(bit_offset_);
buffer_->reserve(KUDU_ALIGN_UP(byte_offset_ + num_bytes, 8));
buffer_->resize(byte_offset_ + num_bytes);
DCHECK_LE(byte_offset_ + num_bytes, buffer_->capacity());
@@ -172,7 +172,7 @@ inline void BitReader::SeekToBit(uint stream_position) {
template<typename T>
inline bool BitReader::GetAligned(int num_bytes, T* v) {
DCHECK_LE(num_bytes, sizeof(T));
- int bytes_read = BitUtil::Ceil(bit_offset_, 8);
+ int bytes_read = BitUtil::Ceil<3>(bit_offset_);
if (PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false;
// Advance byte_offset to next unread byte and read num_bytes
diff --git a/src/kudu/util/bit-util.h b/src/kudu/util/bit-util.h
index 25bf41c..9c0ae23 100644
--- a/src/kudu/util/bit-util.h
+++ b/src/kudu/util/bit-util.h
@@ -32,6 +32,16 @@ class BitUtil {
return value / divisor + (value % divisor != 0);
}
+ // Similar to the above, but a bit optimized for the case when the divisor
+ // is a power of two: LOG2_DIV is log2(divisor), e.g. 3 for the divisor of 8.
+ template <int LOG2_DIV>
+ static inline int Ceil(int value) {
+ constexpr int kDivisor = 1 << LOG2_DIV;
+ constexpr int kComplement = kDivisor - 1;
+ constexpr int kComplementMask = -kDivisor;
+ return ((value + kComplement) & kComplementMask) >> LOG2_DIV;
+ }
+
// Returns the 'num_bits' least-significant bits of 'v'.
static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
if (PREDICT_FALSE(num_bits == 0)) return 0;
diff --git a/src/kudu/util/rle-encoding.h b/src/kudu/util/rle-encoding.h
index 4a00148..58ca87d 100644
--- a/src/kudu/util/rle-encoding.h
+++ b/src/kudu/util/rle-encoding.h
@@ -247,7 +247,7 @@ inline bool RleDecoder<T>::ReadHeader() {
repeat_count_ = indicator_value >> 1;
DCHECK_GT(repeat_count_, 0);
bool result = bit_reader_.GetAligned<T>(
- BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(¤t_value_));
+ BitUtil::Ceil<3>(bit_width_), reinterpret_cast<T*>(¤t_value_));
DCHECK(result);
}
}
@@ -440,7 +440,7 @@ inline void RleEncoder<T>::FlushLiteralRun(bool update_indicator_byte) {
// We only reserve one byte, to allow for streaming writes of literal values.
// The logic makes sure we flush literal runs often enough to not overrun
// the 1 byte.
- int num_groups = BitUtil::Ceil(literal_count_, 8);
+ int num_groups = BitUtil::Ceil<3>(literal_count_);
int32_t indicator_value = (num_groups << 1) | 1;
DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
bit_writer_.buffer()->data()[literal_indicator_byte_idx_] = indicator_value;
@@ -455,7 +455,7 @@ inline void RleEncoder<T>::FlushRepeatedRun() {
// The lsb of 0 indicates this is a repeated run
int32_t indicator_value = repeat_count_ << 1 | 0;
bit_writer_.PutVlqInt(indicator_value);
- bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8));
+ bit_writer_.PutAligned(current_value_, BitUtil::Ceil<3>(bit_width_));
num_buffered_values_ = 0;
repeat_count_ = 0;
}
@@ -480,7 +480,7 @@ inline void RleEncoder<T>::FlushBufferedValues(bool done) {
}
literal_count_ += num_buffered_values_;
- int num_groups = BitUtil::Ceil(literal_count_, 8);
+ int num_groups = BitUtil::Ceil<3>(literal_count_);
if (num_groups + 1 >= (1 << 6)) {
// We need to start a new literal run because the indicator byte we've reserved
// cannot store more values.
diff --git a/src/kudu/util/rle-test.cc b/src/kudu/util/rle-test.cc
index e443b6d..c8467c3 100644
--- a/src/kudu/util/rle-test.cc
+++ b/src/kudu/util/rle-test.cc
@@ -123,7 +123,7 @@ TEST(BitArray, TestBool) {
// Writes 'num_vals' values with width 'bit_width' and reads them back.
void TestBitArrayValues(int bit_width, int num_vals) {
- const int kTestLen = BitUtil::Ceil(bit_width * num_vals, 8);
+ const int kTestLen = BitUtil::Ceil<3>(bit_width * num_vals);
const uint64_t mod = bit_width == 64? 1 : 1LL << bit_width;
faststring buffer(kTestLen);
@@ -248,14 +248,14 @@ TEST(Rle, SpecificSequences) {
}
for (int width = 9; width <= kMaxWidth; ++width) {
- ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil(width, 8)));
+ ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil<3>(width)));
}
// Test 100 0's and 1's alternating
for (int i = 0; i < 100; ++i) {
values[i] = i % 2;
}
- int num_groups = BitUtil::Ceil(100, 8);
+ int num_groups = BitUtil::Ceil<3>(100);
expected_buffer[0] = (num_groups << 1) | 1;
for (int i = 0; i < 100/8; ++i) {
expected_buffer[i + 1] = BOOST_BINARY(1 0 1 0 1 0 1 0); // 0xaa
@@ -266,7 +266,7 @@ TEST(Rle, SpecificSequences) {
// num_groups and expected_buffer only valid for bit width = 1
ValidateRle(values, 1, expected_buffer, 1 + num_groups);
for (int width = 2; width <= kMaxWidth; ++width) {
- ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil(width * 100, 8));
+ ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil<3>(width * 100));
}
}