You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/05/28 13:26:33 UTC
[arrow] branch master updated: ARROW-8793: [C++] Do not inline
BitUtil::SetBitsTo
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e5a0893 ARROW-8793: [C++] Do not inline BitUtil::SetBitsTo
e5a0893 is described below
commit e5a08930862e0d4b497d5bdb8f95523eb17537ad
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu May 28 08:26:07 2020 -0500
ARROW-8793: [C++] Do not inline BitUtil::SetBitsTo
Local benchmarks suggest that there is 3-4 ns of overhead for manipulating small bitmaps when it is not inline.
inline (argument is byte size of bitmap):
```
--------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------
SetBitsTo/2 3 ns 3 ns 271439203 733.294MB/s
SetBitsTo/16 2 ns 2 ns 308813758 6.49485GB/s
SetBitsTo/1024 9 ns 9 ns 79821710 109.078GB/s
SetBitsTo/131072 2029 ns 2029 ns 325563 60.1566GB/s
```
non-inline:
```
--------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------
SetBitsTo/2 6 ns 6 ns 129335891 334.62MB/s
SetBitsTo/16 6 ns 6 ns 122741527 2.53134GB/s
SetBitsTo/1024 11 ns 11 ns 64547137 87.1395GB/s
SetBitsTo/131072 2010 ns 2010 ns 332558 60.7215GB/s
```
If it can be demonstrated that inlining can meaningfully improve the macroperformance of some function, then we could lift the implementation into a `SetBitsToInline`, but until then I think it makes sense to keep it in bit_util.cc
Closes #7296 from wesm/ARROW-8793
Authored-by: Wes McKinney <we...@apache.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/arrow/util/bit_util.cc | 44 ++++++++++++++++++++++++++++++++
cpp/src/arrow/util/bit_util.h | 42 ++----------------------------
cpp/src/arrow/util/bit_util_benchmark.cc | 11 ++++++++
3 files changed, 57 insertions(+), 40 deletions(-)
diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index 395801f..3cd3f37 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -58,6 +58,50 @@ void FillBitsFromBytes(const std::vector<uint8_t>& bytes, uint8_t* bits) {
} // namespace
+void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set) {
+ if (length == 0) {
+ return;
+ }
+
+ const int64_t i_begin = start_offset;
+ const int64_t i_end = start_offset + length;
+ const uint8_t fill_byte = static_cast<uint8_t>(-static_cast<uint8_t>(bits_are_set));
+
+ const int64_t bytes_begin = i_begin / 8;
+ const int64_t bytes_end = i_end / 8 + 1;
+
+ const uint8_t first_byte_mask = kPrecedingBitmask[i_begin % 8];
+ const uint8_t last_byte_mask = kTrailingBitmask[i_end % 8];
+
+ if (bytes_end == bytes_begin + 1) {
+ // set bits within a single byte
+ const uint8_t only_byte_mask =
+ i_end % 8 == 0 ? first_byte_mask
+ : static_cast<uint8_t>(first_byte_mask | last_byte_mask);
+ bits[bytes_begin] &= only_byte_mask;
+ bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~only_byte_mask);
+ return;
+ }
+
+ // set/clear trailing bits of first byte
+ bits[bytes_begin] &= first_byte_mask;
+ bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~first_byte_mask);
+
+ if (bytes_end - bytes_begin > 2) {
+ // set/clear whole bytes
+ std::memset(bits + bytes_begin + 1, fill_byte,
+ static_cast<size_t>(bytes_end - bytes_begin - 2));
+ }
+
+ if (i_end % 8 == 0) {
+ return;
+ }
+
+ // set/clear leading bits of last byte
+ bits[bytes_end - 1] &= last_byte_mask;
+ bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
+}
+
Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>& bytes,
MemoryPool* pool) {
int64_t bit_length = BytesForBits(bytes.size());
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index e708c37..67d3528 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -460,46 +460,8 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
}
/// \brief set or clear a range of bits quickly
-static inline void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length,
- bool bits_are_set) {
- if (length == 0) return;
-
- const auto i_begin = start_offset;
- const auto i_end = start_offset + length;
- const uint8_t fill_byte = static_cast<uint8_t>(-static_cast<uint8_t>(bits_are_set));
-
- const auto bytes_begin = i_begin / 8;
- const auto bytes_end = i_end / 8 + 1;
-
- const auto first_byte_mask = kPrecedingBitmask[i_begin % 8];
- const auto last_byte_mask = kTrailingBitmask[i_end % 8];
-
- if (bytes_end == bytes_begin + 1) {
- // set bits within a single byte
- const auto only_byte_mask =
- i_end % 8 == 0 ? first_byte_mask
- : static_cast<uint8_t>(first_byte_mask | last_byte_mask);
- bits[bytes_begin] &= only_byte_mask;
- bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~only_byte_mask);
- return;
- }
-
- // set/clear trailing bits of first byte
- bits[bytes_begin] &= first_byte_mask;
- bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~first_byte_mask);
-
- if (bytes_end - bytes_begin > 2) {
- // set/clear whole bytes
- std::memset(bits + bytes_begin + 1, fill_byte,
- static_cast<size_t>(bytes_end - bytes_begin - 2));
- }
-
- if (i_end % 8 == 0) return;
-
- // set/clear leading bits of last byte
- bits[bytes_end - 1] &= last_byte_mask;
- bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
-}
+ARROW_EXPORT
+void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
/// \brief Convert vector of bytes to bitmap buffer
ARROW_EXPORT
diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc b/cpp/src/arrow/util/bit_util_benchmark.cc
index fd8855c..9844c08 100644
--- a/cpp/src/arrow/util/bit_util_benchmark.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark.cc
@@ -322,6 +322,16 @@ static void VisitBitsUnrolled(benchmark::State& state) {
BenchmarkVisitBits<VisitBitsUnrolledFunctor>(state, state.range(0));
}
+static void SetBitsTo(benchmark::State& state) {
+ int64_t nbytes = state.range(0);
+ std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
+
+ for (auto _ : state) {
+ BitUtil::SetBitsTo(buffer->mutable_data(), /*offset=*/0, nbytes * 8, true);
+ }
+ state.SetBytesProcessed(state.iterations() * nbytes);
+}
+
constexpr int64_t kBufferSize = 1024 * 8;
template <int64_t Offset = 0>
@@ -364,6 +374,7 @@ BENCHMARK(ReferenceNaiveBitmapReader)->Arg(kBufferSize);
BENCHMARK(BitmapReader)->Arg(kBufferSize);
BENCHMARK(VisitBits)->Arg(kBufferSize);
BENCHMARK(VisitBitsUnrolled)->Arg(kBufferSize);
+BENCHMARK(SetBitsTo)->Arg(2)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
#ifdef ARROW_WITH_BENCHMARKS_REFERENCE
static void ReferenceNaiveBitmapWriter(benchmark::State& state) {