You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/05/28 13:26:33 UTC

[arrow] branch master updated: ARROW-8793: [C++] Do not inline BitUtil::SetBitsTo

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e5a0893  ARROW-8793: [C++] Do not inline BitUtil::SetBitsTo
e5a0893 is described below

commit e5a08930862e0d4b497d5bdb8f95523eb17537ad
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu May 28 08:26:07 2020 -0500

    ARROW-8793: [C++] Do not inline BitUtil::SetBitsTo
    
    Local benchmarks suggest that there is 3-4 ns of overhead for manipulating small bitmaps when it is not inline.
    
    inline (argument is byte size of bitmap):
    
    ```
    --------------------------------------------------------
    Benchmark                 Time           CPU Iterations
    --------------------------------------------------------
    SetBitsTo/2               3 ns          3 ns  271439203   733.294MB/s
    SetBitsTo/16              2 ns          2 ns  308813758   6.49485GB/s
    SetBitsTo/1024            9 ns          9 ns   79821710   109.078GB/s
    SetBitsTo/131072       2029 ns       2029 ns     325563   60.1566GB/s
    ```
    
    non-inline:
    
    ```
    --------------------------------------------------------
    Benchmark                 Time           CPU Iterations
    --------------------------------------------------------
    SetBitsTo/2               6 ns          6 ns  129335891    334.62MB/s
    SetBitsTo/16              6 ns          6 ns  122741527   2.53134GB/s
    SetBitsTo/1024           11 ns         11 ns   64547137   87.1395GB/s
    SetBitsTo/131072       2010 ns       2010 ns     332558   60.7215GB/s
    ```
    
    If it can be demonstrated that inlining can meaningfully improve the macroperformance of some function, then we could lift the implementation into a `SetBitsToInline`, but until then I think it makes sense to keep it in bit_util.cc
    
    Closes #7296 from wesm/ARROW-8793
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/util/bit_util.cc           | 44 ++++++++++++++++++++++++++++++++
 cpp/src/arrow/util/bit_util.h            | 42 ++----------------------------
 cpp/src/arrow/util/bit_util_benchmark.cc | 11 ++++++++
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index 395801f..3cd3f37 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -58,6 +58,50 @@ void FillBitsFromBytes(const std::vector<uint8_t>& bytes, uint8_t* bits) {
 
 }  // namespace
 
+void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set) {
+  if (length == 0) {
+    return;
+  }
+
+  const int64_t i_begin = start_offset;
+  const int64_t i_end = start_offset + length;
+  const uint8_t fill_byte = static_cast<uint8_t>(-static_cast<uint8_t>(bits_are_set));
+
+  const int64_t bytes_begin = i_begin / 8;
+  const int64_t bytes_end = i_end / 8 + 1;
+
+  const uint8_t first_byte_mask = kPrecedingBitmask[i_begin % 8];
+  const uint8_t last_byte_mask = kTrailingBitmask[i_end % 8];
+
+  if (bytes_end == bytes_begin + 1) {
+    // set bits within a single byte
+    const uint8_t only_byte_mask =
+        i_end % 8 == 0 ? first_byte_mask
+                       : static_cast<uint8_t>(first_byte_mask | last_byte_mask);
+    bits[bytes_begin] &= only_byte_mask;
+    bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~only_byte_mask);
+    return;
+  }
+
+  // set/clear trailing bits of first byte
+  bits[bytes_begin] &= first_byte_mask;
+  bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~first_byte_mask);
+
+  if (bytes_end - bytes_begin > 2) {
+    // set/clear whole bytes
+    std::memset(bits + bytes_begin + 1, fill_byte,
+                static_cast<size_t>(bytes_end - bytes_begin - 2));
+  }
+
+  if (i_end % 8 == 0) {
+    return;
+  }
+
+  // set/clear leading bits of last byte
+  bits[bytes_end - 1] &= last_byte_mask;
+  bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
+}
+
 Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>& bytes,
                                             MemoryPool* pool) {
   int64_t bit_length = BytesForBits(bytes.size());
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index e708c37..67d3528 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -460,46 +460,8 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
 }
 
 /// \brief set or clear a range of bits quickly
-static inline void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length,
-                             bool bits_are_set) {
-  if (length == 0) return;
-
-  const auto i_begin = start_offset;
-  const auto i_end = start_offset + length;
-  const uint8_t fill_byte = static_cast<uint8_t>(-static_cast<uint8_t>(bits_are_set));
-
-  const auto bytes_begin = i_begin / 8;
-  const auto bytes_end = i_end / 8 + 1;
-
-  const auto first_byte_mask = kPrecedingBitmask[i_begin % 8];
-  const auto last_byte_mask = kTrailingBitmask[i_end % 8];
-
-  if (bytes_end == bytes_begin + 1) {
-    // set bits within a single byte
-    const auto only_byte_mask =
-        i_end % 8 == 0 ? first_byte_mask
-                       : static_cast<uint8_t>(first_byte_mask | last_byte_mask);
-    bits[bytes_begin] &= only_byte_mask;
-    bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~only_byte_mask);
-    return;
-  }
-
-  // set/clear trailing bits of first byte
-  bits[bytes_begin] &= first_byte_mask;
-  bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~first_byte_mask);
-
-  if (bytes_end - bytes_begin > 2) {
-    // set/clear whole bytes
-    std::memset(bits + bytes_begin + 1, fill_byte,
-                static_cast<size_t>(bytes_end - bytes_begin - 2));
-  }
-
-  if (i_end % 8 == 0) return;
-
-  // set/clear leading bits of last byte
-  bits[bytes_end - 1] &= last_byte_mask;
-  bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
-}
+ARROW_EXPORT
+void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
 
 /// \brief Convert vector of bytes to bitmap buffer
 ARROW_EXPORT
diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc b/cpp/src/arrow/util/bit_util_benchmark.cc
index fd8855c..9844c08 100644
--- a/cpp/src/arrow/util/bit_util_benchmark.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark.cc
@@ -322,6 +322,16 @@ static void VisitBitsUnrolled(benchmark::State& state) {
   BenchmarkVisitBits<VisitBitsUnrolledFunctor>(state, state.range(0));
 }
 
+static void SetBitsTo(benchmark::State& state) {
+  int64_t nbytes = state.range(0);
+  std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
+
+  for (auto _ : state) {
+    BitUtil::SetBitsTo(buffer->mutable_data(), /*offset=*/0, nbytes * 8, true);
+  }
+  state.SetBytesProcessed(state.iterations() * nbytes);
+}
+
 constexpr int64_t kBufferSize = 1024 * 8;
 
 template <int64_t Offset = 0>
@@ -364,6 +374,7 @@ BENCHMARK(ReferenceNaiveBitmapReader)->Arg(kBufferSize);
 BENCHMARK(BitmapReader)->Arg(kBufferSize);
 BENCHMARK(VisitBits)->Arg(kBufferSize);
 BENCHMARK(VisitBitsUnrolled)->Arg(kBufferSize);
+BENCHMARK(SetBitsTo)->Arg(2)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
 
 #ifdef ARROW_WITH_BENCHMARKS_REFERENCE
 static void ReferenceNaiveBitmapWriter(benchmark::State& state) {