You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/01/10 22:10:48 UTC

[arrow] branch master updated: ARROW-764: [C++] Improves performance of CopyBitmap and adds benchmarks

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9eae508  ARROW-764: [C++] Improves performance of CopyBitmap and adds benchmarks
9eae508 is described below

commit 9eae5089b5099548f67bbeca9927ca81df9d5de8
Author: Adam Seibert <se...@users.noreply.github.com>
AuthorDate: Wed Jan 10 17:10:39 2018 -0500

    ARROW-764: [C++] Improves performance of CopyBitmap and adds benchmarks
    
    I took a swing at improving the CopyBitmap performance (benchmarks below).  I'm a C/C++ novice, so I thought I'd get some feedback before I went too much further.
    
    **Starting Point**
    ```
    Run on (4 X 2208 MHz CPU s)
    12/13/17 21:15:18
    Benchmark                                        Time           CPU Iterations
    ------------------------------------------------------------------------------
    BM_CopyBitmap/97.6563k/0/min_time:1.000       4779 us       4758 us        289   20.0445MB/s
    BM_CopyBitmap/976.563k/0/min_time:1.000      47740 us      47476 us         26   20.0875MB/s
    BM_CopyBitmap/97.6563k/4/min_time:1.000       4858 us       4866 us        289   19.5991MB/s
    BM_CopyBitmap/976.563k/4/min_time:1.000      48117 us      47953 us         29   19.8879MB/s
    ```
    
    **Using stanford bithacks for SetBitTo**
    ```
    Run on (4 X 2208 MHz CPU s)
    12/13/17 21:22:05
    Benchmark                                        Time           CPU Iterations
    ------------------------------------------------------------------------------
    BM_CopyBitmap/97.6563k/0/min_time:1.000       1647 us       1649 us        815   57.8415MB/s
    BM_CopyBitmap/976.563k/0/min_time:1.000      16368 us      16397 us         81   58.1629MB/s
    BM_CopyBitmap/97.6563k/4/min_time:1.000       1599 us       1610 us        815   59.2186MB/s
    BM_CopyBitmap/976.563k/4/min_time:1.000      16026 us      16011 us         81   59.5644MB/s
    ```
    
    **memcpy + shifting**
    *This solution provides varying performance depending on whether or not the bit offset is a multiple of 8*
    ```
    Run on (4 X 2208 MHz CPU s)
    12/13/17 21:23:44
    Benchmark                                        Time           CPU Iterations
    ------------------------------------------------------------------------------
    BM_CopyBitmap/97.6563k/0/min_time:1.000          5 us          5 us     280000   18.9651GB/s
    BM_CopyBitmap/976.563k/0/min_time:1.000         62 us         61 us      22400   15.1721GB/s
    BM_CopyBitmap/97.6563k/4/min_time:1.000        171 us        170 us       6892   560.872MB/s
    BM_CopyBitmap/976.563k/4/min_time:1.000       1639 us       1639 us        896   581.782MB/s
    ```
    
    Author: Adam Seibert <se...@users.noreply.github.com>
    
    Closes #1422 from seibs/ARROW-764 and squashes the following commits:
    
    c813c8e2 [Adam Seibert] ARROW-764: [C++] Improves performance of CopyBitmap and adds benchmarks
---
 cpp/src/arrow/util/CMakeLists.txt        |  6 ++--
 cpp/src/arrow/util/bit-util-benchmark.cc | 58 ++++++++++++++++++++++++++++++++
 cpp/src/arrow/util/bit-util-test.cc      | 17 +++++-----
 cpp/src/arrow/util/bit-util.cc           | 32 ++++++++++++++++--
 cpp/src/arrow/util/bit-util.h            |  9 ++---
 5 files changed, 104 insertions(+), 18 deletions(-)

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index a36dffb..8b61a3a 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -63,10 +63,10 @@ if (ARROW_BUILD_BENCHMARKS)
       Shlwapi.lib
   )
   else()
-	  target_link_libraries(arrow_benchmark_main
+    target_link_libraries(arrow_benchmark_main
       benchmark
       pthread
-	  )
+    )
   endif()
 
   # TODO(wesm): Some benchmarks include gtest.h
@@ -80,4 +80,6 @@ ADD_ARROW_TEST(key-value-metadata-test)
 ADD_ARROW_TEST(rle-encoding-test)
 ADD_ARROW_TEST(stl-util-test)
 
+ADD_ARROW_BENCHMARK(bit-util-benchmark)
+
 add_subdirectory(variant)
diff --git a/cpp/src/arrow/util/bit-util-benchmark.cc b/cpp/src/arrow/util/bit-util-benchmark.cc
new file mode 100644
index 0000000..8969dd8
--- /dev/null
+++ b/cpp/src/arrow/util/bit-util-benchmark.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/memory_pool.h"
+#include "arrow/test-util.h"
+#include "arrow/util/bit-util.h"
+
+namespace arrow {
+namespace BitUtil {
+
+static void BM_CopyBitmap(benchmark::State& state) {  // NOLINT non-const reference
+  const int kBufferSize = state.range(0);
+
+  std::shared_ptr<Buffer> buffer;
+  ASSERT_OK(AllocateBuffer(default_memory_pool(), kBufferSize, &buffer));
+  memset(buffer->mutable_data(), 0, kBufferSize);
+  test::random_bytes(kBufferSize, 0, buffer->mutable_data());
+
+  const int num_bits = kBufferSize * 8;
+  const uint8_t* src = buffer->data();
+
+  std::shared_ptr<Buffer> copy;
+  while (state.KeepRunning()) {
+    ABORT_NOT_OK(CopyBitmap(default_memory_pool(), src, state.range(1), num_bits, &copy));
+  }
+  state.SetBytesProcessed(state.iterations() * kBufferSize * sizeof(int8_t));
+}
+
+BENCHMARK(BM_CopyBitmap)
+    ->Args({100000, 0})
+    ->Args({1000000, 0})
+    ->Args({100000, 4})
+    ->Args({1000000, 4})
+    ->MinTime(1.0)
+    ->Unit(benchmark::kMicrosecond);
+
+}  // namespace BitUtil
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc
index 92bdcb5..4c64dea 100644
--- a/cpp/src/arrow/util/bit-util-test.cc
+++ b/cpp/src/arrow/util/bit-util-test.cc
@@ -165,19 +165,20 @@ TEST(BitUtilTests, TestCopyBitmap) {
   memset(buffer->mutable_data(), 0, kBufferSize);
   test::random_bytes(kBufferSize, 0, buffer->mutable_data());
 
-  const int num_bits = kBufferSize * 8;
-
   const uint8_t* src = buffer->data();
 
+  std::vector<int64_t> lengths = {kBufferSize * 8 - 4, kBufferSize * 8};
   std::vector<int64_t> offsets = {0, 12, 16, 32, 37, 63, 64, 128};
-  for (int64_t offset : offsets) {
-    const int64_t copy_length = num_bits - offset;
+  for (int64_t num_bits : lengths) {
+    for (int64_t offset : offsets) {
+      const int64_t copy_length = num_bits - offset;
 
-    std::shared_ptr<Buffer> copy;
-    ASSERT_OK(CopyBitmap(default_memory_pool(), src, offset, copy_length, &copy));
+      std::shared_ptr<Buffer> copy;
+      ASSERT_OK(CopyBitmap(default_memory_pool(), src, offset, copy_length, &copy));
 
-    for (int64_t i = 0; i < copy_length; ++i) {
-      ASSERT_EQ(BitUtil::GetBit(src, i + offset), BitUtil::GetBit(copy->data(), i));
+      for (int64_t i = 0; i < copy_length; ++i) {
+        ASSERT_EQ(BitUtil::GetBit(src, i + offset), BitUtil::GetBit(copy->data(), i));
+      }
     }
   }
 }
diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc
index 4dd91e9..c77f0d0 100644
--- a/cpp/src/arrow/util/bit-util.cc
+++ b/cpp/src/arrow/util/bit-util.cc
@@ -109,9 +109,37 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t
   std::shared_ptr<Buffer> buffer;
   RETURN_NOT_OK(GetEmptyBitmap(pool, length, &buffer));
   uint8_t* dest = buffer->mutable_data();
-  for (int64_t i = 0; i < length; ++i) {
-    BitUtil::SetBitTo(dest, i, BitUtil::GetBit(data, i + offset));
+
+  int64_t byte_offset = offset / 8;
+  int64_t bit_offset = offset % 8;
+  int64_t num_bytes = BitUtil::BytesForBits(length);
+  int64_t bits_to_zero = num_bytes * 8 - length;
+
+  if (bit_offset > 0) {
+    uint32_t carry_mask = BitUtil::kBitmask[bit_offset] - 1U;
+    uint32_t carry_shift = 8U - static_cast<uint32_t>(bit_offset);
+
+    uint32_t carry = 0U;
+    if (BitUtil::BytesForBits(length + bit_offset) > num_bytes) {
+      carry = (data[byte_offset + num_bytes] & carry_mask) << carry_shift;
+    }
+
+    int64_t i = num_bytes - 1;
+    while (i + 1 > 0) {
+      uint8_t cur_byte = data[byte_offset + i];
+      dest[i] = static_cast<uint8_t>((cur_byte >> bit_offset) | carry);
+      carry = (cur_byte & carry_mask) << carry_shift;
+      --i;
+    }
+  } else {
+    std::memcpy(dest, data + byte_offset, static_cast<size_t>(num_bytes));
+  }
+
+  for (int64_t i = length; i < length + bits_to_zero; ++i) {
+    // Both branches may copy extra bits - unsetting to match specification.
+    BitUtil::SetBitTo(dest, i, false);
   }
+
   *out = buffer;
   return Status::OK();
 }
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index cab3c9e..86c17d1 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -139,13 +139,10 @@ static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) {
 }
 
 static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
-  // TODO: speed up. See https://graphics.stanford.edu/~seander/bithacks.html
+  // https://graphics.stanford.edu/~seander/bithacks.html
   // "Conditionally set or clear bits without branching"
-  if (bit_is_set) {
-    SetBit(bits, i);
-  } else {
-    ClearBit(bits, i);
-  }
+  bits[i / 8] ^= static_cast<uint8_t>(-static_cast<uint8_t>(bit_is_set) ^ bits[i / 8]) &
+                 kBitmask[i % 8];
 }
 
 // Returns the minimum number of bits needed to represent the value of 'x'

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].