You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/05 13:29:53 UTC

[arrow] branch master updated: ARROW-9032: [C++] Split up arrow/util/bit_util.h into multiple header files

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new efb707a  ARROW-9032: [C++] Split up arrow/util/bit_util.h into multiple header files
efb707a is described below

commit efb707a5438380dcef78418668b57c3f60592a23
Author: Wes McKinney <we...@apache.org>
AuthorDate: Fri Jun 5 08:29:17 2020 -0500

    ARROW-9032: [C++] Split up arrow/util/bit_util.h into multiple header files
    
    There is a lot of code in bit_util.h that is seldom used compared to certain common utilities like `BitUtil::BytesForBits`. This moves everything outside of the `BitUtil` namespace to different headers. You can see by the frequency of includes that this makes sense so that compilation units that only need some simple bit utilities are not including a lot of header code that they never use
    
    ```
    $ grep -R bit_util.h ../src/ | wc -l
    68
    $ grep -R bitmap.h ../src/ | wc -l
    7
    $ grep -R bitmap_ops.h ../src/ | wc -l
    15
    $ grep -R bitmap_reader.h ../src/ | wc -l
    9
    $ grep -R bitmap_writer.h ../src/ | wc -l
    6
    ```
    
    This doesn't seem to affect aggregate compilation time very much but at minimum makes the code easier to navigate (in my opinion, at least).
    
    All the unit tests are still in bit_util_test.cc. Maybe we can improve that in a follow up patch.
    
    Closes #7352 from wesm/ARROW-9032
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/build-support/iwyu/iwyu.sh                     |   2 +-
 cpp/src/arrow/CMakeLists.txt                       |   4 +
 cpp/src/arrow/array/array_binary_test.cc           |   3 +-
 cpp/src/arrow/array/array_dict.cc                  |   1 +
 cpp/src/arrow/array/array_list_test.cc             |   3 +-
 cpp/src/arrow/array/array_nested.cc                |   1 +
 cpp/src/arrow/array/array_test.cc                  |  13 +-
 cpp/src/arrow/array/builder_union.h                |   2 +-
 cpp/src/arrow/array/concatenate.cc                 |   1 +
 cpp/src/arrow/array/data.cc                        |   2 +-
 cpp/src/arrow/array/data.h                         |   1 -
 cpp/src/arrow/array/diff.h                         |   1 -
 cpp/src/arrow/buffer.cc                            |   1 +
 cpp/src/arrow/buffer_builder.h                     |   1 +
 cpp/src/arrow/compare.cc                           |   1 +
 cpp/src/arrow/compute/api_scalar.h                 |   1 -
 cpp/src/arrow/compute/exec.cc                      |   1 +
 cpp/src/arrow/compute/exec_test.cc                 |   1 +
 cpp/src/arrow/compute/kernels/aggregate_basic.cc   |   2 +
 cpp/src/arrow/compute/kernels/codegen_internal.h   |   3 +
 cpp/src/arrow/compute/kernels/common.h             |   5 -
 cpp/src/arrow/compute/kernels/scalar_boolean.cc    |   6 +
 .../arrow/compute/kernels/scalar_cast_string.cc    |   4 +
 .../arrow/compute/kernels/scalar_cast_temporal.cc  |   3 +-
 .../arrow/compute/kernels/scalar_compare_test.cc   |   8 +-
 cpp/src/arrow/compute/kernels/scalar_set_lookup.cc |   3 +
 cpp/src/arrow/compute/kernels/vector_filter.cc     |   1 +
 cpp/src/arrow/ipc/json_internal.cc                 |  11 +-
 cpp/src/arrow/ipc/json_simple_test.cc              |   4 +-
 cpp/src/arrow/ipc/test_common.cc                   |  11 +-
 cpp/src/arrow/ipc/writer.cc                        |   1 +
 cpp/src/arrow/json/parser.cc                       |   1 +
 cpp/src/arrow/python/numpy_to_arrow.cc             |   2 +
 cpp/src/arrow/sparse_tensor_test.cc                |   1 +
 cpp/src/arrow/util/bit_block_counter.cc            |  75 ++
 cpp/src/arrow/util/bit_block_counter.h             |  55 ++
 cpp/src/arrow/util/bit_util.cc                     | 571 +--------------
 cpp/src/arrow/util/bit_util.h                      | 806 ---------------------
 cpp/src/arrow/util/bit_util_benchmark.cc           |   7 +
 cpp/src/arrow/util/bit_util_test.cc                |   8 +
 cpp/src/arrow/util/bitmap.cc                       |  65 ++
 cpp/src/arrow/util/bitmap.h                        | 296 ++++++++
 cpp/src/arrow/util/bitmap_builders.cc              |  72 ++
 cpp/src/arrow/util/bitmap_builders.h               |  42 ++
 cpp/src/arrow/util/bitmap_generate.h               | 112 +++
 cpp/src/arrow/util/{bit_util.cc => bitmap_ops.cc}  | 169 +----
 cpp/src/arrow/util/bitmap_ops.h                    | 155 ++++
 cpp/src/arrow/util/bitmap_reader.h                 |  70 ++
 cpp/src/arrow/util/bitmap_visit.h                  |  88 +++
 cpp/src/arrow/util/bitmap_writer.h                 | 183 +++++
 cpp/src/arrow/util/bitset_stack.h                  |  89 +++
 cpp/src/arrow/util/hashing.h                       |   1 +
 cpp/src/arrow/util/rle_encoding.h                  |   1 +
 cpp/src/arrow/util/thread_pool.h                   |   3 +-
 cpp/src/arrow/util/thread_pool_test.cc             |   2 -
 cpp/src/arrow/util/trie.h                          |   1 +
 cpp/src/arrow/visitor_inline.h                     |   1 +
 cpp/src/gandiva/bitmap_accumulator.cc              |   2 +
 cpp/src/gandiva/bitmap_accumulator_test.cc         |   1 +
 cpp/src/parquet/arrow/path_internal.cc             |   1 +
 cpp/src/parquet/column_writer_test.cc              |   5 +-
 cpp/src/parquet/encoding.cc                        |   1 +
 cpp/src/parquet/level_conversion_test.cc           |   2 +
 cpp/src/parquet/platform.h                         |  19 +-
 cpp/src/parquet/statistics.cc                      |   1 +
 cpp/src/parquet/statistics_test.cc                 |   1 +
 r/src/array.cpp                                    |   1 +
 r/src/array_from_vector.cpp                        |   1 +
 r/src/array_to_vector.cpp                          |   2 +
 69 files changed, 1427 insertions(+), 1588 deletions(-)

diff --git a/cpp/build-support/iwyu/iwyu.sh b/cpp/build-support/iwyu/iwyu.sh
index 27c62dc..813f64d 100755
--- a/cpp/build-support/iwyu/iwyu.sh
+++ b/cpp/build-support/iwyu/iwyu.sh
@@ -25,7 +25,7 @@ IWYU_LOG=$(mktemp -t arrow-cpp-iwyu.XXXXXX)
 trap "rm -f $IWYU_LOG" EXIT
 
 IWYU_MAPPINGS_PATH="$ROOT/cpp/build-support/iwyu/mappings"
-IWYU_ARGS="--no_fwd_decls \
+IWYU_ARGS="\
     --mapping_file=$IWYU_MAPPINGS_PATH/boost-all.imp \
     --mapping_file=$IWYU_MAPPINGS_PATH/boost-all-private.imp \
     --mapping_file=$IWYU_MAPPINGS_PATH/boost-extra.imp \
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index f622cd8..27cdd02 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -173,7 +173,11 @@ set(ARROW_SRCS
     io/slow.cc
     testing/util.cc
     util/basic_decimal.cc
+    util/bit_block_counter.cc
     util/bit_util.cc
+    util/bitmap.cc
+    util/bitmap_builders.cc
+    util/bitmap_ops.cc
     util/compression.cc
     util/cpu_info.cc
     util/decimal.cc
diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index 1521230..d044baf 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -34,6 +34,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_builders.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/string_view.h"
 #include "arrow/visitor_inline.h"
@@ -93,7 +94,7 @@ class TestStringArray : public ::testing::Test {
     length_ = static_cast<int64_t>(offsets_.size()) - 1;
     value_buf_ = Buffer::Wrap(chars_);
     offsets_buf_ = Buffer::Wrap(offsets_);
-    ASSERT_OK_AND_ASSIGN(null_bitmap_, BitUtil::BytesToBits(valid_bytes_));
+    ASSERT_OK_AND_ASSIGN(null_bitmap_, internal::BytesToBits(valid_bytes_));
     null_count_ = CountNulls(valid_bytes_);
 
     strings_ = std::make_shared<ArrayType>(length_, offsets_buf_, value_buf_,
diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc
index 1c5abf6..48e98bd 100644
--- a/cpp/src/arrow/array/array_dict.cc
+++ b/cpp/src/arrow/array/array_dict.cc
@@ -33,6 +33,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/int_util.h"
 #include "arrow/util/logging.h"
diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc
index e51b73f..3057e0d 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -30,6 +30,7 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_builders.h"
 #include "arrow/util/checked_cast.h"
 
 namespace arrow {
@@ -616,7 +617,7 @@ TEST_F(TestMapArray, BuildingStringToInt) {
   std::vector<int32_t> offsets = {0, 2, 2, 3, 3};
   auto expected_keys = ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])");
   auto expected_values = ArrayFromJSON(int32(), "[0, null, 8]");
-  ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, BitUtil::BytesToBits({1, 0, 1, 1}));
+  ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, internal::BytesToBits({1, 0, 1, 1}));
   MapArray expected(type, 4, Buffer::Wrap(offsets), expected_keys, expected_values,
                     expected_null_bitmap, 1);
 
diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index fa07694..7e4a71a 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -34,6 +34,7 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/atomic_shared_ptr.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 13cf91e..12e37bd 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -57,6 +57,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_builders.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/macros.h"
@@ -116,7 +117,7 @@ Status MakeArrayFromValidBytes(const std::vector<uint8_t>& v, MemoryPool* pool,
                                std::shared_ptr<Array>* out) {
   int64_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0);
 
-  ARROW_ASSIGN_OR_RAISE(auto null_buf, BitUtil::BytesToBits(v));
+  ARROW_ASSIGN_OR_RAISE(auto null_buf, internal::BytesToBits(v));
 
   TypedBufferBuilder<int32_t> value_builder(pool);
   for (size_t i = 0; i < v.size(); ++i) {
@@ -219,7 +220,7 @@ TEST_F(TestArray, TestIsNullIsValid) {
     }
   }
 
-  ASSERT_OK_AND_ASSIGN(auto null_buf, BitUtil::BytesToBits(null_bitmap));
+  ASSERT_OK_AND_ASSIGN(auto null_buf, internal::BytesToBits(null_bitmap));
 
   std::unique_ptr<Array> arr;
   arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_buf, null_count));
@@ -468,7 +469,7 @@ class TestPrimitiveBuilder : public TestBuilder {
     int64_t ex_null_count = 0;
 
     if (nullable) {
-      ASSERT_OK_AND_ASSIGN(ex_null_bitmap, BitUtil::BytesToBits(valid_bytes_));
+      ASSERT_OK_AND_ASSIGN(ex_null_bitmap, internal::BytesToBits(valid_bytes_));
       ex_null_count = CountNulls(valid_bytes_);
     } else {
       ex_null_bitmap = nullptr;
@@ -590,9 +591,9 @@ void TestPrimitiveBuilder<PBoolean>::Check(const std::unique_ptr<BooleanBuilder>
   std::shared_ptr<Buffer> ex_null_bitmap;
   int64_t ex_null_count = 0;
 
-  ASSERT_OK_AND_ASSIGN(ex_data, BitUtil::BytesToBits(draws_));
+  ASSERT_OK_AND_ASSIGN(ex_data, internal::BytesToBits(draws_));
   if (nullable) {
-    ASSERT_OK_AND_ASSIGN(ex_null_bitmap, BitUtil::BytesToBits(valid_bytes_));
+    ASSERT_OK_AND_ASSIGN(ex_null_bitmap, internal::BytesToBits(valid_bytes_));
     ex_null_count = CountNulls(valid_bytes_);
   } else {
     ex_null_bitmap = nullptr;
@@ -2089,7 +2090,7 @@ class DecimalTest : public ::testing::TestWithParam<int> {
 
     auto expected_data = std::make_shared<Buffer>(raw_bytes.data(), BYTE_WIDTH);
     std::shared_ptr<Buffer> expected_null_bitmap;
-    ASSERT_OK_AND_ASSIGN(expected_null_bitmap, BitUtil::BytesToBits(valid_bytes));
+    ASSERT_OK_AND_ASSIGN(expected_null_bitmap, internal::BytesToBits(valid_bytes));
 
     int64_t expected_null_count = CountNulls(valid_bytes);
     auto expected = std::make_shared<Decimal128Array>(
diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h
index 0f8f258..aba707e 100644
--- a/cpp/src/arrow/array/builder_union.h
+++ b/cpp/src/arrow/array/builder_union.h
@@ -20,13 +20,13 @@
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "arrow/array/array_nested.h"
 #include "arrow/array/builder_base.h"
 #include "arrow/array/data.h"
 #include "arrow/buffer_builder.h"
+#include "arrow/memory_pool.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/visibility.h"
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index e83bc47..e1f6d71 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -34,6 +34,7 @@
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/visitor_inline.h"
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 3aa6db6..6bd7f1e 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -28,7 +28,7 @@
 #include "arrow/buffer.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
-#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index afe6694..23f46df 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -25,7 +25,6 @@
 
 #include "arrow/buffer.h"
 #include "arrow/result.h"
-#include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
diff --git a/cpp/src/arrow/array/diff.h b/cpp/src/arrow/array/diff.h
index 00565a1..a405164 100644
--- a/cpp/src/arrow/array/diff.h
+++ b/cpp/src/arrow/array/diff.h
@@ -27,7 +27,6 @@
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
-#include "arrow/util/bit_util.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc
index 2f7917a..6dbbac3 100644
--- a/cpp/src/arrow/buffer.cc
+++ b/cpp/src/arrow/buffer.cc
@@ -22,6 +22,7 @@
 #include <utility>
 
 #include "arrow/memory_pool.h"
+#include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/logging.h"
diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h
index 1870ee6..41a47c9 100644
--- a/cpp/src/arrow/buffer_builder.h
+++ b/cpp/src/arrow/buffer_builder.h
@@ -27,6 +27,7 @@
 #include "arrow/buffer.h"
 #include "arrow/status.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/util/visibility.h"
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index f1af418..db40338 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -39,6 +39,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 66c29f4..e3210d3 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -20,7 +20,6 @@
 
 #pragma once
 
-#include <memory>
 #include <string>
 #include <utility>
 
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index b182857..c5581bf 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -40,6 +40,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/logging.h"
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index f938a45..d861507 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -38,6 +38,7 @@
 #include "arrow/table.h"
 #include "arrow/type.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/logging.h"
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 2f9f3d7..bdec999 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <cmath>
+
 #include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/kernels/aggregate_internal.h"
 #include "arrow/compute/kernels/common.h"
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index b14381b..de42001 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -34,6 +34,9 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/optional.h"
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
index 2355787..b2e02cb 100644
--- a/cpp/src/arrow/compute/kernels/common.h
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -39,21 +39,16 @@
 #include "arrow/table.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/string_view.h"
-#include "arrow/visitor_inline.h"
 
 // IWYU pragma: end_exports
 
 namespace arrow {
 
-using internal::Bitmap;
-using internal::BitmapReader;
 using internal::checked_cast;
 using internal::checked_pointer_cast;
-using internal::FirstTimeBitmapWriter;
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 15ff8cb..89f4de0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -18,8 +18,14 @@
 #include <array>
 
 #include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap.h"
+#include "arrow/util/bitmap_ops.h"
 
 namespace arrow {
+
+using internal::Bitmap;
+
 namespace compute {
 
 namespace {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 6618260..9830102 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -17,10 +17,14 @@
 
 // Implementation of casting to integer or floating point types
 
+#include "arrow/array/array_base.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/result.h"
 #include "arrow/util/formatting.h"
+#include "arrow/util/optional.h"
 #include "arrow/util/utf8.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index a793504..b2addd0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -18,11 +18,10 @@
 // Implementation of casting to (or between) temporal types
 
 #include <limits>
-#include <utility>
-#include <vector>
 
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/util/bitmap_reader.h"
 #include "arrow/util/time.h"
 #include "arrow/util/value_parsing.h"
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index df4306d..758e10b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -27,13 +27,13 @@
 #include "arrow/array.h"
 #include "arrow/compute/api.h"
 #include "arrow/compute/kernels/test_util.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/checked_cast.h"
-
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/checked_cast.h"
 
 namespace arrow {
 namespace compute {
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index bdc815a..fc14bf5 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -19,8 +19,11 @@
 #include "arrow/array/builder_primitive.h"
 #include "arrow/compute/api_scalar.h"
 #include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_writer.h"
 #include "arrow/util/hashing.h"
 #include "arrow/util/optional.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index 03a725d..db21d40 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -22,6 +22,7 @@
 #include "arrow/compute/kernels/vector_selection_internal.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 namespace compute {
diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc
index 757d53f..f658c33 100644
--- a/cpp/src/arrow/ipc/json_internal.cc
+++ b/cpp/src/arrow/ipc/json_internal.cc
@@ -451,7 +451,8 @@ class ArrayWriter {
       if (arr.IsValid(i)) {
         writer_->Int64(arr.Value(i));
       } else {
-        writer_->RawNumber(null_string.data(), null_string.size());
+        writer_->RawNumber(null_string.data(),
+                           static_cast<rj::SizeType>(null_string.size()));
       }
     }
   }
@@ -466,10 +467,12 @@ class ArrayWriter {
     static const std::string null_string = "0";
     for (int64_t i = 0; i < arr.length(); ++i) {
       if (arr.IsValid(i)) {
-        fmt(arr.Value(i),
-            [&](util::string_view repr) { writer_->String(repr.data(), repr.size()); });
+        fmt(arr.Value(i), [&](util::string_view repr) {
+          writer_->String(repr.data(), static_cast<rj::SizeType>(repr.size()));
+        });
       } else {
-        writer_->String(null_string.data(), null_string.size());
+        writer_->String(null_string.data(),
+                        static_cast<rj::SizeType>(null_string.size()));
       }
     }
   }
diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc
index 2bd7321..5b21fd5 100644
--- a/cpp/src/arrow/ipc/json_simple_test.cc
+++ b/cpp/src/arrow/ipc/json_simple_test.cc
@@ -34,6 +34,7 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/bitmap_builders.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 
@@ -47,6 +48,7 @@ namespace ipc {
 namespace internal {
 namespace json {
 
+using ::arrow::internal::BytesToBits;
 using ::arrow::internal::checked_cast;
 using ::arrow::internal::checked_pointer_cast;
 
@@ -668,7 +670,7 @@ TEST(TestMap, StringToInteger) {
   std::vector<int32_t> offsets = {0, 2, 2, 3, 3};
   auto expected_keys = ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])");
   auto expected_values = ArrayFromJSON(int32(), "[0, null, 8]");
-  ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, BitUtil::BytesToBits({1, 0, 1, 1}));
+  ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, BytesToBits({1, 0, 1, 1}));
   auto expected =
       std::make_shared<MapArray>(type, 4, Buffer::Wrap(offsets), expected_keys,
                                  expected_values, expected_null_bitmap, 1);
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index de6fa54..00c3942 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -37,6 +37,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_builders.h"
 #include "arrow/util/checked_cast.h"
 
 namespace arrow {
@@ -160,11 +161,11 @@ Status MakeRandomBooleanArray(const int length, bool include_nulls,
                               std::shared_ptr<Array>* out) {
   std::vector<uint8_t> values(length);
   random_null_bytes(length, 0.5, values.data());
-  ARROW_ASSIGN_OR_RAISE(auto data, BitUtil::BytesToBits(values));
+  ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values));
 
   if (include_nulls) {
     std::vector<uint8_t> valid_bytes(length);
-    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, BitUtil::BytesToBits(valid_bytes));
+    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(valid_bytes));
     random_null_bytes(length, 0.1, valid_bytes.data());
     *out = std::make_shared<BooleanArray>(length, data, null_bitmap, -1);
   } else {
@@ -422,7 +423,7 @@ Status MakeStruct(std::shared_ptr<RecordBatch>* out) {
   std::shared_ptr<Array> no_nulls(new StructArray(type, list_batch->num_rows(), columns));
   std::vector<uint8_t> null_bytes(list_batch->num_rows(), 1);
   null_bytes[0] = 0;
-  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, BitUtil::BytesToBits(null_bytes));
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes));
   std::shared_ptr<Array> with_nulls(
       new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1));
 
@@ -479,7 +480,7 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
 
   std::vector<uint8_t> null_bytes(length, 1);
   null_bytes[2] = 0;
-  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, BitUtil::BytesToBits(null_bytes));
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes));
 
   // construct individual nullable/non-nullable struct arrays
   auto sparse_no_nulls =
@@ -757,7 +758,7 @@ Status MakeDecimal(std::shared_ptr<RecordBatch>* out) {
   random_null_bytes(length, 0.1, is_valid_bytes.data());
 
   ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> is_valid,
-                        BitUtil::BytesToBits(is_valid_bytes));
+                        internal::BytesToBits(is_valid_bytes));
 
   auto a1 = std::make_shared<Decimal128Array>(f0->type(), length, data, is_valid,
                                               kUnknownNullCount);
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index c8bb9a3..ae0af10 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -45,6 +45,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/compression.h"
 #include "arrow/util/key_value_metadata.h"
diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc
index 334af09..0ff04f0 100644
--- a/cpp/src/arrow/json/parser.cc
+++ b/cpp/src/arrow/json/parser.cc
@@ -33,6 +33,7 @@
 #include "arrow/builder.h"
 #include "arrow/memory_pool.h"
 #include "arrow/type.h"
+#include "arrow/util/bitset_stack.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/make_unique.h"
 #include "arrow/util/string_view.h"
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 329fb31..76a0ac8 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -36,6 +36,8 @@
 #include "arrow/type_fwd.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc
index 5602c6f..0e28b2c 100644
--- a/cpp/src/arrow/sparse_tensor_test.cc
+++ b/cpp/src/arrow/sparse_tensor_test.cc
@@ -17,6 +17,7 @@
 
 // Unit tests for DataType (and subclasses), Field, and Schema
 
+#include <cmath>
 #include <cstdint>
 #include <memory>
 #include <string>
diff --git a/cpp/src/arrow/util/bit_block_counter.cc b/cpp/src/arrow/util/bit_block_counter.cc
new file mode 100644
index 0000000..a596c07
--- /dev/null
+++ b/cpp/src/arrow/util/bit_block_counter.cc
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bit_block_counter.h"
+
+#include <cstdint>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace internal {
+
+BitBlockCounter::Block BitBlockCounter::NextBlock() {
+  auto load_word = [](const uint8_t* bytes) -> uint64_t {
+    return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
+  };
+  auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t {
+    return (current >> shift) | (next << (64 - shift));
+  };
+
+  // When the offset is > 0, we need there to be a word beyond the last aligned
+  // word in the bitmap for the bit shifting logic.
+  const int64_t bits_required_to_scan_words = offset_ == 0 ? 256 : 256 + (64 - offset_);
+  if (bits_remaining_ < bits_required_to_scan_words) {
+    // End of the bitmap, leave it to the caller to decide how to best check
+    // these bits, no need to do redundant computation here.
+    const int16_t run_length = static_cast<int16_t>(bits_remaining_);
+    bits_remaining_ -= run_length;
+    return {run_length, static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length))};
+  }
+
+  int64_t total_popcount = 0;
+  if (offset_ == 0) {
+    total_popcount += BitUtil::PopCount(load_word(bitmap_));
+    total_popcount += BitUtil::PopCount(load_word(bitmap_ + 8));
+    total_popcount += BitUtil::PopCount(load_word(bitmap_ + 16));
+    total_popcount += BitUtil::PopCount(load_word(bitmap_ + 24));
+  } else {
+    auto current = load_word(bitmap_);
+    auto next = load_word(bitmap_ + 8);
+    total_popcount += BitUtil::PopCount(shift_word(current, next, offset_));
+    current = next;
+    next = load_word(bitmap_ + 16);
+    total_popcount += BitUtil::PopCount(shift_word(current, next, offset_));
+    current = next;
+    next = load_word(bitmap_ + 24);
+    total_popcount += BitUtil::PopCount(shift_word(current, next, offset_));
+    current = next;
+    next = load_word(bitmap_ + 32);
+    total_popcount += BitUtil::PopCount(shift_word(current, next, offset_));
+  }
+  bitmap_ += BitUtil::BytesForBits(kTargetBlockLength);
+  bits_remaining_ -= 256;
+  return {256, static_cast<int16_t>(total_popcount)};
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
new file mode 100644
index 0000000..92e2b9d
--- /dev/null
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief A class that scans through a true/false bitmap to yield blocks of up
+/// to 256 bits at a time along with their popcount. This is used to accelerate
+/// processing of mostly-not-null array data.
+class ARROW_EXPORT BitBlockCounter {
+ public:
+  struct Block {
+    int16_t length;
+    int16_t popcount;
+  };
+
+  static constexpr int16_t kTargetBlockLength = 256;
+
+  BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(bitmap + start_offset / 8),
+        bits_remaining_(length),
+        offset_(start_offset % 8) {}
+
+  /// \brief Return the next run of available bits, up to 256. The returned
+  /// pair contains the size of run and the number of true values
+  Block NextBlock();
+
+ private:
+  const uint8_t* bitmap_;
+  int64_t bits_remaining_;
+  int64_t offset_;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index 8b63da6..6e23678 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -15,35 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>
+#include "arrow/util/bit_util.h"
+
 #include <cstdint>
 #include <cstring>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/array/array_primitive.h"
-#include "arrow/buffer.h"
-#include "arrow/status.h"
-#include "arrow/util/align_util.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/ubsan.h"
 
 namespace arrow {
 namespace BitUtil {
-namespace {
-
-void FillBitsFromBytes(const std::vector<uint8_t>& bytes, uint8_t* bits) {
-  for (size_t i = 0; i < bytes.size(); ++i) {
-    if (bytes[i] > 0) {
-      SetBit(bits, i);
-    }
-  }
-}
-
-}  // namespace
 
 void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set) {
   if (length == 0) {
@@ -89,550 +67,5 @@ void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_ar
   bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
 }
 
-Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>& bytes,
-                                            MemoryPool* pool) {
-  int64_t bit_length = BytesForBits(bytes.size());
-
-  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(bit_length, pool));
-  uint8_t* out_buf = buffer->mutable_data();
-  memset(out_buf, 0, static_cast<size_t>(buffer->capacity()));
-  FillBitsFromBytes(bytes, out_buf);
-  return std::move(buffer);
-}
-
 }  // namespace BitUtil
-
-namespace internal {
-
-int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) {
-  constexpr int64_t pop_len = sizeof(uint64_t) * 8;
-  DCHECK_GE(bit_offset, 0);
-  int64_t count = 0;
-
-  const auto p = BitmapWordAlign<pop_len / 8>(data, bit_offset, length);
-  for (int64_t i = bit_offset; i < bit_offset + p.leading_bits; ++i) {
-    if (BitUtil::GetBit(data, i)) {
-      ++count;
-    }
-  }
-
-  if (p.aligned_words > 0) {
-    // popcount as much as possible with the widest possible count
-    const uint64_t* u64_data = reinterpret_cast<const uint64_t*>(p.aligned_start);
-    DCHECK_EQ(reinterpret_cast<size_t>(u64_data) & 7, 0);
-    const uint64_t* end = u64_data + p.aligned_words;
-
-    for (auto iter = u64_data; iter < end; ++iter) {
-      count += BitUtil::PopCount(*iter);
-    }
-  }
-
-  // Account for left over bits (in theory we could fall back to smaller
-  // versions of popcount but the code complexity is likely not worth it)
-  for (int64_t i = p.trailing_bit_offset; i < bit_offset + length; ++i) {
-    if (BitUtil::GetBit(data, i)) {
-      ++count;
-    }
-  }
-
-  return count;
-}
-
-template <bool invert_bits, bool restore_trailing_bits>
-void TransferBitmap(const uint8_t* data, int64_t offset, int64_t length,
-                    int64_t dest_offset, uint8_t* dest) {
-  int64_t byte_offset = offset / 8;
-  int64_t bit_offset = offset % 8;
-  int64_t dest_byte_offset = dest_offset / 8;
-  int64_t dest_bit_offset = dest_offset % 8;
-  int64_t num_bytes = BitUtil::BytesForBits(length);
-  // Shift dest by its byte offset
-  dest += dest_byte_offset;
-
-  if (bit_offset || dest_bit_offset) {
-    data += byte_offset;
-
-    const int64_t n_words = length / 64;
-    if (n_words > 1) {
-      auto load_word = [](const uint8_t* bytes) -> uint64_t {
-        return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
-      };
-      auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t {
-        if (shift == 0) return current;
-        return (current >> shift) | (next << (64 - shift));
-      };
-      auto write_word = [](uint8_t* bytes, uint64_t word) {
-        util::SafeStore(bytes, BitUtil::FromLittleEndian(word));
-      };
-
-      const uint64_t dest_mask = (1U << dest_bit_offset) - 1;
-      auto data_current = load_word(data);
-      auto dest_current = load_word(dest);
-
-      for (int64_t i = 0; i < n_words - 1; ++i) {
-        data += 8;
-        const auto data_next = load_word(data);
-        auto word = shift_word(data_current, data_next, bit_offset);
-        data_current = data_next;
-        if (invert_bits) {
-          word = ~word;
-        }
-
-        if (dest_bit_offset) {
-          word = (word << dest_bit_offset) | (word >> (64 - dest_bit_offset));
-          auto dest_next = load_word(dest + 8);
-          dest_current = (dest_current & dest_mask) | (word & ~dest_mask);
-          dest_next = (dest_next & ~dest_mask) | (word & dest_mask);
-          write_word(dest, dest_current);
-          write_word(dest + 8, dest_next);
-          dest_current = dest_next;
-        } else {
-          write_word(dest, word);
-        }
-        dest += 8;
-      }
-
-      length -= (n_words - 1) * 64;
-    }
-
-    internal::BitmapReader valid_reader(data, bit_offset, length);
-    internal::BitmapWriter valid_writer(dest, dest_bit_offset, length);
-
-    for (int64_t i = 0; i < length; i++) {
-      if (invert_bits ^ valid_reader.IsSet()) {
-        valid_writer.Set();
-      } else {
-        valid_writer.Clear();
-      }
-      valid_reader.Next();
-      valid_writer.Next();
-    }
-    valid_writer.Finish();
-  } else {
-    // Take care of the trailing bits in the last byte
-    int64_t trailing_bits = num_bytes * 8 - length;
-    uint8_t trail = 0;
-    if (trailing_bits && restore_trailing_bits) {
-      trail = dest[num_bytes - 1];
-    }
-
-    if (invert_bits) {
-      for (int64_t i = 0; i < num_bytes; i++) {
-        dest[i] = static_cast<uint8_t>(~(data[byte_offset + i]));
-      }
-    } else {
-      std::memcpy(dest, data + byte_offset, static_cast<size_t>(num_bytes));
-    }
-
-    if (restore_trailing_bits) {
-      for (int i = 0; i < trailing_bits; i++) {
-        if (BitUtil::GetBit(&trail, i + 8 - trailing_bits)) {
-          BitUtil::SetBit(dest, length + i);
-        } else {
-          BitUtil::ClearBit(dest, length + i);
-        }
-      }
-    }
-  }
-}
-
-template <bool invert_bits>
-Result<std::shared_ptr<Buffer>> TransferBitmap(MemoryPool* pool, const uint8_t* data,
-                                               int64_t offset, int64_t length) {
-  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateEmptyBitmap(length, pool));
-  uint8_t* dest = buffer->mutable_data();
-
-  TransferBitmap<invert_bits, false>(data, offset, length, 0, dest);
-
-  // As we have freshly allocated this bitmap, we should take care of zeroing the
-  // remaining bits.
-  int64_t num_bytes = BitUtil::BytesForBits(length);
-  int64_t bits_to_zero = num_bytes * 8 - length;
-  for (int64_t i = length; i < length + bits_to_zero; ++i) {
-    // Both branches may copy extra bits - unsetting to match specification.
-    BitUtil::ClearBit(dest, i);
-  }
-  return buffer;
-}
-
-void CopyBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest,
-                int64_t dest_offset, bool restore_trailing_bits) {
-  if (restore_trailing_bits) {
-    TransferBitmap<false, true>(data, offset, length, dest_offset, dest);
-  } else {
-    TransferBitmap<false, false>(data, offset, length, dest_offset, dest);
-  }
-}
-
-void InvertBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest,
-                  int64_t dest_offset) {
-  TransferBitmap<true, true>(data, offset, length, dest_offset, dest);
-}
-
-Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* data,
-                                           int64_t offset, int64_t length) {
-  return TransferBitmap<false>(pool, data, offset, length);
-}
-
-Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* data,
-                                             int64_t offset, int64_t length,
-                                             std::shared_ptr<Buffer>* out) {
-  return TransferBitmap<true>(pool, data, offset, length);
-}
-
-bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-                  int64_t right_offset, int64_t bit_length) {
-  if (left_offset % 8 == 0 && right_offset % 8 == 0) {
-    // byte aligned, can use memcmp
-    bool bytes_equal = std::memcmp(left + left_offset / 8, right + right_offset / 8,
-                                   bit_length / 8) == 0;
-    if (!bytes_equal) {
-      return false;
-    }
-    for (int64_t i = (bit_length / 8) * 8; i < bit_length; ++i) {
-      if (BitUtil::GetBit(left, left_offset + i) !=
-          BitUtil::GetBit(right, right_offset + i)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Unaligned slow case
-  left += left_offset / 8;
-  right += right_offset / 8;
-  left_offset %= 8;
-  right_offset %= 8;
-
-  // process in 64 bits, may touch two adjacent words in one iteration
-  const int64_t n_words = bit_length / 64;
-  if (n_words > 1) {
-    auto load_word = [](const uint8_t* bytes) -> uint64_t {
-      return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
-    };
-    auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t {
-      if (shift == 0) return current;
-      return (current >> shift) | (next << (64 - shift));
-    };
-
-    auto left_current = load_word(left);
-    auto right_current = load_word(right);
-
-    for (int64_t i = 0; i < n_words - 1; ++i) {
-      left += 8;
-      auto left_next = load_word(left);
-      auto left_word = shift_word(left_current, left_next, left_offset);
-      left_current = left_next;
-
-      right += 8;
-      auto right_next = load_word(right);
-      auto right_word = shift_word(right_current, right_next, right_offset);
-      right_current = right_next;
-
-      if (left_word != right_word) {
-        return false;
-      }
-    }
-
-    bit_length -= (n_words - 1) * 64;
-  }
-
-  // process in bit
-  for (int64_t i = 0; i < bit_length; ++i) {
-    if (BitUtil::GetBit(left, left_offset + i) !=
-        BitUtil::GetBit(right, right_offset + i)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-namespace {
-
-template <template <typename> class BitOp>
-void AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-                     int64_t right_offset, uint8_t* out, int64_t out_offset,
-                     int64_t length) {
-  BitOp<uint8_t> op;
-  DCHECK_EQ(left_offset % 8, right_offset % 8);
-  DCHECK_EQ(left_offset % 8, out_offset % 8);
-
-  const int64_t nbytes = BitUtil::BytesForBits(length + left_offset % 8);
-  left += left_offset / 8;
-  right += right_offset / 8;
-  out += out_offset / 8;
-  for (int64_t i = 0; i < nbytes; ++i) {
-    out[i] = op(left[i], right[i]);
-  }
-}
-
-template <template <typename> class BitOp, typename LogicalOp>
-void UnalignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-                       int64_t right_offset, uint8_t* out, int64_t out_offset,
-                       int64_t length) {
-  using Word = uint64_t;
-
-  left += left_offset / 8;
-  right += right_offset / 8;
-  out += out_offset / 8;
-
-  left_offset %= 8;
-  right_offset %= 8;
-  out_offset %= 8;
-
-  const int64_t min_offset = std::min({left_offset, right_offset, out_offset});
-  const int64_t min_nbytes = BitUtil::BytesForBits(length + min_offset);
-  int64_t nwords = min_nbytes / sizeof(Word);
-
-  // process in words, we may touch two words in each iteration
-  if (nwords > 1) {
-    BitOp<Word> op;
-    constexpr int64_t bits_per_word = sizeof(Word) * 8;
-    const Word out_mask = (1U << out_offset) - 1;
-
-    length -= (nwords - 1) * bits_per_word;
-    Word left_word0 = BitUtil::ToLittleEndian(util::SafeLoadAs<Word>(left));
-    Word right_word0 = BitUtil::ToLittleEndian(util::SafeLoadAs<Word>(right));
-    Word out_word0 = BitUtil::ToLittleEndian(util::SafeLoadAs<Word>(out));
-
-    do {
-      left += sizeof(Word);
-      const Word left_word1 = BitUtil::ToLittleEndian(util::SafeLoadAs<Word>(left));
-      Word left_word = left_word0;
-      if (left_offset) {
-        // combine two adjacent words into one word
-        // |<-- left_word1 --->|<-- left_word0 --->|
-        // +-------------+-----+-------------+-----+
-        // |     ---     |  A  |      B      | --- |
-        // +-------------+-----+-------------+-----+
-        //                  |         |       offset
-        //                  v         v
-        //               +-----+-------------+
-        //               |  A  |      B      |
-        //               +-----+-------------+
-        //               |<--- left_word --->|
-        left_word >>= left_offset;
-        left_word |= left_word1 << (bits_per_word - left_offset);
-      }
-      left_word0 = left_word1;
-
-      right += sizeof(Word);
-      const Word right_word1 = BitUtil::ToLittleEndian(util::SafeLoadAs<Word>(right));
-      Word right_word = right_word0;
-      if (right_offset) {
-        right_word >>= right_offset;
-        right_word |= right_word1 << (bits_per_word - right_offset);
-      }
-      right_word0 = right_word1;
-
-      Word out_word = op(left_word, right_word);
-      if (out_offset) {
-        // break one word into two adjacent words, don't touch unused bits
-        //               |<---- out_word --->|
-        //               +-----+-------------+
-        //               |  A  |      B      |
-        //               +-----+-------------+
-        //                  |         |
-        //                  v         v       offset
-        // +-------------+-----+-------------+-----+
-        // |     ---     |  A  |      B      | --- |
-        // +-------------+-----+-------------+-----+
-        // |<--- out_word1 --->|<--- out_word0 --->|
-        out_word = (out_word << out_offset) | (out_word >> (bits_per_word - out_offset));
-        Word out_word1 = util::SafeLoadAs<Word>(out + sizeof(Word));
-        out_word1 = BitUtil::ToLittleEndian(out_word1);
-        out_word0 = (out_word0 & out_mask) | (out_word & ~out_mask);
-        out_word1 = (out_word1 & ~out_mask) | (out_word & out_mask);
-        util::SafeStore(out, BitUtil::FromLittleEndian(out_word0));
-        util::SafeStore(out + sizeof(Word), BitUtil::FromLittleEndian(out_word1));
-        out_word0 = out_word1;
-      } else {
-        util::SafeStore(out, BitUtil::FromLittleEndian(out_word));
-      }
-      out += sizeof(Word);
-
-      --nwords;
-    } while (nwords > 1);
-  }
-
-  // process in bits
-  if (length) {
-    auto left_reader = internal::BitmapReader(left, left_offset, length);
-    auto right_reader = internal::BitmapReader(right, right_offset, length);
-    auto writer = internal::BitmapWriter(out, out_offset, length);
-    LogicalOp op;
-    while (length--) {
-      if (op(left_reader.IsSet(), right_reader.IsSet())) {
-        writer.Set();
-      } else {
-        writer.Clear();
-      }
-      left_reader.Next();
-      right_reader.Next();
-      writer.Next();
-    }
-    writer.Finish();
-  }
-}
-
-template <template <typename> class BitOp, typename LogicalOp>
-void BitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-              int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* dest) {
-  if ((out_offset % 8 == left_offset % 8) && (out_offset % 8 == right_offset % 8)) {
-    // Fast case: can use bytewise AND
-    AlignedBitmapOp<BitOp>(left, left_offset, right, right_offset, dest, out_offset,
-                           length);
-  } else {
-    // Unaligned
-    UnalignedBitmapOp<BitOp, LogicalOp>(left, left_offset, right, right_offset, dest,
-                                        out_offset, length);
-  }
-}
-
-template <template <typename> class BitOp, typename LogicalOp>
-Result<std::shared_ptr<Buffer>> BitmapOp(MemoryPool* pool, const uint8_t* left,
-                                         int64_t left_offset, const uint8_t* right,
-                                         int64_t right_offset, int64_t length,
-                                         int64_t out_offset) {
-  const int64_t phys_bits = length + out_offset;
-  ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateEmptyBitmap(phys_bits, pool));
-  BitmapOp<BitOp, LogicalOp>(left, left_offset, right, right_offset, length, out_offset,
-                             out_buffer->mutable_data());
-  return out_buffer;
-}
-
-}  // namespace
-
-std::string Bitmap::ToString() const {
-  std::string out(length_ + ((length_ - 1) / 8), ' ');
-  for (int64_t i = 0; i < length_; ++i) {
-    out[i + (i / 8)] = GetBit(i) ? '1' : '0';
-  }
-  return out;
-}
-
-std::shared_ptr<BooleanArray> Bitmap::ToArray() const {
-  return std::make_shared<BooleanArray>(length_, buffer_, nullptr, 0, offset_);
-}
-
-std::string Bitmap::Diff(const Bitmap& other) const {
-  return ToArray()->Diff(*other.ToArray());
-}
-
-bool Bitmap::Equals(const Bitmap& other) const {
-  if (length_ != other.length_) {
-    return false;
-  }
-  return BitmapEquals(buffer_->data(), offset_, other.buffer_->data(), other.offset(),
-                      length_);
-}
-
-int64_t Bitmap::BitLength(const Bitmap* bitmaps, size_t N) {
-  for (size_t i = 1; i < N; ++i) {
-    DCHECK_EQ(bitmaps[i].length(), bitmaps[0].length());
-  }
-  return bitmaps[0].length();
-}
-
-Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
-                                          int64_t left_offset, const uint8_t* right,
-                                          int64_t right_offset, int64_t length,
-                                          int64_t out_offset) {
-  return BitmapOp<std::bit_and, std::logical_and<bool>>(pool, left, left_offset, right,
-                                                        right_offset, length, out_offset);
-}
-
-void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
-  BitmapOp<std::bit_and, std::logical_and<bool>>(left, left_offset, right, right_offset,
-                                                 length, out_offset, out);
-}
-
-Result<std::shared_ptr<Buffer>> BitmapOr(MemoryPool* pool, const uint8_t* left,
-                                         int64_t left_offset, const uint8_t* right,
-                                         int64_t right_offset, int64_t length,
-                                         int64_t out_offset) {
-  return BitmapOp<std::bit_or, std::logical_or<bool>>(pool, left, left_offset, right,
-                                                      right_offset, length, out_offset);
-}
-
-void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-              int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
-  BitmapOp<std::bit_or, std::logical_or<bool>>(left, left_offset, right, right_offset,
-                                               length, out_offset, out);
-}
-
-Result<std::shared_ptr<Buffer>> BitmapXor(MemoryPool* pool, const uint8_t* left,
-                                          int64_t left_offset, const uint8_t* right,
-                                          int64_t right_offset, int64_t length,
-                                          int64_t out_offset) {
-  return BitmapOp<std::bit_xor, std::bit_xor<bool>>(pool, left, left_offset, right,
-                                                    right_offset, length, out_offset);
-}
-
-void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
-  BitmapOp<std::bit_xor, std::bit_xor<bool>>(left, left_offset, right, right_offset,
-                                             length, out_offset, out);
-}
-
-Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
-                                                int64_t straggler_pos, bool value) {
-  if (straggler_pos < 0 || straggler_pos >= length) {
-    return Status::Invalid("invalid straggler_pos ", straggler_pos);
-  }
-
-  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(BitUtil::BytesForBits(length), pool));
-
-  auto bitmap_data = buffer->mutable_data();
-  BitUtil::SetBitsTo(bitmap_data, 0, length, value);
-  BitUtil::SetBitTo(bitmap_data, straggler_pos, !value);
-  return std::move(buffer);
-}
-
-BitBlockCounter::Block BitBlockCounter::NextBlock() {
-  auto load_word = [](const uint8_t* bytes) -> uint64_t {
-    return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
-  };
-  auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t {
-    return (current >> shift) | (next << (64 - shift));
-  };
-
-  // When the offset is > 0, we need there to be a word beyond the last aligned
-  // word in the bitmap for the bit shifting logic.
-  const int64_t bits_required_to_scan_words = offset_ == 0 ? 256 : 256 + (64 - offset_);
-  if (bits_remaining_ < bits_required_to_scan_words) {
-    // End of the bitmap, leave it to the caller to decide how to best check
-    // these bits, no need to do redundant computation here.
-    const int16_t run_length = static_cast<int16_t>(bits_remaining_);
-    bits_remaining_ -= run_length;
-    return {run_length, static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length))};
-  }
-
-  int64_t total_popcount = 0;
-  if (offset_ == 0) {
-    total_popcount += __builtin_popcountll(load_word(bitmap_));
-    total_popcount += __builtin_popcountll(load_word(bitmap_ + 8));
-    total_popcount += __builtin_popcountll(load_word(bitmap_ + 16));
-    total_popcount += __builtin_popcountll(load_word(bitmap_ + 24));
-  } else {
-    auto current = load_word(bitmap_);
-    auto next = load_word(bitmap_ + 8);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-    current = next;
-    next = load_word(bitmap_ + 16);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-    current = next;
-    next = load_word(bitmap_ + 24);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-    current = next;
-    next = load_word(bitmap_ + 32);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-  }
-  bitmap_ += BitUtil::BytesForBits(kTargetBlockLength);
-  bits_remaining_ -= 256;
-  return {256, static_cast<int16_t>(total_popcount)};
-}
-
-}  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index e7e9804..1ca293b 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -57,29 +57,10 @@
 #define ARROW_POPCOUNT32 __builtin_popcount
 #endif
 
-#include <algorithm>
-#include <array>
-#include <bitset>
-#include <cassert>
-#include <cmath>  // IWYU pragma: keep
 #include <cstdint>
-#include <cstring>
-#include <limits>  // IWYU pragma: keep
-#include <memory>
-#include <string>
 #include <type_traits>
-#include <utility>
-#include <vector>
 
-#include "arrow/buffer.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/compare.h"
-#include "arrow/util/functional.h"
 #include "arrow/util/macros.h"
-#include "arrow/util/string_builder.h"
-#include "arrow/util/string_view.h"
 #include "arrow/util/type_traits.h"
 #include "arrow/util/visibility.h"
 
@@ -468,792 +449,5 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
 ARROW_EXPORT
 void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
 
-/// \brief Convert vector of bytes to bitmap buffer
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>&,
-                                            MemoryPool* pool = default_memory_pool());
-
 }  // namespace BitUtil
-
-namespace internal {
-
-class BitmapReader {
- public:
-  BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
-      : bitmap_(bitmap), position_(0), length_(length) {
-    current_byte_ = 0;
-    byte_offset_ = start_offset / 8;
-    bit_offset_ = start_offset % 8;
-    if (length > 0) {
-      current_byte_ = bitmap[byte_offset_];
-    }
-  }
-
-  bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
-
-  bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
-
-  void Next() {
-    ++bit_offset_;
-    ++position_;
-    if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
-      bit_offset_ = 0;
-      ++byte_offset_;
-      if (ARROW_PREDICT_TRUE(position_ < length_)) {
-        current_byte_ = bitmap_[byte_offset_];
-      }
-    }
-  }
-
-  int64_t position() const { return position_; }
-
- private:
-  const uint8_t* bitmap_;
-  int64_t position_;
-  int64_t length_;
-
-  uint8_t current_byte_;
-  int64_t byte_offset_;
-  int64_t bit_offset_;
-};
-
-class BitmapWriter {
-  // A sequential bitwise writer that preserves surrounding bit values.
-
- public:
-  BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
-      : bitmap_(bitmap), position_(0), length_(length) {
-    byte_offset_ = start_offset / 8;
-    bit_mask_ = BitUtil::kBitmask[start_offset % 8];
-    if (length > 0) {
-      current_byte_ = bitmap[byte_offset_];
-    } else {
-      current_byte_ = 0;
-    }
-  }
-
-  void Set() { current_byte_ |= bit_mask_; }
-
-  void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; }
-
-  void Next() {
-    bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
-    ++position_;
-    if (bit_mask_ == 0) {
-      // Finished this byte, need advancing
-      bit_mask_ = 0x01;
-      bitmap_[byte_offset_++] = current_byte_;
-      if (ARROW_PREDICT_TRUE(position_ < length_)) {
-        current_byte_ = bitmap_[byte_offset_];
-      }
-    }
-  }
-
-  void Finish() {
-    // Store current byte if we didn't went past bitmap storage
-    if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
-      bitmap_[byte_offset_] = current_byte_;
-    }
-  }
-
-  int64_t position() const { return position_; }
-
- private:
-  uint8_t* bitmap_;
-  int64_t position_;
-  int64_t length_;
-
-  uint8_t current_byte_;
-  uint8_t bit_mask_;
-  int64_t byte_offset_;
-};
-
-class FirstTimeBitmapWriter {
-  // Like BitmapWriter, but any bit values *following* the bits written
-  // might be clobbered.  It is hence faster than BitmapWriter, and can
-  // also avoid false positives with Valgrind.
-
- public:
-  FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
-      : bitmap_(bitmap), position_(0), length_(length) {
-    current_byte_ = 0;
-    byte_offset_ = start_offset / 8;
-    bit_mask_ = BitUtil::kBitmask[start_offset % 8];
-    if (length > 0) {
-      current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8];
-    } else {
-      current_byte_ = 0;
-    }
-  }
-
-  /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
-  ///
-  /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed
-  ///            to be unset (i.e. 0).
-  /// \param[in] number_of_bits The number of bits to append from word.
-  void AppendWord(uint64_t word, int64_t number_of_bits) {
-    if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
-      return;
-    }
-
-    // Location that the first byte needs to be written to.
-    uint8_t* append_position = bitmap_ + byte_offset_;
-
-    // Update state variables except for current_byte_ here.
-    position_ += number_of_bits;
-    int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
-    bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
-    byte_offset_ += (bit_offset + number_of_bits) / 8;
-
-    if (bit_offset != 0) {
-      // We are in the middle of the byte. This code updates the byte and shifts
-      // bits appropriately within word so it can be memcpy'd below.
-      int64_t bits_to_carry = 8 - bit_offset;
-      // Carry over bits from word to current_byte_. We assume any extra bits in word
-      // unset so no additional accounting is needed for when number_of_bits <
-      // bits_to_carry.
-      current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) << bit_offset;
-      // Check if everything is transfered into current_byte_.
-      if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
-        return;
-      }
-      *append_position = current_byte_;
-      append_position++;
-      // Move the carry bits off of word.
-      word = word >> bits_to_carry;
-      number_of_bits -= bits_to_carry;
-    }
-    word = BitUtil::ToLittleEndian(word);
-    int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits);
-    std::memcpy(append_position, &word, bytes_for_word);
-    // At this point, the previous current_byte_ has been written to bitmap_.
-    // The new current_byte_ is either the last relevant byte in 'word'
-    // or cleared if the new position is byte aligned (i.e. a fresh byte).
-    if (bit_mask_ == 0x1) {
-      current_byte_ = 0;
-    } else {
-      current_byte_ = *(append_position + bytes_for_word - 1);
-    }
-  }
-
-  void Set() { current_byte_ |= bit_mask_; }
-
-  void Clear() {}
-
-  void Next() {
-    bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
-    ++position_;
-    if (bit_mask_ == 0) {
-      // Finished this byte, need advancing
-      bit_mask_ = 0x01;
-      bitmap_[byte_offset_++] = current_byte_;
-      current_byte_ = 0;
-    }
-  }
-
-  void Finish() {
-    // Store current byte if we didn't go past bitmap storage
-    if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
-      bitmap_[byte_offset_] = current_byte_;
-    }
-  }
-
-  int64_t position() const { return position_; }
-
- private:
-  uint8_t* bitmap_;
-  int64_t position_;
-  int64_t length_;
-
-  uint8_t current_byte_;
-  uint8_t bit_mask_;
-  int64_t byte_offset_;
-};
-
-// A std::generate() like function to write sequential bits into a bitmap area.
-// Bits preceding the bitmap area are preserved, bits following the bitmap
-// area may be clobbered.
-
-template <class Generator>
-void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
-  if (length == 0) {
-    return;
-  }
-  uint8_t* cur = bitmap + start_offset / 8;
-  uint8_t bit_mask = BitUtil::kBitmask[start_offset % 8];
-  uint8_t current_byte = *cur & BitUtil::kPrecedingBitmask[start_offset % 8];
-
-  for (int64_t index = 0; index < length; ++index) {
-    const bool bit = g();
-    current_byte = bit ? (current_byte | bit_mask) : current_byte;
-    bit_mask = static_cast<uint8_t>(bit_mask << 1);
-    if (bit_mask == 0) {
-      bit_mask = 1;
-      *cur++ = current_byte;
-      current_byte = 0;
-    }
-  }
-  if (bit_mask != 1) {
-    *cur++ = current_byte;
-  }
-}
-
-// Like GenerateBits(), but unrolls its main loop for higher performance.
-
-template <class Generator>
-void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
-                          Generator&& g) {
-  if (length == 0) {
-    return;
-  }
-  uint8_t current_byte;
-  uint8_t* cur = bitmap + start_offset / 8;
-  const uint64_t start_bit_offset = start_offset % 8;
-  uint8_t bit_mask = BitUtil::kBitmask[start_bit_offset];
-  int64_t remaining = length;
-
-  if (bit_mask != 0x01) {
-    current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset];
-    while (bit_mask != 0 && remaining > 0) {
-      current_byte = g() ? (current_byte | bit_mask) : current_byte;
-      bit_mask = static_cast<uint8_t>(bit_mask << 1);
-      --remaining;
-    }
-    *cur++ = current_byte;
-  }
-
-  int64_t remaining_bytes = remaining / 8;
-  while (remaining_bytes-- > 0) {
-    current_byte = 0;
-    current_byte = g() ? current_byte | 0x01 : current_byte;
-    current_byte = g() ? current_byte | 0x02 : current_byte;
-    current_byte = g() ? current_byte | 0x04 : current_byte;
-    current_byte = g() ? current_byte | 0x08 : current_byte;
-    current_byte = g() ? current_byte | 0x10 : current_byte;
-    current_byte = g() ? current_byte | 0x20 : current_byte;
-    current_byte = g() ? current_byte | 0x40 : current_byte;
-    current_byte = g() ? current_byte | 0x80 : current_byte;
-    *cur++ = current_byte;
-  }
-
-  int64_t remaining_bits = remaining % 8;
-  if (remaining_bits) {
-    current_byte = 0;
-    bit_mask = 0x01;
-    while (remaining_bits-- > 0) {
-      current_byte = g() ? (current_byte | bit_mask) : current_byte;
-      bit_mask = static_cast<uint8_t>(bit_mask << 1);
-    }
-    *cur++ = current_byte;
-  }
-}
-
-// A function that visits each bit in a bitmap and calls a visitor function with a
-// boolean representation of that bit. This is intended to be analogous to
-// GenerateBits.
-template <class Visitor>
-void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
-               Visitor&& visit) {
-  BitmapReader reader(bitmap, start_offset, length);
-  for (int64_t index = 0; index < length; ++index) {
-    visit(reader.IsSet());
-    reader.Next();
-  }
-}
-
-// Like VisitBits(), but unrolls its main loop for better performance.
-template <class Visitor>
-void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
-                       Visitor&& visit) {
-  if (length == 0) {
-    return;
-  }
-
-  // Start by visiting any bits preceding the first full byte.
-  int64_t num_bits_before_full_bytes =
-      BitUtil::RoundUpToMultipleOf8(start_offset) - start_offset;
-  // Truncate num_bits_before_full_bytes if it is greater than length.
-  if (num_bits_before_full_bytes > length) {
-    num_bits_before_full_bytes = length;
-  }
-  // Use the non loop-unrolled VisitBits since we don't want to add branches
-  VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
-
-  // Shift the start pointer to the first full byte and compute the
-  // number of full bytes to be read.
-  const uint8_t* first_full_byte = bitmap + BitUtil::CeilDiv(start_offset, 8);
-  const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
-
-  // Iterate over each full byte of the input bitmap and call the visitor in
-  // a loop-unrolled manner.
-  for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
-    // Get the current bit-packed byte value from the bitmap.
-    const uint8_t byte = *(first_full_byte + byte_index);
-
-    // Execute the visitor function on each bit of the current byte.
-    visit(BitUtil::GetBitFromByte(byte, 0));
-    visit(BitUtil::GetBitFromByte(byte, 1));
-    visit(BitUtil::GetBitFromByte(byte, 2));
-    visit(BitUtil::GetBitFromByte(byte, 3));
-    visit(BitUtil::GetBitFromByte(byte, 4));
-    visit(BitUtil::GetBitFromByte(byte, 5));
-    visit(BitUtil::GetBitFromByte(byte, 6));
-    visit(BitUtil::GetBitFromByte(byte, 7));
-  }
-
-  // Write any leftover bits in the last byte.
-  const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
-  VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
-                     visit);
-}
-
-// ----------------------------------------------------------------------
-// Bitmap utilities
-
-/// Copy a bit range of an existing bitmap
-///
-/// \param[in] pool memory pool to allocate memory from
-/// \param[in] bitmap source data
-/// \param[in] offset bit offset into the source data
-/// \param[in] length number of bits to copy
-///
-/// \return Status message
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* bitmap,
-                                           int64_t offset, int64_t length);
-
-/// Copy a bit range of an existing bitmap into an existing bitmap
-///
-/// \param[in] bitmap source data
-/// \param[in] offset bit offset into the source data
-/// \param[in] length number of bits to copy
-/// \param[in] dest_offset bit offset into the destination
-/// \param[in] restore_trailing_bits don't clobber bits outside the destination range
-/// \param[out] dest the destination buffer, must have at least space for
-/// (offset + length) bits
-ARROW_EXPORT
-void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
-                int64_t dest_offset, bool restore_trailing_bits = true);
-
-/// Invert a bit range of an existing bitmap into an existing bitmap
-///
-/// \param[in] bitmap source data
-/// \param[in] offset bit offset into the source data
-/// \param[in] length number of bits to copy
-/// \param[in] dest_offset bit offset into the destination
-/// \param[out] dest the destination buffer, must have at least space for
-/// (offset + length) bits
-ARROW_EXPORT
-void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
-                  int64_t dest_offset);
-
-/// Invert a bit range of an existing bitmap
-///
-/// \param[in] pool memory pool to allocate memory from
-/// \param[in] bitmap source data
-/// \param[in] offset bit offset into the source data
-/// \param[in] length number of bits to copy
-///
-/// \return Status message
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* bitmap,
-                                             int64_t offset, int64_t length);
-
-/// Compute the number of 1's in the given data array
-///
-/// \param[in] data a packed LSB-ordered bitmap as a byte array
-/// \param[in] bit_offset a bitwise offset into the bitmap
-/// \param[in] length the number of bits to inspect in the bitmap relative to
-/// the offset
-///
-/// \return The number of set (1) bits in the range
-ARROW_EXPORT
-int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length);
-
-class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
-                            public util::EqualityComparable<Bitmap> {
- public:
-  template <typename Word>
-  using View = util::basic_string_view<Word>;
-
-  Bitmap() = default;
-
-  Bitmap(std::shared_ptr<Buffer> buffer, int64_t offset, int64_t length)
-      : buffer_(std::move(buffer)), offset_(offset), length_(length) {}
-
-  Bitmap(const void* data, int64_t offset, int64_t length)
-      : buffer_(std::make_shared<Buffer>(static_cast<const uint8_t*>(data),
-                                         BitUtil::BytesForBits(offset + length))),
-        offset_(offset),
-        length_(length) {}
-
-  Bitmap(void* data, int64_t offset, int64_t length)
-      : buffer_(std::make_shared<MutableBuffer>(static_cast<uint8_t*>(data),
-                                                BitUtil::BytesForBits(offset + length))),
-        offset_(offset),
-        length_(length) {}
-
-  Bitmap Slice(int64_t offset) const {
-    return Bitmap(buffer_, offset_ + offset, length_ - offset);
-  }
-
-  Bitmap Slice(int64_t offset, int64_t length) const {
-    return Bitmap(buffer_, offset_ + offset, length);
-  }
-
-  std::string ToString() const;
-
-  bool Equals(const Bitmap& other) const;
-
-  std::string Diff(const Bitmap& other) const;
-
-  bool GetBit(int64_t i) const { return BitUtil::GetBit(buffer_->data(), i + offset_); }
-
-  bool operator[](int64_t i) const { return GetBit(i); }
-
-  void SetBitTo(int64_t i, bool v) const {
-    BitUtil::SetBitTo(buffer_->mutable_data(), i + offset_, v);
-  }
-
-  /// \brief Visit bits from each bitmap as bitset<N>
-  ///
-  /// All bitmaps must have identical length.
-  template <size_t N, typename Visitor>
-  static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) {
-    int64_t bit_length = BitLength(bitmaps, N);
-    std::bitset<N> bits;
-    for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
-      for (size_t i = 0; i < N; ++i) {
-        bits[i] = bitmaps[i].GetBit(bit_i);
-      }
-      visitor(bits);
-    }
-  }
-
-  /// \brief Visit words of bits from each bitmap as array<Word, N>
-  ///
-  /// All bitmaps must have identical length. The first bit in a visited bitmap
-  /// may be offset within the first visited word, but words will otherwise contain
-  /// densely packed bits loaded from the bitmap. That offset within the first word is
-  /// returned.
-  ///
-  /// TODO(bkietz) allow for early termination
-  template <size_t N, typename Visitor,
-            typename Word =
-                typename internal::call_traits::argument_type<0, Visitor&&>::value_type>
-  static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
-    constexpr int64_t kBitWidth = sizeof(Word) * 8;
-
-    // local, mutable variables which will be sliced/decremented to represent consumption:
-    Bitmap bitmaps[N];
-    int64_t offsets[N];
-    int64_t bit_length = BitLength(bitmaps_arg, N);
-    View<Word> words[N];
-    for (size_t i = 0; i < N; ++i) {
-      bitmaps[i] = bitmaps_arg[i];
-      offsets[i] = bitmaps[i].template word_offset<Word>();
-      assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
-      words[i] = bitmaps[i].template words<Word>();
-    }
-
-    auto consume = [&](int64_t consumed_bits) {
-      for (size_t i = 0; i < N; ++i) {
-        bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
-        offsets[i] = bitmaps[i].template word_offset<Word>();
-        assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
-        words[i] = bitmaps[i].template words<Word>();
-      }
-      bit_length -= consumed_bits;
-    };
-
-    std::array<Word, N> visited_words;
-    visited_words.fill(0);
-
-    if (bit_length <= kBitWidth * 2) {
-      // bitmaps fit into one or two words so don't bother with optimization
-      while (bit_length > 0) {
-        auto leading_bits = std::min(bit_length, kBitWidth);
-        SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
-        visitor(visited_words);
-        consume(leading_bits);
-      }
-      return 0;
-    }
-
-    int64_t max_offset = *std::max_element(offsets, offsets + N);
-    int64_t min_offset = *std::min_element(offsets, offsets + N);
-    if (max_offset > 0) {
-      // consume leading bits
-      auto leading_bits = kBitWidth - min_offset;
-      SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
-      visitor(visited_words);
-      consume(leading_bits);
-    }
-    assert(*std::min_element(offsets, offsets + N) == 0);
-
-    int64_t whole_word_count = bit_length / kBitWidth;
-    assert(whole_word_count >= 1);
-
-    if (min_offset == max_offset) {
-      // all offsets were identical, all leading bits have been consumed
-      assert(
-          std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
-
-      for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
-        for (size_t i = 0; i < N; ++i) {
-          visited_words[i] = words[i][word_i];
-        }
-        visitor(visited_words);
-      }
-      consume(whole_word_count * kBitWidth);
-    } else {
-      // leading bits from potentially incomplete words have been consumed
-
-      // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
-      // within the bitmap for all i
-      for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
-        for (size_t i = 0; i < N; ++i) {
-          if (offsets[i] == 0) {
-            visited_words[i] = words[i][word_i];
-          } else {
-            auto words0 = BitUtil::ToLittleEndian(words[i][word_i]);
-            auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]);
-            visited_words[i] = BitUtil::FromLittleEndian(
-                (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
-          }
-        }
-        visitor(visited_words);
-      }
-      consume((whole_word_count - 1) * kBitWidth);
-
-      SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
-
-      visitor(visited_words);
-      consume(kBitWidth);
-    }
-
-    // load remaining bits
-    if (bit_length > 0) {
-      SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
-      visitor(visited_words);
-    }
-
-    return min_offset;
-  }
-
-  const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
-
-  /// offset of first bit relative to buffer().data()
-  int64_t offset() const { return offset_; }
-
-  /// number of bits in this Bitmap
-  int64_t length() const { return length_; }
-
-  /// string_view of all bytes which contain any bit in this Bitmap
-  util::bytes_view bytes() const {
-    auto byte_offset = offset_ / 8;
-    auto byte_count = BitUtil::CeilDiv(offset_ + length_, 8) - byte_offset;
-    return util::bytes_view(buffer_->data() + byte_offset, byte_count);
-  }
-
- private:
-  /// string_view of all Words which contain any bit in this Bitmap
-  ///
-  /// For example, given Word=uint16_t and a bitmap spanning bits [20, 36)
-  /// words() would span bits [16, 48).
-  ///
-  /// 0       16      32     48     64
-  /// |-------|-------|------|------| (buffer)
-  ///           [       ]             (bitmap)
-  ///         |-------|------|        (returned words)
-  ///
-  /// \warning The words may contain bytes which lie outside the buffer or are
-  /// uninitialized.
-  template <typename Word>
-  View<Word> words() const {
-    auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data());
-    auto words_addr = bytes_addr - bytes_addr % sizeof(Word);
-    auto word_byte_count =
-        BitUtil::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()),
-                                   static_cast<int64_t>(sizeof(Word))) -
-        words_addr;
-    return View<Word>(reinterpret_cast<const Word*>(words_addr),
-                      word_byte_count / sizeof(Word));
-  }
-
-  /// offset of first bit relative to words<Word>().data()
-  template <typename Word>
-  int64_t word_offset() const {
-    return offset_ + 8 * (reinterpret_cast<intptr_t>(buffer_->data()) -
-                          reinterpret_cast<intptr_t>(words<Word>().data()));
-  }
-
-  /// load words from bitmaps bitwise
-  template <size_t N, typename Word>
-  static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset,
-                            int64_t out_length, bool set_trailing_bits,
-                            std::array<Word, N>* out) {
-    out->fill(0);
-
-    int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0;
-
-    Bitmap slices[N], out_bitmaps[N];
-    for (size_t i = 0; i < N; ++i) {
-      slices[i] = bitmaps[i].Slice(offset, out_length);
-      out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length);
-    }
-
-    int64_t bit_i = 0;
-    Bitmap::VisitBits(slices, [&](std::bitset<N> bits) {
-      for (size_t i = 0; i < N; ++i) {
-        out_bitmaps[i].SetBitTo(bit_i, bits[i]);
-      }
-      ++bit_i;
-    });
-  }
-
-  std::shared_ptr<BooleanArray> ToArray() const;
-
-  /// assert bitmaps have identical length and return that length
-  static int64_t BitLength(const Bitmap* bitmaps, size_t N);
-
-  std::shared_ptr<Buffer> buffer_;
-  int64_t offset_ = 0, length_ = 0;
-};
-
-ARROW_EXPORT
-bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-                  int64_t right_offset, int64_t bit_length);
-
-/// \brief Do a "bitmap and" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out_buffer starting at the given bit-offset.
-///
-/// out_buffer will be allocated and initialized to zeros using pool before
-/// the operation.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
-                                          int64_t left_offset, const uint8_t* right,
-                                          int64_t right_offset, int64_t length,
-                                          int64_t out_offset);
-
-/// \brief Do a "bitmap and" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out starting at the given bit-offset.
-ARROW_EXPORT
-void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-
-/// \brief Do a "bitmap or" for the given bit length on right and left buffers
-/// starting at their respective bit-offsets and put the results in out_buffer
-/// starting at the given bit-offset.
-///
-/// out_buffer will be allocated and initialized to zeros using pool before
-/// the operation.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapOr(MemoryPool* pool, const uint8_t* left,
-                                         int64_t left_offset, const uint8_t* right,
-                                         int64_t right_offset, int64_t length,
-                                         int64_t out_offset);
-
-/// \brief Do a "bitmap or" for the given bit length on right and left buffers
-/// starting at their respective bit-offsets and put the results in out
-/// starting at the given bit-offset.
-ARROW_EXPORT
-void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-              int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-
-/// \brief Do a "bitmap xor" for the given bit-length on right and left
-/// buffers starting at their respective bit-offsets and put the results in
-/// out_buffer starting at the given bit offset.
-///
-/// out_buffer will be allocated and initialized to zeros using pool before
-/// the operation.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapXor(MemoryPool* pool, const uint8_t* left,
-                                          int64_t left_offset, const uint8_t* right,
-                                          int64_t right_offset, int64_t length,
-                                          int64_t out_offset);
-
-/// \brief Do a "bitmap xor" for the given bit-length on right and left
-/// buffers starting at their respective bit-offsets and put the results in
-/// out starting at the given bit offset.
-ARROW_EXPORT
-void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
-               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-
-/// \brief Generate Bitmap with all position to `value` except for one found
-/// at `straggler_pos`.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
-                                                int64_t straggler_pos, bool value = true);
-
-/// \brief Store a stack of bitsets efficiently. The top bitset may be
-/// accessed and its bits may be modified, but it may not be resized.
-class BitsetStack {
- public:
-  using reference = typename std::vector<bool>::reference;
-
-  /// \brief push a bitset onto the stack
-  /// \param size number of bits in the next bitset
-  /// \param value initial value for bits in the pushed bitset
-  void Push(int size, bool value) {
-    offsets_.push_back(bit_count());
-    bits_.resize(bit_count() + size, value);
-  }
-
-  /// \brief number of bits in the bitset at the top of the stack
-  int TopSize() const {
-    if (offsets_.size() == 0) return 0;
-    return bit_count() - offsets_.back();
-  }
-
-  /// \brief pop a bitset off the stack
-  void Pop() {
-    bits_.resize(offsets_.back());
-    offsets_.pop_back();
-  }
-
-  /// \brief get the value of a bit in the top bitset
-  /// \param i index of the bit to access
-  bool operator[](int i) const { return bits_[offsets_.back() + i]; }
-
-  /// \brief get a mutable reference to a bit in the top bitset
-  /// \param i index of the bit to access
-  reference operator[](int i) { return bits_[offsets_.back() + i]; }
-
- private:
-  int bit_count() const { return static_cast<int>(bits_.size()); }
-  std::vector<bool> bits_;
-  std::vector<int> offsets_;
-};
-
-/// \brief A class that scans through a true/false bitmap to yield blocks of up
-/// to 256 bits at a time along with their popcount. This is used to accelerate
-/// processing of mostly-not-null array data.
-class ARROW_EXPORT BitBlockCounter {
- public:
-  struct Block {
-    int16_t length;
-    int16_t popcount;
-  };
-
-  static constexpr int16_t kTargetBlockLength = 256;
-
-  BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
-      : bitmap_(bitmap + start_offset / 8),
-        bits_remaining_(length),
-        offset_(start_offset % 8) {}
-
-  /// \brief Return the next run of available bits, up to 256. The returned
-  /// pair contains the size of run and the number of true values
-  Block NextBlock();
-
- private:
-  const uint8_t* bitmap_;
-  int64_t bits_remaining_;
-  int64_t offset_;
-};
-
-}  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc b/cpp/src/arrow/util/bit_util_benchmark.cc
index ae4ddc1..ab8f91e 100644
--- a/cpp/src/arrow/util/bit_util_benchmark.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark.cc
@@ -29,7 +29,14 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
+#include "arrow/util/bit_block_counter.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_visit.h"
+#include "arrow/util/bitmap_writer.h"
 
 namespace arrow {
 
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index a501c7f..2eff46f 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -33,8 +33,16 @@
 #include "arrow/memory_pool.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/bit_block_counter.h"
 #include "arrow/util/bit_stream_utils.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap.h"
+#include "arrow/util/bitmap_builders.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_visit.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/bitset_stack.h"
 #include "arrow/util/cpu_info.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/util/bitmap.cc b/cpp/src/arrow/util/bitmap.cc
new file mode 100644
index 0000000..0cd364d
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap.cc
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bitmap.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "arrow/array/array_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+std::string Bitmap::ToString() const {
+  std::string out(length_ + ((length_ - 1) / 8), ' ');
+  for (int64_t i = 0; i < length_; ++i) {
+    out[i + (i / 8)] = GetBit(i) ? '1' : '0';
+  }
+  return out;
+}
+
+std::shared_ptr<BooleanArray> Bitmap::ToArray() const {
+  return std::make_shared<BooleanArray>(length_, buffer_, nullptr, 0, offset_);
+}
+
+std::string Bitmap::Diff(const Bitmap& other) const {
+  return ToArray()->Diff(*other.ToArray());
+}
+
+bool Bitmap::Equals(const Bitmap& other) const {
+  if (length_ != other.length_) {
+    return false;
+  }
+  return BitmapEquals(buffer_->data(), offset_, other.buffer_->data(), other.offset(),
+                      length_);
+}
+
+int64_t Bitmap::BitLength(const Bitmap* bitmaps, size_t N) {
+  for (size_t i = 1; i < N; ++i) {
+    DCHECK_EQ(bitmaps[i].length(), bitmaps[0].length());
+  }
+  return bitmaps[0].length();
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
new file mode 100644
index 0000000..c98b0c6
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap.h
@@ -0,0 +1,296 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/string_builder.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class BooleanArray;
+
+namespace internal {
+
+class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
+                            public util::EqualityComparable<Bitmap> {
+ public:
+  template <typename Word>
+  using View = util::basic_string_view<Word>;
+
+  Bitmap() = default;
+
+  Bitmap(std::shared_ptr<Buffer> buffer, int64_t offset, int64_t length)
+      : buffer_(std::move(buffer)), offset_(offset), length_(length) {}
+
+  Bitmap(const void* data, int64_t offset, int64_t length)
+      : buffer_(std::make_shared<Buffer>(static_cast<const uint8_t*>(data),
+                                         BitUtil::BytesForBits(offset + length))),
+        offset_(offset),
+        length_(length) {}
+
+  Bitmap(void* data, int64_t offset, int64_t length)
+      : buffer_(std::make_shared<MutableBuffer>(static_cast<uint8_t*>(data),
+                                                BitUtil::BytesForBits(offset + length))),
+        offset_(offset),
+        length_(length) {}
+
+  Bitmap Slice(int64_t offset) const {
+    return Bitmap(buffer_, offset_ + offset, length_ - offset);
+  }
+
+  Bitmap Slice(int64_t offset, int64_t length) const {
+    return Bitmap(buffer_, offset_ + offset, length);
+  }
+
+  std::string ToString() const;
+
+  bool Equals(const Bitmap& other) const;
+
+  std::string Diff(const Bitmap& other) const;
+
+  bool GetBit(int64_t i) const { return BitUtil::GetBit(buffer_->data(), i + offset_); }
+
+  bool operator[](int64_t i) const { return GetBit(i); }
+
+  void SetBitTo(int64_t i, bool v) const {
+    BitUtil::SetBitTo(buffer_->mutable_data(), i + offset_, v);
+  }
+
+  /// \brief Visit bits from each bitmap as bitset<N>
+  ///
+  /// All bitmaps must have identical length.
+  template <size_t N, typename Visitor>
+  static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) {
+    int64_t bit_length = BitLength(bitmaps, N);
+    std::bitset<N> bits;
+    for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+      for (size_t i = 0; i < N; ++i) {
+        bits[i] = bitmaps[i].GetBit(bit_i);
+      }
+      visitor(bits);
+    }
+  }
+
+  /// \brief Visit words of bits from each bitmap as array<Word, N>
+  ///
+  /// All bitmaps must have identical length. The first bit in a visited bitmap
+  /// may be offset within the first visited word, but words will otherwise contain
+  /// densely packed bits loaded from the bitmap. That offset within the first word is
+  /// returned.
+  ///
+  /// TODO(bkietz) allow for early termination
+  template <size_t N, typename Visitor,
+            typename Word =
+                typename internal::call_traits::argument_type<0, Visitor&&>::value_type>
+  static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
+    constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+    // local, mutable variables which will be sliced/decremented to represent consumption:
+    Bitmap bitmaps[N];
+    int64_t offsets[N];
+    int64_t bit_length = BitLength(bitmaps_arg, N);
+    View<Word> words[N];
+    for (size_t i = 0; i < N; ++i) {
+      bitmaps[i] = bitmaps_arg[i];
+      offsets[i] = bitmaps[i].template word_offset<Word>();
+      assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+      words[i] = bitmaps[i].template words<Word>();
+    }
+
+    auto consume = [&](int64_t consumed_bits) {
+      for (size_t i = 0; i < N; ++i) {
+        bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
+        offsets[i] = bitmaps[i].template word_offset<Word>();
+        assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+        words[i] = bitmaps[i].template words<Word>();
+      }
+      bit_length -= consumed_bits;
+    };
+
+    std::array<Word, N> visited_words;
+    visited_words.fill(0);
+
+    if (bit_length <= kBitWidth * 2) {
+      // bitmaps fit into one or two words so don't bother with optimization
+      while (bit_length > 0) {
+        auto leading_bits = std::min(bit_length, kBitWidth);
+        SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
+        visitor(visited_words);
+        consume(leading_bits);
+      }
+      return 0;
+    }
+
+    int64_t max_offset = *std::max_element(offsets, offsets + N);
+    int64_t min_offset = *std::min_element(offsets, offsets + N);
+    if (max_offset > 0) {
+      // consume leading bits
+      auto leading_bits = kBitWidth - min_offset;
+      SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
+      visitor(visited_words);
+      consume(leading_bits);
+    }
+    assert(*std::min_element(offsets, offsets + N) == 0);
+
+    int64_t whole_word_count = bit_length / kBitWidth;
+    assert(whole_word_count >= 1);
+
+    if (min_offset == max_offset) {
+      // all offsets were identical, all leading bits have been consumed
+      assert(
+          std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
+
+      for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
+        for (size_t i = 0; i < N; ++i) {
+          visited_words[i] = words[i][word_i];
+        }
+        visitor(visited_words);
+      }
+      consume(whole_word_count * kBitWidth);
+    } else {
+      // leading bits from potentially incomplete words have been consumed
+
+      // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
+      // within the bitmap for all i
+      for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
+        for (size_t i = 0; i < N; ++i) {
+          if (offsets[i] == 0) {
+            visited_words[i] = words[i][word_i];
+          } else {
+            auto words0 = BitUtil::ToLittleEndian(words[i][word_i]);
+            auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]);
+            visited_words[i] = BitUtil::FromLittleEndian(
+                (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
+          }
+        }
+        visitor(visited_words);
+      }
+      consume((whole_word_count - 1) * kBitWidth);
+
+      SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
+
+      visitor(visited_words);
+      consume(kBitWidth);
+    }
+
+    // load remaining bits
+    if (bit_length > 0) {
+      SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
+      visitor(visited_words);
+    }
+
+    return min_offset;
+  }
+
+  const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
+
+  /// offset of first bit relative to buffer().data()
+  int64_t offset() const { return offset_; }
+
+  /// number of bits in this Bitmap
+  int64_t length() const { return length_; }
+
+  /// string_view of all bytes which contain any bit in this Bitmap
+  util::bytes_view bytes() const {
+    auto byte_offset = offset_ / 8;
+    auto byte_count = BitUtil::CeilDiv(offset_ + length_, 8) - byte_offset;
+    return util::bytes_view(buffer_->data() + byte_offset, byte_count);
+  }
+
+ private:
+  /// string_view of all Words which contain any bit in this Bitmap
+  ///
+  /// For example, given Word=uint16_t and a bitmap spanning bits [20, 36)
+  /// words() would span bits [16, 48).
+  ///
+  /// 0       16      32     48     64
+  /// |-------|-------|------|------| (buffer)
+  ///           [       ]             (bitmap)
+  ///         |-------|------|        (returned words)
+  ///
+  /// \warning The words may contain bytes which lie outside the buffer or are
+  /// uninitialized.
+  template <typename Word>
+  View<Word> words() const {
+    auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data());
+    auto words_addr = bytes_addr - bytes_addr % sizeof(Word);
+    auto word_byte_count =
+        BitUtil::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()),
+                                   static_cast<int64_t>(sizeof(Word))) -
+        words_addr;
+    return View<Word>(reinterpret_cast<const Word*>(words_addr),
+                      word_byte_count / sizeof(Word));
+  }
+
+  /// offset of first bit relative to words<Word>().data()
+  template <typename Word>
+  int64_t word_offset() const {
+    return offset_ + 8 * (reinterpret_cast<intptr_t>(buffer_->data()) -
+                          reinterpret_cast<intptr_t>(words<Word>().data()));
+  }
+
+  /// load words from bitmaps bitwise
+  template <size_t N, typename Word>
+  static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset,
+                            int64_t out_length, bool set_trailing_bits,
+                            std::array<Word, N>* out) {
+    out->fill(0);
+
+    int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0;
+
+    Bitmap slices[N], out_bitmaps[N];
+    for (size_t i = 0; i < N; ++i) {
+      slices[i] = bitmaps[i].Slice(offset, out_length);
+      out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length);
+    }
+
+    int64_t bit_i = 0;
+    Bitmap::VisitBits(slices, [&](std::bitset<N> bits) {
+      for (size_t i = 0; i < N; ++i) {
+        out_bitmaps[i].SetBitTo(bit_i, bits[i]);
+      }
+      ++bit_i;
+    });
+  }
+
+  std::shared_ptr<BooleanArray> ToArray() const;
+
+  /// assert bitmaps have identical length and return that length
+  static int64_t BitLength(const Bitmap* bitmaps, size_t N);
+
+  std::shared_ptr<Buffer> buffer_;
+  int64_t offset_ = 0, length_ = 0;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_builders.cc b/cpp/src/arrow/util/bitmap_builders.cc
new file mode 100644
index 0000000..9a91b7a
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_builders.cc
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bitmap_builders.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+void FillBitsFromBytes(const std::vector<uint8_t>& bytes, uint8_t* bits) {
+  for (size_t i = 0; i < bytes.size(); ++i) {
+    if (bytes[i] > 0) {
+      BitUtil::SetBit(bits, i);
+    }
+  }
+}
+
+}  // namespace
+
+Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>& bytes,
+                                            MemoryPool* pool) {
+  int64_t bit_length = BitUtil::BytesForBits(bytes.size());
+
+  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(bit_length, pool));
+  uint8_t* out_buf = buffer->mutable_data();
+  memset(out_buf, 0, static_cast<size_t>(buffer->capacity()));
+  FillBitsFromBytes(bytes, out_buf);
+  return std::move(buffer);
+}
+
+Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
+                                                int64_t straggler_pos, bool value) {
+  if (straggler_pos < 0 || straggler_pos >= length) {
+    return Status::Invalid("invalid straggler_pos ", straggler_pos);
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(BitUtil::BytesForBits(length), pool));
+
+  auto bitmap_data = buffer->mutable_data();
+  BitUtil::SetBitsTo(bitmap_data, 0, length, value);
+  BitUtil::SetBitTo(bitmap_data, straggler_pos, !value);
+  return std::move(buffer);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_builders.h b/cpp/src/arrow/util/bitmap_builders.h
new file mode 100644
index 0000000..d11e8bc
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_builders.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief Generate Bitmap with all position to `value` except for one found
+/// at `straggler_pos`.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
+                                                int64_t straggler_pos, bool value = true);
+
+/// \brief Convert vector of bytes to bitmap buffer
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>&,
+                                            MemoryPool* pool = default_memory_pool());
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_generate.h b/cpp/src/arrow/util/bitmap_generate.h
new file mode 100644
index 0000000..adde43d
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_generate.h
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+// A std::generate() like function to write sequential bits into a bitmap area.
+// Bits preceding the bitmap area are preserved, bits following the bitmap
+// area may be clobbered.
+
+template <class Generator>
+void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
+  if (length == 0) {
+    return;
+  }
+  uint8_t* cur = bitmap + start_offset / 8;
+  uint8_t bit_mask = BitUtil::kBitmask[start_offset % 8];
+  uint8_t current_byte = *cur & BitUtil::kPrecedingBitmask[start_offset % 8];
+
+  for (int64_t index = 0; index < length; ++index) {
+    const bool bit = g();
+    current_byte = bit ? (current_byte | bit_mask) : current_byte;
+    bit_mask = static_cast<uint8_t>(bit_mask << 1);
+    if (bit_mask == 0) {
+      bit_mask = 1;
+      *cur++ = current_byte;
+      current_byte = 0;
+    }
+  }
+  if (bit_mask != 1) {
+    *cur++ = current_byte;
+  }
+}
+
+// Like GenerateBits(), but unrolls its main loop for higher performance.
+
+template <class Generator>
+void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
+                          Generator&& g) {
+  if (length == 0) {
+    return;
+  }
+  uint8_t current_byte;
+  uint8_t* cur = bitmap + start_offset / 8;
+  const uint64_t start_bit_offset = start_offset % 8;
+  uint8_t bit_mask = BitUtil::kBitmask[start_bit_offset];
+  int64_t remaining = length;
+
+  if (bit_mask != 0x01) {
+    current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset];
+    while (bit_mask != 0 && remaining > 0) {
+      current_byte = g() ? (current_byte | bit_mask) : current_byte;
+      bit_mask = static_cast<uint8_t>(bit_mask << 1);
+      --remaining;
+    }
+    *cur++ = current_byte;
+  }
+
+  int64_t remaining_bytes = remaining / 8;
+  while (remaining_bytes-- > 0) {
+    current_byte = 0;
+    current_byte = g() ? current_byte | 0x01 : current_byte;
+    current_byte = g() ? current_byte | 0x02 : current_byte;
+    current_byte = g() ? current_byte | 0x04 : current_byte;
+    current_byte = g() ? current_byte | 0x08 : current_byte;
+    current_byte = g() ? current_byte | 0x10 : current_byte;
+    current_byte = g() ? current_byte | 0x20 : current_byte;
+    current_byte = g() ? current_byte | 0x40 : current_byte;
+    current_byte = g() ? current_byte | 0x80 : current_byte;
+    *cur++ = current_byte;
+  }
+
+  int64_t remaining_bits = remaining % 8;
+  if (remaining_bits) {
+    current_byte = 0;
+    bit_mask = 0x01;
+    while (remaining_bits-- > 0) {
+      current_byte = g() ? (current_byte | bit_mask) : current_byte;
+      bit_mask = static_cast<uint8_t>(bit_mask << 1);
+    }
+    *cur++ = current_byte;
+  }
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bitmap_ops.cc
similarity index 76%
copy from cpp/src/arrow/util/bit_util.cc
copy to cpp/src/arrow/util/bitmap_ops.cc
index 8b63da6..0c34930 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -15,93 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "arrow/util/bitmap_ops.h"
+
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <functional>
 #include <memory>
-#include <string>
-#include <vector>
+#include <type_traits>
 
-#include "arrow/array/array_primitive.h"
 #include "arrow/buffer.h"
-#include "arrow/status.h"
+#include "arrow/result.h"
 #include "arrow/util/align_util.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/ubsan.h"
 
 namespace arrow {
-namespace BitUtil {
-namespace {
-
-void FillBitsFromBytes(const std::vector<uint8_t>& bytes, uint8_t* bits) {
-  for (size_t i = 0; i < bytes.size(); ++i) {
-    if (bytes[i] > 0) {
-      SetBit(bits, i);
-    }
-  }
-}
-
-}  // namespace
-
-void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set) {
-  if (length == 0) {
-    return;
-  }
-
-  const int64_t i_begin = start_offset;
-  const int64_t i_end = start_offset + length;
-  const uint8_t fill_byte = static_cast<uint8_t>(-static_cast<uint8_t>(bits_are_set));
-
-  const int64_t bytes_begin = i_begin / 8;
-  const int64_t bytes_end = i_end / 8 + 1;
-
-  const uint8_t first_byte_mask = kPrecedingBitmask[i_begin % 8];
-  const uint8_t last_byte_mask = kTrailingBitmask[i_end % 8];
-
-  if (bytes_end == bytes_begin + 1) {
-    // set bits within a single byte
-    const uint8_t only_byte_mask =
-        i_end % 8 == 0 ? first_byte_mask
-                       : static_cast<uint8_t>(first_byte_mask | last_byte_mask);
-    bits[bytes_begin] &= only_byte_mask;
-    bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~only_byte_mask);
-    return;
-  }
-
-  // set/clear trailing bits of first byte
-  bits[bytes_begin] &= first_byte_mask;
-  bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~first_byte_mask);
-
-  if (bytes_end - bytes_begin > 2) {
-    // set/clear whole bytes
-    std::memset(bits + bytes_begin + 1, fill_byte,
-                static_cast<size_t>(bytes_end - bytes_begin - 2));
-  }
-
-  if (i_end % 8 == 0) {
-    return;
-  }
-
-  // set/clear leading bits of last byte
-  bits[bytes_end - 1] &= last_byte_mask;
-  bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
-}
-
-Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>& bytes,
-                                            MemoryPool* pool) {
-  int64_t bit_length = BytesForBits(bytes.size());
-
-  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(bit_length, pool));
-  uint8_t* out_buf = buffer->mutable_data();
-  memset(out_buf, 0, static_cast<size_t>(buffer->capacity()));
-  FillBitsFromBytes(bytes, out_buf);
-  return std::move(buffer);
-}
-
-}  // namespace BitUtil
-
 namespace internal {
 
 int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) {
@@ -503,37 +435,6 @@ Result<std::shared_ptr<Buffer>> BitmapOp(MemoryPool* pool, const uint8_t* left,
 
 }  // namespace
 
-std::string Bitmap::ToString() const {
-  std::string out(length_ + ((length_ - 1) / 8), ' ');
-  for (int64_t i = 0; i < length_; ++i) {
-    out[i + (i / 8)] = GetBit(i) ? '1' : '0';
-  }
-  return out;
-}
-
-std::shared_ptr<BooleanArray> Bitmap::ToArray() const {
-  return std::make_shared<BooleanArray>(length_, buffer_, nullptr, 0, offset_);
-}
-
-std::string Bitmap::Diff(const Bitmap& other) const {
-  return ToArray()->Diff(*other.ToArray());
-}
-
-bool Bitmap::Equals(const Bitmap& other) const {
-  if (length_ != other.length_) {
-    return false;
-  }
-  return BitmapEquals(buffer_->data(), offset_, other.buffer_->data(), other.offset(),
-                      length_);
-}
-
-int64_t Bitmap::BitLength(const Bitmap* bitmaps, size_t N) {
-  for (size_t i = 1; i < N; ++i) {
-    DCHECK_EQ(bitmaps[i].length(), bitmaps[0].length());
-  }
-  return bitmaps[0].length();
-}
-
 Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
                                           int64_t left_offset, const uint8_t* right,
                                           int64_t right_offset, int64_t length,
@@ -576,63 +477,5 @@ void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
                                              length, out_offset, out);
 }
 
-Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
-                                                int64_t straggler_pos, bool value) {
-  if (straggler_pos < 0 || straggler_pos >= length) {
-    return Status::Invalid("invalid straggler_pos ", straggler_pos);
-  }
-
-  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(BitUtil::BytesForBits(length), pool));
-
-  auto bitmap_data = buffer->mutable_data();
-  BitUtil::SetBitsTo(bitmap_data, 0, length, value);
-  BitUtil::SetBitTo(bitmap_data, straggler_pos, !value);
-  return std::move(buffer);
-}
-
-BitBlockCounter::Block BitBlockCounter::NextBlock() {
-  auto load_word = [](const uint8_t* bytes) -> uint64_t {
-    return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
-  };
-  auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t {
-    return (current >> shift) | (next << (64 - shift));
-  };
-
-  // When the offset is > 0, we need there to be a word beyond the last aligned
-  // word in the bitmap for the bit shifting logic.
-  const int64_t bits_required_to_scan_words = offset_ == 0 ? 256 : 256 + (64 - offset_);
-  if (bits_remaining_ < bits_required_to_scan_words) {
-    // End of the bitmap, leave it to the caller to decide how to best check
-    // these bits, no need to do redundant computation here.
-    const int16_t run_length = static_cast<int16_t>(bits_remaining_);
-    bits_remaining_ -= run_length;
-    return {run_length, static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length))};
-  }
-
-  int64_t total_popcount = 0;
-  if (offset_ == 0) {
-    total_popcount += __builtin_popcountll(load_word(bitmap_));
-    total_popcount += __builtin_popcountll(load_word(bitmap_ + 8));
-    total_popcount += __builtin_popcountll(load_word(bitmap_ + 16));
-    total_popcount += __builtin_popcountll(load_word(bitmap_ + 24));
-  } else {
-    auto current = load_word(bitmap_);
-    auto next = load_word(bitmap_ + 8);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-    current = next;
-    next = load_word(bitmap_ + 16);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-    current = next;
-    next = load_word(bitmap_ + 24);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-    current = next;
-    next = load_word(bitmap_ + 32);
-    total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
-  }
-  bitmap_ += BitUtil::BytesForBits(kTargetBlockLength);
-  bits_remaining_ -= 256;
-  return {256, static_cast<int16_t>(total_popcount)};
-}
-
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_ops.h b/cpp/src/arrow/util/bitmap_ops.h
new file mode 100644
index 0000000..4d85397
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_ops.h
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+// ----------------------------------------------------------------------
+// Bitmap utilities
+
+/// Copy a bit range of an existing bitmap
+///
+/// \param[in] pool memory pool to allocate memory from
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+///
+/// \return Status message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* bitmap,
+                                           int64_t offset, int64_t length);
+
+/// Copy a bit range of an existing bitmap into an existing bitmap
+///
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+/// \param[in] dest_offset bit offset into the destination
+/// \param[in] restore_trailing_bits don't clobber bits outside the destination range
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
+ARROW_EXPORT
+void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
+                int64_t dest_offset, bool restore_trailing_bits = true);
+
+/// Invert a bit range of an existing bitmap into an existing bitmap
+///
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+/// \param[in] dest_offset bit offset into the destination
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
+ARROW_EXPORT
+void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
+                  int64_t dest_offset);
+
+/// Invert a bit range of an existing bitmap
+///
+/// \param[in] pool memory pool to allocate memory from
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+///
+/// \return Status message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* bitmap,
+                                             int64_t offset, int64_t length);
+
+/// Compute the number of 1's in the given data array
+///
+/// \param[in] data a packed LSB-ordered bitmap as a byte array
+/// \param[in] bit_offset a bitwise offset into the bitmap
+/// \param[in] length the number of bits to inspect in the bitmap relative to
+/// the offset
+///
+/// \return The number of set (1) bits in the range
+ARROW_EXPORT
+int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length);
+
+ARROW_EXPORT
+bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+                  int64_t right_offset, int64_t bit_length);
+
+/// \brief Do a "bitmap and" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
+                                          int64_t left_offset, const uint8_t* right,
+                                          int64_t right_offset, int64_t length,
+                                          int64_t out_offset);
+
+/// \brief Do a "bitmap and" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap or" for the given bit length on right and left buffers
+/// starting at their respective bit-offsets and put the results in out_buffer
+/// starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapOr(MemoryPool* pool, const uint8_t* left,
+                                         int64_t left_offset, const uint8_t* right,
+                                         int64_t right_offset, int64_t length,
+                                         int64_t out_offset);
+
+/// \brief Do a "bitmap or" for the given bit length on right and left buffers
+/// starting at their respective bit-offsets and put the results in out
+/// starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+              int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap xor" for the given bit-length on right and left
+/// buffers starting at their respective bit-offsets and put the results in
+/// out_buffer starting at the given bit offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapXor(MemoryPool* pool, const uint8_t* left,
+                                          int64_t left_offset, const uint8_t* right,
+                                          int64_t right_offset, int64_t length,
+                                          int64_t out_offset);
+
+/// \brief Do a "bitmap xor" for the given bit-length on right and left
+/// buffers starting at their respective bit-offsets and put the results in
+/// out starting at the given bit offset.
+ARROW_EXPORT
+void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
new file mode 100644
index 0000000..470fbfb
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+class BitmapReader {
+ public:
+  BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(bitmap), position_(0), length_(length) {
+    current_byte_ = 0;
+    byte_offset_ = start_offset / 8;
+    bit_offset_ = start_offset % 8;
+    if (length > 0) {
+      current_byte_ = bitmap[byte_offset_];
+    }
+  }
+
+  bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
+
+  bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
+
+  void Next() {
+    ++bit_offset_;
+    ++position_;
+    if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
+      bit_offset_ = 0;
+      ++byte_offset_;
+      if (ARROW_PREDICT_TRUE(position_ < length_)) {
+        current_byte_ = bitmap_[byte_offset_];
+      }
+    }
+  }
+
+  int64_t position() const { return position_; }
+
+ private:
+  const uint8_t* bitmap_;
+  int64_t position_;
+  int64_t length_;
+
+  uint8_t current_byte_;
+  int64_t byte_offset_;
+  int64_t bit_offset_;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_visit.h b/cpp/src/arrow/util/bitmap_visit.h
new file mode 100644
index 0000000..8a16993
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_visit.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+
+namespace arrow {
+namespace internal {
+
+// A function that visits each bit in a bitmap and calls a visitor function with a
+// boolean representation of that bit. This is intended to be analogous to
+// GenerateBits.
+template <class Visitor>
+void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+               Visitor&& visit) {
+  BitmapReader reader(bitmap, start_offset, length);
+  for (int64_t index = 0; index < length; ++index) {
+    visit(reader.IsSet());
+    reader.Next();
+  }
+}
+
+// Like VisitBits(), but unrolls its main loop for better performance.
+template <class Visitor>
+void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+                       Visitor&& visit) {
+  if (length == 0) {
+    return;
+  }
+
+  // Start by visiting any bits preceding the first full byte.
+  int64_t num_bits_before_full_bytes =
+      BitUtil::RoundUpToMultipleOf8(start_offset) - start_offset;
+  // Truncate num_bits_before_full_bytes if it is greater than length.
+  if (num_bits_before_full_bytes > length) {
+    num_bits_before_full_bytes = length;
+  }
+  // Use the non loop-unrolled VisitBits since we don't want to add branches
+  VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
+
+  // Shift the start pointer to the first full byte and compute the
+  // number of full bytes to be read.
+  const uint8_t* first_full_byte = bitmap + BitUtil::CeilDiv(start_offset, 8);
+  const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
+
+  // Iterate over each full byte of the input bitmap and call the visitor in
+  // a loop-unrolled manner.
+  for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
+    // Get the current bit-packed byte value from the bitmap.
+    const uint8_t byte = *(first_full_byte + byte_index);
+
+    // Execute the visitor function on each bit of the current byte.
+    visit(BitUtil::GetBitFromByte(byte, 0));
+    visit(BitUtil::GetBitFromByte(byte, 1));
+    visit(BitUtil::GetBitFromByte(byte, 2));
+    visit(BitUtil::GetBitFromByte(byte, 3));
+    visit(BitUtil::GetBitFromByte(byte, 4));
+    visit(BitUtil::GetBitFromByte(byte, 5));
+    visit(BitUtil::GetBitFromByte(byte, 6));
+    visit(BitUtil::GetBitFromByte(byte, 7));
+  }
+
+  // Write any leftover bits in the last byte.
+  const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
+  VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
+                     visit);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h
new file mode 100644
index 0000000..7cb2fc6
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_writer.h
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+class BitmapWriter {
+  // A sequential bitwise writer that preserves surrounding bit values.
+
+ public:
+  BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(bitmap), position_(0), length_(length) {
+    byte_offset_ = start_offset / 8;
+    bit_mask_ = BitUtil::kBitmask[start_offset % 8];
+    if (length > 0) {
+      current_byte_ = bitmap[byte_offset_];
+    } else {
+      current_byte_ = 0;
+    }
+  }
+
+  void Set() { current_byte_ |= bit_mask_; }
+
+  void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; }
+
+  void Next() {
+    bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
+    ++position_;
+    if (bit_mask_ == 0) {
+      // Finished this byte, need advancing
+      bit_mask_ = 0x01;
+      bitmap_[byte_offset_++] = current_byte_;
+      if (ARROW_PREDICT_TRUE(position_ < length_)) {
+        current_byte_ = bitmap_[byte_offset_];
+      }
+    }
+  }
+
+  void Finish() {
+    // Store current byte if we didn't went past bitmap storage
+    if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
+      bitmap_[byte_offset_] = current_byte_;
+    }
+  }
+
+  int64_t position() const { return position_; }
+
+ private:
+  uint8_t* bitmap_;
+  int64_t position_;
+  int64_t length_;
+
+  uint8_t current_byte_;
+  uint8_t bit_mask_;
+  int64_t byte_offset_;
+};
+
+class FirstTimeBitmapWriter {
+  // Like BitmapWriter, but any bit values *following* the bits written
+  // might be clobbered.  It is hence faster than BitmapWriter, and can
+  // also avoid false positives with Valgrind.
+
+ public:
+  FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(bitmap), position_(0), length_(length) {
+    current_byte_ = 0;
+    byte_offset_ = start_offset / 8;
+    bit_mask_ = BitUtil::kBitmask[start_offset % 8];
+    if (length > 0) {
+      current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8];
+    } else {
+      current_byte_ = 0;
+    }
+  }
+
+  /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
+  ///
+  /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed
+  ///            to be unset (i.e. 0).
+  /// \param[in] number_of_bits The number of bits to append from word.
+  void AppendWord(uint64_t word, int64_t number_of_bits) {
+    if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
+      return;
+    }
+
+    // Location that the first byte needs to be written to.
+    uint8_t* append_position = bitmap_ + byte_offset_;
+
+    // Update state variables except for current_byte_ here.
+    position_ += number_of_bits;
+    int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
+    bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
+    byte_offset_ += (bit_offset + number_of_bits) / 8;
+
+    if (bit_offset != 0) {
+      // We are in the middle of the byte. This code updates the byte and shifts
+      // bits appropriately within word so it can be memcpy'd below.
+      int64_t bits_to_carry = 8 - bit_offset;
+      // Carry over bits from word to current_byte_. We assume any extra bits in word
+      // unset so no additional accounting is needed for when number_of_bits <
+      // bits_to_carry.
+      current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) << bit_offset;
+      // Check if everything is transfered into current_byte_.
+      if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
+        return;
+      }
+      *append_position = current_byte_;
+      append_position++;
+      // Move the carry bits off of word.
+      word = word >> bits_to_carry;
+      number_of_bits -= bits_to_carry;
+    }
+    word = BitUtil::ToLittleEndian(word);
+    int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits);
+    std::memcpy(append_position, &word, bytes_for_word);
+    // At this point, the previous current_byte_ has been written to bitmap_.
+    // The new current_byte_ is either the last relevant byte in 'word'
+    // or cleared if the new position is byte aligned (i.e. a fresh byte).
+    if (bit_mask_ == 0x1) {
+      current_byte_ = 0;
+    } else {
+      current_byte_ = *(append_position + bytes_for_word - 1);
+    }
+  }
+
+  void Set() { current_byte_ |= bit_mask_; }
+
+  void Clear() {}
+
+  void Next() {
+    bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
+    ++position_;
+    if (bit_mask_ == 0) {
+      // Finished this byte, need advancing
+      bit_mask_ = 0x01;
+      bitmap_[byte_offset_++] = current_byte_;
+      current_byte_ = 0;
+    }
+  }
+
+  void Finish() {
+    // Store current byte if we didn't went go bitmap storage
+    if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
+      bitmap_[byte_offset_] = current_byte_;
+    }
+  }
+
+  int64_t position() const { return position_; }
+
+ private:
+  uint8_t* bitmap_;
+  int64_t position_;
+  int64_t length_;
+
+  uint8_t current_byte_;
+  uint8_t bit_mask_;
+  int64_t byte_offset_;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/bitset_stack.h b/cpp/src/arrow/util/bitset_stack.h
new file mode 100644
index 0000000..addded9
--- /dev/null
+++ b/cpp/src/arrow/util/bitset_stack.h
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_builder.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/type_traits.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief Store a stack of bitsets efficiently. The top bitset may be
+/// accessed and its bits may be modified, but it may not be resized.
+class BitsetStack {
+ public:
+  using reference = typename std::vector<bool>::reference;
+
+  /// \brief push a bitset onto the stack
+  /// \param size number of bits in the next bitset
+  /// \param value initial value for bits in the pushed bitset
+  void Push(int size, bool value) {
+    offsets_.push_back(bit_count());
+    bits_.resize(bit_count() + size, value);
+  }
+
+  /// \brief number of bits in the bitset at the top of the stack
+  int TopSize() const {
+    if (offsets_.size() == 0) return 0;
+    return bit_count() - offsets_.back();
+  }
+
+  /// \brief pop a bitset off the stack
+  void Pop() {
+    bits_.resize(offsets_.back());
+    offsets_.pop_back();
+  }
+
+  /// \brief get the value of a bit in the top bitset
+  /// \param i index of the bit to access
+  bool operator[](int i) const { return bits_[offsets_.back() + i]; }
+
+  /// \brief get a mutable reference to a bit in the top bitset
+  /// \param i index of the bit to access
+  reference operator[](int i) { return bits_[offsets_.back() + i]; }
+
+ private:
+  int bit_count() const { return static_cast<int>(bits_.size()); }
+  std::vector<bool> bits_;
+  std::vector<int> offsets_;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index 406343f..caede9f 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -38,6 +38,7 @@
 #include "arrow/type_fwd.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_builders.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/ubsan.h"
diff --git a/cpp/src/arrow/util/rle_encoding.h b/cpp/src/arrow/util/rle_encoding.h
index 2fe2376..0eab5b3 100644
--- a/cpp/src/arrow/util/rle_encoding.h
+++ b/cpp/src/arrow/util/rle_encoding.h
@@ -26,6 +26,7 @@
 
 #include "arrow/util/bit_stream_utils.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
 #include "arrow/util/macros.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/util/thread_pool.h b/cpp/src/arrow/util/thread_pool.h
index cd93b34..fa6583e 100644
--- a/cpp/src/arrow/util/thread_pool.h
+++ b/cpp/src/arrow/util/thread_pool.h
@@ -21,10 +21,9 @@
 #include <unistd.h>
 #endif
 
-#include <cstdlib>
+#include <cstdint>
 #include <functional>
 #include <memory>
-#include <string>
 #include <type_traits>
 #include <utility>
 
diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc
index 8f86de8..ac0eae7 100644
--- a/cpp/src/arrow/util/thread_pool_test.cc
+++ b/cpp/src/arrow/util/thread_pool_test.cc
@@ -21,8 +21,6 @@
 #endif
 
 #include <algorithm>
-#include <chrono>
-#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <functional>
diff --git a/cpp/src/arrow/util/trie.h b/cpp/src/arrow/util/trie.h
index 690d39a..23205b5 100644
--- a/cpp/src/arrow/util/trie.h
+++ b/cpp/src/arrow/util/trie.h
@@ -20,6 +20,7 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
+#include <iosfwd>
 #include <limits>
 #include <string>
 #include <utility>
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 336b94c..1600ad9 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -27,6 +27,7 @@
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/functional.h"
 #include "arrow/util/optional.h"
diff --git a/cpp/src/gandiva/bitmap_accumulator.cc b/cpp/src/gandiva/bitmap_accumulator.cc
index 0519533..d544478 100644
--- a/cpp/src/gandiva/bitmap_accumulator.cc
+++ b/cpp/src/gandiva/bitmap_accumulator.cc
@@ -19,6 +19,8 @@
 
 #include <vector>
 
+#include "arrow/util/bitmap_ops.h"
+
 namespace gandiva {
 
 void BitMapAccumulator::ComputeResult(uint8_t* dst_bitmap) {
diff --git a/cpp/src/gandiva/bitmap_accumulator_test.cc b/cpp/src/gandiva/bitmap_accumulator_test.cc
index 5f570eb..ccffab3 100644
--- a/cpp/src/gandiva/bitmap_accumulator_test.cc
+++ b/cpp/src/gandiva/bitmap_accumulator_test.cc
@@ -24,6 +24,7 @@
 
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/util.h"
+#include "arrow/util/bitmap_ops.h"
 
 #include "gandiva/dex.h"
 
diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc
index 56f44d3..8ada325 100644
--- a/cpp/src/parquet/arrow/path_internal.cc
+++ b/cpp/src/parquet/arrow/path_internal.cc
@@ -99,6 +99,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_visit.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/make_unique.h"
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 65941e4..23554aa 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -22,6 +22,7 @@
 
 #include "arrow/io/buffered.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/bitmap_builders.h"
 
 #include "parquet/column_reader.h"
 #include "parquet/column_writer.h"
@@ -752,8 +753,8 @@ TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) {
   auto values_data = reinterpret_cast<const int32_t*>(values_buffer->data());
 
   std::shared_ptr<Buffer> valid_bits;
-  ASSERT_OK_AND_ASSIGN(
-      valid_bits, ::arrow::BitUtil::BytesToBits({1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1}));
+  ASSERT_OK_AND_ASSIGN(valid_bits, ::arrow::internal::BytesToBits(
+                                       {1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1}));
 
   // valgrind will warn about out of bounds access into def_levels_data
   typed_writer->WriteBatchSpaced(14, def_levels.data(), rep_levels.data(),
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 809309d..50cc377 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -29,6 +29,7 @@
 #include "arrow/array/builder_dict.h"
 #include "arrow/stl_allocator.h"
 #include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/byte_stream_split.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/hashing.h"
diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc
index 0bdbad5..d4f3719 100644
--- a/cpp/src/parquet/level_conversion_test.cc
+++ b/cpp/src/parquet/level_conversion_test.cc
@@ -14,6 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+
 #include "parquet/level_conversion.h"
 
 #include <gmock/gmock.h>
@@ -23,6 +24,7 @@
 #include <vector>
 
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap.h"
 
 namespace parquet {
 namespace internal {
diff --git a/cpp/src/parquet/platform.h b/cpp/src/parquet/platform.h
index 98dc928..4c78f2c 100644
--- a/cpp/src/parquet/platform.h
+++ b/cpp/src/parquet/platform.h
@@ -20,15 +20,16 @@
 #include <cstdint>
 #include <memory>
 
-#include "arrow/buffer.h"            // IWYU pragma: export
-#include "arrow/io/interfaces.h"     // IWYU pragma: export
-#include "arrow/io/memory.h"         // IWYU pragma: export
-#include "arrow/memory_pool.h"       // IWYU pragma: export
-#include "arrow/status.h"            // IWYU pragma: export
-#include "arrow/util/bit_util.h"     // IWYU pragma: export
-#include "arrow/util/compression.h"  // IWYU pragma: export
-#include "arrow/util/macros.h"       // IWYU pragma: export
-#include "arrow/util/string_view.h"  // IWYU pragma: export
+#include "arrow/buffer.h"              // IWYU pragma: export
+#include "arrow/io/interfaces.h"       // IWYU pragma: export
+#include "arrow/io/memory.h"           // IWYU pragma: export
+#include "arrow/memory_pool.h"         // IWYU pragma: export
+#include "arrow/status.h"              // IWYU pragma: export
+#include "arrow/util/bit_util.h"       // IWYU pragma: export
+#include "arrow/util/bitmap_writer.h"  // IWYU pragma: export
+#include "arrow/util/compression.h"    // IWYU pragma: export
+#include "arrow/util/macros.h"         // IWYU pragma: export
+#include "arrow/util/string_view.h"    // IWYU pragma: export
 
 #if defined(_WIN32) || defined(__CYGWIN__)
 
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 6c6cefe..6afeb1c 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -25,6 +25,7 @@
 #include "arrow/array.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/bitmap_reader.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/optional.h"
diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc
index e1bf520..8edc5ee 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/bitmap_ops.h"
 
 #include "parquet/column_reader.h"
 #include "parquet/column_writer.h"
diff --git a/r/src/array.cpp b/r/src/array.cpp
index cd8374c..9ada75d 100644
--- a/r/src/array.cpp
+++ b/r/src/array.cpp
@@ -23,6 +23,7 @@ using Rcpp::no_init;
 #if defined(ARROW_R_WITH_ARROW)
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
 
 void arrow::r::validate_slice_offset(int offset, int len) {
   if (offset == NA_INTEGER) {
diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp
index 0f6eab2..0032e97 100644
--- a/r/src/array_from_vector.cpp
+++ b/r/src/array_from_vector.cpp
@@ -23,6 +23,7 @@
 #include <arrow/array.h>
 #include <arrow/builder.h>
 #include <arrow/table.h>
+#include <arrow/util/bitmap_writer.h>
 // This says "Private header, not to be exported"
 #include <arrow/visitor_inline.h>
 
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 28c6719..223c3fe 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -20,6 +20,8 @@
 
 #include <arrow/array.h>
 #include <arrow/table.h>
+#include <arrow/util/bitmap_reader.h>
+#include <arrow/util/bitmap_writer.h>
 #include <arrow/util/parallel.h>
 #include <arrow/util/task_group.h>