You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/02/13 14:03:52 UTC

arrow git commit: ARROW-553: C++: Faster valid bitmap building

Repository: arrow
Updated Branches:
  refs/heads/master 1f26040f5 -> ad0157547


ARROW-553: C++: Faster valid bitmap building

Author: Uwe L. Korn <uw...@xhochy.com>

Closes #338 from xhochy/ARROW-553 and squashes the following commits:

1c1ee3d [Uwe L. Korn] ARROW-553: C++: Faster valid bitmap building


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/ad015754
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/ad015754
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/ad015754

Branch: refs/heads/master
Commit: ad0157547a4f5e6e51fa2f712c2ed9477489a20c
Parents: 1f26040
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Mon Feb 13 09:03:46 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Feb 13 09:03:46 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/builder.cc                        | 41 ++++++++++++++++++--
 .../jemalloc/jemalloc-builder-benchmark.cc      |  2 +-
 2 files changed, 38 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/ad015754/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index dddadee..f5c13f9 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -114,18 +114,51 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t leng
     UnsafeSetNotNull(length);
     return;
   }
+
+  int byte_offset = length_ / 8;
+  int bit_offset = length_ % 8;
+  uint8_t bitset = null_bitmap_data_[byte_offset];
+
   for (int32_t i = 0; i < length; ++i) {
-    // TODO(emkornfield) Optimize for large values of length?
-    UnsafeAppendToBitmap(valid_bytes[i] > 0);
+    if (valid_bytes[i]) {
+      bitset |= (1 << bit_offset);
+    } else {
+      bitset &= ~(1 << bit_offset);
+      ++null_count_;
+    }
+
+    bit_offset++;
+    if (bit_offset == 8) {
+      bit_offset = 0;
+      null_bitmap_data_[byte_offset] = bitset;
+      byte_offset++;
+      // TODO: Except for the last byte, this shouldn't be needed
+      bitset = null_bitmap_data_[byte_offset];
+    }
   }
+  if (bit_offset != 0) { null_bitmap_data_[byte_offset] = bitset; }
+  length_ += length;
 }
 
 void ArrayBuilder::UnsafeSetNotNull(int32_t length) {
   const int32_t new_length = length + length_;
-  // TODO(emkornfield) Optimize for large values of length?
-  for (int32_t i = length_; i < new_length; ++i) {
+
+  // Fill up the bytes until we have a byte alignment
+  int32_t pad_to_byte = 8 - (length_ % 8);
+  if (pad_to_byte == 8) { pad_to_byte = 0; }
+  for (int32_t i = 0; i < pad_to_byte; ++i) {
+    BitUtil::SetBit(null_bitmap_data_, i);
+  }
+
+  // Fast bitsetting
+  int32_t fast_length = (length - pad_to_byte) / 8;
+  memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 255, fast_length);
+
+  // Trailing bytes
+  for (int32_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) {
     BitUtil::SetBit(null_bitmap_data_, i);
   }
+
   length_ = new_length;
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/ad015754/cpp/src/arrow/jemalloc/jemalloc-builder-benchmark.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/jemalloc/jemalloc-builder-benchmark.cc b/cpp/src/arrow/jemalloc/jemalloc-builder-benchmark.cc
index 58dbaa3..d69c304 100644
--- a/cpp/src/arrow/jemalloc/jemalloc-builder-benchmark.cc
+++ b/cpp/src/arrow/jemalloc/jemalloc-builder-benchmark.cc
@@ -42,6 +42,6 @@ static void BM_BuildPrimitiveArrayNoNulls(
       state.iterations() * data.size() * sizeof(int64_t) * kFinalSize);
 }
 
-BENCHMARK(BM_BuildPrimitiveArrayNoNulls)->Repetitions(3)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_BuildPrimitiveArrayNoNulls)->Repetitions(5)->Unit(benchmark::kMillisecond);
 
 }  // namespace arrow