You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/12/02 07:44:21 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #8770: ARROW-10696: [C++] Add SetBitRunReader

pitrou commented on a change in pull request #8770:
URL: https://github.com/apache/arrow/pull/8770#discussion_r533955521



##########
File path: cpp/src/arrow/util/bit_run_reader.h
##########
@@ -166,7 +167,350 @@ class ARROW_EXPORT BitRunReader {
 using BitRunReader = BitRunReaderLinear;
 #endif
 
-// TODO SetBitRunReader?
+struct SetBitRun {
+  int64_t position;
+  int64_t length;
+
+  bool AtEnd() const { return length == 0; }
+
+  std::string ToString() const {
+    return std::string("{pos=") + std::to_string(position) +
+           ", len=" + std::to_string(length) + "}";
+  }
+
+  bool operator==(const SetBitRun& other) const {
+    return position == other.position && length == other.length;
+  }
+  bool operator!=(const SetBitRun& other) const {
+    return position != other.position || length != other.length;
+  }
+};
+
+template <bool Reverse>
+class BaseSetBitRunReader {
+ public:
+  /// \brief Constructs new SetBitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  inline BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  SetBitRun NextRun() {
+    int64_t pos = 0;
+    int64_t len = 0;
+    if (current_num_bits_) {
+      const auto run = FindCurrentRun();
+      assert(remaining_ >= 0);
+      if (run.length && current_num_bits_) {
+        // The run ends in current_word_
+        return AdjustRun(run);
+      }
+      pos = run.position;
+      len = run.length;
+    }
+    if (!len) {
+      // We didn't get any ones in current_word_, so we can skip any zeros
+      // in the following words
+      SkipNextZeros();
+      if (remaining_ == 0) {
+        return {0, 0};
+      }
+      assert(current_num_bits_);
+      pos = position();
+    } else if (!current_num_bits_) {
+      if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+        current_word_ = LoadFullWord();
+        current_num_bits_ = 64;
+      } else if (remaining_ > 0) {
+        current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+        current_num_bits_ = static_cast<int32_t>(remaining_);
+      } else {
+        // No bits remaining, perhaps we found a run?
+        return AdjustRun({pos, len});
+      }
+      // If current word starts with a zero, we got a full run
+      if (!(current_word_ & kFirstBit)) {
+        return AdjustRun({pos, len});
+      }
+    }
+    // Current word should now start with a set bit
+    len += CountNextOnes();
+    return AdjustRun({pos, len});
+  }
+
+ protected:
+  int64_t position() const {
+    if (Reverse) {
+      return remaining_;
+    } else {
+      return length_ - remaining_;
+    }
+  }
+
+  SetBitRun AdjustRun(SetBitRun run) {
+    if (Reverse) {
+      assert(run.position >= run.length);
+      run.position -= run.length;
+    }
+    return run;
+  }
+
+  uint64_t LoadFullWord() {
+    uint64_t word;
+    if (Reverse) {
+      bitmap_ -= 8;
+    }
+    memcpy(&word, bitmap_, 8);
+    if (!Reverse) {
+      bitmap_ += 8;
+    }
+    return BitUtil::ToLittleEndian(word);
+  }
+
+  uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+    assert(num_bits > 0);
+    uint64_t word = 0;
+    const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
+    if (Reverse) {
+      // Read in the most significant bytes of the word
+      bitmap_ -= num_bytes;
+      memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
+      // XXX MostSignificantBitmask
+      return (BitUtil::ToLittleEndian(word) << bit_offset) &
+             ~BitUtil::LeastSignificantBitMask(64 - num_bits);
+    } else {
+      memcpy(&word, bitmap_, num_bytes);
+      bitmap_ += num_bytes;
+      return (BitUtil::ToLittleEndian(word) >> bit_offset) &
+             BitUtil::LeastSignificantBitMask(num_bits);
+    }
+  }
+
+  void SkipNextZeros() {
+    assert(current_num_bits_ == 0);
+    while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+      current_word_ = LoadFullWord();
+      const auto num_zeros = CountFirstZeros(current_word_);
+      if (num_zeros < 64) {
+        current_word_ = ConsumeBits(current_word_, num_zeros);
+        current_num_bits_ = 64 - num_zeros;
+        remaining_ -= num_zeros;
+        assert(remaining_ >= 0);
+        assert(current_num_bits_ >= 0);
+        return;
+      }
+      remaining_ -= 64;
+    }
+    if (remaining_ > 0) {
+      current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+      current_num_bits_ = static_cast<int32_t>(remaining_);
+      const auto num_zeros =
+          std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
+      current_word_ = ConsumeBits(current_word_, num_zeros);
+      current_num_bits_ -= num_zeros;
+      remaining_ -= num_zeros;
+      assert(remaining_ >= 0);
+      assert(current_num_bits_ >= 0);
+    }
+  }
+
+  int64_t CountNextOnes() {
+    assert(current_word_ & kFirstBit);
+
+    int64_t len;
+    if (~current_word_) {
+      const auto num_ones = CountFirstZeros(~current_word_);
+      assert(num_ones <= current_num_bits_);
+      assert(num_ones <= remaining_);
+      remaining_ -= num_ones;
+      current_word_ = ConsumeBits(current_word_, num_ones);
+      current_num_bits_ -= num_ones;
+      if (current_num_bits_) {
+        // There are pending zeros in current_word_
+        return num_ones;
+      }
+      len = num_ones;
+    } else {
+      // current_word_ is all ones
+      remaining_ -= 64;
+      current_num_bits_ = 0;
+      len = 64;
+    }
+
+    while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+      current_word_ = LoadFullWord();
+      const auto num_ones = CountFirstZeros(~current_word_);
+      len += num_ones;
+      remaining_ -= num_ones;
+      if (num_ones < 64) {

Review comment:
       Well, depending on the distribution of input bits, this may be commonly true (same for the other comment above).




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org