You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/11/03 00:32:11 UTC

[orc] branch main updated: ORC-1020: [C++] Optimize RleDecoderV2::nextDirect base on bit sizes (#944)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new b499c8a  ORC-1020: [C++] Optimize RleDecoderV2::nextDirect base on bit sizes (#944)
b499c8a is described below

commit b499c8a46365d7c2ed3bd2f6ba0d4e87dadcc05c
Author: Quanlong Huang <hu...@gmail.com>
AuthorDate: Wed Nov 3 08:30:19 2021 +0800

    ORC-1020: [C++] Optimize RleDecoderV2::nextDirect base on bit sizes (#944)
    
    ### What changes were proposed in this pull request?
    
    This PR optimizes the C++ implementation of orc::RleDecoderV2::nextDirect. It provides different methods for different bit sizes, which significantly reduces the instructions and branches. Note that only bit sizes of 4,8,16,24,32,40,48,56,64 are optimized in this PR. bitSize 1 and 2 are skipped since they usually have short runs which won't benefit from loop unrolling. Other deprecated bit sizes, e.g. 3,5,6,7 are also skipped.
    
    When the bit size is specified, we can better exhaust the buffer before calling readByte(). This also improves performance.
    
    ### Why are the changes needed?
    
    Perf tests show that orc::RleDecoderV2::nextDirect() dominant the time in reading random integers. There are more details in the JIRA description.
    Performance improvement for scanning 1 billion unsigned numbers in different bit sizes:
    
    bitSize | original time(s) | optimized time(s) | speedup
    -- | -- | -- | --
    4 | 5.93 | 3.25 | 1.8246153846
    8 | 6.24 | 2.62 | 2.3816793893
    16 | 7.24 | 2.59 | 2.7953667954
    24 | 8.28 | 3.04 | 2.7236842105
    32 | 9.45 | 2.9 | 3.2586206897
    40 | 10.62 | 3.42 | 3.1052631579
    48 | 11.67 | 3.56 | 3.2780898876
    56 | 12.86 | 4.21 | 3.054631829
    64 | 14.05 | 7.76 | 1.8105670103
    
    Tested on Ubuntu16.04 with a 6 cores CPU (12 virtual cores) and 32GB RAM.
    CPU: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
    
    ### How was this patch tested?
    
    Add more tests in TestRleDecoder.cc
---
 c++/src/RLEv2.hh           |  61 ++-----
 c++/src/RleDecoderV2.cc    | 358 +++++++++++++++++++++++++++++++++++++-
 c++/test/TestRleDecoder.cc | 421 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 739 insertions(+), 101 deletions(-)

diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh
index e12688e..28a8f3b 100644
--- a/c++/src/RLEv2.hh
+++ b/c++/src/RLEv2.hh
@@ -161,56 +161,25 @@ private:
     bitSize = 0;
   }
 
-  unsigned char readByte() {
-  if (bufferStart == bufferEnd) {
-    int bufferLength;
-    const void* bufferPointer;
-    if (!inputStream->Next(&bufferPointer, &bufferLength)) {
-      throw ParseError("bad read in RleDecoderV2::readByte");
-    }
-    bufferStart = static_cast<const char*>(bufferPointer);
-    bufferEnd = bufferStart + bufferLength;
-  }
-
-  unsigned char result = static_cast<unsigned char>(*bufferStart++);
-  return result;
-}
+  unsigned char readByte();
 
   int64_t readLongBE(uint64_t bsz);
   int64_t readVslong();
   uint64_t readVulong();
   uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len,
-                     uint64_t fb, const char* notNull = nullptr) {
-  uint64_t ret = 0;
-
-  // TODO: unroll to improve performance
-  for(uint64_t i = offset; i < (offset + len); i++) {
-    // skip null positions
-    if (notNull && !notNull[i]) {
-      continue;
-    }
-    uint64_t result = 0;
-    uint64_t bitsLeftToRead = fb;
-    while (bitsLeftToRead > bitsLeft) {
-      result <<= bitsLeft;
-      result |= curByte & ((1 << bitsLeft) - 1);
-      bitsLeftToRead -= bitsLeft;
-      curByte = readByte();
-      bitsLeft = 8;
-    }
-
-    // handle the left over bits
-    if (bitsLeftToRead > 0) {
-      result <<= bitsLeftToRead;
-      bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
-      result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
-    }
-    data[i] = static_cast<int64_t>(result);
-    ++ret;
-  }
-
-  return ret;
-}
+                     uint64_t fbs, const char* notNull = nullptr);
+
+  void readLongsWithoutNulls(int64_t *data, uint64_t offset, uint64_t len,
+                             uint64_t fbs);
+  void unrolledUnpack4(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack8(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack16(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack24(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack32(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack40(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len);
+  void unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len);
 
   uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
                             const char* notNull);
@@ -234,7 +203,7 @@ private:
   int64_t firstValue; // Used by SHORT_REPEAT and DELTA
   int64_t prevValue; // Used by DELTA
   uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA
-  uint32_t bitsLeft; // Used by anything that uses readLongs
+  uint32_t bitsLeft; // Used by readLongs when bitSize < 8
   uint32_t curByte; // Used by anything that uses readLongs
   uint32_t patchBitSize; // Used by PATCHED_BASE
   uint64_t unpackedIdx; // Used by PATCHED_BASE
diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc
index c5c6f6a..324d398 100644
--- a/c++/src/RleDecoderV2.cc
+++ b/c++/src/RleDecoderV2.cc
@@ -23,6 +23,21 @@
 
 namespace orc {
 
+unsigned char RleDecoderV2::readByte() {
+  if (bufferStart == bufferEnd) {
+    int bufferLength;
+    const void* bufferPointer;
+    if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+      throw ParseError("bad read in RleDecoderV2::readByte");
+    }
+    bufferStart = static_cast<const char*>(bufferPointer);
+    bufferEnd = bufferStart + bufferLength;
+  }
+
+  unsigned char result = static_cast<unsigned char>(*bufferStart++);
+  return result;
+}
+
 int64_t RleDecoderV2::readLongBE(uint64_t bsz) {
   int64_t ret = 0, val;
   uint64_t n = bsz;
@@ -49,6 +64,341 @@ uint64_t RleDecoderV2::readVulong() {
   return ret;
 }
 
+void RleDecoderV2::readLongsWithoutNulls(int64_t *data, uint64_t offset, uint64_t len,
+                                         uint64_t fbs) {
+  switch (fbs) {
+    case 4:
+      unrolledUnpack4(data, offset, len);
+      return;
+    case 8:
+      unrolledUnpack8(data, offset, len);
+      return;
+    case 16:
+      unrolledUnpack16(data, offset, len);
+      return;
+    case 24:
+      unrolledUnpack24(data, offset, len);
+      return;
+    case 32:
+      unrolledUnpack32(data, offset, len);
+      return;
+    case 40:
+      unrolledUnpack40(data, offset, len);
+      return;
+    case 48:
+      unrolledUnpack48(data, offset, len);
+      return;
+    case 56:
+      unrolledUnpack56(data, offset, len);
+      return;
+    case 64:
+      unrolledUnpack64(data, offset, len);
+      return;
+    default:
+      // Fallback to the default implementation for deprecated bit size.
+      readLongs(data, offset, len, fbs);
+      return;
+  }
+}
+
+void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8.
+    while (bitsLeft > 0 && curIdx < offset + len) {
+      bitsLeft -= 4;
+      data[curIdx++] = (curByte >> bitsLeft) & 15;
+    }
+    if (curIdx == offset + len) return;
+
+    // Exhaust the buffer
+    uint64_t numGroups = (offset + len - curIdx) / 2;
+    numGroups = std::min(numGroups, static_cast<uint64_t>(bufferEnd - bufferStart));
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto *buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    uint32_t localByte;
+    for (uint64_t i = 0; i < numGroups; ++i) {
+      localByte = *buffer++;
+      data[curIdx] = (localByte >> 4) & 15;
+      data[curIdx + 1] = localByte & 15;
+      curIdx += 2;
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // readByte() will update 'bufferStart' and 'bufferEnd'
+    curByte = readByte();
+    bitsLeft = 8;
+  }
+}
+
+void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = bufferEnd - bufferStart;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      data[curIdx++] = *buffer++;
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // readByte() will update 'bufferStart' and 'bufferEnd'.
+    data[curIdx++] = readByte();
+  }
+}
+
+void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = (bufferEnd - bufferStart) / 2;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    uint16_t b0, b1;
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      b0 = static_cast<uint16_t>(*buffer);
+      b1 = static_cast<uint16_t>(*(buffer + 1));
+      buffer += 2;
+      data[curIdx++] = (b0 << 8) | b1;
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+    b0 = readByte();
+    b1 = readByte();
+    data[curIdx++] = (b0 << 8) | b1;
+  }
+}
+
+void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = (bufferEnd - bufferStart) / 3;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    uint32_t b0, b1, b2;
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      b0 = static_cast<uint32_t>(*buffer);
+      b1 = static_cast<uint32_t>(*(buffer + 1));
+      b2 = static_cast<uint32_t>(*(buffer + 2));
+      buffer += 3;
+      data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
+    }
+    bufferStart += bufferNum * 3;
+    if (curIdx == offset + len) return;
+
+    // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+    b0 = readByte();
+    b1 = readByte();
+    b2 = readByte();
+    data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
+  }
+}
+
+void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = (bufferEnd - bufferStart) / 4;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    uint32_t b0, b1, b2, b3;
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      b0 = static_cast<uint32_t>(*buffer);
+      b1 = static_cast<uint32_t>(*(buffer + 1));
+      b2 = static_cast<uint32_t>(*(buffer + 2));
+      b3 = static_cast<uint32_t>(*(buffer + 3));
+      buffer += 4;
+      data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+    b0 = readByte();
+    b1 = readByte();
+    b2 = readByte();
+    b3 = readByte();
+    data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
+  }
+}
+
+void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = (bufferEnd - bufferStart) / 5;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    uint64_t b0, b1, b2, b3, b4;
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      b0 = static_cast<uint32_t>(*buffer);
+      b1 = static_cast<uint32_t>(*(buffer + 1));
+      b2 = static_cast<uint32_t>(*(buffer + 2));
+      b3 = static_cast<uint32_t>(*(buffer + 3));
+      b4 = static_cast<uint32_t>(*(buffer + 4));
+      buffer += 5;
+      data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+    b0 = readByte();
+    b1 = readByte();
+    b2 = readByte();
+    b3 = readByte();
+    b4 = readByte();
+    data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
+  }
+}
+
+void RleDecoderV2::unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = (bufferEnd - bufferStart) / 6;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    uint64_t b0, b1, b2, b3, b4, b5;
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      b0 = static_cast<uint32_t>(*buffer);
+      b1 = static_cast<uint32_t>(*(buffer + 1));
+      b2 = static_cast<uint32_t>(*(buffer + 2));
+      b3 = static_cast<uint32_t>(*(buffer + 3));
+      b4 = static_cast<uint32_t>(*(buffer + 4));
+      b5 = static_cast<uint32_t>(*(buffer + 5));
+      buffer += 6;
+      data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5);
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+    b0 = readByte();
+    b1 = readByte();
+    b2 = readByte();
+    b3 = readByte();
+    b4 = readByte();
+    b5 = readByte();
+    data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5);
+  }
+}
+
+void RleDecoderV2::unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = (bufferEnd - bufferStart) / 7;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    uint64_t b0, b1, b2, b3, b4, b5, b6;
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      b0 = static_cast<uint32_t>(*buffer);
+      b1 = static_cast<uint32_t>(*(buffer + 1));
+      b2 = static_cast<uint32_t>(*(buffer + 2));
+      b3 = static_cast<uint32_t>(*(buffer + 3));
+      b4 = static_cast<uint32_t>(*(buffer + 4));
+      b5 = static_cast<uint32_t>(*(buffer + 5));
+      b6 = static_cast<uint32_t>(*(buffer + 6));
+      buffer += 7;
+      data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6);
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+    b0 = readByte();
+    b1 = readByte();
+    b2 = readByte();
+    b3 = readByte();
+    b4 = readByte();
+    b5 = readByte();
+    b6 = readByte();
+    data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6);
+  }
+}
+
+void RleDecoderV2::unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len) {
+  uint64_t curIdx = offset;
+  while (curIdx < offset + len) {
+    // Exhaust the buffer
+    int64_t bufferNum = (bufferEnd - bufferStart) / 8;
+    bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+    uint64_t b0, b1, b2, b3, b4, b5, b6, b7;
+    // Avoid updating 'bufferStart' inside the loop.
+    const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
+    for (int i = 0; i < bufferNum; ++i) {
+      b0 = static_cast<uint32_t>(*buffer);
+      b1 = static_cast<uint32_t>(*(buffer + 1));
+      b2 = static_cast<uint32_t>(*(buffer + 2));
+      b3 = static_cast<uint32_t>(*(buffer + 3));
+      b4 = static_cast<uint32_t>(*(buffer + 4));
+      b5 = static_cast<uint32_t>(*(buffer + 5));
+      b6 = static_cast<uint32_t>(*(buffer + 6));
+      b7 = static_cast<uint32_t>(*(buffer + 7));
+      buffer += 8;
+      data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
+    }
+    bufferStart = reinterpret_cast<const char*>(buffer);
+    if (curIdx == offset + len) return;
+
+    // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+    b0 = readByte();
+    b1 = readByte();
+    b2 = readByte();
+    b3 = readByte();
+    b4 = readByte();
+    b5 = readByte();
+    b6 = readByte();
+    b7 = readByte();
+    data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
+  }
+}
+
+uint64_t RleDecoderV2::readLongs(int64_t *data, uint64_t offset, uint64_t len,
+                                 uint64_t fbs, const char* notNull) {
+  uint64_t ret = 0;
+
+  for (uint64_t i = offset; i < (offset + len); i++) {
+    // skip null positions
+    if (notNull && !notNull[i]) {
+      continue;
+    }
+    uint64_t result = 0;
+    uint64_t bitsLeftToRead = fbs;
+    while (bitsLeftToRead > bitsLeft) {
+      result <<= bitsLeft;
+      result |= curByte & ((1 << bitsLeft) - 1);
+      bitsLeftToRead -= bitsLeft;
+      curByte = readByte();
+      bitsLeft = 8;
+    }
+
+    // handle the left over bits
+    if (bitsLeftToRead > 0) {
+      result <<= bitsLeftToRead;
+      bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
+      result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+    }
+    data[i] = static_cast<int64_t>(result);
+    ++ret;
+  }
+  return ret;
+}
+
 RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
                            bool _isSigned, MemoryPool& pool
                            ): inputStream(std::move(input)),
@@ -202,7 +552,13 @@ uint64_t RleDecoderV2::nextDirect(int64_t* const data,
 
   uint64_t nRead = std::min(runLength - runRead, numValues);
 
-  runRead += readLongs(data, offset, nRead, bitSize, notNull);
+  // bitSize 1,2 usually have short runs which won't benefit from loop unrolling.
+  if (notNull || bitSize <= 2) {
+    runRead += readLongs(data, offset, nRead, bitSize, notNull);
+  } else {
+    readLongsWithoutNulls(data, offset, nRead, bitSize);
+    runRead += nRead;
+  }
 
   if (isSigned) {
     if (notNull) {
diff --git a/c++/test/TestRleDecoder.cc b/c++/test/TestRleDecoder.cc
index 9f192ac..1e1447a 100644
--- a/c++/test/TestRleDecoder.cc
+++ b/c++/test/TestRleDecoder.cc
@@ -31,11 +31,13 @@ std::vector<int64_t> decodeRLEv2(const unsigned char *bytes,
                                  unsigned long l,
                                  size_t n,
                                  size_t count,
-                                 const char* notNull = nullptr) {
-  std::unique_ptr<RleDecoder> rle =
-    createRleDecoder(std::unique_ptr<SeekableInputStream>
-                     (new SeekableArrayInputStream(bytes,l)), true,
-                     RleVersion_2, *getDefaultPool());
+                                 const char* notNull = nullptr,
+                                 bool isSigned = true,
+                                 uint64_t blockSize = 0) {
+  std::unique_ptr<RleDecoder> rle = createRleDecoder(
+      std::unique_ptr<SeekableInputStream>(
+            new SeekableArrayInputStream(bytes,l, blockSize)),
+        isSigned,RleVersion_2, *getDefaultPool());
   std::vector<int64_t> results;
   for (size_t i = 0; i < count; i+=n) {
     size_t remaining = count - i;
@@ -254,65 +256,376 @@ TEST(RLEv2, 0to2Repeat1Direct) {
   }
 };
 
+TEST(RLEv2, bitSize1Direct) {
+  // 0,1 repeated 20 times (unsigned ints)
+  size_t count = 40;
+  std::vector<int64_t> values;
+  for (size_t i = 0; i < count; ++i) {
+    values.push_back(i % 2);
+  }
+  const unsigned char bytes[] = {0x40, 0x27, 0x55, 0x55, 0x55, 0x55, 0x55};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, false, blkSize), 1);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, false, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, false, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, false, blkSize), count);
+  }
+
+  values.clear();
+  // 0,1,1 repeated 10 times (unsigned ints)
+  for (size_t i = 0; i < 10; ++i) {
+    values.push_back(0);
+    values.push_back(1);
+    values.push_back(1);
+  }
+  // bytes2: 0100,0000,0001,1101,0110,1101,1011,0110,1101,1011,0110,1100
+  // First byte: 01 for encoding, 00000 for bitSize index (0), 0 for first bit of len - 1.
+  // Second byte: 0001,1101 for len - 1 = 29.
+  // Following bits repeating 011 ten times. Last byte padding with 00.
+  const unsigned char bytes2[] = {0x40, 0x1D, 0x6D, 0xB6, 0xDB, 0x6C};
+  l = sizeof(bytes2) / sizeof(char);
+  count = 30;
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes2, l, 1, count, nullptr, false, blkSize), 1);
+    checkResults(values, decodeRLEv2(bytes2, l, 3, count, nullptr, false, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes2, l, 7, count, nullptr, false, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes2, l, count, count, nullptr, false, blkSize), count);
+  }
+}
+
 TEST(RLEv2, bitSize2Direct) {
- // 0,1 repeated 10 times (signed ints)
- const size_t count = 20;
- std::vector<int64_t> values;
- for (size_t i = 0; i < count; ++i) {
-     values.push_back(i%2);
- }
-
- const unsigned char bytes[] = {0x42, 0x13, 0x22, 0x22, 0x22, 0x22, 0x22};
- unsigned long l = sizeof(bytes) / sizeof(char);
- // Read 1 at a time, then 3 at a time, etc.
- checkResults(values, decodeRLEv2(bytes, l, 1, count), 1);
- checkResults(values, decodeRLEv2(bytes, l, 3, count), 3);
- checkResults(values, decodeRLEv2(bytes, l, 7, count), 7);
- checkResults(values, decodeRLEv2(bytes, l, count, count), count);
-};
+  // 0,1 repeated 10 times (signed ints)
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (size_t i = 0; i < count; ++i) {
+    values.push_back(i % 2);
+  }
+
+  const unsigned char bytes[] = {0x42, 0x13, 0x22, 0x22, 0x22, 0x22, 0x22};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 1);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
 
 TEST(RLEv2, bitSize4Direct) {
- // 0,2 repeated 10 times (signed ints)
- const size_t count = 20;
- std::vector<int64_t> values;
- for (size_t i = 0; i < count; ++i) {
+  // 0,2 repeated 10 times (signed ints)
+  size_t count = 20;
+  std::vector<int64_t> values;
+  for (size_t i = 0; i < count; ++i) {
      values.push_back((i%2)*2);
- }
+  }
+
+  const unsigned char bytes[] = {0x46,0x13,0x04,0x04,0x04,0x04,
+                                 0x04,0x04,0x04,0x04,0x04,0x04};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 1);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
 
- const unsigned char bytes[] = {0x46,0x13,0x04,0x04,0x04,0x04,
-                                0x04,0x04,0x04,0x04,0x04,0x04};
- unsigned long l = sizeof(bytes) / sizeof(char);
+  // -2,0,2,4 repeated 5 times (signed ints)
+  count = 20;
+  values.clear();
+  for (size_t i = 0; i < count; ++i) {
+    values.push_back((i % 4) * 2 - 2);
+  }
 
- // Read 1 at a time, then 3 at a time, etc.
- checkResults(values, decodeRLEv2(bytes, l, 1, count), 1);
- checkResults(values, decodeRLEv2(bytes, l, 3, count), 3);
- checkResults(values, decodeRLEv2(bytes, l, 7, count), 7);
- checkResults(values, decodeRLEv2(bytes, l, count, count), count);
-};
+  const unsigned char bytes2[] = {0x46,0x13,0x30,0x48,0x30,0x48,
+                                  0x30,0x48,0x30,0x48,0x30,0x48};
+  l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes2, l, 1, count, nullptr, true, blkSize), 1);
+    checkResults(values, decodeRLEv2(bytes2, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes2, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes2, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize8Direct) {
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int i = 0; i < count; ++i) {
+    values.push_back(i);
+  }
+
+  const unsigned char bytes[] = {0x4E,0x13,0x00,0x02,0x04,0x06,0x08,0x0A,0x0C,0x0E,0x10,
+                                 0x12,0x14,0x16,0x18,0x1A,0x1C,0x1E,0x20,0x22,0x24,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 1);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize16Direct) {
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int i = 0; i < count; ++i) {
+    values.push_back((i << 8) + i);
+  }
+
+  const unsigned char bytes[] = {0x5E,0x13,
+                                 0x00,0x00,0x02,0x02,0x04,0x04,0x06,0x06,
+                                 0x08,0x08,0x0A,0x0A,0x0C,0x0C,0x0E,0x0E,
+                                 0x10,0x10,0x12,0x12,0x14,0x14,0x16,0x16,
+                                 0x18,0x18,0x1A,0x1A,0x1C,0x1C,0x1E,0x1E,
+                                 0x20,0x20,0x22,0x22,0x24,0x24,0x26,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize24Direct) {
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int64_t i = 0; i < count; ++i) {
+    values.push_back((i << 16) + (i << 8) + i);
+  }
+
+  const unsigned char bytes[] = {0x6E,0x13,
+                                 0x00,0x00,0x00,0x02,0x02,0x02,0x04,0x04,0x04,
+                                 0x06,0x06,0x06,0x08,0x08,0x08,0x0A,0x0A,0x0A,
+                                 0x0C,0x0C,0x0C,0x0E,0x0E,0x0E,0x10,0x10,0x10,
+                                 0x12,0x12,0x12,0x14,0x14,0x14,0x16,0x16,0x16,
+                                 0x18,0x18,0x18,0x1A,0x1A,0x1A,0x1C,0x1C,0x1C,
+                                 0x1E,0x1E,0x1E,0x20,0x20,0x20,0x22,0x22,0x22,
+                                 0x24,0x24,0x24,0x26,0x26,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize32Direct) {
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int64_t i = 0; i < count; ++i) {
+    values.push_back((i << 24) + (i << 16) + (i << 8) + i);
+  }
+
+  const unsigned char bytes[] = {0x76,0x13,
+                                 0x00,0x00,0x00,0x00,0x02,0x02,0x02,0x02,
+                                 0x04,0x04,0x04,0x04,0x06,0x06,0x06,0x06,
+                                 0x08,0x08,0x08,0x08,0x0A,0x0A,0x0A,0x0A,
+                                 0x0C,0x0C,0x0C,0x0C,0x0E,0x0E,0x0E,0x0E,
+                                 0x10,0x10,0x10,0x10,0x12,0x12,0x12,0x12,
+                                 0x14,0x14,0x14,0x14,0x16,0x16,0x16,0x16,
+                                 0x18,0x18,0x18,0x18,0x1A,0x1A,0x1A,0x1A,
+                                 0x1C,0x1C,0x1C,0x1C,0x1E,0x1E,0x1E,0x1E,
+                                 0x20,0x20,0x20,0x20,0x22,0x22,0x22,0x22,
+                                 0x24,0x24,0x24,0x24,0x26,0x26,0x26,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize40Direct) {
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int64_t i = 0; i < count; ++i) {
+    values.push_back((i << 32) + (i << 24) + (i << 16) + (i << 8) + i);
+  }
+
+  const unsigned char bytes[] = {0x78,0x13,
+                                 0x00,0x00,0x00,0x00,0x00,
+                                 0x02,0x02,0x02,0x02,0x02,
+                                 0x04,0x04,0x04,0x04,0x04,
+                                 0x06,0x06,0x06,0x06,0x06,
+                                 0x08,0x08,0x08,0x08,0x08,
+                                 0x0A,0x0A,0x0A,0x0A,0x0A,
+                                 0x0C,0x0C,0x0C,0x0C,0x0C,
+                                 0x0E,0x0E,0x0E,0x0E,0x0E,
+                                 0x10,0x10,0x10,0x10,0x10,
+                                 0x12,0x12,0x12,0x12,0x12,
+                                 0x14,0x14,0x14,0x14,0x14,
+                                 0x16,0x16,0x16,0x16,0x16,
+                                 0x18,0x18,0x18,0x18,0x18,
+                                 0x1A,0x1A,0x1A,0x1A,0x1A,
+                                 0x1C,0x1C,0x1C,0x1C,0x1C,
+                                 0x1E,0x1E,0x1E,0x1E,0x1E,
+                                 0x20,0x20,0x20,0x20,0x20,
+                                 0x22,0x22,0x22,0x22,0x22,
+                                 0x24,0x24,0x24,0x24,0x24,
+                                 0x26,0x26,0x26,0x26,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize48Direct) {
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int64_t i = 0; i < count; ++i) {
+    values.push_back((i << 40) + (i << 32) + (i << 24) + (i << 16) + (i << 8) + i);
+  }
+
+  const unsigned char bytes[] = {0x7A,0x13,
+                                 0x00,0x00,0x00,0x00,0x00,0x00,
+                                 0x02,0x02,0x02,0x02,0x02,0x02,
+                                 0x04,0x04,0x04,0x04,0x04,0x04,
+                                 0x06,0x06,0x06,0x06,0x06,0x06,
+                                 0x08,0x08,0x08,0x08,0x08,0x08,
+                                 0x0A,0x0A,0x0A,0x0A,0x0A,0x0A,
+                                 0x0C,0x0C,0x0C,0x0C,0x0C,0x0C,
+                                 0x0E,0x0E,0x0E,0x0E,0x0E,0x0E,
+                                 0x10,0x10,0x10,0x10,0x10,0x10,
+                                 0x12,0x12,0x12,0x12,0x12,0x12,
+                                 0x14,0x14,0x14,0x14,0x14,0x14,
+                                 0x16,0x16,0x16,0x16,0x16,0x16,
+                                 0x18,0x18,0x18,0x18,0x18,0x18,
+                                 0x1A,0x1A,0x1A,0x1A,0x1A,0x1A,
+                                 0x1C,0x1C,0x1C,0x1C,0x1C,0x1C,
+                                 0x1E,0x1E,0x1E,0x1E,0x1E,0x1E,
+                                 0x20,0x20,0x20,0x20,0x20,0x20,
+                                 0x22,0x22,0x22,0x22,0x22,0x22,
+                                 0x24,0x24,0x24,0x24,0x24,0x24,
+                                 0x26,0x26,0x26,0x26,0x26,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize56Direct) {
+  // 0,2 repeated 10 times (signed ints)
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int64_t i = 0; i < count; ++i) {
+    values.push_back((i << 48) + (i << 40) + (i << 32) + (i << 24) + (i << 16) + (i << 8) + i);
+  }
+
+  const unsigned char bytes[] = {0x7C,0x13,
+                                 0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                                 0x02,0x02,0x02,0x02,0x02,0x02,0x02,
+                                 0x04,0x04,0x04,0x04,0x04,0x04,0x04,
+                                 0x06,0x06,0x06,0x06,0x06,0x06,0x06,
+                                 0x08,0x08,0x08,0x08,0x08,0x08,0x08,
+                                 0x0A,0x0A,0x0A,0x0A,0x0A,0x0A,0x0A,
+                                 0x0C,0x0C,0x0C,0x0C,0x0C,0x0C,0x0C,
+                                 0x0E,0x0E,0x0E,0x0E,0x0E,0x0E,0x0E,
+                                 0x10,0x10,0x10,0x10,0x10,0x10,0x10,
+                                 0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+                                 0x14,0x14,0x14,0x14,0x14,0x14,0x14,
+                                 0x16,0x16,0x16,0x16,0x16,0x16,0x16,
+                                 0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+                                 0x1A,0x1A,0x1A,0x1A,0x1A,0x1A,0x1A,
+                                 0x1C,0x1C,0x1C,0x1C,0x1C,0x1C,0x1C,
+                                 0x1E,0x1E,0x1E,0x1E,0x1E,0x1E,0x1E,
+                                 0x20,0x20,0x20,0x20,0x20,0x20,0x20,
+                                 0x22,0x22,0x22,0x22,0x22,0x22,0x22,
+                                 0x24,0x24,0x24,0x24,0x24,0x24,0x24,
+                                 0x26,0x26,0x26,0x26,0x26,0x26,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
+
+TEST(RLEv2, bitSize64Direct) {
+  const size_t count = 20;
+  std::vector<int64_t> values;
+  for (int64_t i = 0; i < count; ++i) {
+    values.push_back((i << 56) + (i << 48) + (i << 40) + (i << 32) + (i << 24) + (i << 16) + (i << 8) + i);
+  }
+
+  const unsigned char bytes[] = {0x7E,0x13,
+                                 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                                 0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,
+                                 0x04,0x04,0x04,0x04,0x04,0x04,0x04,0x04,
+                                 0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,
+                                 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,
+                                 0x0A,0x0A,0x0A,0x0A,0x0A,0x0A,0x0A,0x0A,
+                                 0x0C,0x0C,0x0C,0x0C,0x0C,0x0C,0x0C,0x0C,
+                                 0x0E,0x0E,0x0E,0x0E,0x0E,0x0E,0x0E,0x0E,
+                                 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
+                                 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+                                 0x14,0x14,0x14,0x14,0x14,0x14,0x14,0x14,
+                                 0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16,
+                                 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+                                 0x1A,0x1A,0x1A,0x1A,0x1A,0x1A,0x1A,0x1A,
+                                 0x1C,0x1C,0x1C,0x1C,0x1C,0x1C,0x1C,0x1C,
+                                 0x1E,0x1E,0x1E,0x1E,0x1E,0x1E,0x1E,0x1E,
+                                 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
+                                 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22,
+                                 0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,
+                                 0x26,0x26,0x26,0x26,0x26,0x26,0x26,0x26};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
 
 TEST(RLEv2, multipleRunsDirect) {
- std::vector<int64_t> values;
- // 0,1 repeated 10 times (signed ints)
- for (size_t i = 0; i < 20; ++i) {
+  std::vector<int64_t> values;
+  // 0,1 repeated 10 times (signed ints)
+  for (size_t i = 0; i < 20; ++i) {
      values.push_back(i%2);
- }
- // 0,2 repeated 10 times (signed ints)
- for (size_t i = 0; i < 20; ++i) {
+  }
+  // 0,2 repeated 10 times (signed ints)
+  for (size_t i = 0; i < 20; ++i) {
      values.push_back((i%2)*2);
- }
-
- const unsigned char bytes[] = {0x42,0x13,0x22,0x22,0x22,0x22,0x22,
-                                0x46,0x13,0x04,0x04,0x04,0x04,0x04,
-                                0x04,0x04,0x04,0x04,0x04};
- unsigned long l = sizeof(bytes) / sizeof(char);
-
- // Read 1 at a time, then 3 at a time, etc.
- checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
- checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
- checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
- checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
-              values.size());
-};
+  }
+
+  const unsigned char bytes[] = {0x42,0x13,0x22,0x22,0x22,0x22,0x22,
+                                 0x46,0x13,0x04,0x04,0x04,0x04,0x04,
+                                 0x04,0x04,0x04,0x04,0x04};
+  unsigned long l = sizeof(bytes) / sizeof(char);
+
+  size_t count = values.size();
+  for (uint32_t blkSize = 1; blkSize <= l; ++blkSize) {
+    // Read 1 at a time, then 3 at a time, etc.
+    checkResults(values, decodeRLEv2(bytes, l, 1, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 3, count, nullptr, true, blkSize), 3);
+    checkResults(values, decodeRLEv2(bytes, l, 7, count, nullptr, true, blkSize), 7);
+    checkResults(values, decodeRLEv2(bytes, l, count, count, nullptr, true, blkSize), count);
+  }
+}
 
 TEST(RLEv2, largeNegativesDirect) {
   const unsigned char buffer[] =