You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2023/01/10 01:19:36 UTC

[datasketches-cpp] 01/01: compressed theta sketch

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch theta_compression
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit c88f1a52265fb41ac5f82bd8005cc11f834d0ff7
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Jan 9 17:19:28 2023 -0800

    compressed theta sketch
---
 common/include/count_zeros.hpp                     |   11 +
 theta/include/bit_packing.hpp                      | 6286 ++++++++++++++++++++
 theta/include/compact_theta_sketch_parser.hpp      |   23 +-
 theta/include/compact_theta_sketch_parser_impl.hpp |   95 +-
 theta/include/theta_sketch.hpp                     |   46 +-
 theta/include/theta_sketch_impl.hpp                |  437 +-
 theta/test/theta_sketch_test.cpp                   |   19 +-
 7 files changed, 6831 insertions(+), 86 deletions(-)

diff --git a/common/include/count_zeros.hpp b/common/include/count_zeros.hpp
index cdd9940..51cbc0c 100644
--- a/common/include/count_zeros.hpp
+++ b/common/include/count_zeros.hpp
@@ -91,6 +91,17 @@ static inline uint8_t count_leading_zeros_in_u64(uint64_t input) {
     return 56 + byte_leading_zeros_table[(input      ) & FCLZ_MASK_08];
 }
 
+static inline uint8_t count_leading_zeros_in_u32(uint32_t input) {
+  if (input > FCLZ_MASK_24)
+    return      byte_leading_zeros_table[(input >> 24) & FCLZ_MASK_08];
+  if (input > FCLZ_MASK_16)
+    return 8 + byte_leading_zeros_table[(input >> 16) & FCLZ_MASK_08];
+  if (input > FCLZ_MASK_08)
+    return 16 + byte_leading_zeros_table[(input >>  8) & FCLZ_MASK_08];
+  if (true)
+    return 24 + byte_leading_zeros_table[(input      ) & FCLZ_MASK_08];
+}
+
 static inline uint8_t count_trailing_zeros_in_u32(uint32_t input) {
   for (int i = 0; i < 4; i++) {
     const int byte = input & 0xff;
diff --git a/theta/include/bit_packing.hpp b/theta/include/bit_packing.hpp
new file mode 100644
index 0000000..48166e8
--- /dev/null
+++ b/theta/include/bit_packing.hpp
@@ -0,0 +1,6286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef BIT_PACKING_HPP_
+#define BIT_PACKING_HPP_
+
+#include <memory>
+
+namespace datasketches {
+
+static inline uint8_t pack_bits(uint64_t value, uint8_t width, uint8_t*& ptr, uint8_t offset) {
+  if (offset > 0) {
+    const uint8_t chunk_bits = 8 - offset;
+    const uint8_t mask = (1 << chunk_bits) - 1;
+    if (width < chunk_bits) {
+      *ptr |= (value << (chunk_bits - width)) & mask;
+      return offset + width;
+    }
+    *ptr++ |= (value >> (width - chunk_bits)) & mask;
+    width -= chunk_bits;
+  }
+  while (width >= 8) {
+    *ptr++ = value >> (width - 8);
+    width -= 8;
+  }
+  if (width > 0) {
+    *ptr = value << (8 - width);
+    return width;
+  }
+  return 0;
+}
+
+static inline uint8_t unpack_bits(uint64_t& value, uint8_t width, const uint8_t*& ptr, uint8_t offset) {
+  const uint8_t avail_bits = 8 - offset;
+  const uint8_t chunk_bits = std::min(avail_bits, width);
+  const uint8_t mask = (1 << chunk_bits) - 1;
+  value = (*ptr >> (avail_bits - chunk_bits)) & mask;
+  ptr += avail_bits == chunk_bits;
+  offset = (offset + chunk_bits) & 7;
+  width -= chunk_bits;
+  while (width >= 8) {
+    value <<= 8;
+    value |= *ptr++;
+    width -= 8;
+  }
+  if (width > 0) {
+    value <<= width;
+    value |= *ptr >> (8 - width);
+    return width;
+  }
+  return offset;
+}
+
+static inline size_t pack_ULEB128(uint64_t value, uint8_t* ptr) {
+  const uint8_t* start = ptr;
+  while (value >= 0x80) {
+    *ptr++ = value | 0x80;
+    value >>= 7;
+  }
+  *ptr++ = value;
+  return ptr - start;
+}
+
+// pack given number of bits from a block of 8 64-bit values into bytes
+
+static inline void pack_bits_1(const uint64_t* values, uint8_t* ptr) {
+  *ptr = values[0] << 7;
+  *ptr |= values[1] << 6;
+  *ptr |= values[2] << 5;
+  *ptr |= values[3] << 4;
+  *ptr |= values[4] << 3;
+  *ptr |= values[5] << 2;
+  *ptr |= values[6] << 1;
+  *ptr |= values[7];
+}
+
+static inline void pack_bits_2(const uint64_t* values, uint8_t* ptr) {
+  *ptr = values[0] << 6;
+  *ptr |= values[1] << 4;
+  *ptr |= values[2] << 2;
+  *ptr++ |= values[3];
+
+  *ptr = values[4] << 6;
+  *ptr |= values[5] << 4;
+  *ptr |= values[6] << 2;
+  *ptr |= values[7];
+}
+
+static inline void pack_bits_3(const uint64_t* values, uint8_t* ptr) {
+  *ptr = values[0] << 5;
+  *ptr |= values[1] << 2;
+  *ptr++ |= values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr |= values[3] << 4;
+  *ptr |= values[4] << 1;
+  *ptr++ |= values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr |= values[6] << 3;
+  *ptr |= values[7];
+}
+
+static inline void pack_bits_4(const uint64_t* values, uint8_t* ptr) {
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1];
+
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3];
+
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5];
+
+  *ptr = values[6] << 4;
+  *ptr |= values[7];
+}
+
+static inline void pack_bits_5(const uint64_t* values, uint8_t* ptr) {
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr |= values[2] << 1;
+  *ptr++ |= values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr |= values[5] << 2;
+  *ptr++ |= values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr |= values[7];
+}
+
+static inline void pack_bits_6(const uint64_t* values, uint8_t* ptr) {
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3];
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr |= values[7];
+}
+
+static inline void pack_bits_7(const uint64_t* values, uint8_t* ptr) {
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr |= values[7];
+}
+
+static inline void pack_bits_8(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0];
+  *ptr++ = values[1];
+  *ptr++ = values[2];
+  *ptr++ = values[3];
+  *ptr++ = values[4];
+  *ptr++ = values[5];
+  *ptr++ = values[6];
+  *ptr = values[7];
+}
+
+static inline void pack_bits_9(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 1;
+
+  *ptr = values[0] << 7;
+  *ptr++ |= values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 3;
+
+  *ptr = values[2] << 5;
+  *ptr++ |= values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 5;
+
+  *ptr = values[4] << 3;
+  *ptr++ |= values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 7;
+
+  *ptr = values[6] << 1;
+  *ptr++ |= values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_10(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 2;
+
+  *ptr = values[0] << 6;
+  *ptr++ |= values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 6;
+
+  *ptr = values[2] << 2;
+  *ptr++ |= values[3] >> 8;
+
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 2;
+
+  *ptr = values[4] << 6;
+  *ptr++ |= values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 6;
+
+  *ptr = values[6] << 2;
+  *ptr++ |= values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_11(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 3;
+
+  *ptr = values[0] << 5;
+  *ptr++ |= values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 9;
+
+  *ptr++ = values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr++ |= values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 7;
+
+  *ptr = values[4] << 1;
+  *ptr++ |= values[5] >> 10;
+
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 5;
+
+  *ptr = values[6] << 3;
+  *ptr++ |= values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_12(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 4;
+
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1] >> 8;
+
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 4;
+
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3] >> 8;
+
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 4;
+
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5] >> 8;
+
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 4;
+
+  *ptr = values[6] << 4;
+  *ptr++ |= values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 5;
+
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 10;
+
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 7;
+
+  *ptr = values[2] << 1;
+  *ptr++ |= values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] >> 4;
+  *ptr++ |= values[4] >> 9;
+
+  *ptr++ = values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr++ |= values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 11;
+
+  *ptr++ = values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr++ |= values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_14(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 6;
+
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 12;
+
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 10;
+
+  *ptr++ = values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3] >> 8;
+
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 6;
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 12;
+
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 10;
+
+  *ptr++ = values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr++ |= values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_15(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 7;
+
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 14;
+
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 13;
+
+  *ptr++ = values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 11;
+
+  *ptr++ = values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 10;
+
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 9;
+
+  *ptr++ = values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr++ |= values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_16(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 8;
+  *ptr++ = values[0];
+
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 8;
+  *ptr++ = values[2];
+
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 8;
+  *ptr++ = values[4];
+
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 8;
+  *ptr++ = values[6];
+
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_17(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 9;
+
+  *ptr++ = values[0] >> 1;
+
+  *ptr = values[0] << 7;
+  *ptr++ |= values[1] >> 10;
+
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 11;
+
+  *ptr++ = values[2] >> 3;
+
+  *ptr = values[2] << 5;
+  *ptr++ |= values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 13;
+
+  *ptr++ = values[4] >> 5;
+
+  *ptr = values[4] << 3;
+  *ptr++ |= values[5] >> 14;
+
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 15;
+
+  *ptr++ = values[6] >> 7;
+
+  *ptr = values[6] << 1;
+  *ptr++ |= values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_18(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 10;
+
+  *ptr++ = values[0] >> 2;
+
+  *ptr = values[0] << 6;
+  *ptr++ |= values[1] >> 12;
+
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 14;
+
+  *ptr++ = values[2] >> 6;
+
+  *ptr = values[2] << 2;
+  *ptr++ |= values[3] >> 16;
+
+  *ptr++ = values[3] >> 8;
+
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 10;
+
+  *ptr++ = values[4] >> 2;
+
+  *ptr = values[4] << 6;
+  *ptr++ |= values[5] >> 12;
+
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 14;
+
+  *ptr++ = values[6] >> 6;
+
+  *ptr = values[6] << 2;
+  *ptr++ |= values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_19(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 11;
+
+  *ptr++ = values[0] >> 3;
+
+  *ptr = values[0] << 5;
+  *ptr++ |= values[1] >> 14;
+
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 17;
+
+  *ptr++ = values[2] >> 9;
+
+  *ptr++ = values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr++ |= values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 15;
+
+  *ptr++ |= values[4] >> 7;
+
+  *ptr = values[4] << 1;
+  *ptr++ |= values[5] >> 18;
+
+  *ptr++ = values[5] >> 10;
+
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 13;
+
+  *ptr++ = values[6] >> 5;
+
+  *ptr = values[6] << 3;
+  *ptr++ |= values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_20(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 12;
+
+  *ptr++ = values[0] >> 4;
+
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1] >> 16;
+
+  *ptr++ = values[1] >> 8;
+
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 12;
+
+  *ptr++ = values[2] >> 4;
+
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3] >> 16;
+
+  *ptr++ = values[3] >> 8;
+
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 12;
+
+  *ptr++ = values[4] >> 4;
+
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5] >> 16;
+
+  *ptr++ = values[5] >> 8;
+
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 12;
+
+  *ptr++ = values[6] >> 4;
+
+  *ptr = values[6] << 4;
+  *ptr++ |= values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_21(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 13;
+
+  *ptr++ = values[0] >> 5;
+
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 18;
+
+  *ptr++ = values[1] >> 10;
+
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 15;
+
+  *ptr++ = values[2] >> 7;
+
+  *ptr = values[2] << 1;
+  *ptr++ |= values[3] >> 20;
+
+  *ptr++ = values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 17;
+
+  *ptr++ = values[4] >> 9;
+
+  *ptr++ = values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr++ |= values[5] >> 14;
+
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 19;
+
+  *ptr++ = values[6] >> 11;
+
+  *ptr++ = values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr++ |= values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_22(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 14;
+
+  *ptr++ = values[0] >> 6;
+
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 20;
+
+  *ptr++ = values[1] >> 12;
+
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 18;
+
+  *ptr++ = values[2] >> 10;
+
+  *ptr++ = values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3] >> 16;
+
+  *ptr++ = values[3] >> 8;
+
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 14;
+
+  *ptr++ = values[4] >> 6;
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 20;
+
+  *ptr++ = values[5] >> 12;
+
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 18;
+
+  *ptr++ = values[6] >> 10;
+
+  *ptr++ = values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr++ |= values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_23(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 15;
+
+  *ptr++ = values[0] >> 7;
+
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 22;
+
+  *ptr++ = values[1] >> 14;
+
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 21;
+
+  *ptr++ = values[2] >> 13;
+
+  *ptr++ = values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 20;
+
+  *ptr++ = values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 19;
+
+  *ptr++ = values[4] >> 11;
+
+  *ptr++ = values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 18;
+
+  *ptr++ = values[5] >> 10;
+
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 17;
+
+  *ptr++ = values[6] >> 9;
+
+  *ptr++ = values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr++ |= values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_24(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 16;
+  *ptr++ = values[0] >> 8;
+  *ptr++ = values[0];
+
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 16;
+  *ptr++ = values[2] >> 8;
+  *ptr++ = values[2];
+
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 16;
+  *ptr++ = values[4] >> 8;
+  *ptr++ = values[4];
+
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 16;
+  *ptr++ = values[6] >> 8;
+  *ptr++ = values[6];
+
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_25(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 17;
+
+  *ptr++ = values[0] >> 9;
+
+  *ptr++ = values[0] >> 1;
+
+  *ptr = values[0] << 7;
+  *ptr++ |= values[1] >> 18;
+
+  *ptr++ = values[1] >> 10;
+
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 19;
+
+  *ptr++ = values[2] >> 11;
+
+  *ptr++ = values[2] >> 3;
+
+  *ptr = values[2] << 5;
+  *ptr++ |= values[3] >> 20;
+
+  *ptr++ = values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 21;
+
+  *ptr++ = values[4] >> 13;
+
+  *ptr++ = values[4] >> 5;
+
+  *ptr = values[4] << 3;
+  *ptr++ |= values[5] >> 22;
+
+  *ptr++ = values[5] >> 14;
+
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 23;
+
+  *ptr++ = values[6] >> 15;
+
+  *ptr++ = values[6] >> 7;
+
+  *ptr = values[6] << 1;
+  *ptr++ |= values[7] >> 24;
+
+  *ptr++ = values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_26(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 18;
+
+  *ptr++ = values[0] >> 10;
+
+  *ptr++ = values[0] >> 2;
+
+  *ptr = values[0] << 6;
+  *ptr++ |= values[1] >> 20;
+
+  *ptr++ = values[1] >> 12;
+
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 22;
+
+  *ptr++ = values[2] >> 14;
+
+  *ptr++ = values[2] >> 6;
+
+  *ptr = values[2] << 2;
+  *ptr++ |= values[3] >> 24;
+
+  *ptr++ = values[3] >> 16;
+
+  *ptr++ = values[3] >> 8;
+
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 18;
+
+  *ptr++ = values[4] >> 10;
+
+  *ptr++ = values[4] >> 2;
+
+  *ptr = values[4] << 6;
+  *ptr++ |= values[5] >> 20;
+
+  *ptr++ = values[5] >> 12;
+
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 22;
+
+  *ptr++ = values[6] >> 14;
+
+  *ptr++ = values[6] >> 6;
+
+  *ptr = values[6] << 2;
+  *ptr++ |= values[7] >> 24;
+
+  *ptr++ = values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_27(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 19;
+
+  *ptr++ = values[0] >> 11;
+
+  *ptr++ = values[0] >> 3;
+
+  *ptr = values[0] << 5;
+  *ptr++ |= values[1] >> 22;
+
+  *ptr++ = values[1] >> 14;
+
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 25;
+
+  *ptr++ = values[2] >> 17;
+
+  *ptr++ = values[2] >> 9;
+
+  *ptr++ = values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr++ |= values[3] >> 20;
+
+  *ptr++ = values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 23;
+
+  *ptr++ = values[4] >> 15;
+
+  *ptr++ = values[4] >> 7;
+
+  *ptr = values[4] << 1;
+  *ptr++ |= values[5] >> 26;
+
+  *ptr++ = values[5] >> 18;
+
+  *ptr++ = values[5] >> 10;
+
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 21;
+
+  *ptr++ = values[6] >> 13;
+
+  *ptr++ = values[6] >> 5;
+
+  *ptr = values[6] << 3;
+  *ptr++ |= values[7] >> 24;
+
+  *ptr++ = values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_28(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 20;
+  *ptr++ = values[0] >> 12;
+  *ptr++ = values[0] >> 4;
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+  *ptr++ = values[2] >> 20;
+  *ptr++ = values[2] >> 12;
+  *ptr++ = values[2] >> 4;
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+  *ptr++ = values[4] >> 20;
+  *ptr++ = values[4] >> 12;
+  *ptr++ = values[4] >> 4;
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+  *ptr++ = values[6] >> 20;
+  *ptr++ = values[6] >> 12;
+  *ptr++ = values[6] >> 4;
+  *ptr = values[6] << 4;
+  *ptr++ |= values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_29(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 21;
+
+  *ptr++ = values[0] >> 13;
+
+  *ptr++ = values[0] >> 5;
+
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 26;
+
+  *ptr++ = values[1] >> 18;
+
+  *ptr++ = values[1] >> 10;
+
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 23;
+
+  *ptr++ = values[2] >> 15;
+
+  *ptr++ = values[2] >> 7;
+
+  *ptr = values[2] << 1;
+  *ptr++ |= values[3] >> 28;
+
+  *ptr++ = values[3] >> 20;
+
+  *ptr++ = values[3] >> 12;
+
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 25;
+
+  *ptr++ = values[4] >> 17;
+
+  *ptr++ = values[4] >> 9;
+
+  *ptr++ = values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr++ |= values[5] >> 22;
+
+  *ptr++ = values[5] >> 14;
+
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 27;
+
+  *ptr++ = values[6] >> 19;
+
+  *ptr++ = values[6] >> 11;
+
+  *ptr++ = values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr++ |= values[7] >> 24;
+
+  *ptr++ = values[7] >> 16;
+
+  *ptr++ = values[7] >> 8;
+
+  *ptr = values[7];
+}
+
+static inline void pack_bits_30(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 22;
+  *ptr++ = values[0] >> 14;
+  *ptr++ = values[0] >> 6;
+
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 26;
+  *ptr++ = values[2] >> 18;
+  *ptr++ = values[2] >> 10;
+  *ptr++ = values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 22;
+  *ptr++ = values[4] >> 14;
+  *ptr++ = values[4] >> 6;
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 26;
+  *ptr++ = values[6] >> 18;
+  *ptr++ = values[6] >> 10;
+  *ptr++ = values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr++ |= values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_31(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 23;
+  *ptr++ = values[0] >> 15;
+  *ptr++ = values[0] >> 7;
+
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 29;
+  *ptr++ = values[2] >> 21;
+  *ptr++ = values[2] >> 13;
+  *ptr++ = values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 27;
+  *ptr++ = values[4] >> 19;
+  *ptr++ = values[4] >> 11;
+  *ptr++ = values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 25;
+  *ptr++ = values[6] >> 17;
+  *ptr++ = values[6] >> 9;
+  *ptr++ = values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr++ |= values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_32(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 24;
+  *ptr++ = values[0] >> 16;
+  *ptr++ = values[0] >> 8;
+  *ptr++ = values[0];
+
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 24;
+  *ptr++ = values[2] >> 16;
+  *ptr++ = values[2] >> 8;
+  *ptr++ = values[2];
+
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 24;
+  *ptr++ = values[4] >> 16;
+  *ptr++ = values[4] >> 8;
+  *ptr++ = values[4];
+
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 24;
+  *ptr++ = values[6] >> 16;
+  *ptr++ = values[6] >> 8;
+  *ptr++ = values[6];
+
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_33(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 25;
+  *ptr++ = values[0] >> 17;
+  *ptr++ = values[0] >> 9;
+  *ptr++ = values[0] >> 1;
+
+  *ptr = values[0] << 7;
+  *ptr++ |= values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 27;
+  *ptr++ = values[2] >> 19;
+  *ptr++ = values[2] >> 11;
+  *ptr++ = values[2] >> 3;
+
+  *ptr = values[2] << 5;
+  *ptr++ |= values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 29;
+  *ptr++ = values[4] >> 21;
+  *ptr++ = values[4] >> 13;
+  *ptr++ = values[4] >> 5;
+
+  *ptr = values[4] << 3;
+  *ptr++ |= values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 31;
+  *ptr++ = values[6] >> 23;
+  *ptr++ = values[6] >> 15;
+  *ptr++ = values[6] >> 7;
+
+  *ptr = values[6] << 1;
+  *ptr++ |= values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_34(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 26;
+  *ptr++ = values[0] >> 18;
+  *ptr++ = values[0] >> 10;
+  *ptr++ = values[0] >> 2;
+
+  *ptr = values[0] << 6;
+  *ptr++ |= values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 30;
+  *ptr++ = values[2] >> 22;
+  *ptr++ = values[2] >> 14;
+  *ptr++ = values[2] >> 6;
+
+  *ptr = values[2] << 2;
+  *ptr++ |= values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 26;
+  *ptr++ = values[4] >> 18;
+  *ptr++ = values[4] >> 10;
+  *ptr++ = values[4] >> 2;
+
+  *ptr = values[4] << 6;
+  *ptr++ |= values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 30;
+  *ptr++ = values[6] >> 22;
+  *ptr++ = values[6] >> 14;
+  *ptr++ = values[6] >> 6;
+
+  *ptr = values[6] << 2;
+  *ptr++ |= values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_35(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 27;
+  *ptr++ = values[0] >> 19;
+  *ptr++ = values[0] >> 11;
+  *ptr++ = values[0] >> 3;
+
+  *ptr = values[0] << 5;
+  *ptr++ |= values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 33;
+  *ptr++ = values[2] >> 25;
+  *ptr++ = values[2] >> 17;
+  *ptr++ = values[2] >> 9;
+  *ptr++ = values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr++ |= values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 31;
+  *ptr++ = values[4] >> 23;
+  *ptr++ = values[4] >> 15;
+  *ptr++ = values[4] >> 7;
+
+  *ptr = values[4] << 1;
+  *ptr++ |= values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 29;
+  *ptr++ = values[6] >> 21;
+  *ptr++ = values[6] >> 13;
+  *ptr++ = values[6] >> 5;
+
+  *ptr = values[6] << 3;
+  *ptr++ |= values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_36(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 28;
+  *ptr++ = values[0] >> 20;
+  *ptr++ = values[0] >> 12;
+  *ptr++ = values[0] >> 4;
+
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1] >> 32;
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 28;
+  *ptr++ = values[2] >> 20;
+  *ptr++ = values[2] >> 12;
+  *ptr++ = values[2] >> 4;
+
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 28;
+  *ptr++ = values[4] >> 20;
+  *ptr++ = values[4] >> 12;
+  *ptr++ = values[4] >> 4;
+
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5] >> 32;
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 28;
+  *ptr++ = values[6] >> 20;
+  *ptr++ = values[6] >> 12;
+  *ptr++ = values[6] >> 4;
+
+  *ptr = values[6] << 4;
+  *ptr++ |= values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_37(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 29;
+  *ptr++ = values[0] >> 21;
+  *ptr++ = values[0] >> 13;
+  *ptr++ = values[0] >> 5;
+
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 34;
+  *ptr++ = values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 31;
+  *ptr++ = values[2] >> 23;
+  *ptr++ = values[2] >> 15;
+  *ptr++ = values[2] >> 7;
+
+  *ptr = values[2] << 1;
+  *ptr++ |= values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 33;
+  *ptr++ = values[4] >> 25;
+  *ptr++ = values[4] >> 17;
+  *ptr++ = values[4] >> 9;
+  *ptr++ = values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr++ |= values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 35;
+  *ptr++ = values[6] >> 27;
+  *ptr++ = values[6] >> 19;
+  *ptr++ = values[6] >> 11;
+  *ptr++ = values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr++ |= values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_38(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 30;
+  *ptr++ = values[0] >> 22;
+  *ptr++ = values[0] >> 14;
+  *ptr++ = values[0] >> 6;
+
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 36;
+  *ptr++ = values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 34;
+  *ptr++ = values[2] >> 26;
+  *ptr++ = values[2] >> 18;
+  *ptr++ = values[2] >> 10;
+  *ptr++ = values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 30;
+  *ptr++ = values[4] >> 22;
+  *ptr++ = values[4] >> 14;
+  *ptr++ = values[4] >> 6;
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 36;
+  *ptr++ = values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 34;
+  *ptr++ = values[6] >> 26;
+  *ptr++ = values[6] >> 18;
+  *ptr++ = values[6] >> 10;
+  *ptr++ = values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr++ |= values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_39(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 31;
+  *ptr++ = values[0] >> 23;
+  *ptr++ = values[0] >> 15;
+  *ptr++ = values[0] >> 7;
+
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 38;
+  *ptr++ = values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 37;
+  *ptr++ = values[2] >> 29;
+  *ptr++ = values[2] >> 21;
+  *ptr++ = values[2] >> 13;
+  *ptr++ = values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 35;
+  *ptr++ = values[4] >> 27;
+  *ptr++ = values[4] >> 19;
+  *ptr++ = values[4] >> 11;
+  *ptr++ = values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 33;
+  *ptr++ = values[6] >> 25;
+  *ptr++ = values[6] >> 17;
+  *ptr++ = values[6] >> 9;
+  *ptr++ = values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr++ |= values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_40(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 32;
+  *ptr++ = values[0] >> 24;
+  *ptr++ = values[0] >> 16;
+  *ptr++ = values[0] >> 8;
+  *ptr++ = values[0];
+
+  *ptr++ = values[1] >> 32;
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 32;
+  *ptr++ = values[2] >> 24;
+  *ptr++ = values[2] >> 16;
+  *ptr++ = values[2] >> 8;
+  *ptr++ = values[2];
+
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 32;
+  *ptr++ = values[4] >> 24;
+  *ptr++ = values[4] >> 16;
+  *ptr++ = values[4] >> 8;
+  *ptr++ = values[4];
+
+  *ptr++ = values[5] >> 32;
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 32;
+  *ptr++ = values[6] >> 24;
+  *ptr++ = values[6] >> 16;
+  *ptr++ = values[6] >> 8;
+  *ptr++ = values[6];
+
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_41(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 33;
+  *ptr++ = values[0] >> 25;
+  *ptr++ = values[0] >> 17;
+  *ptr++ = values[0] >> 9;
+  *ptr++ = values[0] >> 1;
+
+  *ptr = values[0] << 7;
+  *ptr++ |= values[1] >> 34;
+  *ptr++ = values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 35;
+  *ptr++ = values[2] >> 27;
+  *ptr++ = values[2] >> 19;
+  *ptr++ = values[2] >> 11;
+  *ptr++ = values[2] >> 3;
+
+  *ptr = values[2] << 5;
+  *ptr++ |= values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 37;
+  *ptr++ = values[4] >> 29;
+  *ptr++ = values[4] >> 21;
+  *ptr++ = values[4] >> 13;
+  *ptr++ = values[4] >> 5;
+
+  *ptr = values[4] << 3;
+  *ptr++ |= values[5] >> 38;
+  *ptr++ = values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 39;
+  *ptr++ = values[6] >> 31;
+  *ptr++ = values[6] >> 23;
+  *ptr++ = values[6] >> 15;
+  *ptr++ = values[6] >> 7;
+
+  *ptr = values[6] << 1;
+  *ptr++ |= values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_42(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 34;
+  *ptr++ = values[0] >> 26;
+  *ptr++ = values[0] >> 18;
+  *ptr++ = values[0] >> 10;
+  *ptr++ = values[0] >> 2;
+
+  *ptr = values[0] << 6;
+  *ptr++ |= values[1] >> 36;
+  *ptr++ = values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 38;
+  *ptr++ = values[2] >> 30;
+  *ptr++ = values[2] >> 22;
+  *ptr++ = values[2] >> 14;
+  *ptr++ = values[2] >> 6;
+
+  *ptr = values[2] << 2;
+  *ptr++ |= values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 34;
+  *ptr++ = values[4] >> 26;
+  *ptr++ = values[4] >> 18;
+  *ptr++ = values[4] >> 10;
+  *ptr++ = values[4] >> 2;
+
+  *ptr = values[4] << 6;
+  *ptr++ |= values[5] >> 36;
+  *ptr++ = values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 38;
+  *ptr++ = values[6] >> 30;
+  *ptr++ = values[6] >> 22;
+  *ptr++ = values[6] >> 14;
+  *ptr++ = values[6] >> 6;
+
+  *ptr = values[6] << 2;
+  *ptr++ |= values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_43(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 35;
+  *ptr++ = values[0] >> 27;
+  *ptr++ = values[0] >> 19;
+  *ptr++ = values[0] >> 11;
+  *ptr++ = values[0] >> 3;
+
+  *ptr = values[0] << 5;
+  *ptr++ |= values[1] >> 38;
+  *ptr++ = values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 41;
+  *ptr++ = values[2] >> 33;
+  *ptr++ = values[2] >> 25;
+  *ptr++ = values[2] >> 17;
+  *ptr++ = values[2] >> 9;
+  *ptr++ = values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr++ |= values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 39;
+  *ptr++ = values[4] >> 31;
+  *ptr++ = values[4] >> 23;
+  *ptr++ = values[4] >> 15;
+  *ptr++ = values[4] >> 7;
+
+  *ptr = values[4] << 1;
+  *ptr++ |= values[5] >> 42;
+  *ptr++ = values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 37;
+  *ptr++ = values[6] >> 29;
+  *ptr++ = values[6] >> 21;
+  *ptr++ = values[6] >> 13;
+  *ptr++ = values[6] >> 5;
+
+  *ptr = values[6] << 3;
+  *ptr++ |= values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_44(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 36;
+  *ptr++ = values[0] >> 28;
+  *ptr++ = values[0] >> 20;
+  *ptr++ = values[0] >> 12;
+  *ptr++ = values[0] >> 4;
+
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1] >> 40;
+  *ptr++ = values[1] >> 32;
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 36;
+  *ptr++ = values[2] >> 28;
+  *ptr++ = values[2] >> 20;
+  *ptr++ = values[2] >> 12;
+  *ptr++ = values[2] >> 4;
+
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 36;
+  *ptr++ = values[4] >> 28;
+  *ptr++ = values[4] >> 20;
+  *ptr++ = values[4] >> 12;
+  *ptr++ = values[4] >> 4;
+
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5] >> 40;
+  *ptr++ = values[5] >> 32;
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 36;
+  *ptr++ = values[6] >> 28;
+  *ptr++ = values[6] >> 20;
+  *ptr++ = values[6] >> 12;
+  *ptr++ = values[6] >> 4;
+
+  *ptr = values[6] << 4;
+  *ptr++ |= values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_45(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 37;
+  *ptr++ = values[0] >> 29;
+  *ptr++ = values[0] >> 21;
+  *ptr++ = values[0] >> 13;
+  *ptr++ = values[0] >> 5;
+
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 42;
+  *ptr++ = values[1] >> 34;
+  *ptr++ = values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 39;
+  *ptr++ = values[2] >> 31;
+  *ptr++ = values[2] >> 23;
+  *ptr++ = values[2] >> 15;
+  *ptr++ = values[2] >> 7;
+
+  *ptr = values[2] << 1;
+  *ptr++ |= values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 41;
+  *ptr++ = values[4] >> 33;
+  *ptr++ = values[4] >> 25;
+  *ptr++ = values[4] >> 17;
+  *ptr++ = values[4] >> 9;
+  *ptr++ = values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr++ |= values[5] >> 38;
+  *ptr++ = values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 43;
+  *ptr++ = values[6] >> 35;
+  *ptr++ = values[6] >> 27;
+  *ptr++ = values[6] >> 19;
+  *ptr++ = values[6] >> 11;
+  *ptr++ = values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr++ |= values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_46(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 38;
+  *ptr++ = values[0] >> 30;
+  *ptr++ = values[0] >> 22;
+  *ptr++ = values[0] >> 14;
+  *ptr++ = values[0] >> 6;
+
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 44;
+  *ptr++ = values[1] >> 36;
+  *ptr++ = values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 42;
+  *ptr++ = values[2] >> 34;
+  *ptr++ = values[2] >> 26;
+  *ptr++ = values[2] >> 18;
+  *ptr++ = values[2] >> 10;
+  *ptr++ = values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 38;
+  *ptr++ = values[4] >> 30;
+  *ptr++ = values[4] >> 22;
+  *ptr++ = values[4] >> 14;
+  *ptr++ = values[4] >> 6;
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 44;
+  *ptr++ = values[5] >> 36;
+  *ptr++ = values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 42;
+  *ptr++ = values[6] >> 34;
+  *ptr++ = values[6] >> 26;
+  *ptr++ = values[6] >> 18;
+  *ptr++ = values[6] >> 10;
+  *ptr++ = values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr++ |= values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_47(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 39;
+  *ptr++ = values[0] >> 31;
+  *ptr++ = values[0] >> 23;
+  *ptr++ = values[0] >> 15;
+  *ptr++ = values[0] >> 7;
+
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 46;
+  *ptr++ = values[1] >> 38;
+  *ptr++ = values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 45;
+  *ptr++ = values[2] >> 37;
+  *ptr++ = values[2] >> 29;
+  *ptr++ = values[2] >> 21;
+  *ptr++ = values[2] >> 13;
+  *ptr++ = values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 43;
+  *ptr++ = values[4] >> 35;
+  *ptr++ = values[4] >> 27;
+  *ptr++ = values[4] >> 19;
+  *ptr++ = values[4] >> 11;
+  *ptr++ = values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 42;
+  *ptr++ = values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 41;
+  *ptr++ = values[6] >> 33;
+  *ptr++ = values[6] >> 25;
+  *ptr++ = values[6] >> 17;
+  *ptr++ = values[6] >> 9;
+  *ptr++ = values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr++ |= values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_48(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 40;
+  *ptr++ = values[0] >> 32;
+  *ptr++ = values[0] >> 24;
+  *ptr++ = values[0] >> 16;
+  *ptr++ = values[0] >> 8;
+  *ptr++ = values[0];
+
+  *ptr++ = values[1] >> 40;
+  *ptr++ = values[1] >> 32;
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 40;
+  *ptr++ = values[2] >> 32;
+  *ptr++ = values[2] >> 24;
+  *ptr++ = values[2] >> 16;
+  *ptr++ = values[2] >> 8;
+  *ptr++ = values[2];
+
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 40;
+  *ptr++ = values[4] >> 32;
+  *ptr++ = values[4] >> 24;
+  *ptr++ = values[4] >> 16;
+  *ptr++ = values[4] >> 8;
+  *ptr++ = values[4];
+
+  *ptr++ = values[5] >> 40;
+  *ptr++ = values[5] >> 32;
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 40;
+  *ptr++ = values[6] >> 32;
+  *ptr++ = values[6] >> 24;
+  *ptr++ = values[6] >> 16;
+  *ptr++ = values[6] >> 8;
+  *ptr++ = values[6];
+
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_49(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 41;
+  *ptr++ = values[0] >> 33;
+  *ptr++ = values[0] >> 25;
+  *ptr++ = values[0] >> 17;
+  *ptr++ = values[0] >> 9;
+  *ptr++ = values[0] >> 1;
+
+  *ptr = values[0] << 7;
+  *ptr++ |= values[1] >> 42;
+  *ptr++ = values[1] >> 34;
+  *ptr++ = values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 43;
+  *ptr++ = values[2] >> 35;
+  *ptr++ = values[2] >> 27;
+  *ptr++ = values[2] >> 19;
+  *ptr++ = values[2] >> 11;
+  *ptr++ = values[2] >> 3;
+
+  *ptr = values[2] << 5;
+  *ptr++ |= values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 45;
+  *ptr++ = values[4] >> 37;
+  *ptr++ = values[4] >> 29;
+  *ptr++ = values[4] >> 21;
+  *ptr++ = values[4] >> 13;
+  *ptr++ = values[4] >> 5;
+
+  *ptr = values[4] << 3;
+  *ptr++ |= values[5] >> 46;
+  *ptr++ = values[5] >> 38;
+  *ptr++ = values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 47;
+  *ptr++ = values[6] >> 39;
+  *ptr++ = values[6] >> 31;
+  *ptr++ = values[6] >> 23;
+  *ptr++ = values[6] >> 15;
+  *ptr++ = values[6] >> 7;
+
+  *ptr = values[6] << 1;
+  *ptr++ |= values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_50(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 42;
+  *ptr++ = values[0] >> 34;
+  *ptr++ = values[0] >> 26;
+  *ptr++ = values[0] >> 18;
+  *ptr++ = values[0] >> 10;
+  *ptr++ = values[0] >> 2;
+
+  *ptr = values[0] << 6;
+  *ptr++ |= values[1] >> 44;
+  *ptr++ = values[1] >> 36;
+  *ptr++ = values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 46;
+  *ptr++ = values[2] >> 38;
+  *ptr++ = values[2] >> 30;
+  *ptr++ = values[2] >> 22;
+  *ptr++ = values[2] >> 14;
+  *ptr++ = values[2] >> 6;
+
+  *ptr = values[2] << 2;
+  *ptr++ |= values[3] >> 48;
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 42;
+  *ptr++ = values[4] >> 34;
+  *ptr++ = values[4] >> 26;
+  *ptr++ = values[4] >> 18;
+  *ptr++ = values[4] >> 10;
+  *ptr++ = values[4] >> 2;
+
+  *ptr = values[4] << 6;
+  *ptr++ |= values[5] >> 44;
+  *ptr++ = values[5] >> 36;
+  *ptr++ = values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 46;
+  *ptr++ = values[6] >> 38;
+  *ptr++ = values[6] >> 30;
+  *ptr++ = values[6] >> 22;
+  *ptr++ = values[6] >> 14;
+  *ptr++ = values[6] >> 6;
+
+  *ptr = values[6] << 2;
+  *ptr++ |= values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_51(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 43;
+  *ptr++ = values[0] >> 35;
+  *ptr++ = values[0] >> 27;
+  *ptr++ = values[0] >> 19;
+  *ptr++ = values[0] >> 11;
+  *ptr++ = values[0] >> 3;
+
+  *ptr = values[0] << 5;
+  *ptr++ |= values[1] >> 46;
+  *ptr++ = values[1] >> 38;
+  *ptr++ = values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 49;
+  *ptr++ = values[2] >> 41;
+  *ptr++ = values[2] >> 33;
+  *ptr++ = values[2] >> 25;
+  *ptr++ = values[2] >> 17;
+  *ptr++ = values[2] >> 9;
+  *ptr++ = values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr++ |= values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 47;
+  *ptr++ = values[4] >> 39;
+  *ptr++ = values[4] >> 31;
+  *ptr++ = values[4] >> 23;
+  *ptr++ = values[4] >> 15;
+  *ptr++ = values[4] >> 7;
+
+  *ptr = values[4] << 1;
+  *ptr++ |= values[5] >> 50;
+  *ptr++ = values[5] >> 42;
+  *ptr++ = values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 45;
+  *ptr++ = values[6] >> 37;
+  *ptr++ = values[6] >> 29;
+  *ptr++ = values[6] >> 21;
+  *ptr++ = values[6] >> 13;
+  *ptr++ = values[6] >> 5;
+
+  *ptr = values[6] << 3;
+  *ptr++ |= values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_52(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 44;
+  *ptr++ = values[0] >> 36;
+  *ptr++ = values[0] >> 28;
+  *ptr++ = values[0] >> 20;
+  *ptr++ = values[0] >> 12;
+  *ptr++ = values[0] >> 4;
+
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1] >> 48;
+  *ptr++ = values[1] >> 40;
+  *ptr++ = values[1] >> 32;
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 44;
+  *ptr++ = values[2] >> 36;
+  *ptr++ = values[2] >> 28;
+  *ptr++ = values[2] >> 20;
+  *ptr++ = values[2] >> 12;
+  *ptr++ = values[2] >> 4;
+
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3] >> 48;
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 44;
+  *ptr++ = values[4] >> 36;
+  *ptr++ = values[4] >> 28;
+  *ptr++ = values[4] >> 20;
+  *ptr++ = values[4] >> 12;
+  *ptr++ = values[4] >> 4;
+
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5] >> 48;
+  *ptr++ = values[5] >> 40;
+  *ptr++ = values[5] >> 32;
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 44;
+  *ptr++ = values[6] >> 36;
+  *ptr++ = values[6] >> 28;
+  *ptr++ = values[6] >> 20;
+  *ptr++ = values[6] >> 12;
+  *ptr++ = values[6] >> 4;
+
+  *ptr = values[6] << 4;
+  *ptr++ |= values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_53(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 45;
+  *ptr++ = values[0] >> 37;
+  *ptr++ = values[0] >> 29;
+  *ptr++ = values[0] >> 21;
+  *ptr++ = values[0] >> 13;
+  *ptr++ = values[0] >> 5;
+
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 50;
+  *ptr++ = values[1] >> 42;
+  *ptr++ = values[1] >> 34;
+  *ptr++ = values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 47;
+  *ptr++ = values[2] >> 39;
+  *ptr++ = values[2] >> 31;
+  *ptr++ = values[2] >> 23;
+  *ptr++ = values[2] >> 15;
+  *ptr++ = values[2] >> 7;
+
+  *ptr = values[2] << 1;
+  *ptr++ |= values[3] >> 52;
+  *ptr++ = values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 49;
+  *ptr++ = values[4] >> 41;
+  *ptr++ = values[4] >> 33;
+  *ptr++ = values[4] >> 25;
+  *ptr++ = values[4] >> 17;
+  *ptr++ = values[4] >> 9;
+  *ptr++ = values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr++ |= values[5] >> 46;
+  *ptr++ = values[5] >> 38;
+  *ptr++ = values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 51;
+  *ptr++ = values[6] >> 43;
+  *ptr++ = values[6] >> 35;
+  *ptr++ = values[6] >> 27;
+  *ptr++ = values[6] >> 19;
+  *ptr++ = values[6] >> 11;
+  *ptr++ = values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr++ |= values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_54(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 46;
+  *ptr++ = values[0] >> 38;
+  *ptr++ = values[0] >> 30;
+  *ptr++ = values[0] >> 22;
+  *ptr++ = values[0] >> 14;
+  *ptr++ = values[0] >> 6;
+
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 52;
+  *ptr++ = values[1] >> 44;
+  *ptr++ = values[1] >> 36;
+  *ptr++ = values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 50;
+  *ptr++ = values[2] >> 42;
+  *ptr++ = values[2] >> 34;
+  *ptr++ = values[2] >> 26;
+  *ptr++ = values[2] >> 18;
+  *ptr++ = values[2] >> 10;
+  *ptr++ = values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3] >> 48;
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 46;
+  *ptr++ = values[4] >> 38;
+  *ptr++ = values[4] >> 30;
+  *ptr++ = values[4] >> 22;
+  *ptr++ = values[4] >> 14;
+  *ptr++ = values[4] >> 6;
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 52;
+  *ptr++ = values[5] >> 44;
+  *ptr++ = values[5] >> 36;
+  *ptr++ = values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 50;
+  *ptr++ = values[6] >> 42;
+  *ptr++ = values[6] >> 34;
+  *ptr++ = values[6] >> 26;
+  *ptr++ = values[6] >> 18;
+  *ptr++ = values[6] >> 10;
+  *ptr++ = values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr++ |= values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_55(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 47;
+  *ptr++ = values[0] >> 39;
+  *ptr++ = values[0] >> 31;
+  *ptr++ = values[0] >> 23;
+  *ptr++ = values[0] >> 15;
+  *ptr++ = values[0] >> 7;
+
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 54;
+  *ptr++ = values[1] >> 46;
+  *ptr++ = values[1] >> 38;
+  *ptr++ = values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 53;
+  *ptr++ = values[2] >> 45;
+  *ptr++ = values[2] >> 37;
+  *ptr++ = values[2] >> 29;
+  *ptr++ = values[2] >> 21;
+  *ptr++ = values[2] >> 13;
+  *ptr++ = values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 52;
+  *ptr++ = values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 51;
+  *ptr++ = values[4] >> 43;
+  *ptr++ = values[4] >> 35;
+  *ptr++ = values[4] >> 27;
+  *ptr++ = values[4] >> 19;
+  *ptr++ = values[4] >> 11;
+  *ptr++ = values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 50;
+  *ptr++ = values[5] >> 42;
+  *ptr++ = values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 49;
+  *ptr++ = values[6] >> 41;
+  *ptr++ = values[6] >> 33;
+  *ptr++ = values[6] >> 25;
+  *ptr++ = values[6] >> 17;
+  *ptr++ = values[6] >> 9;
+  *ptr++ = values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr++ |= values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_56(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 48;
+  *ptr++ = values[0] >> 40;
+  *ptr++ = values[0] >> 32;
+  *ptr++ = values[0] >> 24;
+  *ptr++ = values[0] >> 16;
+  *ptr++ = values[0] >> 8;
+  *ptr++ = values[0];
+
+  *ptr++ = values[1] >> 48;
+  *ptr++ = values[1] >> 40;
+  *ptr++ = values[1] >> 32;
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 48;
+  *ptr++ = values[2] >> 40;
+  *ptr++ = values[2] >> 32;
+  *ptr++ = values[2] >> 24;
+  *ptr++ = values[2] >> 16;
+  *ptr++ = values[2] >> 8;
+  *ptr++ = values[2];
+
+  *ptr++ = values[3] >> 48;
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 48;
+  *ptr++ = values[4] >> 40;
+  *ptr++ = values[4] >> 32;
+  *ptr++ = values[4] >> 24;
+  *ptr++ = values[4] >> 16;
+  *ptr++ = values[4] >> 8;
+  *ptr++ = values[4];
+
+  *ptr++ = values[5] >> 48;
+  *ptr++ = values[5] >> 40;
+  *ptr++ = values[5] >> 32;
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 48;
+  *ptr++ = values[6] >> 40;
+  *ptr++ = values[6] >> 32;
+  *ptr++ = values[6] >> 24;
+  *ptr++ = values[6] >> 16;
+  *ptr++ = values[6] >> 8;
+  *ptr++ = values[6];
+
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_57(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 49;
+  *ptr++ = values[0] >> 41;
+  *ptr++ = values[0] >> 33;
+  *ptr++ = values[0] >> 25;
+  *ptr++ = values[0] >> 17;
+  *ptr++ = values[0] >> 9;
+  *ptr++ = values[0] >> 1;
+
+  *ptr = values[0] << 7;
+  *ptr++ |= values[1] >> 50;
+  *ptr++ = values[1] >> 42;
+  *ptr++ = values[1] >> 34;
+  *ptr++ = values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 51;
+  *ptr++ = values[2] >> 43;
+  *ptr++ = values[2] >> 35;
+  *ptr++ = values[2] >> 27;
+  *ptr++ = values[2] >> 19;
+  *ptr++ = values[2] >> 11;
+  *ptr++ = values[2] >> 3;
+
+  *ptr = values[2] << 5;
+  *ptr++ |= values[3] >> 52;
+  *ptr++ = values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 53;
+  *ptr++ = values[4] >> 45;
+  *ptr++ = values[4] >> 37;
+  *ptr++ = values[4] >> 29;
+  *ptr++ = values[4] >> 21;
+  *ptr++ = values[4] >> 13;
+  *ptr++ = values[4] >> 5;
+
+  *ptr = values[4] << 3;
+  *ptr++ |= values[5] >> 54;
+  *ptr++ = values[5] >> 46;
+  *ptr++ = values[5] >> 38;
+  *ptr++ = values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 55;
+  *ptr++ = values[6] >> 47;
+  *ptr++ = values[6] >> 39;
+  *ptr++ = values[6] >> 31;
+  *ptr++ = values[6] >> 23;
+  *ptr++ = values[6] >> 15;
+  *ptr++ = values[6] >> 7;
+
+  *ptr = values[6] << 1;
+  *ptr++ |= values[7] >> 56;
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_58(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 50;
+  *ptr++ = values[0] >> 42;
+  *ptr++ = values[0] >> 34;
+  *ptr++ = values[0] >> 26;
+  *ptr++ = values[0] >> 18;
+  *ptr++ = values[0] >> 10;
+  *ptr++ = values[0] >> 2;
+
+  *ptr = values[0] << 6;
+  *ptr++ |= values[1] >> 52;
+  *ptr++ = values[1] >> 44;
+  *ptr++ = values[1] >> 36;
+  *ptr++ = values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 54;
+  *ptr++ = values[2] >> 46;
+  *ptr++ = values[2] >> 38;
+  *ptr++ = values[2] >> 30;
+  *ptr++ = values[2] >> 22;
+  *ptr++ = values[2] >> 14;
+  *ptr++ = values[2] >> 6;
+
+  *ptr = values[2] << 2;
+  *ptr++ |= values[3] >> 56;
+  *ptr++ = values[3] >> 48;
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 50;
+  *ptr++ = values[4] >> 42;
+  *ptr++ = values[4] >> 34;
+  *ptr++ = values[4] >> 26;
+  *ptr++ = values[4] >> 18;
+  *ptr++ = values[4] >> 10;
+  *ptr++ = values[4] >> 2;
+
+  *ptr = values[4] << 6;
+  *ptr++ |= values[5] >> 52;
+  *ptr++ = values[5] >> 44;
+  *ptr++ = values[5] >> 36;
+  *ptr++ = values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 54;
+  *ptr++ = values[6] >> 46;
+  *ptr++ = values[6] >> 38;
+  *ptr++ = values[6] >> 30;
+  *ptr++ = values[6] >> 22;
+  *ptr++ = values[6] >> 14;
+  *ptr++ = values[6] >> 6;
+
+  *ptr = values[6] << 2;
+  *ptr++ |= values[7] >> 56;
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_59(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 51;
+  *ptr++ = values[0] >> 43;
+  *ptr++ = values[0] >> 35;
+  *ptr++ = values[0] >> 27;
+  *ptr++ = values[0] >> 19;
+  *ptr++ = values[0] >> 11;
+  *ptr++ = values[0] >> 3;
+
+  *ptr = values[0] << 5;
+  *ptr++ |= values[1] >> 54;
+  *ptr++ = values[1] >> 46;
+  *ptr++ = values[1] >> 38;
+  *ptr++ = values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 57;
+  *ptr++ = values[2] >> 49;
+  *ptr++ = values[2] >> 41;
+  *ptr++ = values[2] >> 33;
+  *ptr++ = values[2] >> 25;
+  *ptr++ = values[2] >> 17;
+  *ptr++ = values[2] >> 9;
+  *ptr++ = values[2] >> 1;
+
+  *ptr = values[2] << 7;
+  *ptr++ |= values[3] >> 52;
+  *ptr++ = values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 55;
+  *ptr++ = values[4] >> 47;
+  *ptr++ = values[4] >> 39;
+  *ptr++ = values[4] >> 31;
+  *ptr++ = values[4] >> 23;
+  *ptr++ = values[4] >> 15;
+  *ptr++ = values[4] >> 7;
+
+  *ptr = values[4] << 1;
+  *ptr++ |= values[5] >> 58;
+  *ptr++ = values[5] >> 50;
+  *ptr++ = values[5] >> 42;
+  *ptr++ = values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 53;
+  *ptr++ = values[6] >> 45;
+  *ptr++ = values[6] >> 37;
+  *ptr++ = values[6] >> 29;
+  *ptr++ = values[6] >> 21;
+  *ptr++ = values[6] >> 13;
+  *ptr++ = values[6] >> 5;
+
+  *ptr = values[6] << 3;
+  *ptr++ |= values[7] >> 56;
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_60(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 52;
+  *ptr++ = values[0] >> 44;
+  *ptr++ = values[0] >> 36;
+  *ptr++ = values[0] >> 28;
+  *ptr++ = values[0] >> 20;
+  *ptr++ = values[0] >> 12;
+  *ptr++ = values[0] >> 4;
+
+  *ptr = values[0] << 4;
+  *ptr++ |= values[1] >> 56;
+  *ptr++ = values[1] >> 48;
+  *ptr++ = values[1] >> 40;
+  *ptr++ = values[1] >> 32;
+  *ptr++ = values[1] >> 24;
+  *ptr++ = values[1] >> 16;
+  *ptr++ = values[1] >> 8;
+  *ptr++ = values[1];
+
+  *ptr++ = values[2] >> 52;
+  *ptr++ = values[2] >> 44;
+  *ptr++ = values[2] >> 36;
+  *ptr++ = values[2] >> 28;
+  *ptr++ = values[2] >> 20;
+  *ptr++ = values[2] >> 12;
+  *ptr++ = values[2] >> 4;
+
+  *ptr = values[2] << 4;
+  *ptr++ |= values[3] >> 56;
+  *ptr++ = values[3] >> 48;
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 52;
+  *ptr++ = values[4] >> 44;
+  *ptr++ = values[4] >> 36;
+  *ptr++ = values[4] >> 28;
+  *ptr++ = values[4] >> 20;
+  *ptr++ = values[4] >> 12;
+  *ptr++ = values[4] >> 4;
+
+  *ptr = values[4] << 4;
+  *ptr++ |= values[5] >> 56;
+  *ptr++ = values[5] >> 48;
+  *ptr++ = values[5] >> 40;
+  *ptr++ = values[5] >> 32;
+  *ptr++ = values[5] >> 24;
+  *ptr++ = values[5] >> 16;
+  *ptr++ = values[5] >> 8;
+  *ptr++ = values[5];
+
+  *ptr++ = values[6] >> 52;
+  *ptr++ = values[6] >> 44;
+  *ptr++ = values[6] >> 36;
+  *ptr++ = values[6] >> 28;
+  *ptr++ = values[6] >> 20;
+  *ptr++ = values[6] >> 12;
+  *ptr++ = values[6] >> 4;
+
+  *ptr = values[6] << 4;
+  *ptr++ |= values[7] >> 56;
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_61(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 53;
+  *ptr++ = values[0] >> 45;
+  *ptr++ = values[0] >> 37;
+  *ptr++ = values[0] >> 29;
+  *ptr++ = values[0] >> 21;
+  *ptr++ = values[0] >> 13;
+  *ptr++ = values[0] >> 5;
+
+  *ptr = values[0] << 3;
+  *ptr++ |= values[1] >> 58;
+  *ptr++ = values[1] >> 50;
+  *ptr++ = values[1] >> 42;
+  *ptr++ = values[1] >> 34;
+  *ptr++ = values[1] >> 26;
+  *ptr++ = values[1] >> 18;
+  *ptr++ = values[1] >> 10;
+  *ptr++ = values[1] >> 2;
+
+  *ptr = values[1] << 6;
+  *ptr++ |= values[2] >> 55;
+  *ptr++ = values[2] >> 47;
+  *ptr++ = values[2] >> 39;
+  *ptr++ = values[2] >> 31;
+  *ptr++ = values[2] >> 23;
+  *ptr++ = values[2] >> 15;
+  *ptr++ = values[2] >> 7;
+
+  *ptr = values[2] << 1;
+  *ptr++ |= values[3] >> 60;
+  *ptr++ = values[3] >> 52;
+  *ptr++ = values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 57;
+  *ptr++ = values[4] >> 49;
+  *ptr++ = values[4] >> 41;
+  *ptr++ = values[4] >> 33;
+  *ptr++ = values[4] >> 25;
+  *ptr++ = values[4] >> 17;
+  *ptr++ = values[4] >> 9;
+  *ptr++ = values[4] >> 1;
+
+  *ptr = values[4] << 7;
+  *ptr++ |= values[5] >> 54;
+  *ptr++ = values[5] >> 46;
+  *ptr++ = values[5] >> 38;
+  *ptr++ = values[5] >> 30;
+  *ptr++ = values[5] >> 22;
+  *ptr++ = values[5] >> 14;
+  *ptr++ = values[5] >> 6;
+
+  *ptr = values[5] << 2;
+  *ptr++ |= values[6] >> 59;
+  *ptr++ = values[6] >> 51;
+  *ptr++ = values[6] >> 43;
+  *ptr++ = values[6] >> 35;
+  *ptr++ = values[6] >> 27;
+  *ptr++ = values[6] >> 19;
+  *ptr++ = values[6] >> 11;
+  *ptr++ = values[6] >> 3;
+
+  *ptr = values[6] << 5;
+  *ptr++ |= values[7] >> 56;
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_62(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 54;
+  *ptr++ = values[0] >> 46;
+  *ptr++ = values[0] >> 38;
+  *ptr++ = values[0] >> 30;
+  *ptr++ = values[0] >> 22;
+  *ptr++ = values[0] >> 14;
+  *ptr++ = values[0] >> 6;
+
+  *ptr = values[0] << 2;
+  *ptr++ |= values[1] >> 60;
+  *ptr++ = values[1] >> 52;
+  *ptr++ = values[1] >> 44;
+  *ptr++ = values[1] >> 36;
+  *ptr++ = values[1] >> 28;
+  *ptr++ = values[1] >> 20;
+  *ptr++ = values[1] >> 12;
+  *ptr++ = values[1] >> 4;
+
+  *ptr = values[1] << 4;
+  *ptr++ |= values[2] >> 58;
+  *ptr++ = values[2] >> 50;
+  *ptr++ = values[2] >> 42;
+  *ptr++ = values[2] >> 34;
+  *ptr++ = values[2] >> 26;
+  *ptr++ = values[2] >> 18;
+  *ptr++ = values[2] >> 10;
+  *ptr++ = values[2] >> 2;
+
+  *ptr = values[2] << 6;
+  *ptr++ |= values[3] >> 56;
+  *ptr++ = values[3] >> 48;
+  *ptr++ = values[3] >> 40;
+  *ptr++ = values[3] >> 32;
+  *ptr++ = values[3] >> 24;
+  *ptr++ = values[3] >> 16;
+  *ptr++ = values[3] >> 8;
+  *ptr++ = values[3];
+
+  *ptr++ = values[4] >> 54;
+  *ptr++ = values[4] >> 46;
+  *ptr++ = values[4] >> 38;
+  *ptr++ = values[4] >> 30;
+  *ptr++ = values[4] >> 22;
+  *ptr++ = values[4] >> 14;
+  *ptr++ = values[4] >> 6;
+
+  *ptr = values[4] << 2;
+  *ptr++ |= values[5] >> 60;
+  *ptr++ = values[5] >> 52;
+  *ptr++ = values[5] >> 44;
+  *ptr++ = values[5] >> 36;
+  *ptr++ = values[5] >> 28;
+  *ptr++ = values[5] >> 20;
+  *ptr++ = values[5] >> 12;
+  *ptr++ = values[5] >> 4;
+
+  *ptr = values[5] << 4;
+  *ptr++ |= values[6] >> 58;
+  *ptr++ = values[6] >> 50;
+  *ptr++ = values[6] >> 42;
+  *ptr++ = values[6] >> 34;
+  *ptr++ = values[6] >> 26;
+  *ptr++ = values[6] >> 18;
+  *ptr++ = values[6] >> 10;
+  *ptr++ = values[6] >> 2;
+
+  *ptr = values[6] << 6;
+  *ptr++ |= values[7] >> 56;
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void pack_bits_63(const uint64_t* values, uint8_t* ptr) {
+  *ptr++ = values[0] >> 55;
+  *ptr++ = values[0] >> 47;
+  *ptr++ = values[0] >> 39;
+  *ptr++ = values[0] >> 31;
+  *ptr++ = values[0] >> 23;
+  *ptr++ = values[0] >> 15;
+  *ptr++ = values[0] >> 7;
+
+  *ptr = values[0] << 1;
+  *ptr++ |= values[1] >> 62;
+  *ptr++ = values[1] >> 54;
+  *ptr++ = values[1] >> 46;
+  *ptr++ = values[1] >> 38;
+  *ptr++ = values[1] >> 30;
+  *ptr++ = values[1] >> 22;
+  *ptr++ = values[1] >> 14;
+  *ptr++ = values[1] >> 6;
+
+  *ptr = values[1] << 2;
+  *ptr++ |= values[2] >> 61;
+  *ptr++ = values[2] >> 53;
+  *ptr++ = values[2] >> 45;
+  *ptr++ = values[2] >> 37;
+  *ptr++ = values[2] >> 29;
+  *ptr++ = values[2] >> 21;
+  *ptr++ = values[2] >> 13;
+  *ptr++ = values[2] >> 5;
+
+  *ptr = values[2] << 3;
+  *ptr++ |= values[3] >> 60;
+  *ptr++ = values[3] >> 52;
+  *ptr++ = values[3] >> 44;
+  *ptr++ = values[3] >> 36;
+  *ptr++ = values[3] >> 28;
+  *ptr++ = values[3] >> 20;
+  *ptr++ = values[3] >> 12;
+  *ptr++ = values[3] >> 4;
+
+  *ptr = values[3] << 4;
+  *ptr++ |= values[4] >> 59;
+  *ptr++ = values[4] >> 51;
+  *ptr++ = values[4] >> 43;
+  *ptr++ = values[4] >> 35;
+  *ptr++ = values[4] >> 27;
+  *ptr++ = values[4] >> 19;
+  *ptr++ = values[4] >> 11;
+  *ptr++ = values[4] >> 3;
+
+  *ptr = values[4] << 5;
+  *ptr++ |= values[5] >> 58;
+  *ptr++ = values[5] >> 50;
+  *ptr++ = values[5] >> 42;
+  *ptr++ = values[5] >> 34;
+  *ptr++ = values[5] >> 26;
+  *ptr++ = values[5] >> 18;
+  *ptr++ = values[5] >> 10;
+  *ptr++ = values[5] >> 2;
+
+  *ptr = values[5] << 6;
+  *ptr++ |= values[6] >> 57;
+  *ptr++ = values[6] >> 49;
+  *ptr++ = values[6] >> 41;
+  *ptr++ = values[6] >> 33;
+  *ptr++ = values[6] >> 25;
+  *ptr++ = values[6] >> 17;
+  *ptr++ = values[6] >> 9;
+  *ptr++ = values[6] >> 1;
+
+  *ptr = values[6] << 7;
+  *ptr++ |= values[7] >> 56;
+  *ptr++ = values[7] >> 48;
+  *ptr++ = values[7] >> 40;
+  *ptr++ = values[7] >> 32;
+  *ptr++ = values[7] >> 24;
+  *ptr++ = values[7] >> 16;
+  *ptr++ = values[7] >> 8;
+  *ptr = values[7];
+}
+
+static inline void unpack_bits_1(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr >> 7;
+  values[1] = (*ptr >> 6) & 1;
+  values[2] = (*ptr >> 5) & 1;
+  values[3] = (*ptr >> 4) & 1;
+  values[4] = (*ptr >> 3) & 1;
+  values[5] = (*ptr >> 2) & 1;
+  values[6] = (*ptr >> 1) & 1;
+  values[7] = *ptr++ & 1;
+}
+
+static inline void unpack_bits_2(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr >> 6;
+  values[1] = (*ptr >> 4) & 3;
+  values[2] = (*ptr >> 2) & 3;
+  values[3] = *ptr++ & 3;
+  values[4] = *ptr >> 6;
+  values[5] = (*ptr >> 4) & 3;
+  values[6] = (*ptr >> 2) & 3;
+  values[7] = *ptr & 3;
+}
+
+static inline void unpack_bits_3(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr >> 5;
+  values[1] = (*ptr >> 2) & 7;
+  values[2] = (*ptr++ & 3) << 1;
+  values[2] |= *ptr >> 7;
+  values[3] = (*ptr >> 4) & 7;
+  values[4] = (*ptr >> 1) & 7;
+  values[5] = (*ptr++ & 1) << 2;
+  values[5] |= *ptr >> 6;
+  values[6] = (*ptr >> 3) & 7;
+  values[7] = *ptr & 7;
+}
+
+static inline void unpack_bits_4(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr >> 4;
+  values[1] = *ptr++ & 0xf;
+  values[2] = *ptr >> 4;
+  values[3] = *ptr++ & 0xf;
+  values[4] = *ptr >> 4;
+  values[5] = *ptr++ & 0xf;
+  values[6] = *ptr >> 4;
+  values[7] = *ptr & 0xf;
+}
+
+static inline void unpack_bits_5(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr >> 3;
+
+  values[1] = (*ptr++ & 7) << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = (*ptr >> 1) & 0x1f;
+
+  values[3] = (*ptr++ & 1) << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = (*ptr >> 2) & 0x1f;
+
+  values[6] = (*ptr++ & 3) << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = *ptr & 0x1f;
+}
+
+static inline void unpack_bits_6(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr >> 2;
+
+  values[1] = (*ptr++ & 3) << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = (*ptr++ & 0xf) << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = *ptr++ & 0x3f;
+
+  values[4] = *ptr >> 2;
+
+  values[5] = (*ptr++ & 3) << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = (*ptr++ & 0xf) << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = *ptr & 0x3f;
+}
+
+static inline void unpack_bits_7(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr >> 1;
+
+  values[1] = (*ptr++ & 1) << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = (*ptr++ & 3) << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = (*ptr++ & 7) << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = (*ptr++ & 0x1f) << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = (*ptr++ & 0x3f) << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = *ptr & 0x7f;
+}
+
+static inline void unpack_bits_8(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++;
+  values[1] = *ptr++;
+  values[2] = *ptr++;
+  values[3] = *ptr++;
+  values[4] = *ptr++;
+  values[5] = *ptr++;
+  values[6] = *ptr++;
+  values[7] = *ptr;
+}
+
+static inline void unpack_bits_9(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 1;
+  values[0] |= *ptr >> 7;
+
+  values[1] = (*ptr++ & 0x7f) << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = (*ptr++ & 0x3f) << 3;
+  values[2] |= *ptr >> 5;
+
+  values[3] = (*ptr++ & 0x1f) << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 5;
+  values[4] |= *ptr >> 3;
+
+  values[5] = (*ptr++ & 7) << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = (*ptr++ & 3) << 7;
+  values[6] |= *ptr >> 1;
+
+  values[7] = (*ptr++ & 1) << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_10(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 2;
+  values[0] |= *ptr >> 6;
+
+  values[1] = (*ptr++ & 0x3f) << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = (*ptr++ & 0xf) << 6;
+  values[2] |= *ptr >> 2;
+
+  values[3] = (*ptr++ & 3) << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 2;
+  values[4] |= *ptr >> 6;
+
+  values[5] = (*ptr++ & 0x3f) << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = (*ptr++ & 0xf) << 6;
+  values[6] |= *ptr >> 2;
+
+  values[7] = (*ptr++ & 3) << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_11(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 3;
+  values[0] |= *ptr >> 5;
+
+  values[1] = (*ptr++ & 0x1f) << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = (*ptr++ & 3) << 9;
+  values[2] |= *ptr++ << 1;
+  values[2] |= *ptr >> 7;
+
+  values[3] = (*ptr++ & 0x7f) << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 7;
+  values[4] |= *ptr >> 1;
+
+  values[5] = (*ptr++ & 1) << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = (*ptr++ & 0x3f) << 5;
+  values[6] |= *ptr >> 3;
+
+  values[7] = (*ptr++ & 7) << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_12(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 4;
+  values[0] |= *ptr >> 4;
+
+  values[1] = (*ptr++ & 0xf) << 8;
+  values[1] |= *ptr++;
+
+  values[2] = *ptr++ << 4;
+  values[2] |= *ptr >> 4;
+
+  values[3] = (*ptr++ & 0xf) << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 4;
+  values[4] |= *ptr >> 4;
+
+  values[5] = (*ptr++ & 0xf) << 8;
+  values[5] |= *ptr++;
+
+  values[6] = *ptr++ << 4;
+  values[6] |= *ptr >> 4;
+
+  values[7] = (*ptr++ & 0xf) << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_13(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 5;
+  values[0] |= *ptr >> 3;
+
+  values[1] = (*ptr++ & 7) << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = (*ptr++ & 0x3f) << 7;
+  values[2] |= *ptr >> 1;
+
+  values[3] = (*ptr++ & 1) << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 9;
+  values[4] |= *ptr++ << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = (*ptr++ & 0x7f) << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = (*ptr++ & 3) << 11;
+  values[6] |= *ptr++ << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = (*ptr++ & 0x1f) << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_14(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 6;
+  values[0] |= *ptr >> 2;
+
+  values[1] = (*ptr++ & 3) << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = (*ptr++ & 0xf) << 10;
+  values[2] |= *ptr++ << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = (*ptr++ & 0x3f) << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 6;
+  values[4] |= *ptr >> 2;
+
+  values[5] = (*ptr++ & 3) << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = (*ptr++ & 0xf) << 10;
+  values[6] |= *ptr++ << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = (*ptr++ & 0x3f) << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_15(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 7;
+  values[0] |= *ptr >> 1;
+
+  values[1] = (*ptr++ & 1) << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = (*ptr++ & 3) << 13;
+  values[2] |= *ptr++ << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = (*ptr++ & 7) << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 11;
+  values[4] |= *ptr++ << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = (*ptr++ & 0x1f) << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = (*ptr++ & 0x3f) << 9;
+  values[6] |= *ptr++ << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = (*ptr++ & 0x7f) << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_16(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 8;
+  values[0] |= *ptr++;
+  values[1] = *ptr++ << 8;
+  values[1] |= *ptr++;
+  values[2] = *ptr++ << 8;
+  values[2] |= *ptr++;
+  values[3] = *ptr++ << 8;
+  values[3] |= *ptr++;
+  values[4] = *ptr++ << 8;
+  values[4] |= *ptr++;
+  values[5] = *ptr++ << 8;
+  values[5] |= *ptr++;
+  values[6] = *ptr++ << 8;
+  values[6] |= *ptr++;
+  values[7] = *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_17(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 9;
+  values[0] |= *ptr++ << 1;
+  values[0] |= *ptr >> 7;
+
+  values[1] = (*ptr++ & 0x7f) << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = (*ptr++ & 0x3f) << 11;
+  values[2] |= *ptr++ << 3;
+  values[2] |= *ptr >> 5;
+
+  values[3] = (*ptr++ & 0x1f) << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 13;
+  values[4] |= *ptr++ << 5;
+  values[4] |= *ptr >> 3;
+
+  values[5] = (*ptr++ & 7) << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = (*ptr++ & 3) << 15;
+  values[6] |= *ptr++ << 7;
+  values[6] |= *ptr >> 1;
+
+  values[7] = (*ptr++ & 1) << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_18(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 10;
+  values[0] |= *ptr++ << 2;
+  values[0] |= *ptr >> 6;
+
+  values[1] = (*ptr++ & 0x3f) << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = (*ptr++ & 0xf) << 14;
+  values[2] |= *ptr++ << 6;
+  values[2] |= *ptr >> 2;
+
+  values[3] = (*ptr++ & 3) << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 10;
+  values[4] |= *ptr++ << 2;
+  values[4] |= *ptr >> 6;
+
+  values[5] = (*ptr++ & 0x3f) << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = (*ptr++ & 0xf) << 14;
+  values[6] |= *ptr++ << 6;
+  values[6] |= *ptr >> 2;
+
+  values[7] = (*ptr++ & 3) << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_19(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 11;
+  values[0] |= *ptr++ << 3;
+  values[0] |= *ptr >> 5;
+
+  values[1] = (*ptr++ & 0x1f) << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = (*ptr++ & 3) << 17;
+  values[2] |= *ptr++ << 9;
+  values[2] |= *ptr++ << 1;
+  values[2] |= *ptr >> 7;
+
+  values[3] = (*ptr++ & 0x7f) << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 15;
+  values[4] |= *ptr++ << 7;
+  values[4] |= *ptr >> 1;
+
+  values[5] = (*ptr++ & 1) << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = (*ptr++ & 0x3f) << 13;
+  values[6] |= *ptr++ << 5;
+  values[6] |= *ptr >> 3;
+
+  values[7] = (*ptr++ & 7) << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_20(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 12;
+  values[0] |= *ptr++ << 4;
+  values[0] |= *ptr >> 4;
+
+  values[1] = (*ptr++ & 0xf) << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+
+  values[2] = *ptr++ << 12;
+  values[2] |= *ptr++ << 4;
+  values[2] |= *ptr >> 4;
+
+  values[3] = (*ptr++ & 0xf) << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 12;
+  values[4] |= *ptr++ << 4;
+  values[4] |= *ptr >> 4;
+
+  values[5] = (*ptr++ & 0xf) << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+
+  values[6] = *ptr++ << 12;
+  values[6] |= *ptr++ << 4;
+  values[6] |= *ptr >> 4;
+
+  values[7] = (*ptr++ & 0xf) << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_21(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 13;
+  values[0] |= *ptr++ << 5;
+  values[0] |= *ptr >> 3;
+
+  values[1] = (*ptr++ & 7) << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = (*ptr++ & 0x3f) << 15;
+  values[2] |= *ptr++ << 7;
+  values[2] |= *ptr >> 1;
+
+  values[3] = (*ptr++ & 1) << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 17;
+  values[4] |= *ptr++ << 9;
+  values[4] |= *ptr++ << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = (*ptr++ & 0x7f) << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = (*ptr++ & 3) << 19;
+  values[6] |= *ptr++ << 11;
+  values[6] |= *ptr++ << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = (*ptr++ & 0x1f) << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_22(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 14;
+  values[0] |= *ptr++ << 6;
+  values[0] |= *ptr >> 2;
+
+  values[1] = (*ptr++ & 3) << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = (*ptr++ & 0xf) << 18;
+  values[2] |= *ptr++ << 10;
+  values[2] |= *ptr++ << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = (*ptr++ & 0x3f) << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 14;
+  values[4] |= *ptr++ << 6;
+  values[4] |= *ptr >> 2;
+
+  values[5] = (*ptr++ & 3) << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = (*ptr++ & 0xf) << 18;
+  values[6] |= *ptr++ << 10;
+  values[6] |= *ptr++ << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = (*ptr++ & 0x3f) << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_23(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 15;
+  values[0] |= *ptr++ << 7;
+  values[0] |= *ptr >> 1;
+
+  values[1] = (*ptr++ & 1) << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = (*ptr++ & 3) << 21;
+  values[2] |= *ptr++ << 13;
+  values[2] |= *ptr++ << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = (*ptr++ & 7) << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 19;
+  values[4] |= *ptr++ << 11;
+  values[4] |= *ptr++ << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = (*ptr++ & 0x1f) << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = (*ptr++ & 0x3f) << 17;
+  values[6] |= *ptr++ << 9;
+  values[6] |= *ptr++ << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = (*ptr++ & 0x7f) << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_24(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 16;
+  values[0] |= *ptr++ << 8;
+  values[0] |= *ptr++;
+  values[1] = *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+  values[2] = *ptr++ << 16;
+  values[2] |= *ptr++ << 8;
+  values[2] |= *ptr++;
+  values[3] = *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+  values[4] = *ptr++ << 16;
+  values[4] |= *ptr++ << 8;
+  values[4] |= *ptr++;
+  values[5] = *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+  values[6] = *ptr++ << 16;
+  values[6] |= *ptr++ << 8;
+  values[6] |= *ptr++;
+  values[7] = *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_25(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 17;
+  values[0] |= *ptr++ << 9;
+  values[0] |= *ptr++ << 1;
+  values[0] |= *ptr >> 7;
+
+  values[1] = (*ptr++ & 0x7f) << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = (*ptr++ & 0x3f) << 19;
+  values[2] |= *ptr++ << 11;
+  values[2] |= *ptr++ << 3;
+  values[2] |= *ptr >> 5;
+
+  values[3] = (*ptr++ & 0x1f) << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 21;
+  values[4] |= *ptr++ << 13;
+  values[4] |= *ptr++ << 5;
+  values[4] |= *ptr >> 3;
+
+  values[5] = (*ptr++ & 7) << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = (*ptr++ & 3) << 23;
+  values[6] |= *ptr++ << 15;
+  values[6] |= *ptr++ << 7;
+  values[6] |= *ptr >> 1;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_26(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 18;
+  values[0] |= *ptr++ << 10;
+  values[0] |= *ptr++ << 2;
+  values[0] |= *ptr >> 6;
+
+  values[1] = (*ptr++ & 0x3f) << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = (*ptr++ & 0xf) << 22;
+  values[2] |= *ptr++ << 14;
+  values[2] |= *ptr++ << 6;
+  values[2] |= *ptr >> 2;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 3) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 18;
+  values[4] |= *ptr++ << 10;
+  values[4] |= *ptr++ << 2;
+  values[4] |= *ptr >> 6;
+
+  values[5] = (*ptr++ & 0x3f) << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = (*ptr++ & 0xf) << 22;
+  values[6] |= *ptr++ << 14;
+  values[6] |= *ptr++ << 6;
+  values[6] |= *ptr >> 2;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 3) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_27(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 19;
+  values[0] |= *ptr++ << 11;
+  values[0] |= *ptr++ << 3;
+  values[0] |= *ptr >> 5;
+
+  values[1] = (*ptr++ & 0x1f) << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 25;
+  values[2] |= *ptr++ << 17;
+  values[2] |= *ptr++ << 9;
+  values[2] |= *ptr++ << 1;
+  values[2] |= *ptr >> 7;
+
+  values[3] = (*ptr++ & 0x7f) << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = (*ptr++ & 0xf) << 23;
+  values[4] |= *ptr++ << 15;
+  values[4] |= *ptr++ << 7;
+  values[4] |= *ptr >> 1;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 1) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = (*ptr++ & 0x3f) << 21;
+  values[6] |= *ptr++ << 13;
+  values[6] |= *ptr++ << 5;
+  values[6] |= *ptr >> 3;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 7) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_28(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 20;
+  values[0] |= *ptr++ << 12;
+  values[0] |= *ptr++ << 4;
+  values[0] |= *ptr >> 4;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0xf) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+
+  values[2] = *ptr++ << 20;
+  values[2] |= *ptr++ << 12;
+  values[2] |= *ptr++ << 4;
+  values[2] |= *ptr >> 4;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0xf) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 20;
+  values[4] |= *ptr++ << 12;
+  values[4] |= *ptr++ << 4;
+  values[4] |= *ptr >> 4;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0xf) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+
+  values[6] = *ptr++ << 20;
+  values[6] |= *ptr++ << 12;
+  values[6] |= *ptr++ << 4;
+  values[6] |= *ptr >> 4;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0xf) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_29(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 21;
+  values[0] |= *ptr++ << 13;
+  values[0] |= *ptr++ << 5;
+  values[0] |= *ptr >> 3;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 7) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = (*ptr++ & 0x3f) << 23;
+  values[2] |= *ptr++ << 15;
+  values[2] |= *ptr++ << 7;
+  values[2] |= *ptr >> 1;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 1) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 25;
+  values[4] |= *ptr++ << 17;
+  values[4] |= *ptr++ << 9;
+  values[4] |= *ptr++ << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = (*ptr++ & 0x7f) << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 27;
+  values[6] |= *ptr++ << 19;
+  values[6] |= *ptr++ << 11;
+  values[6] |= *ptr++ << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x1f) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_30(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 22;
+  values[0] |= *ptr++ << 14;
+  values[0] |= *ptr++ << 6;
+  values[0] |= *ptr >> 2;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 3) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 26;
+  values[2] |= *ptr++ << 18;
+  values[2] |= *ptr++ << 10;
+  values[2] |= *ptr++ << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x3f) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = *ptr++ << 22;
+  values[4] |= *ptr++ << 14;
+  values[4] |= *ptr++ << 6;
+  values[4] |= *ptr >> 2;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 3) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 26;
+  values[6] |= *ptr++ << 18;
+  values[6] |= *ptr++ << 10;
+  values[6] |= *ptr++ << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x3f) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_31(uint64_t* values, const uint8_t* ptr) {
+  values[0] = *ptr++ << 23;
+  values[0] |= *ptr++ << 15;
+  values[0] |= *ptr++ << 7;
+  values[0] |= *ptr >> 1;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 1) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 29;
+  values[2] |= *ptr++ << 21;
+  values[2] |= *ptr++ << 13;
+  values[2] |= *ptr++ << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 7) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 27;
+  values[4] |= *ptr++ << 19;
+  values[4] |= *ptr++ << 11;
+  values[4] |= *ptr++ << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x1f) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 25;
+  values[6] |= *ptr++ << 17;
+  values[6] |= *ptr++ << 9;
+  values[6] |= *ptr++ << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x7f) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_32(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 24;
+  values[0] |= *ptr++ << 16;
+  values[0] |= *ptr++ << 8;
+  values[0] |= *ptr++;
+  values[1] = static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+  values[2] = static_cast<uint64_t>(*ptr++) << 24;
+  values[2] |= *ptr++ << 16;
+  values[2] |= *ptr++ << 8;
+  values[2] |= *ptr++;
+  values[3] = static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+  values[4] = static_cast<uint64_t>(*ptr++) << 24;
+  values[4] |= *ptr++ << 16;
+  values[4] |= *ptr++ << 8;
+  values[4] |= *ptr++;
+  values[5] = static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+  values[6] = static_cast<uint64_t>(*ptr++) << 24;
+  values[6] |= *ptr++ << 16;
+  values[6] |= *ptr++ << 8;
+  values[6] |= *ptr++;
+  values[7] = static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 25;
+  values[0] |= *ptr++ << 17;
+  values[0] |= *ptr++ << 9;
+  values[0] |= *ptr++ << 1;
+  values[0] |= *ptr >> 7;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x7f) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 27;
+  values[2] |= *ptr++ << 19;
+  values[2] |= *ptr++ << 11;
+  values[2] |= *ptr++ << 3;
+  values[2] |= *ptr >> 5;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x1f) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 29;
+  values[4] |= *ptr++ << 21;
+  values[4] |= *ptr++ << 13;
+  values[4] |= *ptr++ << 5;
+  values[4] |= *ptr >> 3;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 7) << 30;
+  values[5] |= *ptr++ << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 31;
+  values[6] |= *ptr++ << 23;
+  values[6] |= *ptr++ << 15;
+  values[6] |= *ptr++ << 7;
+  values[6] |= *ptr >> 1;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
+  values[7] |= *ptr++ << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_34(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 26;
+  values[0] |= *ptr++ << 18;
+  values[0] |= *ptr++ << 10;
+  values[0] |= *ptr++ << 2;
+  values[0] |= *ptr >> 6;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x3f) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 30;
+  values[2] |= *ptr++ << 22;
+  values[2] |= *ptr++ << 14;
+  values[2] |= *ptr++ << 6;
+  values[2] |= *ptr >> 2;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 3) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 26;
+  values[4] |= *ptr++ << 18;
+  values[4] |= *ptr++ << 10;
+  values[4] |= *ptr++ << 2;
+  values[4] |= *ptr >> 6;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x3f) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 30;
+  values[6] |= *ptr++ << 22;
+  values[6] |= *ptr++ << 14;
+  values[6] |= *ptr++ << 6;
+  values[6] |= *ptr >> 2;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 3) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr++;
+}
+
+static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 27;
+  values[0] |= *ptr++ << 19;
+  values[0] |= *ptr++ << 11;
+  values[0] |= *ptr++ << 3;
+  values[0] |= *ptr >> 5;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x1f) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 2) << 33;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[2] |= *ptr++ << 17;
+  values[2] |= *ptr++ << 9;
+  values[2] |= *ptr++ << 1;
+  values[2] |= *ptr >> 7;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x7f) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 31;
+  values[4] |= *ptr++ << 23;
+  values[4] |= *ptr++ << 15;
+  values[4] |= *ptr++ << 7;
+  values[4] |= *ptr >> 1;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 1) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 29;
+  values[6] |= *ptr++ << 21;
+  values[6] |= *ptr++ << 13;
+  values[6] |= *ptr++ << 5;
+  values[6] |= *ptr >> 3;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 7) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_36(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 28;
+  values[0] |= *ptr++ << 20;
+  values[0] |= *ptr++ << 12;
+  values[0] |= *ptr++ << 4;
+  values[0] |= *ptr >> 4;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0xf) << 32;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+
+  values[2] = static_cast<uint64_t>(*ptr++) << 28;
+  values[2] |= *ptr++ << 20;
+  values[2] |= *ptr++ << 12;
+  values[2] |= *ptr++ << 4;
+  values[2] |= *ptr >> 4;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0xf) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 28;
+  values[4] |= *ptr++ << 20;
+  values[4] |= *ptr++ << 12;
+  values[4] |= *ptr++ << 4;
+  values[4] |= *ptr >> 4;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0xf) << 32;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+
+  values[6] = static_cast<uint64_t>(*ptr++) << 28;
+  values[6] |= *ptr++ << 20;
+  values[6] |= *ptr++ << 12;
+  values[6] |= *ptr++ << 4;
+  values[6] |= *ptr >> 4;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0xf) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_37(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 29;
+  values[0] |= *ptr++ << 21;
+  values[0] |= *ptr++ << 13;
+  values[0] |= *ptr++ << 5;
+  values[0] |= *ptr >> 3;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 7) << 34;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 31;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 23;
+  values[2] |= *ptr++ << 15;
+  values[2] |= *ptr++ << 7;
+  values[2] |= *ptr >> 1;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 1) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 33;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[4] |= *ptr++ << 17;
+  values[4] |= *ptr++ << 9;
+  values[4] |= *ptr++ << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x7f) << 30;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 35;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[6] |= *ptr++ << 19;
+  values[6] |= *ptr++ << 11;
+  values[6] |= *ptr++ << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x1f) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_38(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 30;
+  values[0] |= *ptr++ << 22;
+  values[0] |= *ptr++ << 14;
+  values[0] |= *ptr++ << 6;
+  values[0] |= *ptr >> 2;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 3) << 36;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 34;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[2] |= *ptr++ << 18;
+  values[2] |= *ptr++ << 10;
+  values[2] |= *ptr++ << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x3f) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 30;
+  values[4] |= *ptr++ << 22;
+  values[4] |= *ptr++ << 14;
+  values[4] |= *ptr++ << 6;
+  values[4] |= *ptr >> 2;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 3) << 36;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 34;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[6] |= *ptr++ << 18;
+  values[6] |= *ptr++ << 10;
+  values[6] |= *ptr++ << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x3f) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_39(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 31;
+  values[0] |= *ptr++ << 23;
+  values[0] |= *ptr++ << 15;
+  values[0] |= *ptr++ << 7;
+  values[0] |= *ptr >> 1;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 1) << 38;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 37;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[2] |= *ptr++ << 21;
+  values[2] |= *ptr++ << 13;
+  values[2] |= *ptr++ << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 7) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 35;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[4] |= *ptr++ << 19;
+  values[4] |= *ptr++ << 11;
+  values[4] |= *ptr++ << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x1f) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 33;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[6] |= *ptr++ << 17;
+  values[6] |= *ptr++ << 9;
+  values[6] |= *ptr++ << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x7f) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_40(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 32;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[0] |= *ptr++ << 16;
+  values[0] |= *ptr++ << 8;
+  values[0] |= *ptr++;
+  values[1] = static_cast<uint64_t>(*ptr++) << 32;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+  values[2] = static_cast<uint64_t>(*ptr++) << 32;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[2] |= *ptr++ << 16;
+  values[2] |= *ptr++ << 8;
+  values[2] |= *ptr++;
+  values[3] = static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+  values[4] = static_cast<uint64_t>(*ptr++) << 32;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[4] |= *ptr++ << 16;
+  values[4] |= *ptr++ << 8;
+  values[4] |= *ptr++;
+  values[5] = static_cast<uint64_t>(*ptr++) << 32;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+  values[6] = static_cast<uint64_t>(*ptr++) << 32;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[6] |= *ptr++ << 16;
+  values[6] |= *ptr++ << 8;
+  values[6] |= *ptr++;
+  values[7] = static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_41(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 33;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[0] |= *ptr++ << 17;
+  values[0] |= *ptr++ << 9;
+  values[0] |= *ptr++ << 1;
+  values[0] |= *ptr >> 7;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x7f) << 34;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 35;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[2] |= *ptr++ << 19;
+  values[2] |= *ptr++ << 11;
+  values[2] |= *ptr++ << 3;
+  values[2] |= *ptr >> 5;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x1f) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 37;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[4] |= *ptr++ << 21;
+  values[4] |= *ptr++ << 13;
+  values[4] |= *ptr++ << 5;
+  values[4] |= *ptr >> 3;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 7) << 38;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[5] |= *ptr++ << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 39;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[6] |= *ptr++ << 23;
+  values[6] |= *ptr++ << 15;
+  values[6] |= *ptr++ << 7;
+  values[6] |= *ptr >> 1;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_42(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 34;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[0] |= *ptr++ << 18;
+  values[0] |= *ptr++ << 10;
+  values[0] |= *ptr++ << 2;
+  values[0] |= *ptr >> 6;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x3f) << 36;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 38;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[2] |= *ptr++ << 22;
+  values[2] |= *ptr++ << 14;
+  values[2] |= *ptr++ << 6;
+  values[2] |= *ptr >> 2;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 3) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 34;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[4] |= *ptr++ << 18;
+  values[4] |= *ptr++ << 10;
+  values[4] |= *ptr++ << 2;
+  values[4] |= *ptr >> 6;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x3f) << 36;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 38;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[6] |= *ptr++ << 22;
+  values[6] |= *ptr++ << 14;
+  values[6] |= *ptr++ << 6;
+  values[6] |= *ptr >> 2;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 3) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_43(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 35;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[0] |= *ptr++ << 19;
+  values[0] |= *ptr++ << 11;
+  values[0] |= *ptr++ << 3;
+  values[0] |= *ptr >> 5;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x1f) << 38;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 41;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[2] |= *ptr++ << 17;
+  values[2] |= *ptr++ << 9;
+  values[2] |= *ptr++ << 1;
+  values[2] |= *ptr >> 7;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x7f) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 39;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[4] |= *ptr++ << 23;
+  values[4] |= *ptr++ << 15;
+  values[4] |= *ptr++ << 7;
+  values[4] |= *ptr >> 1;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 1) << 42;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 37;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[6] |= *ptr++ << 21;
+  values[6] |= *ptr++ << 13;
+  values[6] |= *ptr++ << 5;
+  values[6] |= *ptr >> 3;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 7) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_44(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 36;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[0] |= *ptr++ << 20;
+  values[0] |= *ptr++ << 12;
+  values[0] |= *ptr++ << 4;
+  values[0] |= *ptr >> 4;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0xf) << 40;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+
+  values[2] = static_cast<uint64_t>(*ptr++) << 36;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[2] |= *ptr++ << 20;
+  values[2] |= *ptr++ << 12;
+  values[2] |= *ptr++ << 4;
+  values[2] |= *ptr >> 4;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0xf) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 36;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[4] |= *ptr++ << 20;
+  values[4] |= *ptr++ << 12;
+  values[4] |= *ptr++ << 4;
+  values[4] |= *ptr >> 4;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0xf) << 40;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+
+  values[6] = static_cast<uint64_t>(*ptr++) << 36;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[6] |= *ptr++ << 20;
+  values[6] |= *ptr++ << 12;
+  values[6] |= *ptr++ << 4;
+  values[6] |= *ptr >> 4;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0xf) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_45(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 37;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[0] |= *ptr++ << 21;
+  values[0] |= *ptr++ << 13;
+  values[0] |= *ptr++ << 5;
+  values[0] |= *ptr >> 3;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 7) << 42;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 39;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 23;
+  values[2] |= *ptr++ << 15;
+  values[2] |= *ptr++ << 7;
+  values[2] |= *ptr >> 1;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 1) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 41;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[4] |= *ptr++ << 17;
+  values[4] |= *ptr++ << 9;
+  values[4] |= *ptr++ << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x7f) << 38;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 43;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[6] |= *ptr++ << 19;
+  values[6] |= *ptr++ << 11;
+  values[6] |= *ptr++ << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x1f) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_46(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 38;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[0] |= *ptr++ << 22;
+  values[0] |= *ptr++ << 14;
+  values[0] |= *ptr++ << 6;
+  values[0] |= *ptr >> 2;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 3) << 44;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 42;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[2] |= *ptr++ << 18;
+  values[2] |= *ptr++ << 10;
+  values[2] |= *ptr++ << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x3f) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 38;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[4] |= *ptr++ << 22;
+  values[4] |= *ptr++ << 14;
+  values[4] |= *ptr++ << 6;
+  values[4] |= *ptr >> 2;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 3) << 44;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 42;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[6] |= *ptr++ << 18;
+  values[6] |= *ptr++ << 10;
+  values[6] |= *ptr++ << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x3f) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_47(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 39;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[0] |= *ptr++ << 23;
+  values[0] |= *ptr++ << 15;
+  values[0] |= *ptr++ << 7;
+  values[0] |= *ptr >> 1;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 1) << 46;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 45;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[2] |= *ptr++ << 21;
+  values[2] |= *ptr++ << 13;
+  values[2] |= *ptr++ << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 7) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 43;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[4] |= *ptr++ << 19;
+  values[4] |= *ptr++ << 11;
+  values[4] |= *ptr++ << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x1f) << 42;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 41;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[6] |= *ptr++ << 17;
+  values[6] |= *ptr++ << 9;
+  values[6] |= *ptr++ << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x7f) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_48(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 40;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[0] |= *ptr++ << 16;
+  values[0] |= *ptr++ << 8;
+  values[0] |= *ptr++;
+  values[1] = static_cast<uint64_t>(*ptr++) << 40;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+  values[2] = static_cast<uint64_t>(*ptr++) << 40;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[2] |= *ptr++ << 16;
+  values[2] |= *ptr++ << 8;
+  values[2] |= *ptr++;
+  values[3] = static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+  values[4] = static_cast<uint64_t>(*ptr++) << 40;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[4] |= *ptr++ << 16;
+  values[4] |= *ptr++ << 8;
+  values[4] |= *ptr++;
+  values[5] = static_cast<uint64_t>(*ptr++) << 40;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+  values[6] = static_cast<uint64_t>(*ptr++) << 40;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[6] |= *ptr++ << 16;
+  values[6] |= *ptr++ << 8;
+  values[6] |= *ptr++;
+  values[7] = static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_49(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 41;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[0] |= *ptr++ << 17;
+  values[0] |= *ptr++ << 9;
+  values[0] |= *ptr++ << 1;
+  values[0] |= *ptr >> 7;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x7f) << 42;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 43;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[2] |= *ptr++ << 19;
+  values[2] |= *ptr++ << 11;
+  values[2] |= *ptr++ << 3;
+  values[2] |= *ptr >> 5;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x1f) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 45;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[4] |= *ptr++ << 21;
+  values[4] |= *ptr++ << 13;
+  values[4] |= *ptr++ << 5;
+  values[4] |= *ptr >> 3;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 7) << 46;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[5] |= *ptr++ << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 47;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[6] |= *ptr++ << 23;
+  values[6] |= *ptr++ << 15;
+  values[6] |= *ptr++ << 7;
+  values[6] |= *ptr >> 1;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_50(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 42;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[0] |= *ptr++ << 18;
+  values[0] |= *ptr++ << 10;
+  values[0] |= *ptr++ << 2;
+  values[0] |= *ptr >> 6;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x3f) << 44;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 46;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[2] |= *ptr++ << 22;
+  values[2] |= *ptr++ << 14;
+  values[2] |= *ptr++ << 6;
+  values[2] |= *ptr >> 2;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 3) << 48;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 42;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[4] |= *ptr++ << 18;
+  values[4] |= *ptr++ << 10;
+  values[4] |= *ptr++ << 2;
+  values[4] |= *ptr >> 6;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x3f) << 44;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 46;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[6] |= *ptr++ << 22;
+  values[6] |= *ptr++ << 14;
+  values[6] |= *ptr++ << 6;
+  values[6] |= *ptr >> 2;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 3) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_51(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 43;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[0] |= *ptr++ << 19;
+  values[0] |= *ptr++ << 11;
+  values[0] |= *ptr++ << 3;
+  values[0] |= *ptr >> 5;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x1f) << 46;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 49;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 41;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[2] |= *ptr++ << 17;
+  values[2] |= *ptr++ << 9;
+  values[2] |= *ptr++ << 1;
+  values[2] |= *ptr >> 7;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x7f) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 47;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[4] |= *ptr++ << 23;
+  values[4] |= *ptr++ << 15;
+  values[4] |= *ptr++ << 7;
+  values[4] |= *ptr >> 1;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 1) << 50;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 45;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[6] |= *ptr++ << 21;
+  values[6] |= *ptr++ << 13;
+  values[6] |= *ptr++ << 5;
+  values[6] |= *ptr >> 3;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 7) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_52(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 44;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[0] |= *ptr++ << 20;
+  values[0] |= *ptr++ << 12;
+  values[0] |= *ptr++ << 4;
+  values[0] |= *ptr >> 4;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0xf) << 48;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+
+  values[2] = static_cast<uint64_t>(*ptr++) << 44;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[2] |= *ptr++ << 20;
+  values[2] |= *ptr++ << 12;
+  values[2] |= *ptr++ << 4;
+  values[2] |= *ptr >> 4;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0xf) << 48;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 44;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[4] |= *ptr++ << 20;
+  values[4] |= *ptr++ << 12;
+  values[4] |= *ptr++ << 4;
+  values[4] |= *ptr >> 4;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0xf) << 48;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+
+  values[6] = static_cast<uint64_t>(*ptr++) << 44;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[6] |= *ptr++ << 20;
+  values[6] |= *ptr++ << 12;
+  values[6] |= *ptr++ << 4;
+  values[6] |= *ptr >> 4;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0xf) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_53(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 45;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[0] |= *ptr++ << 21;
+  values[0] |= *ptr++ << 13;
+  values[0] |= *ptr++ << 5;
+  values[0] |= *ptr >> 3;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 7) << 50;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 47;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 23;
+  values[2] |= *ptr++ << 15;
+  values[2] |= *ptr++ << 7;
+  values[2] |= *ptr >> 1;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 1) << 52;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 49;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 41;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[4] |= *ptr++ << 17;
+  values[4] |= *ptr++ << 9;
+  values[4] |= *ptr++ << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x7f) << 46;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[5] |= *ptr++ << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 51;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 43;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[6] |= *ptr++ << 19;
+  values[6] |= *ptr++ << 11;
+  values[6] |= *ptr++ << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x1f) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_54(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 46;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[0] |= *ptr++ << 22;
+  values[0] |= *ptr++ << 14;
+  values[0] |= *ptr++ << 6;
+  values[0] |= *ptr >> 2;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 3) << 52;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 50;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[2] |= *ptr++ << 18;
+  values[2] |= *ptr++ << 10;
+  values[2] |= *ptr++ << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x3f) << 48;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 46;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[4] |= *ptr++ << 22;
+  values[4] |= *ptr++ << 14;
+  values[4] |= *ptr++ << 6;
+  values[4] |= *ptr >> 2;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 3) << 52;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 50;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[6] |= *ptr++ << 18;
+  values[6] |= *ptr++ << 10;
+  values[6] |= *ptr++ << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x3f) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr++;
+}
+
+static inline void unpack_bits_55(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 47;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[0] |= *ptr++ << 23;
+  values[0] |= *ptr++ << 15;
+  values[0] |= *ptr++ << 7;
+  values[0] |= *ptr >> 1;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 1) << 54;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 53;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 45;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[2] |= *ptr++ << 21;
+  values[2] |= *ptr++ << 13;
+  values[2] |= *ptr++ << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 7) << 52;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 51;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 43;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[4] |= *ptr++ << 19;
+  values[4] |= *ptr++ << 11;
+  values[4] |= *ptr++ << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x1f) << 50;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 49;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 41;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[6] |= *ptr++ << 17;
+  values[6] |= *ptr++ << 9;
+  values[6] |= *ptr++ << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x7f) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_56(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 48;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[0] |= *ptr++ << 16;
+  values[0] |= *ptr++ << 8;
+  values[0] |= *ptr++;
+  values[1] = static_cast<uint64_t>(*ptr++) << 48;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+  values[2] = static_cast<uint64_t>(*ptr++) << 48;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[2] |= *ptr++ << 16;
+  values[2] |= *ptr++ << 8;
+  values[2] |= *ptr++;
+  values[3] = static_cast<uint64_t>(*ptr++) << 48;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+  values[4] = static_cast<uint64_t>(*ptr++) << 48;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[4] |= *ptr++ << 16;
+  values[4] |= *ptr++ << 8;
+  values[4] |= *ptr++;
+  values[5] = static_cast<uint64_t>(*ptr++) << 48;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+  values[6] = static_cast<uint64_t>(*ptr++) << 48;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[6] |= *ptr++ << 16;
+  values[6] |= *ptr++ << 8;
+  values[6] |= *ptr++;
+  values[7] = static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_57(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 49;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 41;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[0] |= *ptr++ << 17;
+  values[0] |= *ptr++ << 9;
+  values[0] |= *ptr++ << 1;
+  values[0] |= *ptr >> 7;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x7f) << 50;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 51;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 43;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[2] |= *ptr++ << 19;
+  values[2] |= *ptr++ << 11;
+  values[2] |= *ptr++ << 3;
+  values[2] |= *ptr >> 5;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x1f) << 52;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 53;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 45;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[4] |= *ptr++ << 21;
+  values[4] |= *ptr++ << 13;
+  values[4] |= *ptr++ << 5;
+  values[4] |= *ptr >> 3;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 7) << 54;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[5] |= *ptr++ << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 55;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 47;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[6] |= *ptr++ << 23;
+  values[6] |= *ptr++ << 15;
+  values[6] |= *ptr++ << 7;
+  values[6] |= *ptr >> 1;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 56;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_58(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 50;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[0] |= *ptr++ << 18;
+  values[0] |= *ptr++ << 10;
+  values[0] |= *ptr++ << 2;
+  values[0] |= *ptr >> 6;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x3f) << 52;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 54;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[2] |= *ptr++ << 22;
+  values[2] |= *ptr++ << 14;
+  values[2] |= *ptr++ << 6;
+  values[2] |= *ptr >> 2;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 3) << 56;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 50;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[4] |= *ptr++ << 18;
+  values[4] |= *ptr++ << 10;
+  values[4] |= *ptr++ << 2;
+  values[4] |= *ptr >> 6;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x3f) << 52;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 54;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[6] |= *ptr++ << 22;
+  values[6] |= *ptr++ << 14;
+  values[6] |= *ptr++ << 6;
+  values[6] |= *ptr >> 2;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 3) << 56;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr++;
+}
+
+static inline void unpack_bits_59(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 51;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 43;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[0] |= *ptr++ << 19;
+  values[0] |= *ptr++ << 11;
+  values[0] |= *ptr++ << 3;
+  values[0] |= *ptr >> 5;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0x1f) << 54;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 57;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 49;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 41;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[2] |= *ptr++ << 17;
+  values[2] |= *ptr++ << 9;
+  values[2] |= *ptr++ << 1;
+  values[2] |= *ptr >> 7;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x7f) << 52;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 55;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 47;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[4] |= *ptr++ << 23;
+  values[4] |= *ptr++ << 15;
+  values[4] |= *ptr++ << 7;
+  values[4] |= *ptr >> 1;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 1) << 58;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 50;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 53;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 45;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[6] |= *ptr++ << 21;
+  values[6] |= *ptr++ << 13;
+  values[6] |= *ptr++ << 5;
+  values[6] |= *ptr >> 3;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 7) << 56;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_60(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 52;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[0] |= *ptr++ << 20;
+  values[0] |= *ptr++ << 12;
+  values[0] |= *ptr++ << 4;
+  values[0] |= *ptr >> 4;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 0xf) << 56;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[1] |= *ptr++ << 16;
+  values[1] |= *ptr++ << 8;
+  values[1] |= *ptr++;
+
+  values[2] = static_cast<uint64_t>(*ptr++) << 52;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[2] |= *ptr++ << 20;
+  values[2] |= *ptr++ << 12;
+  values[2] |= *ptr++ << 4;
+  values[2] |= *ptr >> 4;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0xf) << 56;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 52;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[4] |= *ptr++ << 20;
+  values[4] |= *ptr++ << 12;
+  values[4] |= *ptr++ << 4;
+  values[4] |= *ptr >> 4;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0xf) << 56;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[5] |= *ptr++ << 16;
+  values[5] |= *ptr++ << 8;
+  values[5] |= *ptr++;
+
+  values[6] = static_cast<uint64_t>(*ptr++) << 52;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[6] |= *ptr++ << 20;
+  values[6] |= *ptr++ << 12;
+  values[6] |= *ptr++ << 4;
+  values[6] |= *ptr >> 4;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0xf) << 56;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_61(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 53;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 45;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[0] |= *ptr++ << 21;
+  values[0] |= *ptr++ << 13;
+  values[0] |= *ptr++ << 5;
+  values[0] |= *ptr >> 3;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 7) << 58;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 50;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[1] |= *ptr++ << 18;
+  values[1] |= *ptr++ << 10;
+  values[1] |= *ptr++ << 2;
+  values[1] |= *ptr >> 6;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0x3f) << 55;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 47;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[2] |= *ptr++ << 23;
+  values[2] |= *ptr++ << 15;
+  values[2] |= *ptr++ << 7;
+  values[2] |= *ptr >> 1;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 1) << 60;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 52;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 57;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 49;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 41;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[4] |= *ptr++ << 17;
+  values[4] |= *ptr++ << 9;
+  values[4] |= *ptr++ << 1;
+  values[4] |= *ptr >> 7;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x7f) << 54;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[5] |= *ptr++ << 22;
+  values[5] |= *ptr++ << 14;
+  values[5] |= *ptr++ << 6;
+  values[5] |= *ptr >> 2;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 3) << 59;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 51;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 43;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[6] |= *ptr++ << 19;
+  values[6] |= *ptr++ << 11;
+  values[6] |= *ptr++ << 3;
+  values[6] |= *ptr >> 5;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x1f) << 56;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_62(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 54;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[0] |= *ptr++ << 22;
+  values[0] |= *ptr++ << 14;
+  values[0] |= *ptr++ << 6;
+  values[0] |= *ptr >> 2;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 3) << 60;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 52;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[1] |= *ptr++ << 20;
+  values[1] |= *ptr++ << 12;
+  values[1] |= *ptr++ << 4;
+  values[1] |= *ptr >> 4;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 0xf) << 58;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 50;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[2] |= *ptr++ << 18;
+  values[2] |= *ptr++ << 10;
+  values[2] |= *ptr++ << 2;
+  values[2] |= *ptr >> 6;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 0x3f) << 56;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[3] |= *ptr++ << 16;
+  values[3] |= *ptr++ << 8;
+  values[3] |= *ptr++;
+
+  values[4] = static_cast<uint64_t>(*ptr++) << 54;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[4] |= *ptr++ << 22;
+  values[4] |= *ptr++ << 14;
+  values[4] |= *ptr++ << 6;
+  values[4] |= *ptr >> 2;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 3) << 60;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 52;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[5] |= *ptr++ << 20;
+  values[5] |= *ptr++ << 12;
+  values[5] |= *ptr++ << 4;
+  values[5] |= *ptr >> 4;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0xf) << 58;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 50;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[6] |= *ptr++ << 18;
+  values[6] |= *ptr++ << 10;
+  values[6] |= *ptr++ << 2;
+  values[6] |= *ptr >> 6;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x3f) << 56;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void unpack_bits_63(uint64_t* values, const uint8_t* ptr) {
+  values[0] = static_cast<uint64_t>(*ptr++) << 55;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 47;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 39;
+  values[0] |= static_cast<uint64_t>(*ptr++) << 31;
+  values[0] |= *ptr++ << 23;
+  values[0] |= *ptr++ << 15;
+  values[0] |= *ptr++ << 7;
+  values[0] |= *ptr >> 1;
+
+  values[1] = static_cast<uint64_t>(*ptr++ & 1) << 62;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 54;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 46;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 38;
+  values[1] |= static_cast<uint64_t>(*ptr++) << 30;
+  values[1] |= *ptr++ << 22;
+  values[1] |= *ptr++ << 14;
+  values[1] |= *ptr++ << 6;
+  values[1] |= *ptr >> 2;
+
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 61;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 53;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 45;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 37;
+  values[2] |= static_cast<uint64_t>(*ptr++) << 29;
+  values[2] |= *ptr++ << 21;
+  values[2] |= *ptr++ << 13;
+  values[2] |= *ptr++ << 5;
+  values[2] |= *ptr >> 3;
+
+  values[3] = static_cast<uint64_t>(*ptr++ & 7) << 60;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 52;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 44;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 36;
+  values[3] |= static_cast<uint64_t>(*ptr++) << 28;
+  values[3] |= *ptr++ << 20;
+  values[3] |= *ptr++ << 12;
+  values[3] |= *ptr++ << 4;
+  values[3] |= *ptr >> 4;
+
+  values[4] = static_cast<uint64_t>(*ptr++ & 0xf) << 59;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 51;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 43;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 35;
+  values[4] |= static_cast<uint64_t>(*ptr++) << 27;
+  values[4] |= *ptr++ << 19;
+  values[4] |= *ptr++ << 11;
+  values[4] |= *ptr++ << 3;
+  values[4] |= *ptr >> 5;
+
+  values[5] = static_cast<uint64_t>(*ptr++ & 0x1f) << 58;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 50;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 42;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 34;
+  values[5] |= static_cast<uint64_t>(*ptr++) << 26;
+  values[5] |= *ptr++ << 18;
+  values[5] |= *ptr++ << 10;
+  values[5] |= *ptr++ << 2;
+  values[5] |= *ptr >> 6;
+
+  values[6] = static_cast<uint64_t>(*ptr++ & 0x3f) << 57;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 49;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 41;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 33;
+  values[6] |= static_cast<uint64_t>(*ptr++) << 25;
+  values[6] |= *ptr++ << 17;
+  values[6] |= *ptr++ << 9;
+  values[6] |= *ptr++ << 1;
+  values[6] |= *ptr >> 7;
+
+  values[7] = static_cast<uint64_t>(*ptr++ & 0x7f) << 56;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 48;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 40;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 32;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
+  values[7] |= *ptr++ << 16;
+  values[7] |= *ptr++ << 8;
+  values[7] |= *ptr;
+}
+
+static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_t bits) {
+  switch (bits) {
+    case 1: pack_bits_1(values, ptr); break;
+    case 2: pack_bits_2(values, ptr); break;
+    case 3: pack_bits_3(values, ptr); break;
+    case 4: pack_bits_4(values, ptr); break;
+    case 5: pack_bits_5(values, ptr); break;
+    case 6: pack_bits_6(values, ptr); break;
+    case 7: pack_bits_7(values, ptr); break;
+    case 8: pack_bits_8(values, ptr); break;
+    case 9: pack_bits_9(values, ptr); break;
+    case 10: pack_bits_10(values, ptr); break;
+    case 11: pack_bits_11(values, ptr); break;
+    case 12: pack_bits_12(values, ptr); break;
+    case 13: pack_bits_13(values, ptr); break;
+    case 14: pack_bits_14(values, ptr); break;
+    case 15: pack_bits_15(values, ptr); break;
+    case 16: pack_bits_16(values, ptr); break;
+    case 17: pack_bits_17(values, ptr); break;
+    case 18: pack_bits_18(values, ptr); break;
+    case 19: pack_bits_19(values, ptr); break;
+    case 20: pack_bits_20(values, ptr); break;
+    case 21: pack_bits_21(values, ptr); break;
+    case 22: pack_bits_22(values, ptr); break;
+    case 23: pack_bits_23(values, ptr); break;
+    case 24: pack_bits_24(values, ptr); break;
+    case 25: pack_bits_25(values, ptr); break;
+    case 26: pack_bits_26(values, ptr); break;
+    case 27: pack_bits_27(values, ptr); break;
+    case 28: pack_bits_28(values, ptr); break;
+    case 29: pack_bits_29(values, ptr); break;
+    case 30: pack_bits_30(values, ptr); break;
+    case 31: pack_bits_31(values, ptr); break;
+    case 32: pack_bits_32(values, ptr); break;
+    case 33: pack_bits_33(values, ptr); break;
+    case 34: pack_bits_34(values, ptr); break;
+    case 35: pack_bits_35(values, ptr); break;
+    case 36: pack_bits_36(values, ptr); break;
+    case 37: pack_bits_37(values, ptr); break;
+    case 38: pack_bits_38(values, ptr); break;
+    case 39: pack_bits_39(values, ptr); break;
+    case 40: pack_bits_40(values, ptr); break;
+    case 41: pack_bits_41(values, ptr); break;
+    case 42: pack_bits_42(values, ptr); break;
+    case 43: pack_bits_43(values, ptr); break;
+    case 44: pack_bits_44(values, ptr); break;
+    case 45: pack_bits_45(values, ptr); break;
+    case 46: pack_bits_46(values, ptr); break;
+    case 47: pack_bits_47(values, ptr); break;
+    case 48: pack_bits_48(values, ptr); break;
+    case 49: pack_bits_49(values, ptr); break;
+    case 50: pack_bits_50(values, ptr); break;
+    case 51: pack_bits_51(values, ptr); break;
+    case 52: pack_bits_52(values, ptr); break;
+    case 53: pack_bits_53(values, ptr); break;
+    case 54: pack_bits_54(values, ptr); break;
+    case 55: pack_bits_55(values, ptr); break;
+    case 56: pack_bits_56(values, ptr); break;
+    case 57: pack_bits_57(values, ptr); break;
+    case 58: pack_bits_58(values, ptr); break;
+    case 59: pack_bits_59(values, ptr); break;
+    case 60: pack_bits_60(values, ptr); break;
+    case 61: pack_bits_61(values, ptr); break;
+    case 62: pack_bits_62(values, ptr); break;
+    case 63: pack_bits_63(values, ptr); break;
+    default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
+  }
+}
+
+static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint8_t bits) {
+  switch (bits) {
+    case 1: unpack_bits_1(values, ptr); break;
+    case 2: unpack_bits_2(values, ptr); break;
+    case 3: unpack_bits_3(values, ptr); break;
+    case 4: unpack_bits_4(values, ptr); break;
+    case 5: unpack_bits_5(values, ptr); break;
+    case 6: unpack_bits_6(values, ptr); break;
+    case 7: unpack_bits_7(values, ptr); break;
+    case 8: unpack_bits_8(values, ptr); break;
+    case 9: unpack_bits_9(values, ptr); break;
+    case 10: unpack_bits_10(values, ptr); break;
+    case 11: unpack_bits_11(values, ptr); break;
+    case 12: unpack_bits_12(values, ptr); break;
+    case 13: unpack_bits_13(values, ptr); break;
+    case 14: unpack_bits_14(values, ptr); break;
+    case 15: unpack_bits_15(values, ptr); break;
+    case 16: unpack_bits_16(values, ptr); break;
+    case 17: unpack_bits_17(values, ptr); break;
+    case 18: unpack_bits_18(values, ptr); break;
+    case 19: unpack_bits_19(values, ptr); break;
+    case 20: unpack_bits_20(values, ptr); break;
+    case 21: unpack_bits_21(values, ptr); break;
+    case 22: unpack_bits_22(values, ptr); break;
+    case 23: unpack_bits_23(values, ptr); break;
+    case 24: unpack_bits_24(values, ptr); break;
+    case 25: unpack_bits_25(values, ptr); break;
+    case 26: unpack_bits_26(values, ptr); break;
+    case 27: unpack_bits_27(values, ptr); break;
+    case 28: unpack_bits_28(values, ptr); break;
+    case 29: unpack_bits_29(values, ptr); break;
+    case 30: unpack_bits_30(values, ptr); break;
+    case 31: unpack_bits_31(values, ptr); break;
+    case 32: unpack_bits_32(values, ptr); break;
+    case 33: unpack_bits_33(values, ptr); break;
+    case 34: unpack_bits_34(values, ptr); break;
+    case 35: unpack_bits_35(values, ptr); break;
+    case 36: unpack_bits_36(values, ptr); break;
+    case 37: unpack_bits_37(values, ptr); break;
+    case 38: unpack_bits_38(values, ptr); break;
+    case 39: unpack_bits_39(values, ptr); break;
+    case 40: unpack_bits_40(values, ptr); break;
+    case 41: unpack_bits_41(values, ptr); break;
+    case 42: unpack_bits_42(values, ptr); break;
+    case 43: unpack_bits_43(values, ptr); break;
+    case 44: unpack_bits_44(values, ptr); break;
+    case 45: unpack_bits_45(values, ptr); break;
+    case 46: unpack_bits_46(values, ptr); break;
+    case 47: unpack_bits_47(values, ptr); break;
+    case 48: unpack_bits_48(values, ptr); break;
+    case 49: unpack_bits_49(values, ptr); break;
+    case 50: unpack_bits_50(values, ptr); break;
+    case 51: unpack_bits_51(values, ptr); break;
+    case 52: unpack_bits_52(values, ptr); break;
+    case 53: unpack_bits_53(values, ptr); break;
+    case 54: unpack_bits_54(values, ptr); break;
+    case 55: unpack_bits_55(values, ptr); break;
+    case 56: unpack_bits_56(values, ptr); break;
+    case 57: unpack_bits_57(values, ptr); break;
+    case 58: unpack_bits_58(values, ptr); break;
+    case 59: unpack_bits_59(values, ptr); break;
+    case 60: unpack_bits_60(values, ptr); break;
+    case 61: unpack_bits_61(values, ptr); break;
+    case 62: unpack_bits_62(values, ptr); break;
+    case 63: unpack_bits_63(values, ptr); break;
+    default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
+  }
+}
+
+} // namespace
+
+#endif // BIT_PACKING_HPP_
diff --git a/theta/include/compact_theta_sketch_parser.hpp b/theta/include/compact_theta_sketch_parser.hpp
index a06924e..8a88324 100644
--- a/theta/include/compact_theta_sketch_parser.hpp
+++ b/theta/include/compact_theta_sketch_parser.hpp
@@ -20,7 +20,7 @@
 #ifndef COMPACT_THETA_SKETCH_PARSER_HPP_
 #define COMPACT_THETA_SKETCH_PARSER_HPP_
 
-#include <stdint.h>
+#include <cstdint>
 
 namespace datasketches {
 
@@ -33,7 +33,8 @@ public:
     uint16_t seed_hash;
     uint32_t num_entries;
     uint64_t theta;
-    const uint64_t* entries;
+    const void* entries_start_ptr;
+    uint8_t entry_bits;
   };
 
   static compact_theta_sketch_data parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error = false);
@@ -45,18 +46,24 @@ private:
   static const size_t COMPACT_SKETCH_TYPE_BYTE = 2;
   static const size_t COMPACT_SKETCH_FLAGS_BYTE = 5;
   static const size_t COMPACT_SKETCH_SEED_HASH_U16 = 3;
-  static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2;
-  static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1;
-  static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2;
-  static const size_t COMPACT_SKETCH_THETA_U64 = 2;
-  static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3;
+  static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1; // ver 3
+  static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2; // ver 1-3
+  static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2; // ver 1-3
+  static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3; // ver 1-3
+  static const size_t COMPACT_SKETCH_THETA_U64 = 2; // ver 1-3
+  static const size_t COMPACT_SKETCH_V4_MIN_ENTRY_ZEROS_BYTE = 3;
+  static const size_t COMPACT_SKETCH_V4_THETA_U64 = 1;
+  static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_EXACT_U32 = 2;
+  static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_ESTIMATION_U32 = 4;
+  static const size_t COMPACT_SKETCH_V4_PACKED_DATA_EXACT_U8 = 12;
+  static const size_t COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_U8 = 20;
 
   static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
   static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
 
-  static const uint8_t COMPACT_SKETCH_SERIAL_VERSION = 3;
   static const uint8_t COMPACT_SKETCH_TYPE = 3;
 
+  static void check_memory_size(const void* ptr, size_t actual_bytes, size_t expected_bytes, bool dump_on_error);
   static std::string hex_dump(const uint8_t* ptr, size_t size);
 };
 
diff --git a/theta/include/compact_theta_sketch_parser_impl.hpp b/theta/include/compact_theta_sketch_parser_impl.hpp
index 4b653b8..115cdda 100644
--- a/theta/include/compact_theta_sketch_parser_impl.hpp
+++ b/theta/include/compact_theta_sketch_parser_impl.hpp
@@ -28,104 +28,107 @@ namespace datasketches {
 
 template<bool dummy>
 auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
-  if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size)
-      + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
-
+  check_memory_size(ptr, size, 8, dump_on_error);
+  checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
   uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
-
   switch(serial_version) {
-  case COMPACT_SKETCH_SERIAL_VERSION: {
-      checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+  case 4: {
+    // version 4 sketches are ordered and always have entries (single item in exact mode is v3)
+    const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
+    checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+    const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 1;
+    uint64_t theta = theta_constants::MAX_THETA;
+    if (has_theta) {
+      check_memory_size(ptr, size, 16, dump_on_error);
+      theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_V4_THETA_U64];
+    }
+    const size_t num_entries_index = has_theta ? COMPACT_SKETCH_V4_NUM_ENTRIES_ESTIMATION_U32 : COMPACT_SKETCH_V4_NUM_ENTRIES_EXACT_U32;
+    check_memory_size(ptr, size, (num_entries_index + 1) * sizeof(uint32_t), dump_on_error);
+    const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[num_entries_index];
+    const size_t entries_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_U8 : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_U8;
+    const uint8_t min_entry_zeros = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_MIN_ENTRY_ZEROS_BYTE];
+    const size_t expected_bits = (64 - min_entry_zeros) * num_entries;
+    const size_t expected_size_bytes = entries_offset_bytes + std::ceil(expected_bits / 8.0);
+    check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
+    return {false, true, seed_hash, num_entries, theta,
+      reinterpret_cast<const uint8_t*>(ptr) + entries_offset_bytes, static_cast<uint8_t>(64 - min_entry_zeros)};
+  }
+  case 3: {
       uint64_t theta = theta_constants::MAX_THETA;
       const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
       if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
-        return {true, true, seed_hash, 0, theta, nullptr};
+        return {true, true, seed_hash, 0, theta, nullptr, 64};
       }
       checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
       const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
       if (has_theta) {
-        if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
+        check_memory_size(ptr, size, (COMPACT_SKETCH_THETA_U64 + 1) * sizeof(uint64_t), dump_on_error);
         theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
       }
       if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
-        if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
-        return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
+        check_memory_size(ptr, size, 16, dump_on_error);
+        return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64, 64};
       }
       const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
       const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
       const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
       const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
-      if (size < expected_size_bytes) {
-        throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
-            + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
-      }
+      check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
       const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
-      return {false, is_ordered, seed_hash, num_entries, theta, entries};
+      return {false, is_ordered, seed_hash, num_entries, theta, entries, 64};
   }
   case 1:  {
       uint16_t seed_hash = compute_seed_hash(seed);
-      checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
       const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
       uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
       bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
-      if (is_empty) {
-          return {true, true, seed_hash, 0, theta, nullptr};
-      }
+      if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
       const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
       const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
-      if (size < expected_size_bytes) {
-        throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
-            + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
-      }
-      return {false, true, seed_hash, num_entries, theta, entries};
+      check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
+      return {false, true, seed_hash, num_entries, theta, entries, 64};
   }
   case 2:  {
-      uint8_t preamble_size =  reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
-      checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+      uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
       const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
       checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
       if (preamble_size == 1) {
-          return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
+          return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
       } else if (preamble_size == 2) {
           const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
           if (num_entries == 0) {
-              return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
+              return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
           } else {
               const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
-              if (size < expected_size_bytes) {
-                  throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
-                      + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
-              }
+              check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
               const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
-              return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
+              return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries, 64};
           }
       } else if (preamble_size == 3) {
           const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
           uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
           bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
-          if (is_empty) {
-              return {true, true, seed_hash, 0, theta, nullptr};
-          }
+          if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
           const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
           const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
-          if (size < expected_size_bytes) {
-            throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
-                + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
-          }
-          return {false, true, seed_hash, num_entries, theta, entries};
+          check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
+          return {false, true, seed_hash, num_entries, theta, entries, 64};
       } else {
           throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
       }
   }
   default:
-      // this should always fail since the valid cases are handled above
-      checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
-      // this throw is never reached, because check_serial_version will throw an informative exception.
-      // This is only here to avoid a compiler warning about a path without a return value.
-      throw std::invalid_argument("unexpected sketch serialization version");
+    throw std::invalid_argument("unsupported serial version " + std::to_string(serial_version));
   }
 }
 
+template<bool dummy>
+void compact_theta_sketch_parser<dummy>::check_memory_size(const void* ptr, size_t actual_bytes, size_t expected_bytes, bool dump_on_error) {
+  if (actual_bytes < expected_bytes) throw std::out_of_range("at least " + std::to_string(expected_bytes)
+      + " bytes expected, actual " + std::to_string(actual_bytes)
+      + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), actual_bytes)) : ""));
+}
+
 template<bool dummy>
 std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
   std::stringstream s;
diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp
index fa283b8..e638f93 100644
--- a/theta/include/theta_sketch.hpp
+++ b/theta/include/theta_sketch.hpp
@@ -21,6 +21,7 @@
 #define THETA_SKETCH_HPP_
 
 #include "theta_update_sketch_base.hpp"
+#include "compact_theta_sketch_parser.hpp"
 
 namespace datasketches {
 
@@ -355,6 +356,8 @@ public:
    */
   vector_bytes serialize(unsigned header_size_bytes = 0) const;
 
+  vector_bytes serialize_compressed(unsigned header_size_bytes = 0) const;
+
   virtual iterator begin();
   virtual iterator end();
   virtual const_iterator begin() const;
@@ -391,6 +394,14 @@ private:
   uint64_t theta_;
   std::vector<uint64_t, Allocator> entries_;
 
+  vector_bytes serialize_version_4_SLZ(unsigned header_size_bytes = 0) const;
+  vector_bytes serialize_version_4_MLZ(unsigned header_size_bytes = 0) const;
+  vector_bytes serialize_version_4_FLZ(unsigned header_size_bytes = 0) const;
+  vector_bytes serialize_version_4_ULEB128(unsigned header_size_bytes = 0) const;
+
+  static compact_theta_sketch_alloc deserialize_version_4(const void* bytes, size_t size,
+      uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
+
   virtual void print_specifics(std::ostringstream& os) const;
 };
 
@@ -407,7 +418,7 @@ public:
 template<typename Allocator = std::allocator<uint64_t>>
 class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
 public:
-  using const_iterator = const uint64_t*;
+  class const_iterator;
 
   Allocator get_allocator() const;
   bool is_empty() const;
@@ -433,15 +444,32 @@ protected:
   virtual void print_items(std::ostringstream& os) const;
 
 private:
-  bool is_empty_;
-  bool is_ordered_;
-  uint16_t seed_hash_;
-  uint32_t num_entries_;
-  uint64_t theta_;
-  const uint64_t* entries_;
+  using data_type = compact_theta_sketch_parser<true>::compact_theta_sketch_data;
+  data_type data_;
+
+  wrapped_compact_theta_sketch_alloc(const data_type& data);
+};
 
-  wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
-      uint64_t theta, const uint64_t* entries);
+template<typename Allocator>
+class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator: std::iterator<std::input_iterator_tag, uint64_t> {
+public:
+  const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
+  const_iterator& operator++();
+  const_iterator operator++(int);
+  bool operator==(const const_iterator& other) const;
+  bool operator!=(const const_iterator& other) const;
+  const uint64_t& operator*() const;
+  const uint64_t* operator->() const;
+private:
+  const void* ptr_;
+  uint8_t entry_bits_;
+  uint32_t num_entries_;
+  uint32_t index_;
+  uint64_t previous_;
+  bool is_block_mode_;
+  uint8_t buf_i_;
+  uint8_t offset_;
+  uint64_t buffer_[8];
 };
 
 // aliases with default allocator for convenience
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 2e06b89..a889b57 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -27,7 +27,8 @@
 #include "serde.hpp"
 #include "binomial_bounds.hpp"
 #include "theta_helpers.hpp"
-#include "compact_theta_sketch_parser.hpp"
+#include "count_zeros.hpp"
+#include "bit_packing.hpp"
 
 namespace datasketches {
 
@@ -401,6 +402,239 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
   return bytes;
 }
 
+template<typename A>
+auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_bytes) const -> vector_bytes {
+  if (!this->is_ordered() || entries_.size() == 0 ||
+      (entries_.size() == 1 && !this->is_estimation_mode())) return serialize(header_size_bytes);
+  return serialize_version_4_MLZ(header_size_bytes);
+}
+
+// should be called for ordered sketches that are not empty and not single item
+template<typename A>
+auto compact_theta_sketch_alloc<A>::serialize_version_4_FLZ(unsigned header_size_bytes) const -> vector_bytes {
+  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
+  // compression based on leading zeros in deltas between ordered hash values
+  // assumes ordered sketch
+  uint64_t previous = 0;
+  uint8_t min_zeros = 64;
+  uint8_t max_zeros = 0;
+  std::vector<uint8_t> leading_zeros(entries_.get_allocator());
+  leading_zeros.reserve(entries_.size());
+  std::vector<uint64_t> deltas(entries_.get_allocator());
+  deltas.reserve(entries_.size());
+  size_t compressed_bits = 0;
+  for (unsigned i = 0; i < entries_.size(); ++i) {
+    const uint64_t delta = entries_[i] - previous;
+    deltas.push_back(delta);
+    previous = entries_[i];
+    const uint8_t zeros = count_leading_zeros_in_u64(delta);
+    leading_zeros.push_back(zeros);
+    min_zeros = std::min(min_zeros, zeros);
+    max_zeros = std::max(max_zeros, zeros);
+    compressed_bits += 63 - zeros; // the first 1 is understood
+  }
+  const uint8_t count_zeros_bits = 8 - byte_leading_zeros_table[max_zeros - min_zeros];
+  compressed_bits += count_zeros_bits * entries_.size();
+
+  const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
+  const uint8_t num_entries_bits = 31 - count_leading_zeros_in_u32(num_entries); // the first 1 is understood
+  compressed_bits += 5 + num_entries_bits;
+
+  const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + std::ceil(compressed_bits / 8.0);
+  vector_bytes bytes(size, 0, entries_.get_allocator());
+  uint8_t* ptr = bytes.data() + header_size_bytes;
+
+  ptr += copy_to_mem(preamble_longs, ptr);
+  const uint8_t serial_version = 4;
+  ptr += copy_to_mem(serial_version, ptr);
+  const uint8_t type = SKETCH_TYPE;
+  ptr += copy_to_mem(type, ptr);
+  ptr += copy_to_mem(min_zeros, ptr);
+  ptr += copy_to_mem(count_zeros_bits, ptr);
+  const uint8_t flags_byte(
+    (1 << flags::IS_COMPACT) |
+    (1 << flags::IS_READ_ONLY) |
+    (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+    (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
+  );
+  ptr += copy_to_mem(flags_byte, ptr);
+  const uint16_t seed_hash = get_seed_hash();
+  ptr += copy_to_mem(seed_hash, ptr);
+  if (this->is_estimation_mode()) {
+    ptr += copy_to_mem(theta_, ptr);
+  }
+
+  size_t offset_bits = pack_bits(num_entries_bits, 5, ptr, 0);
+  offset_bits = pack_bits(num_entries, num_entries_bits, ptr, offset_bits);
+  for (unsigned i = 0; i < entries_.size(); ++i) {
+    offset_bits = pack_bits(leading_zeros[i] - min_zeros, count_zeros_bits, ptr, offset_bits);
+    offset_bits = pack_bits(deltas[i], 63 - leading_zeros[i], ptr, offset_bits);
+  }
+  return bytes;
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size_bytes) const -> vector_bytes {
+  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
+  // compression based on leading zeros in deltas between ordered hash values
+  // assumes ordered sketch
+  uint64_t previous = 0;
+  uint64_t ored = 0;
+  for (const uint64_t entry: entries_) {
+    const uint64_t delta = entry - previous;
+    ored |= delta;
+    previous = entry;
+  }
+  uint8_t min_entry_zeros = count_leading_zeros_in_u64(ored);
+  size_t compressed_bits = (64 - min_entry_zeros) * entries_.size();
+
+//  const uint8_t num_entries_zeros = count_leading_zeros_in_u32(entries_.size());
+//  const uint8_t num_entries_bits = 31 - num_entries_zeros; // first 1 is understood
+//  compressed_bits += num_entries_bits * (entries_.size() > 1); // no bits for 0 or 1 entry
+
+  const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + std::ceil(compressed_bits / 8.0) + sizeof(uint32_t);
+  vector_bytes bytes(size, 0, entries_.get_allocator());
+  uint8_t* ptr = bytes.data() + header_size_bytes;
+
+  ptr += copy_to_mem(preamble_longs, ptr);
+  const uint8_t serial_version = 4;
+  ptr += copy_to_mem<uint8_t>(serial_version, ptr);
+  ptr += copy_to_mem(SKETCH_TYPE, ptr);
+  ptr += copy_to_mem(min_entry_zeros, ptr);
+  ptr += sizeof(uint8_t); // unused
+  const uint8_t flags_byte(
+    (1 << flags::IS_COMPACT) |
+    (1 << flags::IS_READ_ONLY) |
+    (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+    (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
+  );
+  ptr += copy_to_mem(flags_byte, ptr);
+  ptr += copy_to_mem<uint16_t>(get_seed_hash(), ptr);
+  if (this->is_estimation_mode()) {
+    ptr += copy_to_mem(theta_, ptr);
+  }
+//  uint8_t offset_bits = 0;
+//  if (entries_.size() > 1) {
+//    offset_bits = put_bits64(entries_.size(), num_entries_bits, ptr, offset_bits);
+//    std::cout << "writing " << std::to_string(num_entries_bits) << " bits\n";
+//  }
+
+  // uncompressed num_entries for now
+  ptr += copy_to_mem<uint32_t>(entries_.size(), ptr);
+
+  const uint8_t entry_bits = 64 - min_entry_zeros;
+
+  previous = 0;
+  uint64_t deltas[8];
+
+  unsigned i;
+  for (i = 0; i + 7 < entries_.size(); i += 8) {
+    for (unsigned j = 0; j < 8; ++j) {
+      deltas[j] = entries_[i + j] - previous;
+      previous = entries_[i + j];
+    }
+    pack_bits_block8(deltas, ptr, entry_bits);
+    ptr += entry_bits;
+  }
+
+  uint8_t offset = 0;
+  for (; i < entries_.size(); ++i) {
+    const uint64_t delta = entries_[i] - previous;
+    previous = entries_[i];
+    offset = pack_bits(delta, entry_bits, ptr, offset);
+  }
+  return bytes;
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::serialize_version_4_SLZ(unsigned header_size_bytes) const -> vector_bytes {
+  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
+  // compression based on leading zeros in deltas between ordered hash values
+  // assumes ordered sketch
+  const size_t fixed_size = header_size_bytes + sizeof(uint64_t) * preamble_longs;
+  const size_t estimated_variable_size = 5 // for num_entries
+      + 9 * entries_.size();
+  vector_bytes bytes(fixed_size + estimated_variable_size, 0, entries_.get_allocator());
+  uint8_t* ptr = bytes.data() + header_size_bytes;
+
+  ptr += copy_to_mem(preamble_longs, ptr);
+  const uint8_t serial_version = 4;
+  ptr += copy_to_mem(serial_version, ptr);
+  const uint8_t type = SKETCH_TYPE;
+  ptr += copy_to_mem(type, ptr);
+  ptr += sizeof(uint16_t); // unused
+  const uint8_t flags_byte(
+    (1 << flags::IS_COMPACT) |
+    (1 << flags::IS_READ_ONLY) |
+    (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+    (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
+  );
+  ptr += copy_to_mem(flags_byte, ptr);
+  const uint16_t seed_hash = get_seed_hash();
+  ptr += copy_to_mem(seed_hash, ptr);
+  if (this->is_estimation_mode()) {
+    ptr += copy_to_mem(theta_, ptr);
+  }
+
+  ptr += ULEB128(entries_.size(), ptr);
+
+  uint8_t offset_bits = 0;
+  uint64_t previous = 0;
+  for (const uint64_t entry: entries_) {
+    const uint64_t delta = entry - previous;
+    previous = entry;
+    const uint8_t zeros = count_leading_zeros_in_u64(delta);
+    offset_bits = pack_bits(zeros, 6, ptr, offset_bits);
+    offset_bits = pack_bits(delta, 63 - zeros, ptr, offset_bits);
+  }
+  if (offset_bits > 0) ++ptr;
+  bytes.resize(ptr - bytes.data());
+  return bytes;
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::serialize_version_4_ULEB128(unsigned header_size_bytes) const -> vector_bytes {
+  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
+  // compression based on ULEB128 code of deltas between ordered hash values
+  // assumes ordered sketch
+
+  const size_t fixed_size = header_size_bytes + sizeof(uint64_t) * preamble_longs;
+  const size_t estimated_variable_size = 5 // for num_entries
+      + 10 * entries_.size();
+  vector_bytes bytes(fixed_size + estimated_variable_size, 0, entries_.get_allocator());
+  uint8_t* ptr = bytes.data() + header_size_bytes;
+
+  ptr += copy_to_mem(preamble_longs, ptr);
+  const uint8_t serial_version = 4;
+  ptr += copy_to_mem(serial_version, ptr);
+  const uint8_t type = SKETCH_TYPE;
+  ptr += copy_to_mem(type, ptr);
+  ptr += sizeof(uint16_t); // unused
+  const uint8_t flags_byte(
+    (1 << flags::IS_COMPACT) |
+    (1 << flags::IS_READ_ONLY) |
+    (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+    (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
+  );
+  ptr += copy_to_mem(flags_byte, ptr);
+  const uint16_t seed_hash = get_seed_hash();
+  ptr += copy_to_mem(seed_hash, ptr);
+  if (this->is_estimation_mode()) {
+    ptr += copy_to_mem(theta_, ptr);
+  }
+
+  ptr += ULEB128(entries_.size(), ptr);
+
+  uint64_t previous = 0;
+  for (unsigned i = 0; i < entries_.size(); ++i) {
+    const uint64_t delta = entries_[i] - previous;
+    previous = entries_[i];
+    ptr += pack_ULEB128(delta, ptr);
+  }
+  bytes.resize(ptr - bytes.data());
+  return bytes;
+}
+
 template<typename A>
 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
   const auto preamble_longs = read<uint8_t>(is);
@@ -505,27 +739,93 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
 
 template<typename A>
 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
+  if (static_cast<const uint8_t*>(bytes)[1] == 4) return deserialize_version_4(bytes, size, seed, allocator); // TODO: check size
   auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
-  return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
+  const uint64_t* entries = reinterpret_cast<const uint64_t*>(data.entries_start_ptr);
+  return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
+      std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
+}
+
+// MLZ
+template<typename A>
+compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_version_4(const void* bytes, size_t size,
+    uint64_t seed, const A& allocator) {
+  ensure_minimum_memory(size, 8);
+  const uint8_t* ptr = static_cast<const uint8_t*>(bytes);
+  uint8_t preamble_longs;
+  ptr += copy_from_mem(ptr, preamble_longs);
+  uint8_t serial_version;
+  ptr += copy_from_mem(ptr, serial_version);
+  uint8_t type;
+  ptr += copy_from_mem(ptr, type);
+  uint8_t min_entry_zeros;
+  ptr += copy_from_mem(ptr, min_entry_zeros);
+//  uint8_t num_entries_zeros;
+//  ptr += copy_from_mem(ptr, num_entries_zeros);
+  ptr++; // unused
+  uint8_t flags_byte;
+  ptr += copy_from_mem(ptr, flags_byte);
+  uint16_t seed_hash;
+  ptr += copy_from_mem(ptr, seed_hash);
+  checker<true>::check_sketch_type(type, SKETCH_TYPE);
+  checker<true>::check_serial_version(serial_version, 4);
+  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
+  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+  // check counts of zeros?
+
+  uint64_t theta = theta_constants::MAX_THETA;
+  if (preamble_longs == 2) {
+    ensure_minimum_memory(size, sizeof(uint64_t) * 2);
+    ptr += copy_from_mem(ptr, theta);
+  }
+
+  // TODO: check size
+
+//  const size_t expected_bits = (64 - min_entry_zeros) * num_entries + num_entries_bits * (num_entries > 1);
+//  const size_t expected_size = sizeof(uint64_t) * (preamble_longs + std::ceil(expected_bits / 64.0));
+//  ensure_minimum_memory(size, expected_size);
+
+  // uncompressed num_entries for now
+  uint32_t num_entries;
+  ptr += copy_from_mem(ptr, num_entries);
+
+  const size_t expected_bits = (64 - min_entry_zeros) * num_entries;
+  const size_t expected_size = sizeof(uint64_t) * preamble_longs + std::ceil(expected_bits / 8.0) + sizeof(uint32_t);
+  ensure_minimum_memory(size, expected_size);
+
+  const uint8_t entry_bits = 64 - min_entry_zeros;
+  uint64_t previous = 0;
+  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
+
+  unsigned i;
+  for (i = 0; i + 7 < num_entries; i += 8) {
+    unpack_bits_block8(&entries[i], ptr, entry_bits);
+    ptr += entry_bits;
+  }
+  uint8_t offset = 0;
+  for (; i < num_entries; ++i) {
+    offset = unpack_bits(entries[i], entry_bits, ptr, offset);
+  }
+  // undo deltas
+  for (i = 0; i < num_entries; ++i) {
+    entries[i] += previous;
+    previous = entries[i];
+  }
+
+  const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
+  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
 }
 
 // wrapped compact sketch
 
 template<typename A>
-wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
-    uint64_t theta, const uint64_t* entries):
-is_empty_(is_empty),
-is_ordered_(is_ordered),
-seed_hash_(seed_hash),
-num_entries_(num_entries),
-theta_(theta),
-entries_(entries)
+wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(const data_type& data):
+data_(data)
 {}
 
 template<typename A>
 const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
-  auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
-  return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
+  return wrapped_compact_theta_sketch_alloc(compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error));
 }
 
 template<typename A>
@@ -535,37 +835,37 @@ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
 
 template<typename A>
 bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
-  return is_empty_;
+  return data_.is_empty;
 }
 
 template<typename A>
 bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
-  return is_ordered_;
+  return data_.is_ordered;
 }
 
 template<typename A>
 uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
-  return theta_;
+  return data_.theta;
 }
 
 template<typename A>
 uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
-  return static_cast<uint32_t>(num_entries_);
+  return data_.num_entries;
 }
 
 template<typename A>
 uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
-  return seed_hash_;
+  return data_.seed_hash;
 }
 
 template<typename A>
 auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
-  return entries_;
+  return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
 }
 
 template<typename A>
 auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
-  return entries_ + num_entries_;
+  return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
 }
 
 template<typename A>
@@ -574,12 +874,109 @@ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&)
 template<typename A>
 void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
     os << "### Retained entries" << std::endl;
-    for (const auto& hash: *this) {
+    for (const auto hash: *this) {
       os << hash << std::endl;
     }
     os << "### End retained entries" << std::endl;
 }
 
+// assumes index == 0 or index == num_entries
+template<typename Allocator>
+wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
+    const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
+ptr_(ptr),
+entry_bits_(entry_bits),
+num_entries_(num_entries),
+index_(index),
+previous_(0),
+is_block_mode_(num_entries_ >= 8),
+buf_i_(0),
+offset_(0)
+{
+  if (entry_bits == 64) { // no compression
+    ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
+  } else if (index < num_entries) {
+    if (is_block_mode_) {
+      unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
+      ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
+      for (int i = 0; i < 8; ++i) {
+        buffer_[i] += previous_;
+        previous_ = buffer_[i];
+      }
+    } else {
+      offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
+      buffer_[0] += previous_;
+      previous_ = buffer_[0];
+    }
+  }
+}
+
+template<typename Allocator>
+auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
+  if (entry_bits_ == 64) { // no compression
+    ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
+    return *this;
+  }
+  ++index_;
+  if (index_ < num_entries_) {
+    if (is_block_mode_) {
+      ++buf_i_;
+      if (buf_i_ == 8) {
+        buf_i_ = 0;
+        if (index_ + 8 < num_entries_) {
+          unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
+          ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
+          for (int i = 0; i < 8; ++i) {
+            buffer_[i] += previous_;
+            previous_ = buffer_[i];
+          }
+        } else {
+          is_block_mode_ = false;
+          offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
+          buffer_[0] += previous_;
+          previous_ = buffer_[0];
+        }
+      }
+    } else {
+      offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
+      buffer_[0] += previous_;
+      previous_ = buffer_[0];
+    }
+  }
+  return *this;
+}
+
+template<typename Allocator>
+auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
+  const_iterator tmp(*this);
+  operator++();
+  return tmp;
+}
+
+template<typename Allocator>
+bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(const const_iterator& other) const {
+  if (entry_bits_ == 64) return ptr_ != other.ptr_;
+  return index_ != other.index_;
+}
+
+template<typename Allocator>
+bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(const const_iterator& other) const {
+  if (entry_bits_ == 64) return ptr_ == other.ptr_;
+  return index_ == other.index_;
+}
+
+template<typename Allocator>
+const uint64_t& wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const {
+  if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
+  return buffer_[buf_i_];
+}
+
+template<typename Allocator>
+const uint64_t* wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const {
+  if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
+  return buffer_ + buf_i_;
+}
+
 } /* namespace datasketches */
 
 #endif
diff --git a/theta/test/theta_sketch_test.cpp b/theta/test/theta_sketch_test.cpp
index 4fc6651..f57e0e5 100644
--- a/theta/test/theta_sketch_test.cpp
+++ b/theta/test/theta_sketch_test.cpp
@@ -607,7 +607,7 @@ TEST_CASE("theta sketch: wrap compact estimation from java", "[theta_sketch]") {
   compact_theta_sketch compact_sketch = update_sketch.compact();
   // the sketches are ordered, so the iteration sequence must match exactly
   auto iter = sketch.begin();
-  for (const auto& key: compact_sketch) {
+  for (const auto key: compact_sketch) {
     REQUIRE(*iter == key);
     ++iter;
   }
@@ -652,7 +652,7 @@ TEST_CASE("theta sketch: wrap compact v1 estimation from java", "[theta_sketch]"
   compact_theta_sketch compact_sketch = update_sketch.compact();
   // the sketches are ordered, so the iteration sequence must match exactly
   auto iter = sketch.begin();
-  for (const auto& key: compact_sketch) {
+  for (const auto key: compact_sketch) {
     REQUIRE(*iter == key);
     ++iter;
   }
@@ -697,7 +697,20 @@ TEST_CASE("theta sketch: wrap compact v2 estimation from java", "[theta_sketch]"
   compact_theta_sketch compact_sketch = update_sketch.compact();
   // the sketches are ordered, so the iteration sequence must match exactly
   auto iter = sketch.begin();
-  for (const auto& key: compact_sketch) {
+  for (const auto key: compact_sketch) {
+    REQUIRE(*iter == key);
+    ++iter;
+  }
+}
+
+TEST_CASE("theta sketch: wrapped compressed", "[theta_sketch]") {
+  auto update_sketch = update_theta_sketch::builder().build();
+  for (int i = 0; i < 10000; i++) update_sketch.update(i);
+  auto compact_sketch = update_sketch.compact();
+  auto bytes = compact_sketch.serialize_compressed();
+  auto wrapped_compressed = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
+  auto iter = wrapped_compressed.begin();
+  for (const auto key: compact_sketch) {
     REQUIRE(*iter == key);
     ++iter;
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org