You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/06/27 12:14:04 UTC

[1/3] arrow git commit: ARROW-1154: [C++] Import miscellaneous computational utility code from parquet-cpp

Repository: arrow
Updated Branches:
  refs/heads/master cb5f2b953 -> b06522870


http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/hash-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
new file mode 100644
index 0000000..ffe1a9d
--- /dev/null
+++ b/cpp/src/arrow/util/hash-util.h
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-02-22
+
+#ifndef ARROW_UTIL_HASH_UTIL_H
+#define ARROW_UTIL_HASH_UTIL_H
+
+#include <cstdint>
+
+#include "arrow/util/compiler-util.h"
+#include "arrow/util/cpu-info.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/sse-util.h"
+
+namespace arrow {
+
+/// Utility class to compute hash values.
+class HashUtil {
+ public:
+  /// Compute the Crc32 hash for data using SSE4 instructions.  The input hash
+  /// parameter is the current hash/seed value.
+  /// This should only be called if SSE is supported.
+  /// This is ~4x faster than Fnv/Boost Hash.
+  /// TODO: crc32 hashes with different seeds do not result in different hash functions.
+  /// The resulting hashes are correlated.
+  /// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate.
+  static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
+    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
+    uint32_t words = bytes / sizeof(uint32_t);
+    bytes = bytes % sizeof(uint32_t);
+
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
+    while (words--) {
+      hash = SSE4_crc32_u32(hash, *p);
+      ++p;
+    }
+
+    const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
+    while (bytes--) {
+      hash = SSE4_crc32_u8(hash, *s);
+      ++s;
+    }
+
+    // The lower half of the CRC hash has has poor uniformity, so swap the halves
+    // for anyone who only uses the first several bits of the hash.
+    hash = (hash << 16) | (hash >> 16);
+    return hash;
+  }
+
+  /// CrcHash() specialized for 1-byte data
+  static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
+    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
+    const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
+    hash = SSE4_crc32_u8(hash, *s);
+    hash = (hash << 16) | (hash >> 16);
+    return hash;
+  }
+
+  /// CrcHash() specialized for 2-byte data
+  static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
+    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
+    const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
+    hash = SSE4_crc32_u16(hash, *s);
+    hash = (hash << 16) | (hash >> 16);
+    return hash;
+  }
+
+  /// CrcHash() specialized for 4-byte data
+  static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
+    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
+    hash = SSE4_crc32_u32(hash, *p);
+    hash = (hash << 16) | (hash >> 16);
+    return hash;
+  }
+
+  /// CrcHash() specialized for 8-byte data
+  static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
+    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
+    const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
+    hash = SSE4_crc32_u64(hash, *p);
+    hash = (hash << 16) | (hash >> 16);
+    return hash;
+  }
+
+  /// CrcHash() specialized for 12-byte data
+  static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
+    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
+    const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
+    hash = SSE4_crc32_u64(hash, *p);
+    ++p;
+    hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
+    hash = (hash << 16) | (hash >> 16);
+    return hash;
+  }
+
+  /// CrcHash() specialized for 16-byte data
+  static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
+    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
+    const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
+    hash = SSE4_crc32_u64(hash, *p);
+    ++p;
+    hash = SSE4_crc32_u64(hash, *p);
+    hash = (hash << 16) | (hash >> 16);
+    return hash;
+  }
+
+  static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995;
+  static const int MURMUR_R = 47;
+
+  /// Murmur2 hash implementation returning 64-bit hashes.
+  static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) {
+    uint64_t h = seed ^ (len * MURMUR_PRIME);
+
+    const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
+    const uint64_t* end = data + (len / sizeof(uint64_t));
+
+    while (data != end) {
+      uint64_t k = *data++;
+      k *= MURMUR_PRIME;
+      k ^= k >> MURMUR_R;
+      k *= MURMUR_PRIME;
+      h ^= k;
+      h *= MURMUR_PRIME;
+    }
+
+    const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
+    switch (len & 7) {
+      case 7:
+        h ^= uint64_t(data2[6]) << 48;
+      case 6:
+        h ^= uint64_t(data2[5]) << 40;
+      case 5:
+        h ^= uint64_t(data2[4]) << 32;
+      case 4:
+        h ^= uint64_t(data2[3]) << 24;
+      case 3:
+        h ^= uint64_t(data2[2]) << 16;
+      case 2:
+        h ^= uint64_t(data2[1]) << 8;
+      case 1:
+        h ^= uint64_t(data2[0]);
+        h *= MURMUR_PRIME;
+    }
+
+    h ^= h >> MURMUR_R;
+    h *= MURMUR_PRIME;
+    h ^= h >> MURMUR_R;
+    return h;
+  }
+
+  /// default values recommended by http://isthe.com/chongo/tech/comp/fnv/
+  static const uint32_t FNV_PRIME = 0x01000193;  //   16777619
+  static const uint32_t FNV_SEED = 0x811C9DC5;   // 2166136261
+  static const uint64_t FNV64_PRIME = 1099511628211UL;
+  static const uint64_t FNV64_SEED = 14695981039346656037UL;
+
+  /// Implementation of the Fowler-Noll-Vo hash function. This is not as performant
+  /// as boost's hash on int types (2x slower) but has bit entropy.
+  /// For ints, boost just returns the value of the int which can be pathological.
+  /// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000
+  /// is taken on the hash, all values will collide to the same bucket.
+  /// For string values, Fnv is slightly faster than boost.
+  /// IMPORTANT: FNV hash suffers from poor diffusion of the least significant bit,
+  /// which can lead to poor results when input bytes are duplicated.
+  /// See FnvHash64to32() for how this can be mitigated.
+  static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) {
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
+    while (bytes--) {
+      hash = (*ptr ^ hash) * FNV64_PRIME;
+      ++ptr;
+    }
+    return hash;
+  }
+
+  /// Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits.
+  /// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the
+  /// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs.
+  /// The input seed 'hash' is duplicated so the top half of the seed is not all zero.
+  /// Data length must be at least 1 byte: zero-length data should be handled separately,
+  /// for example using CombineHash with a unique constant value to avoid returning the
+  /// hash argument. Zero-length data gives terrible results: the initial hash value is
+  /// xored with itself cancelling all bits.
+  static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) {
+    // IMPALA-2270: this function should never be used for zero-byte inputs.
+    DCHECK_GT(bytes, 0);
+    uint64_t hash_u64 = hash | ((uint64_t)hash << 32);
+    hash_u64 = FnvHash64(data, bytes, hash_u64);
+    return (hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF);
+  }
+
+  /// Computes the hash value for data.  Will call either CrcHash or MurmurHash
+  /// depending on hardware capabilities.
+  /// Seed values for different steps of the query execution should use different seeds
+  /// to prevent accidental key collisions. (See IMPALA-219 for more details).
+  static uint32_t Hash(const void* data, int32_t bytes, uint32_t seed) {
+#ifdef ARROW_USE_SSE
+    if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSE4_2))) {
+      return CrcHash(data, bytes, seed);
+    } else {
+      return MurmurHash2_64(data, bytes, seed);
+    }
+#else
+    return static_cast<uint32_t>(MurmurHash2_64(data, bytes, seed));
+#endif
+  }
+
+  /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
+  static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
+
+  /// Combine hashes 'value' and 'seed' to get a new hash value.  Similar to
+  /// boost::hash_combine(), but for uint32_t. This function should be used with a
+  /// constant first argument to update the hash value for zero-length values such as
+  /// NULL, boolean, and empty strings.
+  static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) {
+    return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2));
+  }
+
+  // Get 32 more bits of randomness from a 32-bit hash:
+  static inline uint32_t Rehash32to32(const uint32_t hash) {
+    // Constants generated by uuidgen(1) with the -r flag
+    static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull;
+    // This is strongly universal hashing following Dietzfelbinger's "Universal hashing
+    // and k-wise independent random variables via integer arithmetic without primes". As
+    // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the
+    // randomness of the constants) that any subset of bit positions of
+    // Rehash32to32(hash1) is equal to the same subset of bit positions
+    // Rehash32to32(hash2) is minimal.
+    return (static_cast<uint64_t>(hash) * m + a) >> 32;
+  }
+
+  static inline uint64_t Rehash32to64(const uint32_t hash) {
+    static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull,
+                          a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull;
+    const uint64_t hash1 = (static_cast<uint64_t>(hash) * m1 + a1) >> 32;
+    const uint64_t hash2 = (static_cast<uint64_t>(hash) * m2 + a2) >> 32;
+    return hash1 | (hash2 << 32);
+  }
+};
+
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_HASH_UTIL_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/logging.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h
index 8a929da..b618121 100644
--- a/cpp/src/arrow/util/logging.h
+++ b/cpp/src/arrow/util/logging.h
@@ -38,6 +38,7 @@ namespace arrow {
 
 #define ARROW_LOG_INTERNAL(level) ::arrow::internal::CerrLog(level)
 #define ARROW_LOG(level) ARROW_LOG_INTERNAL(ARROW_##level)
+#define ARROW_IGNORE_EXPR(expr) ((void)(expr));
 
 #define ARROW_CHECK(condition)                           \
   (condition) ? 0                                        \
@@ -47,25 +48,32 @@ namespace arrow {
 #ifdef NDEBUG
 #define ARROW_DFATAL ARROW_WARNING
 
-#define DCHECK(condition) \
-  while (false)           \
+#define DCHECK(condition)      \
+  ARROW_IGNORE_EXPR(condition) \
+  while (false)                \
   ::arrow::internal::NullLog()
 #define DCHECK_EQ(val1, val2) \
+  ARROW_IGNORE_EXPR(val1)     \
   while (false)               \
   ::arrow::internal::NullLog()
 #define DCHECK_NE(val1, val2) \
+  ARROW_IGNORE_EXPR(val1)     \
   while (false)               \
   ::arrow::internal::NullLog()
 #define DCHECK_LE(val1, val2) \
+  ARROW_IGNORE_EXPR(val1)     \
   while (false)               \
   ::arrow::internal::NullLog()
 #define DCHECK_LT(val1, val2) \
+  ARROW_IGNORE_EXPR(val1)     \
   while (false)               \
   ::arrow::internal::NullLog()
 #define DCHECK_GE(val1, val2) \
+  ARROW_IGNORE_EXPR(val1)     \
   while (false)               \
   ::arrow::internal::NullLog()
 #define DCHECK_GT(val1, val2) \
+  ARROW_IGNORE_EXPR(val1)     \
   while (false)               \
   ::arrow::internal::NullLog()
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/rle-encoding-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc
new file mode 100644
index 0000000..7c9b33c
--- /dev/null
+++ b/cpp/src/arrow/util/rle-encoding-test.cc
@@ -0,0 +1,460 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <boost/utility.hpp>
+
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "arrow/util/bit-stream-utils.h"
+#include "arrow/util/rle-encoding.h"
+
+using std::vector;
+
+namespace arrow {
+
+const int MAX_WIDTH = 32;
+
+TEST(BitArray, TestBool) {
+  const int len = 8;
+  uint8_t buffer[len];
+
+  BitWriter writer(buffer, len);
+
+  // Write alternating 0's and 1's
+  for (int i = 0; i < 8; ++i) {
+    bool result = writer.PutValue(i % 2, 1);
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
+
+  // Write 00110011
+  for (int i = 0; i < 8; ++i) {
+    bool result = false;
+    switch (i) {
+      case 0:
+      case 1:
+      case 4:
+      case 5:
+        result = writer.PutValue(false, 1);
+        break;
+      default:
+        result = writer.PutValue(true, 1);
+        break;
+    }
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+
+  // Validate the exact bit value
+  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
+  EXPECT_EQ((int)buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0));
+
+  // Use the reader and validate
+  BitReader reader(buffer, len);
+  for (int i = 0; i < 8; ++i) {
+    bool val = false;
+    bool result = reader.GetValue(1, &val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, (i % 2) != 0);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    bool val = false;
+    bool result = reader.GetValue(1, &val);
+    EXPECT_TRUE(result);
+    switch (i) {
+      case 0:
+      case 1:
+      case 4:
+      case 5:
+        EXPECT_EQ(val, false);
+        break;
+      default:
+        EXPECT_EQ(val, true);
+        break;
+    }
+  }
+}
+
+// Writes 'num_vals' values with width 'bit_width' and reads them back.
+void TestBitArrayValues(int bit_width, int num_vals) {
+  int len = static_cast<int>(BitUtil::Ceil(bit_width * num_vals, 8));
+  EXPECT_GT(len, 0);
+  const uint64_t mod = bit_width == 64 ? 1 : 1LL << bit_width;
+
+  std::vector<uint8_t> buffer(len);
+  BitWriter writer(buffer.data(), len);
+  for (int i = 0; i < num_vals; ++i) {
+    bool result = writer.PutValue(i % mod, bit_width);
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+  EXPECT_EQ(writer.bytes_written(), len);
+
+  BitReader reader(buffer.data(), len);
+  for (int i = 0; i < num_vals; ++i) {
+    int64_t val = 0;
+    bool result = reader.GetValue(bit_width, &val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, i % mod);
+  }
+  EXPECT_EQ(reader.bytes_left(), 0);
+}
+
+TEST(BitArray, TestValues) {
+  for (int width = 1; width <= MAX_WIDTH; ++width) {
+    TestBitArrayValues(width, 1);
+    TestBitArrayValues(width, 2);
+    // Don't write too many values
+    TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
+    TestBitArrayValues(width, 1024);
+  }
+}
+
+// Test some mixed values
+TEST(BitArray, TestMixed) {
+  const int len = 1024;
+  uint8_t buffer[len];
+  bool parity = true;
+
+  BitWriter writer(buffer, len);
+  for (int i = 0; i < len; ++i) {
+    bool result;
+    if (i % 2 == 0) {
+      result = writer.PutValue(parity, 1);
+      parity = !parity;
+    } else {
+      result = writer.PutValue(i, 10);
+    }
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+
+  parity = true;
+  BitReader reader(buffer, len);
+  for (int i = 0; i < len; ++i) {
+    bool result;
+    if (i % 2 == 0) {
+      bool val;
+      result = reader.GetValue(1, &val);
+      EXPECT_EQ(val, parity);
+      parity = !parity;
+    } else {
+      int val;
+      result = reader.GetValue(10, &val);
+      EXPECT_EQ(val, i);
+    }
+    EXPECT_TRUE(result);
+  }
+}
+
+// Validates encoding of values by encoding and decoding them.  If
+// expected_encoding != NULL, also validates that the encoded buffer is
+// exactly 'expected_encoding'.
+// if expected_len is not -1, it will validate the encoded size is correct.
+void ValidateRle(const vector<int>& values, int bit_width, uint8_t* expected_encoding,
+    int expected_len) {
+  const int len = 64 * 1024;
+  uint8_t buffer[len];
+  EXPECT_LE(expected_len, len);
+
+  RleEncoder encoder(buffer, len, bit_width);
+  for (size_t i = 0; i < values.size(); ++i) {
+    bool result = encoder.Put(values[i]);
+    EXPECT_TRUE(result);
+  }
+  int encoded_len = encoder.Flush();
+
+  if (expected_len != -1) { EXPECT_EQ(encoded_len, expected_len); }
+  if (expected_encoding != NULL) {
+    EXPECT_EQ(memcmp(buffer, expected_encoding, expected_len), 0);
+  }
+
+  // Verify read
+  {
+    RleDecoder decoder(buffer, len, bit_width);
+    for (size_t i = 0; i < values.size(); ++i) {
+      uint64_t val;
+      bool result = decoder.Get(&val);
+      EXPECT_TRUE(result);
+      EXPECT_EQ(values[i], val);
+    }
+  }
+
+  // Verify batch read
+  {
+    RleDecoder decoder(buffer, len, bit_width);
+    vector<int> values_read(values.size());
+    ASSERT_EQ(values.size(),
+        decoder.GetBatch(values_read.data(), static_cast<int>(values.size())));
+    EXPECT_EQ(values, values_read);
+  }
+}
+
+// A version of ValidateRle that round-trips the values and returns false if
+// the returned values are not all the same
+bool CheckRoundTrip(const vector<int>& values, int bit_width) {
+  const int len = 64 * 1024;
+  uint8_t buffer[len];
+  RleEncoder encoder(buffer, len, bit_width);
+  for (size_t i = 0; i < values.size(); ++i) {
+    bool result = encoder.Put(values[i]);
+    if (!result) { return false; }
+  }
+  int encoded_len = encoder.Flush();
+  int out = 0;
+
+  {
+    RleDecoder decoder(buffer, encoded_len, bit_width);
+    for (size_t i = 0; i < values.size(); ++i) {
+      EXPECT_TRUE(decoder.Get(&out));
+      if (values[i] != out) { return false; }
+    }
+  }
+
+  // Verify batch read
+  {
+    RleDecoder decoder(buffer, len, bit_width);
+    vector<int> values_read(values.size());
+    if (static_cast<int>(values.size()) !=
+        decoder.GetBatch(values_read.data(), static_cast<int>(values.size()))) {
+      return false;
+    }
+    if (values != values_read) { return false; }
+  }
+
+  return true;
+}
+
+TEST(Rle, SpecificSequences) {
+  const int len = 1024;
+  uint8_t expected_buffer[len];
+  vector<int> values;
+
+  // Test 50 0' followed by 50 1's
+  values.resize(100);
+  for (int i = 0; i < 50; ++i) {
+    values[i] = 0;
+  }
+  for (int i = 50; i < 100; ++i) {
+    values[i] = 1;
+  }
+
+  // expected_buffer valid for bit width <= 1 byte
+  expected_buffer[0] = (50 << 1);
+  expected_buffer[1] = 0;
+  expected_buffer[2] = (50 << 1);
+  expected_buffer[3] = 1;
+  for (int width = 1; width <= 8; ++width) {
+    ValidateRle(values, width, expected_buffer, 4);
+  }
+
+  for (int width = 9; width <= MAX_WIDTH; ++width) {
+    ValidateRle(values, width, NULL, 2 * (1 + static_cast<int>(BitUtil::Ceil(width, 8))));
+  }
+
+  // Test 100 0's and 1's alternating
+  for (int i = 0; i < 100; ++i) {
+    values[i] = i % 2;
+  }
+  int num_groups = static_cast<int>(BitUtil::Ceil(100, 8));
+  expected_buffer[0] = static_cast<uint8_t>((num_groups << 1) | 1);
+  for (int i = 1; i <= 100 / 8; ++i) {
+    expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0);
+  }
+  // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0.
+  expected_buffer[100 / 8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0);
+
+  // num_groups and expected_buffer only valid for bit width = 1
+  ValidateRle(values, 1, expected_buffer, 1 + num_groups);
+  for (int width = 2; width <= MAX_WIDTH; ++width) {
+    int num_values = static_cast<int>(BitUtil::Ceil(100, 8)) * 8;
+    ValidateRle(
+        values, width, NULL, 1 + static_cast<int>(BitUtil::Ceil(width * num_values, 8)));
+  }
+}
+
+// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
+// is used, otherwise alternating values are used.
+void TestRleValues(int bit_width, int num_vals, int value = -1) {
+  const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width;
+  vector<int> values;
+  for (int v = 0; v < num_vals; ++v) {
+    values.push_back((value != -1) ? value : static_cast<int>(v % mod));
+  }
+  ValidateRle(values, bit_width, NULL, -1);
+}
+
+TEST(Rle, TestValues) {
+  for (int width = 1; width <= MAX_WIDTH; ++width) {
+    TestRleValues(width, 1);
+    TestRleValues(width, 1024);
+    TestRleValues(width, 1024, 0);
+    TestRleValues(width, 1024, 1);
+  }
+}
+
+TEST(Rle, BitWidthZeroRepeated) {
+  uint8_t buffer[1];
+  const int num_values = 15;
+  buffer[0] = num_values << 1;  // repeated indicator byte
+  RleDecoder decoder(buffer, sizeof(buffer), 0);
+  uint8_t val;
+  for (int i = 0; i < num_values; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, 0);  // can only encode 0s with bit width 0
+  }
+  EXPECT_FALSE(decoder.Get(&val));
+}
+
+TEST(Rle, BitWidthZeroLiteral) {
+  uint8_t buffer[1];
+  const int num_groups = 4;
+  buffer[0] = num_groups << 1 | 1;  // literal indicator byte
+  RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0);
+  const int num_values = num_groups * 8;
+  uint8_t val;
+  for (int i = 0; i < num_values; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, 0);  // can only encode 0s with bit width 0
+  }
+  EXPECT_FALSE(decoder.Get(&val));
+}
+
+// Test that writes out a repeated group and then a literal
+// group but flush before finishing.
+TEST(BitRle, Flush) {
+  vector<int> values;
+  for (int i = 0; i < 16; ++i)
+    values.push_back(1);
+  values.push_back(0);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+}
+
+// Test some random sequences.
+TEST(BitRle, Random) {
+  int niters = 50;
+  int ngroups = 1000;
+  int max_group_size = 16;
+  vector<int> values(ngroups + max_group_size);
+
+  // prng setup
+  std::random_device rd;
+  std::uniform_int_distribution<int> dist(1, 20);
+
+  for (int iter = 0; iter < niters; ++iter) {
+    // generate a seed with device entropy
+    uint32_t seed = rd();
+    std::mt19937 gen(seed);
+
+    bool parity = 0;
+    values.resize(0);
+
+    for (int i = 0; i < ngroups; ++i) {
+      int group_size = dist(gen);
+      if (group_size > max_group_size) { group_size = 1; }
+      for (int i = 0; i < group_size; ++i) {
+        values.push_back(parity);
+      }
+      parity = !parity;
+    }
+    if (!CheckRoundTrip(values, BitUtil::NumRequiredBits(values.size()))) {
+      FAIL() << "failing seed: " << seed;
+    }
+  }
+}
+
+// Test a sequence of 1 0's, 2 1's, 3 0's. etc
+// e.g. 011000111100000
+TEST(BitRle, RepeatedPattern) {
+  vector<int> values;
+  const int min_run = 1;
+  const int max_run = 32;
+
+  for (int i = min_run; i <= max_run; ++i) {
+    int v = i % 2;
+    for (int j = 0; j < i; ++j) {
+      values.push_back(v);
+    }
+  }
+
+  // And go back down again
+  for (int i = max_run; i >= min_run; --i) {
+    int v = i % 2;
+    for (int j = 0; j < i; ++j) {
+      values.push_back(v);
+    }
+  }
+
+  ValidateRle(values, 1, NULL, -1);
+}
+
+TEST(BitRle, Overflow) {
+  for (int bit_width = 1; bit_width < 32; bit_width += 3) {
+    int len = RleEncoder::MinBufferSize(bit_width);
+    std::vector<uint8_t> buffer(len);
+    int num_added = 0;
+    bool parity = true;
+
+    RleEncoder encoder(buffer.data(), len, bit_width);
+    // Insert alternating true/false until there is no space left
+    while (true) {
+      bool result = encoder.Put(parity);
+      parity = !parity;
+      if (!result) break;
+      ++num_added;
+    }
+
+    int bytes_written = encoder.Flush();
+    EXPECT_LE(bytes_written, len);
+    EXPECT_GT(num_added, 0);
+
+    RleDecoder decoder(buffer.data(), bytes_written, bit_width);
+    parity = true;
+    uint32_t v;
+    for (int i = 0; i < num_added; ++i) {
+      bool result = decoder.Get(&v);
+      EXPECT_TRUE(result);
+      EXPECT_EQ(v != 0, parity);
+      parity = !parity;
+    }
+    // Make sure we get false when reading past end a couple times.
+    EXPECT_FALSE(decoder.Get(&v));
+    EXPECT_FALSE(decoder.Get(&v));
+  }
+}
+
+}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h
new file mode 100644
index 0000000..9ec6235
--- /dev/null
+++ b/cpp/src/arrow/util/rle-encoding.h
@@ -0,0 +1,598 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use
+// in parquet-cpp, Arrow
+
+#ifndef ARROW_UTIL_RLE_ENCODING_H
+#define ARROW_UTIL_RLE_ENCODING_H
+
+#include <algorithm>
+#include <math.h>
+
+#include "arrow/util/bit-stream-utils.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/compiler-util.h"
+
+namespace arrow {
+
+/// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
+/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+/// (literal encoding).
+/// For both types of runs, there is a byte-aligned indicator which encodes the length
+/// of the run and the type of the run.
+/// This encoding has the benefit that when there aren't any long enough runs, values
+/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+/// the run length are byte aligned. This allows for very efficient decoding
+/// implementations.
+/// The encoding is:
+///    encoded-block := run*
+///    run := literal-run | repeated-run
+///    literal-run := literal-indicator < literal bytes >
+///    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+///    literal-indicator := varint_encode( number_of_groups << 1 | 1)
+///    repeated-indicator := varint_encode( number_of_repetitions << 1 )
+//
+/// Each run is preceded by a varint. The varint's least significant bit is
+/// used to indicate whether the run is a literal run or a repeated run. The rest
+/// of the varint is used to determine the length of the run (eg how many times the
+/// value repeats).
+//
+/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+/// on a byte boundary without padding.
+/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+/// the actual number of encoded ints. (This means that the total number of encoded values
+/// can not be determined from the encoded data, since the number of values in the last
+/// group may not be a multiple of 8). For the last group of literal runs, we pad
+/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+/// without the need for additional checks.
+//
+/// There is a break-even point when it is more storage efficient to do run length
+/// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
+/// for both the repeated encoding or the literal encoding.  This value can always
+/// be computed based on the bit-width.
+/// TODO: think about how to use this for strings.  The bit packing isn't quite the same.
+//
+/// Examples with bit-width 1 (eg encoding booleans):
+/// ----------------------------------------
+/// 100 1s followed by 100 0s:
+/// <varint(100 << 1)> <1, padded to 1 byte>  <varint(100 << 1)> <0, padded to 1 byte>
+///  - (total 4 bytes)
+//
+/// alternating 1s and 0s (200 total):
+/// 200 ints = 25 groups of 8
+/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+/// (total 26 bytes, 1 byte overhead)
+//
+
+/// Decoder class for RLE encoded data.
+class RleDecoder {
+ public:
+  /// Create a decoder object. buffer/buffer_len is the decoded data.
+  /// bit_width is the width of each value (before encoding).
+  RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
+      : bit_reader_(buffer, buffer_len),
+        bit_width_(bit_width),
+        current_value_(0),
+        repeat_count_(0),
+        literal_count_(0) {
+    DCHECK_GE(bit_width_, 0);
+    DCHECK_LE(bit_width_, 64);
+  }
+
+  RleDecoder() : bit_width_(-1) {}
+
+  void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
+    DCHECK_GE(bit_width, 0);
+    DCHECK_LE(bit_width, 64);
+    bit_reader_.Reset(buffer, buffer_len);
+    bit_width_ = bit_width;
+    current_value_ = 0;
+    repeat_count_ = 0;
+    literal_count_ = 0;
+  }
+
+  /// Gets the next value.  Returns false if there are no more.
+  template <typename T>
+  bool Get(T* val);
+
+  /// Gets a batch of values.  Returns the number of decoded elements.
+  template <typename T>
+  int GetBatch(T* values, int batch_size);
+
+  /// Like GetBatch but the values are then decoded using the provided dictionary
+  template <typename T>
+  int GetBatchWithDict(const T* dictionary, T* values, int batch_size);
+
+  /// Like GetBatchWithDict but add spacing for null entries
+  template <typename T>
+  int GetBatchWithDictSpaced(const T* dictionary, T* values, int batch_size,
+      int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset);
+
+ protected:
+  BitReader bit_reader_;
+  /// Number of bits needed to encode the value. Must be between 0 and 64.
+  int bit_width_;
+  uint64_t current_value_;
+  uint32_t repeat_count_;
+  uint32_t literal_count_;
+
+ private:
+  /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
+  /// are no more.
+  template <typename T>
+  bool NextCounts();
+};
+
+/// Class to incrementally build the rle data.   This class does not allocate any memory.
+/// The encoding has two modes: encoding repeated runs and literal runs.
+/// If the run is sufficiently short, it is more efficient to encode as a literal run.
+/// This class does so by buffering 8 values at a time.  If they are not all the same
+/// they are added to the literal run.  If they are the same, they are added to the
+/// repeated run.  When we switch modes, the previous run is flushed out.
+class RleEncoder {
+ public:
+  /// buffer/buffer_len: preallocated output buffer.
+  /// bit_width: max number of bits for value.
+  /// TODO: consider adding a min_repeated_run_length so the caller can control
+  /// when values should be encoded as repeated runs.  Currently this is derived
+  /// based on the bit_width, which can determine a storage optimal choice.
+  /// TODO: allow 0 bit_width (and have dict encoder use it)
+  RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
+      : bit_width_(bit_width), bit_writer_(buffer, buffer_len) {
+    DCHECK_GE(bit_width_, 0);
+    DCHECK_LE(bit_width_, 64);
+    max_run_byte_size_ = MinBufferSize(bit_width);
+    DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
+    Clear();
+  }
+
+  /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
+  /// This is the maximum length of a single run for 'bit_width'.
+  /// It is not valid to pass a buffer less than this length.
+  static int MinBufferSize(int bit_width) {
+    /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
+    int max_literal_run_size =
+        1 + static_cast<int>(BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8));
+    /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
+    int max_repeated_run_size =
+        BitReader::MAX_VLQ_BYTE_LEN + static_cast<int>(BitUtil::Ceil(bit_width, 8));
+    return std::max(max_literal_run_size, max_repeated_run_size);
+  }
+
+  /// Returns the maximum byte size it could take to encode 'num_values'.
+  static int MaxBufferSize(int bit_width, int num_values) {
+    // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
+    // and then a repeated run of length 8".
+    // 8 values per smallest run, 8 bits per byte
+    // int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8);
+    int bytes_per_run = bit_width;
+    int num_runs = static_cast<int>(BitUtil::Ceil(num_values, 8));
+    int literal_max_size = num_runs + num_runs * bytes_per_run;
+
+    // In the very worst case scenario, the data is a concatenation of repeated
+    // runs of 8 values. Repeated run has a 1 byte varint followed by the
+    // bit-packed repeated value
+    int min_repeated_run_size = 1 + static_cast<int>(BitUtil::Ceil(bit_width, 8));
+    int repeated_max_size =
+        static_cast<int>(BitUtil::Ceil(num_values, 8)) * min_repeated_run_size;
+
+    return std::max(literal_max_size, repeated_max_size);
+  }
+
+  /// Encode value.  Returns true if the value fits in buffer, false otherwise.
+  /// This value must be representable with bit_width_ bits.
+  bool Put(uint64_t value);
+
+  /// Flushes any pending values to the underlying buffer.
+  /// Returns the total number of bytes written
+  int Flush();
+
+  /// Resets all the state in the encoder.
+  void Clear();
+
+  /// Returns pointer to underlying buffer
+  uint8_t* buffer() { return bit_writer_.buffer(); }
+  int32_t len() { return bit_writer_.bytes_written(); }
+
+ private:
+  /// Flushes any buffered values.  If this is part of a repeated run, this is largely
+  /// a no-op.
+  /// If it is part of a literal run, this will call FlushLiteralRun, which writes
+  /// out the buffered literal values.
+  /// If 'done' is true, the current run would be written even if it would normally
+  /// have been buffered more.  This should only be called at the end, when the
+  /// encoder has received all values even if it would normally continue to be
+  /// buffered.
+  void FlushBufferedValues(bool done);
+
+  /// Flushes literal values to the underlying buffer.  If update_indicator_byte,
+  /// then the current literal run is complete and the indicator byte is updated.
+  void FlushLiteralRun(bool update_indicator_byte);
+
+  /// Flushes a repeated run to the underlying buffer.
+  void FlushRepeatedRun();
+
+  /// Checks and sets buffer_full_. This must be called after flushing a run to
+  /// make sure there are enough bytes remaining to encode the next run.
+  void CheckBufferFull();
+
+  /// The maximum number of values in a single literal run
+  /// (number of groups encodable by a 1-byte indicator * 8)
+  static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
+
+  /// Number of bits needed to encode the value. Must be between 0 and 64.
+  const int bit_width_;
+
+  /// Underlying buffer.
+  BitWriter bit_writer_;
+
+  /// If true, the buffer is full and subsequent Put()'s will fail.
+  bool buffer_full_;
+
+  /// The maximum byte size a single run can take.
+  int max_run_byte_size_;
+
+  /// We need to buffer at most 8 values for literals.  This happens when the
+  /// bit_width is 1 (so 8 values fit in one byte).
+  /// TODO: generalize this to other bit widths
+  int64_t buffered_values_[8];
+
+  /// Number of values in buffered_values_
+  int num_buffered_values_;
+
+  /// The current (also last) value that was written and the count of how
+  /// many times in a row that value has been seen.  This is maintained even
+  /// if we are in a literal run.  If the repeat_count_ get high enough, we switch
+  /// to encoding repeated runs.
+  uint64_t current_value_;
+  int repeat_count_;
+
+  /// Number of literals in the current run.  This does not include the literals
+  /// that might be in buffered_values_.  Only after we've got a group big enough
+  /// can we decide if they should part of the literal_count_ or repeat_count_
+  int literal_count_;
+
+  /// Pointer to a byte in the underlying buffer that stores the indicator byte.
+  /// This is reserved as soon as we need a literal run but the value is written
+  /// when the literal run is complete.
+  uint8_t* literal_indicator_byte_;
+};
+
+template <typename T>
+inline bool RleDecoder::Get(T* val) {
+  return GetBatch(val, 1) == 1;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatch(T* values, int batch_size) {
+  DCHECK_GE(bit_width_, 0);
+  int values_read = 0;
+
+  while (values_read < batch_size) {
+    if (repeat_count_ > 0) {
+      int repeat_batch =
+          std::min(batch_size - values_read, static_cast<int>(repeat_count_));
+      std::fill(values + values_read, values + values_read + repeat_batch,
+          static_cast<T>(current_value_));
+      repeat_count_ -= repeat_batch;
+      values_read += repeat_batch;
+    } else if (literal_count_ > 0) {
+      int literal_batch =
+          std::min(batch_size - values_read, static_cast<int>(literal_count_));
+      int actual_read =
+          bit_reader_.GetBatch(bit_width_, values + values_read, literal_batch);
+      DCHECK_EQ(actual_read, literal_batch);
+      literal_count_ -= literal_batch;
+      values_read += literal_batch;
+    } else {
+      if (!NextCounts<T>()) return values_read;
+    }
+  }
+
+  return values_read;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDict(const T* dictionary, T* values, int batch_size) {
+  DCHECK_GE(bit_width_, 0);
+  int values_read = 0;
+
+  while (values_read < batch_size) {
+    if (repeat_count_ > 0) {
+      int repeat_batch =
+          std::min(batch_size - values_read, static_cast<int>(repeat_count_));
+      std::fill(values + values_read, values + values_read + repeat_batch,
+          dictionary[current_value_]);
+      repeat_count_ -= repeat_batch;
+      values_read += repeat_batch;
+    } else if (literal_count_ > 0) {
+      int literal_batch =
+          std::min(batch_size - values_read, static_cast<int>(literal_count_));
+
+      const int buffer_size = 1024;
+      int indices[buffer_size];
+      literal_batch = std::min(literal_batch, buffer_size);
+      int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch);
+      DCHECK_EQ(actual_read, literal_batch);
+      for (int i = 0; i < literal_batch; ++i) {
+        values[values_read + i] = dictionary[indices[i]];
+      }
+      literal_count_ -= literal_batch;
+      values_read += literal_batch;
+    } else {
+      if (!NextCounts<T>()) return values_read;
+    }
+  }
+
+  return values_read;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values,
+    int batch_size, int null_count, const uint8_t* valid_bits,
+    int64_t valid_bits_offset) {
+  DCHECK_GE(bit_width_, 0);
+  int values_read = 0;
+  int remaining_nulls = null_count;
+  INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
+
+  while (values_read < batch_size) {
+    bool is_valid = (bitset_valid_bits & (1 << bit_offset_valid_bits)) != 0;
+    READ_NEXT_BITSET(valid_bits);
+
+    if (is_valid) {
+      if ((repeat_count_ == 0) && (literal_count_ == 0)) {
+        if (!NextCounts<T>()) return values_read;
+      }
+      if (repeat_count_ > 0) {
+        T value = dictionary[current_value_];
+        // The current index is already valid, we don't need to check that again
+        int repeat_batch = 1;
+        repeat_count_--;
+
+        while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
+          if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+            repeat_count_--;
+          } else {
+            remaining_nulls--;
+          }
+          repeat_batch++;
+
+          READ_NEXT_BITSET(valid_bits);
+        }
+        std::fill(values + values_read, values + values_read + repeat_batch, value);
+        values_read += repeat_batch;
+      } else if (literal_count_ > 0) {
+        int literal_batch = std::min(
+            batch_size - values_read - remaining_nulls, static_cast<int>(literal_count_));
+
+        // Decode the literals
+        constexpr int kBufferSize = 1024;
+        int indices[kBufferSize];
+        literal_batch = std::min(literal_batch, kBufferSize);
+        int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch);
+        DCHECK_EQ(actual_read, literal_batch);
+
+        int skipped = 0;
+        int literals_read = 1;
+        values[values_read] = dictionary[indices[0]];
+
+        // Read the first bitset to the end
+        while (literals_read < literal_batch) {
+          if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+            values[values_read + literals_read + skipped] =
+                dictionary[indices[literals_read]];
+            literals_read++;
+          } else {
+            skipped++;
+          }
+
+          READ_NEXT_BITSET(valid_bits);
+        }
+        literal_count_ -= literal_batch;
+        values_read += literal_batch + skipped;
+        remaining_nulls -= skipped;
+      }
+    } else {
+      values_read++;
+      remaining_nulls--;
+    }
+  }
+
+  return values_read;
+}
+
+template <typename T>
+bool RleDecoder::NextCounts() {
+  // Read the next run's indicator int, it could be a literal or repeated run.
+  // The int is encoded as a vlq-encoded value.
+  int32_t indicator_value = 0;
+  bool result = bit_reader_.GetVlqInt(&indicator_value);
+  if (!result) return false;
+
+  // lsb indicates if it is a literal run or repeated run
+  bool is_literal = indicator_value & 1;
+  if (is_literal) {
+    literal_count_ = (indicator_value >> 1) * 8;
+  } else {
+    repeat_count_ = indicator_value >> 1;
+    bool result =
+        bit_reader_.GetAligned<T>(static_cast<int>(BitUtil::Ceil(bit_width_, 8)),
+            reinterpret_cast<T*>(&current_value_));
+    DCHECK(result);
+  }
+  return true;
+}
+
+/// This function buffers input values 8 at a time.  After seeing all 8 values,
+/// it decides whether they should be encoded as a literal or repeated run.
+inline bool RleEncoder::Put(uint64_t value) {
+  DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_));
+  if (UNLIKELY(buffer_full_)) return false;
+
+  if (LIKELY(current_value_ == value)) {
+    ++repeat_count_;
+    if (repeat_count_ > 8) {
+      // This is just a continuation of the current run, no need to buffer the
+      // values.
+      // Note that this is the fast path for long repeated runs.
+      return true;
+    }
+  } else {
+    if (repeat_count_ >= 8) {
+      // We had a run that was long enough but it has ended.  Flush the
+      // current repeated run.
+      DCHECK_EQ(literal_count_, 0);
+      FlushRepeatedRun();
+    }
+    repeat_count_ = 1;
+    current_value_ = value;
+  }
+
+  buffered_values_[num_buffered_values_] = value;
+  if (++num_buffered_values_ == 8) {
+    DCHECK_EQ(literal_count_ % 8, 0);
+    FlushBufferedValues(false);
+  }
+  return true;
+}
+
+inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
+  if (literal_indicator_byte_ == NULL) {
+    // The literal indicator byte has not been reserved yet, get one now.
+    literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
+    DCHECK(literal_indicator_byte_ != NULL);
+  }
+
+  // Write all the buffered values as bit packed literals
+  for (int i = 0; i < num_buffered_values_; ++i) {
+    bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
+    DCHECK(success) << "There is a bug in using CheckBufferFull()";
+  }
+  num_buffered_values_ = 0;
+
+  if (update_indicator_byte) {
+    // At this point we need to write the indicator byte for the literal run.
+    // We only reserve one byte, to allow for streaming writes of literal values.
+    // The logic makes sure we flush literal runs often enough to not overrun
+    // the 1 byte.
+    DCHECK_EQ(literal_count_ % 8, 0);
+    int num_groups = literal_count_ / 8;
+    int32_t indicator_value = (num_groups << 1) | 1;
+    DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
+    *literal_indicator_byte_ = static_cast<uint8_t>(indicator_value);
+    literal_indicator_byte_ = NULL;
+    literal_count_ = 0;
+    CheckBufferFull();
+  }
+}
+
+inline void RleEncoder::FlushRepeatedRun() {
+  DCHECK_GT(repeat_count_, 0);
+  bool result = true;
+  // The lsb of 0 indicates this is a repeated run
+  int32_t indicator_value = repeat_count_ << 1 | 0;
+  result &= bit_writer_.PutVlqInt(indicator_value);
+  result &= bit_writer_.PutAligned(
+      current_value_, static_cast<int>(BitUtil::Ceil(bit_width_, 8)));
+  DCHECK(result);
+  num_buffered_values_ = 0;
+  repeat_count_ = 0;
+  CheckBufferFull();
+}
+
+/// Flush the values that have been buffered.  At this point we decide whether
+/// we need to switch between the run types or continue the current one.
+inline void RleEncoder::FlushBufferedValues(bool done) {
+  if (repeat_count_ >= 8) {
+    // Clear the buffered values.  They are part of the repeated run now and we
+    // don't want to flush them out as literals.
+    num_buffered_values_ = 0;
+    if (literal_count_ != 0) {
+      // There was a current literal run.  All the values in it have been flushed
+      // but we still need to update the indicator byte.
+      DCHECK_EQ(literal_count_ % 8, 0);
+      DCHECK_EQ(repeat_count_, 8);
+      FlushLiteralRun(true);
+    }
+    DCHECK_EQ(literal_count_, 0);
+    return;
+  }
+
+  literal_count_ += num_buffered_values_;
+  DCHECK_EQ(literal_count_ % 8, 0);
+  int num_groups = literal_count_ / 8;
+  if (num_groups + 1 >= (1 << 6)) {
+    // We need to start a new literal run because the indicator byte we've reserved
+    // cannot store more values.
+    DCHECK(literal_indicator_byte_ != NULL);
+    FlushLiteralRun(true);
+  } else {
+    FlushLiteralRun(done);
+  }
+  repeat_count_ = 0;
+}
+
+inline int RleEncoder::Flush() {
+  if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
+    bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ ||
+                                                 num_buffered_values_ == 0);
+    // There is something pending, figure out if it's a repeated or literal run
+    if (repeat_count_ > 0 && all_repeat) {
+      FlushRepeatedRun();
+    } else {
+      DCHECK_EQ(literal_count_ % 8, 0);
+      // Buffer the last group of literals to 8 by padding with 0s.
+      for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
+           ++num_buffered_values_) {
+        buffered_values_[num_buffered_values_] = 0;
+      }
+      literal_count_ += num_buffered_values_;
+      FlushLiteralRun(true);
+      repeat_count_ = 0;
+    }
+  }
+  bit_writer_.Flush();
+  DCHECK_EQ(num_buffered_values_, 0);
+  DCHECK_EQ(literal_count_, 0);
+  DCHECK_EQ(repeat_count_, 0);
+
+  return bit_writer_.bytes_written();
+}
+
+inline void RleEncoder::CheckBufferFull() {
+  int bytes_written = bit_writer_.bytes_written();
+  if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
+    buffer_full_ = true;
+  }
+}
+
+inline void RleEncoder::Clear() {
+  buffer_full_ = false;
+  current_value_ = 0;
+  repeat_count_ = 0;
+  num_buffered_values_ = 0;
+  literal_count_ = 0;
+  literal_indicator_byte_ = NULL;
+  bit_writer_.Clear();
+}
+
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_RLE_ENCODING_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/sse-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/sse-util.h b/cpp/src/arrow/util/sse-util.h
new file mode 100644
index 0000000..570c405
--- /dev/null
+++ b/cpp/src/arrow/util/sse-util.h
@@ -0,0 +1,237 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
+// functions needed for parquet-cpp
+
+#ifndef ARROW_UTIL_SSE_UTIL_H
+#define ARROW_UTIL_SSE_UTIL_H
+
+#ifdef ARROW_USE_SSE
+#include <emmintrin.h>
+#endif
+
+namespace arrow {
+
+/// This class contains constants useful for text processing with SSE4.2 intrinsics.
+namespace SSEUtil {
+/// Number of characters that fit in 64/128 bit register.  SSE provides instructions
+/// for loading 64 or 128 bits into a register at a time.
+static const int CHARS_PER_64_BIT_REGISTER = 8;
+static const int CHARS_PER_128_BIT_REGISTER = 16;
+
+/// SSE4.2 adds instructions for text processing.  The instructions have a control
+/// byte that determines some of functionality of the instruction.  (Equivalent to
+/// GCC's _SIDD_CMP_EQUAL_ANY, etc).
+static const int PCMPSTR_EQUAL_ANY = 0x00;     // strchr
+static const int PCMPSTR_EQUAL_EACH = 0x08;    // strcmp
+static const int PCMPSTR_UBYTE_OPS = 0x00;     // unsigned char (8-bits, rather than 16)
+static const int PCMPSTR_NEG_POLARITY = 0x10;  // see Intel SDM chapter 4.1.4.
+
+/// In this mode, SSE text processing functions will return a mask of all the
+/// characters that matched.
+static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
+
+/// In this mode, SSE text processing functions will return the number of
+/// bytes that match consecutively from the beginning.
+static const int STRCMP_MODE =
+    PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY;
+
+/// Precomputed mask values up to 16 bits.
+static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
+    1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9,
+    1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,
+};
+}  // namespace SSEUtil
+
+#ifdef ARROW_USE_SSE
+
+/// Define the SSE 4.2 intrinsics.  The caller must first verify at runtime (or codegen
+/// IR load time) that the processor supports SSE 4.2 before calling these.  These are
+/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
+#ifndef IR_COMPILE
+/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler
+/// flag.  Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime
+/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2
+/// (IMPALA-1399/1646).  The compiler intrinsics cannot be used without -msse4.2, so we
+/// define our own implementations of the intrinsics instead.
+
+/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an
+/// immediate.  So, those need to be always inlined in order to always propagate the
+/// mode constant into the inline asm.
+#define SSE_ALWAYS_INLINE inline __attribute__((__always_inline__))
+
+template <int MODE>
+static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
+#ifdef __clang__
+  /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 -
+  /// clang doesn't support Y-prefixed asm constraints.
+  register volatile __m128i result asm("xmm0");
+  __asm__ volatile("pcmpestrm %5, %2, %1"
+                   : "=x"(result)
+                   : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
+                   : "cc");
+#else
+  __m128i result;
+  __asm__ volatile("pcmpestrm %5, %2, %1"
+                   : "=Yz"(result)
+                   : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
+                   : "cc");
+#endif
+  return result;
+}
+
+template <int MODE>
+static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
+  int result;
+  __asm__("pcmpestri %5, %2, %1"
+          : "=c"(result)
+          : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
+          : "cc");
+  return result;
+}
+
+static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
+  __asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
+static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
+  __asm__("crc32w %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
+static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
+  __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
+static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
+  uint64_t result = crc;
+  __asm__("crc32q %1, %0" : "+r"(result) : "rm"(v));
+  return result;
+}
+
+static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
+  int64_t result;
+  __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
+  return result;
+}
+
+#undef SSE_ALWAYS_INLINE
+
+#elif defined(__SSE4_2__)  // IR_COMPILE for SSE 4.2.
+/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not
+/// support it.  However, the cross-compiled IR is compiled twice: with and without
+/// -msse4.2.  When -msse4.2 is enabled in the cross-compile, we can just use the
+/// compiler intrinsics.
+
+#include <smmintrin.h>
+
+template <int MODE>
+static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
+  return _mm_cmpestrm(str1, len1, str2, len2, MODE);
+}
+
+template <int MODE>
+static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
+  return _mm_cmpestri(str1, len1, str2, len2, MODE);
+}
+
+#define SSE4_crc32_u8 _mm_crc32_u8
+#define SSE4_crc32_u16 _mm_crc32_u16
+#define SSE4_crc32_u32 _mm_crc32_u32
+#define SSE4_crc32_u64 _mm_crc32_u64
+#define POPCNT_popcnt_u64 _mm_popcnt_u64
+
+#else  // IR_COMPILE without SSE 4.2.
+/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use
+/// SSE 4.2 instructions.  Otherwise, the IR loading will fail on CPUs that don't
+/// support SSE 4.2.  However, because the caller isn't allowed to call these routines
+/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case.
+
+template <int MODE>
+static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return (__m128i){0};  // NOLINT
+}
+
+template <int MODE>
+static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+#endif  // IR_COMPILE
+
+#else
+
+static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
+  DCHECK(false) << "SSE support is not enabled";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
+  DCHECK(false) << "SSE support is not enabled";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
+  DCHECK(false) << "SSE support is not enabled";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
+  DCHECK(false) << "SSE support is not enabled";
+  return 0;
+}
+
+static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
+  DCHECK(false) << "SSE support is not enabled";
+  return 0;
+}
+
+#endif  // ARROW_USE_SSE
+
+}  // namespace arrow
+
+#endif  //  ARROW_UTIL_SSE_UTIL_H


[3/3] arrow git commit: ARROW-1154: [C++] Import miscellaneous computational utility code from parquet-cpp

Posted by we...@apache.org.
ARROW-1154: [C++] Import miscellaneous computational utility code from parquet-cpp

I will make a corresponding PR to parquet-cpp to ensure that this code migration is complete enough.

Author: Wes McKinney <we...@twosigma.com>

Closes #785 from wesm/ARROW-1154 and squashes the following commits:

08b54c98 [Wes McKinney] Fix variety of compiler warnings
ddc7354b [Wes McKinney] Fixes to get PARQUET-1045 working
f5cd0259 [Wes McKinney] Import miscellaneous computational utility code from parquet-cpp


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/b0652287
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/b0652287
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/b0652287

Branch: refs/heads/master
Commit: b06522870cb212989dc619b3c50e080b05772bce
Parents: cb5f2b9
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Jun 27 08:13:59 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jun 27 08:13:59 2017 -0400

----------------------------------------------------------------------
 LICENSE.txt                             |   11 +
 cpp/CMakeLists.txt                      |    5 +
 cpp/src/arrow/builder-benchmark.cc      |    8 +-
 cpp/src/arrow/compare.cc                |    4 +-
 cpp/src/arrow/ipc/reader.cc             |    4 +-
 cpp/src/arrow/status.h                  |   16 +-
 cpp/src/arrow/util/CMakeLists.txt       |    8 +
 cpp/src/arrow/util/bit-stream-utils.h   |  397 +++
 cpp/src/arrow/util/bit-util-test.cc     |  166 +-
 cpp/src/arrow/util/bit-util.h           |  329 ++-
 cpp/src/arrow/util/bpacking.h           | 3342 ++++++++++++++++++++++++++
 cpp/src/arrow/util/compiler-util.h      |   59 +
 cpp/src/arrow/util/cpu-info.cc          |  206 ++
 cpp/src/arrow/util/cpu-info.h           |   92 +
 cpp/src/arrow/util/hash-util.h          |  258 ++
 cpp/src/arrow/util/logging.h            |   12 +-
 cpp/src/arrow/util/rle-encoding-test.cc |  460 ++++
 cpp/src/arrow/util/rle-encoding.h       |  598 +++++
 cpp/src/arrow/util/sse-util.h           |  237 ++
 19 files changed, 6191 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/LICENSE.txt
----------------------------------------------------------------------
diff --git a/LICENSE.txt b/LICENSE.txt
index 95c506f..55823cb 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -330,3 +330,14 @@ Apache 2.0 License or the under the 3-clause BSD license:
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from Daniel Lemire's FrameOfReference project.
+
+https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
+
+Copyright: 2013 Daniel Lemire
+Home page: http://lemire.me/en/
+Project page: https://github.com/lemire/FrameOfReference
+License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ca34101..1d1ffbe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -137,6 +137,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
     "Build the plasma object store along with Arrow"
     OFF)
 
+  option(ARROW_USE_SSE
+    "Build with SSE4 optimizations"
+    OFF)
+
   option(ARROW_ZLIB_VENDORED
     "Build our own zlib (some libz.a aren't configured for static linking)"
     ON)
@@ -650,6 +654,7 @@ set(ARROW_SRCS
 
   src/arrow/util/bit-util.cc
   src/arrow/util/compression.cc
+  src/arrow/util/cpu-info.cc
   src/arrow/util/decimal.cc
   src/arrow/util/key_value_metadata.cc
 )

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/builder-benchmark.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc
index 3c49c63..bdfba8b 100644
--- a/cpp/src/arrow/builder-benchmark.cc
+++ b/cpp/src/arrow/builder-benchmark.cc
@@ -25,10 +25,10 @@ namespace arrow {
 
 constexpr int64_t kFinalSize = 256;
 
-#define ABORT_NOT_OK(s)                                 \
-  do {                                                  \
-    ::arrow::Status _s = (s);                           \
-    if (ARROW_PREDICT_FALSE(!_s.ok())) { exit(-1); }    \
+#define ABORT_NOT_OK(s)                              \
+  do {                                               \
+    ::arrow::Status _s = (s);                        \
+    if (ARROW_PREDICT_FALSE(!_s.ok())) { exit(-1); } \
   } while (0);
 
 static void BM_BuildPrimitiveArrayNoNulls(

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/compare.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index c2f4f84..390a406 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -323,8 +323,8 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
       : RangeEqualsVisitor(right, 0, right.length(), 0) {}
 
   Status Visit(const NullArray& left) {
-      result_ = true;
-      return Status::OK();
+    result_ = true;
+    return Status::OK();
   }
 
   Status Visit(const BooleanArray& left) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/ipc/reader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 7fef847..cb747b6 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -188,8 +188,8 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl {
     return ReadSchema();
   }
 
-  Status ReadNextMessage(Message::Type expected_type, bool allow_null,
-      std::shared_ptr<Message>* message) {
+  Status ReadNextMessage(
+      Message::Type expected_type, bool allow_null, std::shared_ptr<Message>* message) {
     RETURN_NOT_OK(ReadMessage(stream_.get(), message));
 
     if (!(*message) && !allow_null) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/status.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h
index 9a75a58..bfb945c 100644
--- a/cpp/src/arrow/status.h
+++ b/cpp/src/arrow/status.h
@@ -23,10 +23,10 @@
 #include "arrow/util/visibility.h"
 
 // Return the given status if it is not OK.
-#define ARROW_RETURN_NOT_OK(s)                          \
-  do {                                                  \
-    ::arrow::Status _s = (s);                           \
-    if (ARROW_PREDICT_FALSE(!_s.ok())) { return _s; }   \
+#define ARROW_RETURN_NOT_OK(s)                        \
+  do {                                                \
+    ::arrow::Status _s = (s);                         \
+    if (ARROW_PREDICT_FALSE(!_s.ok())) { return _s; } \
   } while (0);
 
 // If 'to_call' returns a bad status, CHECK immediately with a logged message
@@ -43,10 +43,10 @@
 
 namespace arrow {
 
-#define RETURN_NOT_OK(s)                                \
-  do {                                                  \
-    Status _s = (s);                                    \
-    if (ARROW_PREDICT_FALSE(!_s.ok())) { return _s; }   \
+#define RETURN_NOT_OK(s)                              \
+  do {                                                \
+    Status _s = (s);                                  \
+    if (ARROW_PREDICT_FALSE(!_s.ok())) { return _s; } \
   } while (0);
 
 #define RETURN_NOT_OK_ELSE(s, else_) \

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 1abcce4..bc1eeb2 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -22,11 +22,18 @@
 # Headers: top level
 install(FILES
   bit-util.h
+  bit-stream-utils.h
+  bpacking.h
+  compiler-util.h
   compression.h
+  cpu-info.h
   key_value_metadata.h
+  hash-util.h
   logging.h
   macros.h
   random.h
+  rle-encoding.h
+  sse-util.h
   stl.h
   visibility.h
   DESTINATION include/arrow/util)
@@ -56,4 +63,5 @@ ADD_ARROW_TEST(bit-util-test)
 ADD_ARROW_TEST(compression-test)
 ADD_ARROW_TEST(decimal-test)
 ADD_ARROW_TEST(key-value-metadata-test)
+ADD_ARROW_TEST(rle-encoding-test)
 ADD_ARROW_TEST(stl-util-test)

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/bit-stream-utils.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-stream-utils.h b/cpp/src/arrow/util/bit-stream-utils.h
new file mode 100644
index 0000000..537fdc3
--- /dev/null
+++ b/cpp/src/arrow/util/bit-stream-utils.h
@@ -0,0 +1,397 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29
+
+#ifndef ARROW_UTIL_BIT_STREAM_UTILS_H
+#define ARROW_UTIL_BIT_STREAM_UTILS_H
+
+#include <algorithm>
+#include <cstdint>
+#include <string.h>
+
+#include "arrow/util/bit-util.h"
+#include "arrow/util/bpacking.h"
+#include "arrow/util/compiler-util.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+/// Utility class to write bit/byte streams.  This class can write data to either be
+/// bit packed or byte aligned (and a single stream that has a mix of both).
+/// This class does not allocate memory.
+class BitWriter {
+ public:
+  /// buffer: buffer to write bits to.  Buffer should be preallocated with
+  /// 'buffer_len' bytes.
+  BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
+    Clear();
+  }
+
+  void Clear() {
+    buffered_values_ = 0;
+    byte_offset_ = 0;
+    bit_offset_ = 0;
+  }
+
+  /// The number of current bytes written, including the current byte (i.e. may include a
+  /// fraction of a byte). Includes buffered values.
+  int bytes_written() const {
+    return byte_offset_ + static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
+  }
+  uint8_t* buffer() const { return buffer_; }
+  int buffer_len() const { return max_bytes_; }
+
+  /// Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
+  /// packed.  Returns false if there was not enough space. num_bits must be <= 32.
+  bool PutValue(uint64_t v, int num_bits);
+
+  /// Writes v to the next aligned byte using num_bytes. If T is larger than
+  /// num_bytes, the extra high-order bytes will be ignored. Returns false if
+  /// there was not enough space.
+  template <typename T>
+  bool PutAligned(T v, int num_bytes);
+
+  /// Write a Vlq encoded int to the buffer.  Returns false if there was not enough
+  /// room.  The value is written byte aligned.
+  /// For more details on vlq:
+  /// en.wikipedia.org/wiki/Variable-length_quantity
+  bool PutVlqInt(uint32_t v);
+
+  // Writes an int zigzag encoded.
+  bool PutZigZagVlqInt(int32_t v);
+
+  /// Get a pointer to the next aligned byte and advance the underlying buffer
+  /// by num_bytes.
+  /// Returns NULL if there was not enough space.
+  uint8_t* GetNextBytePtr(int num_bytes = 1);
+
+  /// Flushes all buffered values to the buffer. Call this when done writing to
+  /// the buffer.  If 'align' is true, buffered_values_ is reset and any future
+  /// writes will be written to the next byte boundary.
+  void Flush(bool align = false);
+
+ private:
+  uint8_t* buffer_;
+  int max_bytes_;
+
+  /// Bit-packed values are initially written to this variable before being memcpy'd to
+  /// buffer_. This is faster than writing values byte by byte directly to buffer_.
+  uint64_t buffered_values_;
+
+  int byte_offset_;  // Offset in buffer_
+  int bit_offset_;   // Offset in buffered_values_
+};
+
+/// Utility class to read bit/byte stream.  This class can read bits or bytes
+/// that are either byte aligned or not.  It also has utilities to read multiple
+/// bytes in one read (e.g. encoded int).
+class BitReader {
+ public:
+  /// 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
+  BitReader(const uint8_t* buffer, int buffer_len)
+      : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
+    int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+    memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+  }
+
+  BitReader() : buffer_(NULL), max_bytes_(0) {}
+
+  void Reset(const uint8_t* buffer, int buffer_len) {
+    buffer_ = buffer;
+    max_bytes_ = buffer_len;
+    byte_offset_ = 0;
+    bit_offset_ = 0;
+    int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+    memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+  }
+
+  /// Gets the next value from the buffer.  Returns true if 'v' could be read or false if
+  /// there are not enough bytes left. num_bits must be <= 32.
+  template <typename T>
+  bool GetValue(int num_bits, T* v);
+
+  /// Get a number of values from the buffer. Return the number of values actually read.
+  template <typename T>
+  int GetBatch(int num_bits, T* v, int batch_size);
+
+  /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
+  /// needs to be a little-endian native type and big enough to store
+  /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
+  /// be advanced to the start of the next byte before 'v' is read. Returns
+  /// false if there are not enough bytes left.
+  template <typename T>
+  bool GetAligned(int num_bytes, T* v);
+
+  /// Reads a vlq encoded int from the stream.  The encoded int must start at
+  /// the beginning of a byte. Return false if there were not enough bytes in
+  /// the buffer.
+  bool GetVlqInt(int32_t* v);
+
+  // Reads a zigzag encoded int `into` v.
+  bool GetZigZagVlqInt(int32_t* v);
+
+  /// Returns the number of bytes left in the stream, not including the current
+  /// byte (i.e., there may be an additional fraction of a byte).
+  int bytes_left() {
+    return max_bytes_ - (byte_offset_ + static_cast<int>(BitUtil::Ceil(bit_offset_, 8)));
+  }
+
+  /// Maximum byte length of a vlq encoded int
+  static const int MAX_VLQ_BYTE_LEN = 5;
+
+ private:
+  const uint8_t* buffer_;
+  int max_bytes_;
+
+  /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
+  /// faster than reading values byte by byte directly from buffer_.
+  uint64_t buffered_values_;
+
+  int byte_offset_;  // Offset in buffer_
+  int bit_offset_;   // Offset in buffered_values_
+};
+
+inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
+  // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
+  DCHECK_LE(num_bits, 32);
+  DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
+
+  if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false;
+
+  buffered_values_ |= v << bit_offset_;
+  bit_offset_ += num_bits;
+
+  if (UNLIKELY(bit_offset_ >= 64)) {
+    // Flush buffered_values_ and write out bits of v that did not fit
+    memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
+    buffered_values_ = 0;
+    byte_offset_ += 8;
+    bit_offset_ -= 64;
+    buffered_values_ = v >> (num_bits - bit_offset_);
+  }
+  DCHECK_LT(bit_offset_, 64);
+  return true;
+}
+
+inline void BitWriter::Flush(bool align) {
+  int num_bytes = static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
+  DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
+  memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes);
+
+  if (align) {
+    buffered_values_ = 0;
+    byte_offset_ += num_bytes;
+    bit_offset_ = 0;
+  }
+}
+
+inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
+  Flush(/* align */ true);
+  DCHECK_LE(byte_offset_, max_bytes_);
+  if (byte_offset_ + num_bytes > max_bytes_) return NULL;
+  uint8_t* ptr = buffer_ + byte_offset_;
+  byte_offset_ += num_bytes;
+  return ptr;
+}
+
+template <typename T>
+inline bool BitWriter::PutAligned(T val, int num_bytes) {
+  uint8_t* ptr = GetNextBytePtr(num_bytes);
+  if (ptr == NULL) return false;
+  memcpy(ptr, &val, num_bytes);
+  return true;
+}
+
+inline bool BitWriter::PutVlqInt(uint32_t v) {
+  bool result = true;
+  while ((v & 0xFFFFFF80) != 0L) {
+    result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
+    v >>= 7;
+  }
+  result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
+  return result;
+}
+
+template <typename T>
+inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
+    int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+  *v = static_cast<T>(
+      BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+  *bit_offset += num_bits;
+  if (*bit_offset >= 64) {
+    *byte_offset += 8;
+    *bit_offset -= 64;
+
+    int bytes_remaining = max_bytes - *byte_offset;
+    if (LIKELY(bytes_remaining >= 8)) {
+      memcpy(buffered_values, buffer + *byte_offset, 8);
+    } else {
+      memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
+    }
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800 4805)
+#endif
+    // Read bits of v that crossed into new buffered_values_
+    *v = *v | static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset)
+                             << (num_bits - *bit_offset));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+    DCHECK_LE(*bit_offset, 64);
+  }
+}
+
+template <typename T>
+inline bool BitReader::GetValue(int num_bits, T* v) {
+  return GetBatch(num_bits, v, 1) == 1;
+}
+
+template <typename T>
+inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
+  DCHECK(buffer_ != NULL);
+  // TODO: revisit this limit if necessary
+  DCHECK_LE(num_bits, 32);
+  DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
+
+  int bit_offset = bit_offset_;
+  int byte_offset = byte_offset_;
+  uint64_t buffered_values = buffered_values_;
+  int max_bytes = max_bytes_;
+  const uint8_t* buffer = buffer_;
+
+  uint64_t needed_bits = num_bits * batch_size;
+  uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset;
+  if (remaining_bits < needed_bits) {
+    batch_size = static_cast<int>(remaining_bits) / num_bits;
+  }
+
+  int i = 0;
+  if (UNLIKELY(bit_offset != 0)) {
+    for (; i < batch_size && bit_offset != 0; ++i) {
+      GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
+          &buffered_values);
+    }
+  }
+
+  if (sizeof(T) == 4) {
+    int num_unpacked = unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+        reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
+    i += num_unpacked;
+    byte_offset += num_unpacked * num_bits / 8;
+  } else {
+    const int buffer_size = 1024;
+    uint32_t unpack_buffer[buffer_size];
+    while (i < batch_size) {
+      int unpack_size = std::min(buffer_size, batch_size - i);
+      int num_unpacked = unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+          unpack_buffer, unpack_size, num_bits);
+      if (num_unpacked == 0) { break; }
+      for (int k = 0; k < num_unpacked; ++k) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+        v[i + k] = static_cast<T>(unpack_buffer[k]);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+      }
+      i += num_unpacked;
+      byte_offset += num_unpacked * num_bits / 8;
+    }
+  }
+
+  int bytes_remaining = max_bytes - byte_offset;
+  if (bytes_remaining >= 8) {
+    memcpy(&buffered_values, buffer + byte_offset, 8);
+  } else {
+    memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
+  }
+
+  for (; i < batch_size; ++i) {
+    GetValue_(
+        num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, &buffered_values);
+  }
+
+  bit_offset_ = bit_offset;
+  byte_offset_ = byte_offset;
+  buffered_values_ = buffered_values;
+
+  return batch_size;
+}
+
+template <typename T>
+inline bool BitReader::GetAligned(int num_bytes, T* v) {
+  DCHECK_LE(num_bytes, static_cast<int>(sizeof(T)));
+  int bytes_read = static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
+  if (UNLIKELY(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false;
+
+  // Advance byte_offset to next unread byte and read num_bytes
+  byte_offset_ += bytes_read;
+  memcpy(v, buffer_ + byte_offset_, num_bytes);
+  byte_offset_ += num_bytes;
+
+  // Reset buffered_values_
+  bit_offset_ = 0;
+  int bytes_remaining = max_bytes_ - byte_offset_;
+  if (LIKELY(bytes_remaining >= 8)) {
+    memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
+  } else {
+    memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+  }
+  return true;
+}
+
+inline bool BitReader::GetVlqInt(int32_t* v) {
+  *v = 0;
+  int shift = 0;
+  int num_bytes = 0;
+  uint8_t byte = 0;
+  do {
+    if (!GetAligned<uint8_t>(1, &byte)) return false;
+    *v |= (byte & 0x7F) << shift;
+    shift += 7;
+    DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN);
+  } while ((byte & 0x80) != 0);
+  return true;
+}
+
+inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+  uint32_t u = (v << 1) ^ (v >> 31);
+  return PutVlqInt(u);
+}
+
+inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
+  int32_t u_signed;
+  if (!GetVlqInt(&u_signed)) return false;
+  uint32_t u = static_cast<uint32_t>(u_signed);
+  *reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(static_cast<int32_t>(u & 1));
+  return true;
+}
+
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_BIT_STREAM_UTILS_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc
index cb2fd1a..cd94558 100644
--- a/cpp/src/arrow/util/bit-util-test.cc
+++ b/cpp/src/arrow/util/bit-util-test.cc
@@ -15,18 +15,29 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/util/bit-util.h"
-
 #include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
 #include <vector>
 
 #include "gtest/gtest.h"
 
+#include <boost/utility.hpp>
+
 #include "arrow/buffer.h"
 #include "arrow/test-util.h"
+#include "arrow/util/bit-stream-utils.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/cpu-info.h"
 
 namespace arrow {
 
+static void EnsureCpuInfoInitialized() {
+  if (!CpuInfo::initialized()) { CpuInfo::Init(); }
+}
+
 TEST(BitUtilTests, TestIsMultipleOf64) {
   using BitUtil::IsMultipleOf64;
   EXPECT_TRUE(IsMultipleOf64(64));
@@ -109,4 +120,155 @@ TEST(BitUtilTests, TestCopyBitmap) {
   }
 }
 
+TEST(BitUtil, Ceil) {
+  EXPECT_EQ(BitUtil::Ceil(0, 1), 0);
+  EXPECT_EQ(BitUtil::Ceil(1, 1), 1);
+  EXPECT_EQ(BitUtil::Ceil(1, 2), 1);
+  EXPECT_EQ(BitUtil::Ceil(1, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(7, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(8, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(9, 8), 2);
+  EXPECT_EQ(BitUtil::Ceil(9, 9), 1);
+  EXPECT_EQ(BitUtil::Ceil(10000000000, 10), 1000000000);
+  EXPECT_EQ(BitUtil::Ceil(10, 10000000000), 1);
+  EXPECT_EQ(BitUtil::Ceil(100000000000, 10000000000), 10);
+}
+
+TEST(BitUtil, RoundUp) {
+  EXPECT_EQ(BitUtil::RoundUp(0, 1), 0);
+  EXPECT_EQ(BitUtil::RoundUp(1, 1), 1);
+  EXPECT_EQ(BitUtil::RoundUp(1, 2), 2);
+  EXPECT_EQ(BitUtil::RoundUp(6, 2), 6);
+  EXPECT_EQ(BitUtil::RoundUp(7, 3), 9);
+  EXPECT_EQ(BitUtil::RoundUp(9, 9), 9);
+  EXPECT_EQ(BitUtil::RoundUp(10000000001, 10), 10000000010);
+  EXPECT_EQ(BitUtil::RoundUp(10, 10000000000), 10000000000);
+  EXPECT_EQ(BitUtil::RoundUp(100000000000, 10000000000), 100000000000);
+}
+
+TEST(BitUtil, RoundDown) {
+  EXPECT_EQ(BitUtil::RoundDown(0, 1), 0);
+  EXPECT_EQ(BitUtil::RoundDown(1, 1), 1);
+  EXPECT_EQ(BitUtil::RoundDown(1, 2), 0);
+  EXPECT_EQ(BitUtil::RoundDown(6, 2), 6);
+  EXPECT_EQ(BitUtil::RoundDown(7, 3), 6);
+  EXPECT_EQ(BitUtil::RoundDown(9, 9), 9);
+  EXPECT_EQ(BitUtil::RoundDown(10000000001, 10), 10000000000);
+  EXPECT_EQ(BitUtil::RoundDown(10, 10000000000), 0);
+  EXPECT_EQ(BitUtil::RoundDown(100000000000, 10000000000), 100000000000);
+}
+
+TEST(BitUtil, Popcount) {
+  EnsureCpuInfoInitialized();
+
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
+  EXPECT_EQ(BitUtil::Popcount(0), 0);
+  EXPECT_EQ(BitUtil::PopcountNoHw(0), 0);
+}
+
+TEST(BitUtil, TrailingBits) {
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1);
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64),
+      BOOST_BINARY(1 1 1 1 1 1 1 1));
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100),
+      BOOST_BINARY(1 1 1 1 1 1 1 1));
+  EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63);
+}
+
+TEST(BitUtil, ByteSwap) {
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0x11223344)), 0x44332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0x11223344)), 0x44332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint64_t>(0)), 0);
+  EXPECT_EQ(
+      BitUtil::ByteSwap(static_cast<uint64_t>(0x1122334455667788)), 0x8877665544332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int64_t>(0)), 0);
+  EXPECT_EQ(
+      BitUtil::ByteSwap(static_cast<int64_t>(0x1122334455667788)), 0x8877665544332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0x1122)), 0x2211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211);
+}
+
+TEST(BitUtil, Log2) {
+  EXPECT_EQ(BitUtil::Log2(1), 0);
+  EXPECT_EQ(BitUtil::Log2(2), 1);
+  EXPECT_EQ(BitUtil::Log2(3), 2);
+  EXPECT_EQ(BitUtil::Log2(4), 2);
+  EXPECT_EQ(BitUtil::Log2(5), 3);
+  EXPECT_EQ(BitUtil::Log2(INT_MAX), 31);
+  EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32);
+  EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64);
+}
+
+TEST(BitUtil, RoundUpToPowerOf2) {
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(7, 8), 8);
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(8, 8), 8);
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(9, 8), 16);
+}
+
+TEST(BitUtil, RoundDownToPowerOf2) {
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(7, 8), 0);
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(8, 8), 8);
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(9, 8), 8);
+}
+
+TEST(BitUtil, RoundUpDown) {
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(7), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(8), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(9), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(7), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(8), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(9), 1);
+
+  EXPECT_EQ(BitUtil::RoundUpNumi32(31), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi32(32), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi32(33), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(31), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(32), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(33), 1);
+
+  EXPECT_EQ(BitUtil::RoundUpNumi64(63), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi64(64), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi64(65), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(63), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(64), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
+}
+
+void TestZigZag(int32_t v) {
+  uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN];
+  BitWriter writer(buffer, sizeof(buffer));
+  BitReader reader(buffer, sizeof(buffer));
+  writer.PutZigZagVlqInt(v);
+  int32_t result;
+  EXPECT_TRUE(reader.GetZigZagVlqInt(&result));
+  EXPECT_EQ(v, result);
+}
+
+TEST(BitStreamUtil, ZigZag) {
+  TestZigZag(0);
+  TestZigZag(1);
+  TestZigZag(-1);
+  TestZigZag(std::numeric_limits<int32_t>::max());
+  TestZigZag(-std::numeric_limits<int32_t>::max());
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/bit-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index 90a1c3e..bba9d2d 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -18,15 +18,77 @@
 #ifndef ARROW_UTIL_BIT_UTIL_H
 #define ARROW_UTIL_BIT_UTIL_H
 
+#if defined(__APPLE__)
+#include <machine/endian.h>
+#elif defined(_WIN32)
+#define __LITTLE_ENDIAN 1
+#else
+#include <endian.h>
+#endif
+
+#if defined(_MSC_VER)
+#define ARROW_BYTE_SWAP64 _byteswap_uint64
+#define ARROW_BYTE_SWAP32 _byteswap_ulong
+#else
+#define ARROW_BYTE_SWAP64 __builtin_bswap64
+#define ARROW_BYTE_SWAP32 __builtin_bswap32
+#endif
+
 #include <cstdint>
 #include <limits>
 #include <memory>
 #include <vector>
 
+#include "arrow/util/compiler-util.h"
 #include "arrow/util/visibility.h"
 
+#ifdef ARROW_USE_SSE
+#include "arrow/util/cpu-info.h"
+#include "arrow/util/sse-util.h"
+#endif
+
 namespace arrow {
 
+#define INIT_BITSET(valid_bits_vector, valid_bits_index)        \
+  int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
+  int bit_offset_##valid_bits_vector = (valid_bits_index) % 8;  \
+  uint8_t bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector];
+
+#define READ_NEXT_BITSET(valid_bits_vector)                                          \
+  bit_offset_##valid_bits_vector++;                                                  \
+  if (bit_offset_##valid_bits_vector == 8) {                                         \
+    bit_offset_##valid_bits_vector = 0;                                              \
+    byte_offset_##valid_bits_vector++;                                               \
+    bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; \
+  }
+
+// TODO(wesm): The source from Impala was depending on boost::make_unsigned
+//
+// We add a partial stub implementation here
+
+template <typename T>
+struct make_unsigned {};
+
+template <>
+struct make_unsigned<int8_t> {
+  typedef uint8_t type;
+};
+
+template <>
+struct make_unsigned<int16_t> {
+  typedef uint16_t type;
+};
+
+template <>
+struct make_unsigned<int32_t> {
+  typedef uint32_t type;
+};
+
+template <>
+struct make_unsigned<int64_t> {
+  typedef uint64_t type;
+};
+
 class Buffer;
 class MemoryPool;
 class MutableBuffer;
@@ -67,6 +129,11 @@ static inline void SetBit(uint8_t* bits, int64_t i) {
   bits[i / 8] |= kBitmask[i % 8];
 }
 
+/// Set bit if is_set is true, but cannot clear bit
+static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) {
+  if (is_set) { SetBit(bits, i); }
+}
+
 static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
   // TODO: speed up. See https://graphics.stanford.edu/~seander/bithacks.html
   // "Conditionally set or clear bits without branching"
@@ -77,6 +144,18 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
   }
 }
 
+// Returns the minimum number of bits needed to represent the value of 'x'
+static inline int NumRequiredBits(uint64_t x) {
+  for (int i = 63; i >= 0; --i) {
+    if (x & (UINT64_C(1) << i)) return i + 1;
+  }
+  return 0;
+}
+
+/// Returns the smallest power of two that contains v. Taken from
+/// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+/// TODO: Pick a better name, as it is not clear what happens when the input is
+/// already a power of two.
 static inline int64_t NextPower2(int64_t n) {
   n--;
   n |= n >> 1;
@@ -97,12 +176,66 @@ static inline bool IsMultipleOf8(int64_t n) {
   return (n & 7) == 0;
 }
 
+/// Returns the ceil of value/divisor
+static inline int64_t Ceil(int64_t value, int64_t divisor) {
+  return value / divisor + (value % divisor != 0);
+}
+
 /// Returns 'value' rounded up to the nearest multiple of 'factor'
 inline int64_t RoundUp(int64_t value, int64_t factor) {
   return (value + (factor - 1)) / factor * factor;
 }
 
-inline int64_t RoundUpToMultipleOf64(int64_t num) {
+/// Returns 'value' rounded down to the nearest multiple of 'factor'
+static inline int64_t RoundDown(int64_t value, int64_t factor) {
+  return (value / factor) * factor;
+}
+
+/// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
+/// a power of two
+static inline int RoundUpToPowerOf2(int value, int factor) {
+  // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+  return (value + (factor - 1)) & ~(factor - 1);
+}
+
+static inline int RoundDownToPowerOf2(int value, int factor) {
+  // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+  return value & ~(factor - 1);
+}
+
+/// Specialized round up and down functions for frequently used factors,
+/// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
+/// Returns the rounded up number of bytes that fit the number of bits.
+static inline uint32_t RoundUpNumBytes(uint32_t bits) {
+  return (bits + 7) >> 3;
+}
+
+/// Returns the rounded down number of bytes that fit the number of bits.
+static inline uint32_t RoundDownNumBytes(uint32_t bits) {
+  return bits >> 3;
+}
+
+/// Returns the rounded up to 32 multiple. Used for conversions of bits to i32.
+static inline uint32_t RoundUpNumi32(uint32_t bits) {
+  return (bits + 31) >> 5;
+}
+
+/// Returns the rounded up 32 multiple.
+static inline uint32_t RoundDownNumi32(uint32_t bits) {
+  return bits >> 5;
+}
+
+/// Returns the rounded up to 64 multiple. Used for conversions of bits to i64.
+static inline uint32_t RoundUpNumi64(uint32_t bits) {
+  return (bits + 63) >> 6;
+}
+
+/// Returns the rounded down to 64 multiple.
+static inline uint32_t RoundDownNumi64(uint32_t bits) {
+  return bits >> 6;
+}
+
+static inline int64_t RoundUpToMultipleOf64(int64_t num) {
   // TODO(wesm): is this definitely needed?
   // DCHECK_GE(num, 0);
   constexpr int64_t round_to = 64;
@@ -114,6 +247,200 @@ inline int64_t RoundUpToMultipleOf64(int64_t num) {
   return num;
 }
 
+/// Non hw accelerated pop count.
+/// TODO: we don't use this in any perf sensitive code paths currently.  There
+/// might be a much faster way to implement this.
+static inline int PopcountNoHw(uint64_t x) {
+  int count = 0;
+  for (; x != 0; ++count)
+    x &= x - 1;
+  return count;
+}
+
+/// Returns the number of set bits in x
+static inline int Popcount(uint64_t x) {
+#ifdef ARROW_USE_SSE
+  if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) {
+    return POPCNT_popcnt_u64(x);
+  } else {
+    return PopcountNoHw(x);
+  }
+#else
+  return PopcountNoHw(x);
+#endif
+}
+
+// Compute correct population count for various-width signed integers
+template <typename T>
+static inline int PopcountSigned(T v) {
+  // Converting to same-width unsigned then extending preserves the bit pattern.
+  return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v));
+}
+
+/// Returns the 'num_bits' least-significant bits of 'v'.
+static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
+  if (UNLIKELY(num_bits == 0)) return 0;
+  if (UNLIKELY(num_bits >= 64)) return v;
+  int n = 64 - num_bits;
+  return (v << n) >> n;
+}
+
+/// Returns ceil(log2(x)).
+/// TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
+/// in a hot path.
+static inline int Log2(uint64_t x) {
+  // DCHECK_GT(x, 0);
+  if (x == 1) return 0;
+  // Compute result = ceil(log2(x))
+  //                = floor(log2(x - 1)) + 1, for x > 1
+  // by finding the position of the most significant bit (1-indexed) of x - 1
+  // (floor(log2(n)) = MSB(n) (0-indexed))
+  --x;
+  int result = 1;
+  while (x >>= 1)
+    ++result;
+  return result;
+}
+
+/// Swaps the byte order (i.e. endianess)
+static inline int64_t ByteSwap(int64_t value) {
+  return ARROW_BYTE_SWAP64(value);
+}
+static inline uint64_t ByteSwap(uint64_t value) {
+  return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
+}
+static inline int32_t ByteSwap(int32_t value) {
+  return ARROW_BYTE_SWAP32(value);
+}
+static inline uint32_t ByteSwap(uint32_t value) {
+  return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
+}
+static inline int16_t ByteSwap(int16_t value) {
+  constexpr int16_t m = static_cast<int16_t>(0xff);
+  return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
+}
+static inline uint16_t ByteSwap(uint16_t value) {
+  return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
+}
+
+/// Write the swapped bytes into dst. Src and st cannot overlap.
+static inline void ByteSwap(void* dst, const void* src, int len) {
+  switch (len) {
+    case 1:
+      *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
+      return;
+    case 2:
+      *reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
+      return;
+    case 4:
+      *reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
+      return;
+    case 8:
+      *reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
+      return;
+    default:
+      break;
+  }
+
+  uint8_t* d = reinterpret_cast<uint8_t*>(dst);
+  const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
+  for (int i = 0; i < len; ++i) {
+    d[i] = s[len - i - 1];
+  }
+}
+
+/// Converts to big endian format (if not already in big endian) from the
+/// machine's native endian format.
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static inline int64_t ToBigEndian(int64_t value) {
+  return ByteSwap(value);
+}
+static inline uint64_t ToBigEndian(uint64_t value) {
+  return ByteSwap(value);
+}
+static inline int32_t ToBigEndian(int32_t value) {
+  return ByteSwap(value);
+}
+static inline uint32_t ToBigEndian(uint32_t value) {
+  return ByteSwap(value);
+}
+static inline int16_t ToBigEndian(int16_t value) {
+  return ByteSwap(value);
+}
+static inline uint16_t ToBigEndian(uint16_t value) {
+  return ByteSwap(value);
+}
+#else
+static inline int64_t ToBigEndian(int64_t val) {
+  return val;
+}
+static inline uint64_t ToBigEndian(uint64_t val) {
+  return val;
+}
+static inline int32_t ToBigEndian(int32_t val) {
+  return val;
+}
+static inline uint32_t ToBigEndian(uint32_t val) {
+  return val;
+}
+static inline int16_t ToBigEndian(int16_t val) {
+  return val;
+}
+static inline uint16_t ToBigEndian(uint16_t val) {
+  return val;
+}
+#endif
+
+/// Converts from big endian format to the machine's native endian format.
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static inline int64_t FromBigEndian(int64_t value) {
+  return ByteSwap(value);
+}
+static inline uint64_t FromBigEndian(uint64_t value) {
+  return ByteSwap(value);
+}
+static inline int32_t FromBigEndian(int32_t value) {
+  return ByteSwap(value);
+}
+static inline uint32_t FromBigEndian(uint32_t value) {
+  return ByteSwap(value);
+}
+static inline int16_t FromBigEndian(int16_t value) {
+  return ByteSwap(value);
+}
+static inline uint16_t FromBigEndian(uint16_t value) {
+  return ByteSwap(value);
+}
+#else
+static inline int64_t FromBigEndian(int64_t val) {
+  return val;
+}
+static inline uint64_t FromBigEndian(uint64_t val) {
+  return val;
+}
+static inline int32_t FromBigEndian(int32_t val) {
+  return val;
+}
+static inline uint32_t FromBigEndian(uint32_t val) {
+  return val;
+}
+static inline int16_t FromBigEndian(int16_t val) {
+  return val;
+}
+static inline uint16_t FromBigEndian(uint16_t val) {
+  return val;
+}
+#endif
+
+// Logical right shift for signed integer types
+// This is needed because the C >> operator does arithmetic right shift
+// Negative shift amounts lead to undefined behavior
+template <typename T>
+static T ShiftRightLogical(T v, int shift) {
+  // Conversion to unsigned ensures most significant bits always filled with 0's
+  return static_cast<typename make_unsigned<T>::type>(v) >> shift;
+}
+
 void BytesToBits(const std::vector<uint8_t>& bytes, uint8_t* bits);
 ARROW_EXPORT Status BytesToBits(const std::vector<uint8_t>&, std::shared_ptr<Buffer>*);
 


[2/3] arrow git commit: ARROW-1154: [C++] Import miscellaneous computational utility code from parquet-cpp

Posted by we...@apache.org.
http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/bpacking.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bpacking.h b/cpp/src/arrow/util/bpacking.h
new file mode 100644
index 0000000..fce5f55
--- /dev/null
+++ b/cpp/src/arrow/util/bpacking.h
@@ -0,0 +1,3342 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file was modified from its original version for inclusion in parquet-cpp.
+// Original source:
+// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
+// The original copyright notice follows.
+
+/**
+*
+* This code is released under the
+* Apache License Version 2.0 http://www.apache.org/licenses/.
+* (c) Daniel Lemire 2013
+*/
+
+#ifndef ARROW_UTIL_BPACKING_H
+#define ARROW_UTIL_BPACKING_H
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) & 1;
+  out++;
+  *out = ((*in) >> 1) & 1;
+  out++;
+  *out = ((*in) >> 2) & 1;
+  out++;
+  *out = ((*in) >> 3) & 1;
+  out++;
+  *out = ((*in) >> 4) & 1;
+  out++;
+  *out = ((*in) >> 5) & 1;
+  out++;
+  *out = ((*in) >> 6) & 1;
+  out++;
+  *out = ((*in) >> 7) & 1;
+  out++;
+  *out = ((*in) >> 8) & 1;
+  out++;
+  *out = ((*in) >> 9) & 1;
+  out++;
+  *out = ((*in) >> 10) & 1;
+  out++;
+  *out = ((*in) >> 11) & 1;
+  out++;
+  *out = ((*in) >> 12) & 1;
+  out++;
+  *out = ((*in) >> 13) & 1;
+  out++;
+  *out = ((*in) >> 14) & 1;
+  out++;
+  *out = ((*in) >> 15) & 1;
+  out++;
+  *out = ((*in) >> 16) & 1;
+  out++;
+  *out = ((*in) >> 17) & 1;
+  out++;
+  *out = ((*in) >> 18) & 1;
+  out++;
+  *out = ((*in) >> 19) & 1;
+  out++;
+  *out = ((*in) >> 20) & 1;
+  out++;
+  *out = ((*in) >> 21) & 1;
+  out++;
+  *out = ((*in) >> 22) & 1;
+  out++;
+  *out = ((*in) >> 23) & 1;
+  out++;
+  *out = ((*in) >> 24) & 1;
+  out++;
+  *out = ((*in) >> 25) & 1;
+  out++;
+  *out = ((*in) >> 26) & 1;
+  out++;
+  *out = ((*in) >> 27) & 1;
+  out++;
+  *out = ((*in) >> 28) & 1;
+  out++;
+  *out = ((*in) >> 29) & 1;
+  out++;
+  *out = ((*in) >> 30) & 1;
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 2);
+  out++;
+  *out = ((*in) >> 4) % (1U << 2);
+  out++;
+  *out = ((*in) >> 6) % (1U << 2);
+  out++;
+  *out = ((*in) >> 8) % (1U << 2);
+  out++;
+  *out = ((*in) >> 10) % (1U << 2);
+  out++;
+  *out = ((*in) >> 12) % (1U << 2);
+  out++;
+  *out = ((*in) >> 14) % (1U << 2);
+  out++;
+  *out = ((*in) >> 16) % (1U << 2);
+  out++;
+  *out = ((*in) >> 18) % (1U << 2);
+  out++;
+  *out = ((*in) >> 20) % (1U << 2);
+  out++;
+  *out = ((*in) >> 22) % (1U << 2);
+  out++;
+  *out = ((*in) >> 24) % (1U << 2);
+  out++;
+  *out = ((*in) >> 26) % (1U << 2);
+  out++;
+  *out = ((*in) >> 28) % (1U << 2);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 2);
+  out++;
+  *out = ((*in) >> 4) % (1U << 2);
+  out++;
+  *out = ((*in) >> 6) % (1U << 2);
+  out++;
+  *out = ((*in) >> 8) % (1U << 2);
+  out++;
+  *out = ((*in) >> 10) % (1U << 2);
+  out++;
+  *out = ((*in) >> 12) % (1U << 2);
+  out++;
+  *out = ((*in) >> 14) % (1U << 2);
+  out++;
+  *out = ((*in) >> 16) % (1U << 2);
+  out++;
+  *out = ((*in) >> 18) % (1U << 2);
+  out++;
+  *out = ((*in) >> 20) % (1U << 2);
+  out++;
+  *out = ((*in) >> 22) % (1U << 2);
+  out++;
+  *out = ((*in) >> 24) % (1U << 2);
+  out++;
+  *out = ((*in) >> 26) % (1U << 2);
+  out++;
+  *out = ((*in) >> 28) % (1U << 2);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 3);
+  out++;
+  *out = ((*in) >> 6) % (1U << 3);
+  out++;
+  *out = ((*in) >> 9) % (1U << 3);
+  out++;
+  *out = ((*in) >> 12) % (1U << 3);
+  out++;
+  *out = ((*in) >> 15) % (1U << 3);
+  out++;
+  *out = ((*in) >> 18) % (1U << 3);
+  out++;
+  *out = ((*in) >> 21) % (1U << 3);
+  out++;
+  *out = ((*in) >> 24) % (1U << 3);
+  out++;
+  *out = ((*in) >> 27) % (1U << 3);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (3 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 3);
+  out++;
+  *out = ((*in) >> 4) % (1U << 3);
+  out++;
+  *out = ((*in) >> 7) % (1U << 3);
+  out++;
+  *out = ((*in) >> 10) % (1U << 3);
+  out++;
+  *out = ((*in) >> 13) % (1U << 3);
+  out++;
+  *out = ((*in) >> 16) % (1U << 3);
+  out++;
+  *out = ((*in) >> 19) % (1U << 3);
+  out++;
+  *out = ((*in) >> 22) % (1U << 3);
+  out++;
+  *out = ((*in) >> 25) % (1U << 3);
+  out++;
+  *out = ((*in) >> 28) % (1U << 3);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (3 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 3);
+  out++;
+  *out = ((*in) >> 5) % (1U << 3);
+  out++;
+  *out = ((*in) >> 8) % (1U << 3);
+  out++;
+  *out = ((*in) >> 11) % (1U << 3);
+  out++;
+  *out = ((*in) >> 14) % (1U << 3);
+  out++;
+  *out = ((*in) >> 17) % (1U << 3);
+  out++;
+  *out = ((*in) >> 20) % (1U << 3);
+  out++;
+  *out = ((*in) >> 23) % (1U << 3);
+  out++;
+  *out = ((*in) >> 26) % (1U << 3);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 4);
+  out++;
+  *out = ((*in) >> 8) % (1U << 4);
+  out++;
+  *out = ((*in) >> 12) % (1U << 4);
+  out++;
+  *out = ((*in) >> 16) % (1U << 4);
+  out++;
+  *out = ((*in) >> 20) % (1U << 4);
+  out++;
+  *out = ((*in) >> 24) % (1U << 4);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 5);
+  out++;
+  *out = ((*in) >> 10) % (1U << 5);
+  out++;
+  *out = ((*in) >> 15) % (1U << 5);
+  out++;
+  *out = ((*in) >> 20) % (1U << 5);
+  out++;
+  *out = ((*in) >> 25) % (1U << 5);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (5 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 5);
+  out++;
+  *out = ((*in) >> 8) % (1U << 5);
+  out++;
+  *out = ((*in) >> 13) % (1U << 5);
+  out++;
+  *out = ((*in) >> 18) % (1U << 5);
+  out++;
+  *out = ((*in) >> 23) % (1U << 5);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (5 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 5);
+  out++;
+  *out = ((*in) >> 6) % (1U << 5);
+  out++;
+  *out = ((*in) >> 11) % (1U << 5);
+  out++;
+  *out = ((*in) >> 16) % (1U << 5);
+  out++;
+  *out = ((*in) >> 21) % (1U << 5);
+  out++;
+  *out = ((*in) >> 26) % (1U << 5);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (5 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 5);
+  out++;
+  *out = ((*in) >> 9) % (1U << 5);
+  out++;
+  *out = ((*in) >> 14) % (1U << 5);
+  out++;
+  *out = ((*in) >> 19) % (1U << 5);
+  out++;
+  *out = ((*in) >> 24) % (1U << 5);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (5 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 5);
+  out++;
+  *out = ((*in) >> 7) % (1U << 5);
+  out++;
+  *out = ((*in) >> 12) % (1U << 5);
+  out++;
+  *out = ((*in) >> 17) % (1U << 5);
+  out++;
+  *out = ((*in) >> 22) % (1U << 5);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 6);
+  out++;
+  *out = ((*in) >> 12) % (1U << 6);
+  out++;
+  *out = ((*in) >> 18) % (1U << 6);
+  out++;
+  *out = ((*in) >> 24) % (1U << 6);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (6 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 6);
+  out++;
+  *out = ((*in) >> 10) % (1U << 6);
+  out++;
+  *out = ((*in) >> 16) % (1U << 6);
+  out++;
+  *out = ((*in) >> 22) % (1U << 6);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (6 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 6);
+  out++;
+  *out = ((*in) >> 8) % (1U << 6);
+  out++;
+  *out = ((*in) >> 14) % (1U << 6);
+  out++;
+  *out = ((*in) >> 20) % (1U << 6);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 6);
+  out++;
+  *out = ((*in) >> 12) % (1U << 6);
+  out++;
+  *out = ((*in) >> 18) % (1U << 6);
+  out++;
+  *out = ((*in) >> 24) % (1U << 6);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (6 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 6);
+  out++;
+  *out = ((*in) >> 10) % (1U << 6);
+  out++;
+  *out = ((*in) >> 16) % (1U << 6);
+  out++;
+  *out = ((*in) >> 22) % (1U << 6);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (6 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 6);
+  out++;
+  *out = ((*in) >> 8) % (1U << 6);
+  out++;
+  *out = ((*in) >> 14) % (1U << 6);
+  out++;
+  *out = ((*in) >> 20) % (1U << 6);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 7);
+  out++;
+  *out = ((*in) >> 14) % (1U << 7);
+  out++;
+  *out = ((*in) >> 21) % (1U << 7);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (7 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 7);
+  out++;
+  *out = ((*in) >> 10) % (1U << 7);
+  out++;
+  *out = ((*in) >> 17) % (1U << 7);
+  out++;
+  *out = ((*in) >> 24) % (1U << 7);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (7 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 7);
+  out++;
+  *out = ((*in) >> 13) % (1U << 7);
+  out++;
+  *out = ((*in) >> 20) % (1U << 7);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (7 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 7);
+  out++;
+  *out = ((*in) >> 9) % (1U << 7);
+  out++;
+  *out = ((*in) >> 16) % (1U << 7);
+  out++;
+  *out = ((*in) >> 23) % (1U << 7);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (7 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 7);
+  out++;
+  *out = ((*in) >> 12) % (1U << 7);
+  out++;
+  *out = ((*in) >> 19) % (1U << 7);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (7 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 7);
+  out++;
+  *out = ((*in) >> 8) % (1U << 7);
+  out++;
+  *out = ((*in) >> 15) % (1U << 7);
+  out++;
+  *out = ((*in) >> 22) % (1U << 7);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (7 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 7);
+  out++;
+  *out = ((*in) >> 11) % (1U << 7);
+  out++;
+  *out = ((*in) >> 18) % (1U << 7);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 8);
+  out++;
+  *out = ((*in) >> 16) % (1U << 8);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 9);
+  out++;
+  *out = ((*in) >> 18) % (1U << 9);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (9 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 9);
+  out++;
+  *out = ((*in) >> 13) % (1U << 9);
+  out++;
+  *out = ((*in) >> 22) % (1U << 9);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (9 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 9);
+  out++;
+  *out = ((*in) >> 17) % (1U << 9);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (9 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 9);
+  out++;
+  *out = ((*in) >> 12) % (1U << 9);
+  out++;
+  *out = ((*in) >> 21) % (1U << 9);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (9 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 9);
+  out++;
+  *out = ((*in) >> 16) % (1U << 9);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (9 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 9);
+  out++;
+  *out = ((*in) >> 11) % (1U << 9);
+  out++;
+  *out = ((*in) >> 20) % (1U << 9);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (9 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 9);
+  out++;
+  *out = ((*in) >> 15) % (1U << 9);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (9 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 9);
+  out++;
+  *out = ((*in) >> 10) % (1U << 9);
+  out++;
+  *out = ((*in) >> 19) % (1U << 9);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (9 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 9);
+  out++;
+  *out = ((*in) >> 14) % (1U << 9);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 10);
+  out++;
+  *out = ((*in) >> 20) % (1U << 10);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (10 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 10);
+  out++;
+  *out = ((*in) >> 18) % (1U << 10);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (10 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 10);
+  out++;
+  *out = ((*in) >> 16) % (1U << 10);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (10 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 10);
+  out++;
+  *out = ((*in) >> 14) % (1U << 10);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (10 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 10);
+  out++;
+  *out = ((*in) >> 12) % (1U << 10);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 10);
+  out++;
+  *out = ((*in) >> 20) % (1U << 10);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (10 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 10);
+  out++;
+  *out = ((*in) >> 18) % (1U << 10);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (10 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 10);
+  out++;
+  *out = ((*in) >> 16) % (1U << 10);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (10 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 10);
+  out++;
+  *out = ((*in) >> 14) % (1U << 10);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (10 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 10);
+  out++;
+  *out = ((*in) >> 12) % (1U << 10);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 11);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (11 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 11);
+  out++;
+  *out = ((*in) >> 12) % (1U << 11);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (11 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 11);
+  out++;
+  *out = ((*in) >> 13) % (1U << 11);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (11 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 11);
+  out++;
+  *out = ((*in) >> 14) % (1U << 11);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (11 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 11);
+  out++;
+  *out = ((*in) >> 15) % (1U << 11);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (11 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 11);
+  out++;
+  *out = ((*in) >> 16) % (1U << 11);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (11 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 11);
+  out++;
+  *out = ((*in) >> 17) % (1U << 11);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (11 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 11);
+  out++;
+  *out = ((*in) >> 18) % (1U << 11);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (11 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 11);
+  out++;
+  *out = ((*in) >> 19) % (1U << 11);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (11 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 11);
+  out++;
+  *out = ((*in) >> 20) % (1U << 11);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (11 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 11);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 12);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (12 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 12);
+  out++;
+  *out = ((*in) >> 16) % (1U << 12);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (12 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 12);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 13);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (13 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 13);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (13 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 13);
+  out++;
+  *out = ((*in) >> 14) % (1U << 13);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (13 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 13);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (13 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 13);
+  out++;
+  *out = ((*in) >> 15) % (1U << 13);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (13 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 13);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (13 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 13);
+  out++;
+  *out = ((*in) >> 16) % (1U << 13);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (13 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 13);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (13 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 13);
+  out++;
+  *out = ((*in) >> 17) % (1U << 13);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (13 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 13);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (13 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 13);
+  out++;
+  *out = ((*in) >> 18) % (1U << 13);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (13 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 13);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (13 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 13);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 14);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (14 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 14);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (14 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 14);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (14 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 14);
+  out++;
+  *out = ((*in) >> 16) % (1U << 14);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (14 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 14);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (14 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 14);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (14 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 14);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 14);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (14 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 14);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (14 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 14);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (14 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 14);
+  out++;
+  *out = ((*in) >> 16) % (1U << 14);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (14 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 14);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (14 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 14);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (14 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 14);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 15);
+  out++;
+  *out = ((*in) >> 15) % (1U << 15);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (15 - 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 15);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (15 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 15);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (15 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 15);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (15 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 15);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (15 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 15);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (15 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 15);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (15 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 15);
+  out++;
+  *out = ((*in) >> 16) % (1U << 15);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (15 - 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 15);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (15 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 15);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (15 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 15);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (15 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 15);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (15 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 15);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (15 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 15);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (15 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 15);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (17 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 17);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (17 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 17);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (17 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 17);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (17 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 17);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (17 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 17);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (17 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 17);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (17 - 14);
+  out++;
+  *out = ((*in) >> 14) % (1U << 17);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (17 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (17 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 17);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (17 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 17);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (17 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 17);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (17 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 17);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (17 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 17);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (17 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 17);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (17 - 13);
+  out++;
+  *out = ((*in) >> 13) % (1U << 17);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (17 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (18 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 18);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (18 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 18);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (18 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 18);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (18 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (18 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 18);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (18 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 18);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (18 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 18);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (18 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (18 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 18);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (18 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 18);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (18 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 18);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (18 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (18 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 18);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (18 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 18);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (18 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 18);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (18 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (19 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 19);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (19 - 12);
+  out++;
+  *out = ((*in) >> 12) % (1U << 19);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (19 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (19 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 19);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (19 - 11);
+  out++;
+  *out = ((*in) >> 11) % (1U << 19);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (19 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (19 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 19);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (19 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 19);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (19 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (19 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 19);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (19 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 19);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (19 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (19 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 19);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (19 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 19);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (19 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (19 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 19);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (19 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 19);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (19 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (20 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 20);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (20 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (20 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 20);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (20 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (21 - 10);
+  out++;
+  *out = ((*in) >> 10) % (1U << 21);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (21 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (21 - 9);
+  out++;
+  *out = ((*in) >> 9) % (1U << 21);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (21 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (21 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 21);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (21 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (21 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 21);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (21 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (21 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 21);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (21 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (21 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 21);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (21 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (21 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 21);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (21 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (21 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 21);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (21 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (21 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 21);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (21 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (21 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 21);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (21 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (22 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (22 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 22);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (22 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (22 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 22);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (22 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (22 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 22);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (22 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (22 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 22);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (22 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (22 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (22 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (22 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 22);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (22 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (22 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 22);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (22 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (22 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 22);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (22 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (22 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 22);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (22 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (22 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (23 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (23 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 23);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (23 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (23 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (23 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 23);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (23 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (23 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 23);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (23 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (23 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (23 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 23);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (23 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (23 - 7);
+  out++;
+  *out = ((*in) >> 7) % (1U << 23);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (23 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (23 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (23 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 23);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (23 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (23 - 8);
+  out++;
+  *out = ((*in) >> 8) % (1U << 23);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (23 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (23 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (23 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 23);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (23 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (23 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (24 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (24 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (25 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (25 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (25 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 25);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (25 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (25 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (25 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (25 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 25);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (25 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (25 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (25 - 5);
+  out++;
+  *out = ((*in) >> 5) % (1U << 25);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (25 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (25 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (25 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (25 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 25);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (25 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (25 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (25 - 6);
+  out++;
+  *out = ((*in) >> 6) % (1U << 25);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (25 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (25 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (25 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (25 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 25);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (25 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (25 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (25 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (26 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (26 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (26 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (26 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 26);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (26 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (26 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (26 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (26 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 26);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (26 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (26 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (26 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (26 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (26 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (26 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (26 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (26 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 26);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (26 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (26 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (26 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (26 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 26);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (26 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (26 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (26 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (26 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (27 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (27 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (27 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (27 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (27 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 27);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (27 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (27 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (27 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (27 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (27 - 4);
+  out++;
+  *out = ((*in) >> 4) % (1U << 27);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (27 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (27 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (27 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (27 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (27 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (27 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 27);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (27 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (27 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (27 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (27 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (27 - 3);
+  out++;
+  *out = ((*in) >> 3) % (1U << 27);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (27 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (27 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (27 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (27 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (27 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (28 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (28 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (28 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (28 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (28 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (28 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 29);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (29 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (29 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (29 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (29 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (29 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (29 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (29 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (29 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (29 - 2);
+  out++;
+  *out = ((*in) >> 2) % (1U << 29);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (29 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (29 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (29 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (29 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (29 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (29 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (29 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (29 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (29 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (29 - 1);
+  out++;
+  *out = ((*in) >> 1) % (1U << 29);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 27)) << (29 - 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (29 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (29 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (29 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (29 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (29 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (29 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (29 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (29 - 3);
+  out++;
+  *out = ((*in) >> 3);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (30 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (30 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (30 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (30 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (30 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (30 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (30 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (30 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (30 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (30 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (30 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (30 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (30 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (30 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  out++;
+  *out = ((*in) >> 0) % (1U << 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (30 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (30 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (30 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (30 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (30 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (30 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (30 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (30 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (30 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (30 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (30 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (30 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (30 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (30 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0) % (1U << 31);
+  out++;
+  *out = ((*in) >> 31);
+  ++in;
+  *out |= ((*in) % (1U << 30)) << (31 - 30);
+  out++;
+  *out = ((*in) >> 30);
+  ++in;
+  *out |= ((*in) % (1U << 29)) << (31 - 29);
+  out++;
+  *out = ((*in) >> 29);
+  ++in;
+  *out |= ((*in) % (1U << 28)) << (31 - 28);
+  out++;
+  *out = ((*in) >> 28);
+  ++in;
+  *out |= ((*in) % (1U << 27)) << (31 - 27);
+  out++;
+  *out = ((*in) >> 27);
+  ++in;
+  *out |= ((*in) % (1U << 26)) << (31 - 26);
+  out++;
+  *out = ((*in) >> 26);
+  ++in;
+  *out |= ((*in) % (1U << 25)) << (31 - 25);
+  out++;
+  *out = ((*in) >> 25);
+  ++in;
+  *out |= ((*in) % (1U << 24)) << (31 - 24);
+  out++;
+  *out = ((*in) >> 24);
+  ++in;
+  *out |= ((*in) % (1U << 23)) << (31 - 23);
+  out++;
+  *out = ((*in) >> 23);
+  ++in;
+  *out |= ((*in) % (1U << 22)) << (31 - 22);
+  out++;
+  *out = ((*in) >> 22);
+  ++in;
+  *out |= ((*in) % (1U << 21)) << (31 - 21);
+  out++;
+  *out = ((*in) >> 21);
+  ++in;
+  *out |= ((*in) % (1U << 20)) << (31 - 20);
+  out++;
+  *out = ((*in) >> 20);
+  ++in;
+  *out |= ((*in) % (1U << 19)) << (31 - 19);
+  out++;
+  *out = ((*in) >> 19);
+  ++in;
+  *out |= ((*in) % (1U << 18)) << (31 - 18);
+  out++;
+  *out = ((*in) >> 18);
+  ++in;
+  *out |= ((*in) % (1U << 17)) << (31 - 17);
+  out++;
+  *out = ((*in) >> 17);
+  ++in;
+  *out |= ((*in) % (1U << 16)) << (31 - 16);
+  out++;
+  *out = ((*in) >> 16);
+  ++in;
+  *out |= ((*in) % (1U << 15)) << (31 - 15);
+  out++;
+  *out = ((*in) >> 15);
+  ++in;
+  *out |= ((*in) % (1U << 14)) << (31 - 14);
+  out++;
+  *out = ((*in) >> 14);
+  ++in;
+  *out |= ((*in) % (1U << 13)) << (31 - 13);
+  out++;
+  *out = ((*in) >> 13);
+  ++in;
+  *out |= ((*in) % (1U << 12)) << (31 - 12);
+  out++;
+  *out = ((*in) >> 12);
+  ++in;
+  *out |= ((*in) % (1U << 11)) << (31 - 11);
+  out++;
+  *out = ((*in) >> 11);
+  ++in;
+  *out |= ((*in) % (1U << 10)) << (31 - 10);
+  out++;
+  *out = ((*in) >> 10);
+  ++in;
+  *out |= ((*in) % (1U << 9)) << (31 - 9);
+  out++;
+  *out = ((*in) >> 9);
+  ++in;
+  *out |= ((*in) % (1U << 8)) << (31 - 8);
+  out++;
+  *out = ((*in) >> 8);
+  ++in;
+  *out |= ((*in) % (1U << 7)) << (31 - 7);
+  out++;
+  *out = ((*in) >> 7);
+  ++in;
+  *out |= ((*in) % (1U << 6)) << (31 - 6);
+  out++;
+  *out = ((*in) >> 6);
+  ++in;
+  *out |= ((*in) % (1U << 5)) << (31 - 5);
+  out++;
+  *out = ((*in) >> 5);
+  ++in;
+  *out |= ((*in) % (1U << 4)) << (31 - 4);
+  out++;
+  *out = ((*in) >> 4);
+  ++in;
+  *out |= ((*in) % (1U << 3)) << (31 - 3);
+  out++;
+  *out = ((*in) >> 3);
+  ++in;
+  *out |= ((*in) % (1U << 2)) << (31 - 2);
+  out++;
+  *out = ((*in) >> 2);
+  ++in;
+  *out |= ((*in) % (1U << 1)) << (31 - 1);
+  out++;
+  *out = ((*in) >> 1);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+  *out = ((*in) >> 0);
+  ++in;
+  out++;
+
+  return in;
+}
+
+inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) {
+  for (int k = 0; k < 32; ++k) {
+    out[k] = 0;
+  }
+  return in;
+}
+
+inline int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
+  batch_size = batch_size / 32 * 32;
+  int num_loops = batch_size / 32;
+
+  switch (num_bits) {
+    case 0:
+      for (int i = 0; i < num_loops; ++i)
+        in = nullunpacker32(in, out + i * 32);
+      break;
+    case 1:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack1_32(in, out + i * 32);
+      break;
+    case 2:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack2_32(in, out + i * 32);
+      break;
+    case 3:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack3_32(in, out + i * 32);
+      break;
+    case 4:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack4_32(in, out + i * 32);
+      break;
+    case 5:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack5_32(in, out + i * 32);
+      break;
+    case 6:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack6_32(in, out + i * 32);
+      break;
+    case 7:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack7_32(in, out + i * 32);
+      break;
+    case 8:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack8_32(in, out + i * 32);
+      break;
+    case 9:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack9_32(in, out + i * 32);
+      break;
+    case 10:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack10_32(in, out + i * 32);
+      break;
+    case 11:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack11_32(in, out + i * 32);
+      break;
+    case 12:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack12_32(in, out + i * 32);
+      break;
+    case 13:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack13_32(in, out + i * 32);
+      break;
+    case 14:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack14_32(in, out + i * 32);
+      break;
+    case 15:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack15_32(in, out + i * 32);
+      break;
+    case 16:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack16_32(in, out + i * 32);
+      break;
+    case 17:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack17_32(in, out + i * 32);
+      break;
+    case 18:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack18_32(in, out + i * 32);
+      break;
+    case 19:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack19_32(in, out + i * 32);
+      break;
+    case 20:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack20_32(in, out + i * 32);
+      break;
+    case 21:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack21_32(in, out + i * 32);
+      break;
+    case 22:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack22_32(in, out + i * 32);
+      break;
+    case 23:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack23_32(in, out + i * 32);
+      break;
+    case 24:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack24_32(in, out + i * 32);
+      break;
+    case 25:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack25_32(in, out + i * 32);
+      break;
+    case 26:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack26_32(in, out + i * 32);
+      break;
+    case 27:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack27_32(in, out + i * 32);
+      break;
+    case 28:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack28_32(in, out + i * 32);
+      break;
+    case 29:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack29_32(in, out + i * 32);
+      break;
+    case 30:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack30_32(in, out + i * 32);
+      break;
+    case 31:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack31_32(in, out + i * 32);
+      break;
+    case 32:
+      for (int i = 0; i < num_loops; ++i)
+        in = unpack32_32(in, out + i * 32);
+      break;
+    default:
+      DCHECK(false) << "Unsupported num_bits";
+  }
+
+  return batch_size;
+}
+
+};  // namespace arrow
+
+#endif  // ARROW_UTIL_BPACKING_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/compiler-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/compiler-util.h b/cpp/src/arrow/util/compiler-util.h
new file mode 100644
index 0000000..ccbe545
--- /dev/null
+++ b/cpp/src/arrow/util/compiler-util.h
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_COMPILER_UTIL_H
+#define ARROW_UTIL_COMPILER_UTIL_H
+
+// Branch prediction macro hints for GCC
+#ifdef LIKELY
+#undef LIKELY
+#endif
+
+#ifdef UNLIKELY
+#undef UNLIKELY
+#endif
+
+#ifdef _MSC_VER
+#define LIKELY(expr) expr
+#define UNLIKELY(expr) expr
+#else
+#define LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#endif
+
+#define PREFETCH(addr) __builtin_prefetch(addr)
+
+// macros to disable padding
+// these macros are portable across different compilers and platforms
+//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
+#if defined(_MSC_VER)
+#define MANUALLY_ALIGNED_STRUCT(alignment) \
+  __pragma(pack(1));                       \
+  struct __declspec(align(alignment))
+#define STRUCT_END(name, size) \
+  __pragma(pack());            \
+  static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#elif defined(__GNUC__) || defined(__clang__)
+#define MANUALLY_ALIGNED_STRUCT(alignment) \
+  _Pragma("pack(1)") struct __attribute__((aligned(alignment)))
+#define STRUCT_END(name, size) \
+  _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#else
+#error Unknown compiler, please define structure alignment macros
+#endif
+
+#endif  // ARROW_UTIL_COMPILER_UTIL_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/cpu-info.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/cpu-info.cc b/cpp/src/arrow/util/cpu-info.cc
new file mode 100644
index 0000000..c0fc8bd
--- /dev/null
+++ b/cpp/src/arrow/util/cpu-info.cc
@@ -0,0 +1,206 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29.
+
+#include "arrow/util/cpu-info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef _MSC_VER
+#include <unistd.h>
+#endif
+
+#include <boost/algorithm/string.hpp>
+
+#include <algorithm>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+#include "arrow/util/logging.h"
+
+using boost::algorithm::contains;
+using boost::algorithm::trim;
+using std::max;
+using std::string;
+
+namespace arrow {
+
+bool CpuInfo::initialized_ = false;
+int64_t CpuInfo::hardware_flags_ = 0;
+int64_t CpuInfo::original_hardware_flags_;
+int64_t CpuInfo::cache_sizes_[L3_CACHE + 1];
+int64_t CpuInfo::cycles_per_ms_;
+int CpuInfo::num_cores_ = 1;
+string CpuInfo::model_name_ = "unknown";  // NOLINT
+static std::mutex cpuinfo_mutex;
+
+static struct {
+  string name;
+  int64_t flag;
+} flag_mappings[] = {
+    {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, {"sse4_2", CpuInfo::SSE4_2},
+    {"popcnt", CpuInfo::POPCNT},
+};
+static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+
+// Helper function to parse for hardware flags.
+// values contains a list of space-seperated flags.  check to see if the flags we
+// care about are present.
+// Returns a bitmap of flags.
+int64_t ParseCPUFlags(const string& values) {
+  int64_t flags = 0;
+  for (int i = 0; i < num_flags; ++i) {
+    if (contains(values, flag_mappings[i].name)) { flags |= flag_mappings[i].flag; }
+  }
+  return flags;
+}
+
+void CpuInfo::Init() {
+  std::lock_guard<std::mutex> cpuinfo_lock(cpuinfo_mutex);
+
+  if (initialized()) { return; }
+
+  string line;
+  string name;
+  string value;
+
+  float max_mhz = 0;
+  int num_cores = 0;
+
+  memset(&cache_sizes_, 0, sizeof(cache_sizes_));
+
+  // Read from /proc/cpuinfo
+  std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
+  while (cpuinfo) {
+    getline(cpuinfo, line);
+    size_t colon = line.find(':');
+    if (colon != string::npos) {
+      name = line.substr(0, colon - 1);
+      value = line.substr(colon + 1, string::npos);
+      trim(name);
+      trim(value);
+      if (name.compare("flags") == 0) {
+        hardware_flags_ |= ParseCPUFlags(value);
+      } else if (name.compare("cpu MHz") == 0) {
+        // Every core will report a different speed.  We'll take the max, assuming
+        // that when impala is running, the core will not be in a lower power state.
+        // TODO: is there a more robust way to do this, such as
+        // Window's QueryPerformanceFrequency()
+        float mhz = static_cast<float>(atof(value.c_str()));
+        max_mhz = max(mhz, max_mhz);
+      } else if (name.compare("processor") == 0) {
+        ++num_cores;
+      } else if (name.compare("model name") == 0) {
+        model_name_ = value;
+      }
+    }
+  }
+  if (cpuinfo.is_open()) cpuinfo.close();
+
+#ifdef __APPLE__
+  // On Mac OS X use sysctl() to get the cache sizes
+  size_t len = 0;
+  sysctlbyname("hw.cachesize", NULL, &len, NULL, 0);
+  uint64_t* data = static_cast<uint64_t*>(malloc(len));
+  sysctlbyname("hw.cachesize", data, &len, NULL, 0);
+  DCHECK_GE(len / sizeof(uint64_t), 3);
+  for (size_t i = 0; i < 3; ++i) {
+    cache_sizes_[i] = data[i];
+  }
+#else
+#ifndef _SC_LEVEL1_DCACHE_SIZE
+  // Provide reasonable default values if no info
+  cache_sizes_[0] = 32 * 1024;    // Level 1: 32k
+  cache_sizes_[1] = 256 * 1024;   // Level 2: 256k
+  cache_sizes_[2] = 3072 * 1024;  // Level 3: 3M
+#else
+  // Call sysconf to query for the cache sizes
+  cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+  cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
+  cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
+#endif
+#endif
+
+  if (max_mhz != 0) {
+    cycles_per_ms_ = static_cast<int64_t>(max_mhz) * 1000;
+  } else {
+    cycles_per_ms_ = 1000000;
+  }
+  original_hardware_flags_ = hardware_flags_;
+
+  if (num_cores > 0) {
+    num_cores_ = num_cores;
+  } else {
+    num_cores_ = 1;
+  }
+
+  initialized_ = true;
+}
+
+void CpuInfo::VerifyCpuRequirements() {
+  if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) {
+    DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
+  }
+}
+
+void CpuInfo::EnableFeature(int64_t flag, bool enable) {
+  DCHECK(initialized_);
+  if (!enable) {
+    hardware_flags_ &= ~flag;
+  } else {
+    // Can't turn something on that can't be supported
+    DCHECK_NE(original_hardware_flags_ & flag, 0);
+    hardware_flags_ |= flag;
+  }
+}
+
+int64_t CpuInfo::hardware_flags() {
+  DCHECK(initialized_);
+  return hardware_flags_;
+}
+
+int64_t CpuInfo::CacheSize(CacheLevel level) {
+  DCHECK(initialized_);
+  return cache_sizes_[level];
+}
+
+int64_t CpuInfo::cycles_per_ms() {
+  DCHECK(initialized_);
+  return cycles_per_ms_;
+}
+
+int CpuInfo::num_cores() {
+  DCHECK(initialized_);
+  return num_cores_;
+}
+
+std::string CpuInfo::model_name() {
+  DCHECK(initialized_);
+  return model_name_;
+}
+
+}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/b0652287/cpp/src/arrow/util/cpu-info.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/cpu-info.h b/cpp/src/arrow/util/cpu-info.h
new file mode 100644
index 0000000..06800fc
--- /dev/null
+++ b/cpp/src/arrow/util/cpu-info.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29. Pared down to a minimal
+// set of functions needed for Apache Arrow / Apache parquet-cpp
+
+#ifndef ARROW_UTIL_CPU_INFO_H
+#define ARROW_UTIL_CPU_INFO_H
+
+#include <cstdint>
+#include <string>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// CpuInfo is an interface to query for cpu information at runtime.  The caller can
+/// ask for the sizes of the caches and what hardware features are supported.
+/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
+/// /sys/devices)
+class ARROW_EXPORT CpuInfo {
+ public:
+  static const int64_t SSSE3 = (1 << 1);
+  static const int64_t SSE4_1 = (1 << 2);
+  static const int64_t SSE4_2 = (1 << 3);
+  static const int64_t POPCNT = (1 << 4);
+
+  /// Cache enums for L1 (data), L2 and L3
+  enum CacheLevel {
+    L1_CACHE = 0,
+    L2_CACHE = 1,
+    L3_CACHE = 2,
+  };
+
+  /// Initialize CpuInfo.
+  static void Init();
+
+  /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
+  /// and terminate.
+  static void VerifyCpuRequirements();
+
+  /// Returns all the flags for this cpu
+  static int64_t hardware_flags();
+
+  /// Returns whether of not the cpu supports this flag
+  inline static bool IsSupported(int64_t flag) { return (hardware_flags_ & flag) != 0; }
+
+  /// Toggle a hardware feature on and off.  It is not valid to turn on a feature
+  /// that the underlying hardware cannot support. This is useful for testing.
+  static void EnableFeature(int64_t flag, bool enable);
+
+  /// Returns the size of the cache in KB at this cache level
+  static int64_t CacheSize(CacheLevel level);
+
+  /// Returns the number of cpu cycles per millisecond
+  static int64_t cycles_per_ms();
+
+  /// Returns the number of cores (including hyper-threaded) on this machine.
+  static int num_cores();
+
+  /// Returns the model name of the cpu (e.g. Intel i7-2600)
+  static std::string model_name();
+
+  static bool initialized() { return initialized_; }
+
+ private:
+  static bool initialized_;
+  static int64_t hardware_flags_;
+  static int64_t original_hardware_flags_;
+  static int64_t cache_sizes_[L3_CACHE + 1];
+  static int64_t cycles_per_ms_;
+  static int num_cores_;
+  static std::string model_name_;  // NOLINT
+};
+
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_CPU_INFO_H