You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2020/03/27 05:13:18 UTC
[kudu] 01/02: Add core algorithms for columnar serialization

This is an automated email from the ASF dual-hosted git repository.

todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 0ba6cb8d6b38a992786e5027528349a43802fd31
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Tue Mar 24 14:59:54 2020 -0700

    Add core algorithms for columnar serialization
    
    This adds the core of the columnar serialization code paths. Even though
    we internally scan in a columnar fashion in the tablet server, sending
    those columns across the wire isn't straightforward. We have two bits of
    necessary processing:
    
    1) the selection vector needs to be taken into account so we only send
    back selected rows. This means we need to copy out the selected cells
    and also copy out the selected bits from the null bitmap where relevant.
    Doing the null bitmap portion efficiently with wide platform support
    makes up a lot of this patch.
    
    2) for the case of null values, we want to make sure we don't send
    uninitialized memory (which might include secrets!) to the client. So we
    need to zero out any cells where the corresponding non-null bitmap bit
    is unset.
    
    To keep the review manageable, this just adds some unit tests and all
    the new code is initially "dead". Later commits will add the parts that
    construct the full block of columns to be sent on the wire, hook this
    into the tserver, etc.
    
    Change-Id: I16f2993081aac54609aab4d8219ef0bf6c7708c2
    Reviewed-on: http://gerrit.cloudera.org:8080/15556
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
 LICENSE.txt                                    |  25 ++
 src/kudu/common/CMakeLists.txt                 |   5 +-
 src/kudu/common/columnar_serialization-test.cc | 179 ++++++++++++
 src/kudu/common/columnar_serialization.cc      | 365 +++++++++++++++++++++++++
 src/kudu/common/columnar_serialization.h       |  60 ++++
 src/kudu/common/zp7.cc                         | 173 ++++++++++++
 src/kudu/common/zp7.h                          |  36 +++
 7 files changed, 842 insertions(+), 1 deletion(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index 0b8f344..2ab810f 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -350,6 +350,31 @@ src/kudu/util/array_view.h: 3-clause BSD license with patent grant
   for this implementation of the WebRTC code package shall terminate as
   of the date such litigation is filed.
 
+--------------------------------------------------------------------------------
+
+src/kudu/common/zp7.cc: MIT license
+
+  ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
+
+  Copyright (c) 2020 Zach Wegner
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
 
 --------------------------------------------------------------------------------
 
diff --git a/src/kudu/common/CMakeLists.txt b/src/kudu/common/CMakeLists.txt
index 2d30d77..0a1814e 100644
--- a/src/kudu/common/CMakeLists.txt
+++ b/src/kudu/common/CMakeLists.txt
@@ -42,6 +42,7 @@ ADD_EXPORTABLE_LIBRARY(wire_protocol_proto
 set(COMMON_SRCS
   columnblock.cc
   column_predicate.cc
+  columnar_serialization.cc
   encoded_key.cc
   generic_iterators.cc
   id_mapping.cc
@@ -60,7 +61,8 @@ set(COMMON_SRCS
   table_util.cc
   timestamp.cc
   types.cc
-  wire_protocol.cc)
+  wire_protocol.cc
+  zp7.cc)
 
 # Workaround for clang bug https://llvm.org/bugs/show_bug.cgi?id=23757
 # in which it incorrectly optimizes key_util.cc and causes incorrect results.
@@ -80,6 +82,7 @@ ADD_EXPORTABLE_LIBRARY(kudu_common
   DEPS ${COMMON_LIBS})
 
 SET_KUDU_TEST_LINK_LIBS(kudu_common)
+ADD_KUDU_TEST(columnar_serialization-test)
 ADD_KUDU_TEST(columnblock-test)
 ADD_KUDU_TEST(column_predicate-test NUM_SHARDS 4)
 ADD_KUDU_TEST(encoded_key-test)
diff --git a/src/kudu/common/columnar_serialization-test.cc b/src/kudu/common/columnar_serialization-test.cc
new file mode 100644
index 0000000..9cff425
--- /dev/null
+++ b/src/kudu/common/columnar_serialization-test.cc
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "kudu/common/columnar_serialization.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "kudu/util/bitmap.h"
+#include "kudu/util/faststring.h"
+#include "kudu/util/random.h"
+#include "kudu/util/scoped_cleanup.h"
+#include "kudu/util/test_util.h"
+
+using std::vector;
+
+namespace kudu {
+
+class ColumnarSerializationTest : public KuduTest {
+ protected:
+  ColumnarSerializationTest() : rng_(SeedRandom()) {
+  }
+
+  // TODO(todd): templatize this test for other types once we have specialized
+  // implementations.
+  using DataType = uint32_t;
+  static constexpr int kTypeSize = sizeof(DataType);
+
+  struct RandomCellsAndNulls {
+    vector<DataType> vals;
+    faststring non_nulls;
+
+    void VerifyNullsAreZeroed() {
+      for (int i = 0; i < vals.size(); i++) {
+        SCOPED_TRACE(i);
+        if (BitmapTest(non_nulls.data(), i)) {
+          EXPECT_EQ(0xdeadbeef, vals[i]);
+        } else {
+          EXPECT_EQ(0, vals[i]);
+        }
+      }
+    }
+  };
+
+  // Generate a random bitmap with the given number of bits.
+  faststring RandomBitmap(int n_bits) {
+    faststring bm;
+    bm.resize(BitmapSize(n_bits));
+
+    for (int i = 0; i < n_bits; i++) {
+      BitmapChange(bm.data(), i, rng_.OneIn(3));
+    }
+    return bm;
+  }
+
+  // Create an array of 0xdeadbeef values and a corresponding
+  // null bitmap with random entries set to null.
+  RandomCellsAndNulls CreateDeadBeefsWithRandomNulls() {
+    auto num_rows = rng_.Uniform(1000) + 1;
+    vector<uint32_t> vals(num_rows, 0xdeadbeef);
+    faststring non_nulls = RandomBitmap(num_rows);
+    return { std::move(vals), std::move(non_nulls) };
+  }
+
+  Random rng_;
+};
+
+
+// Simple test of ZeroNullValues for a whole array.
+TEST_F(ColumnarSerializationTest, TestZeroNullValues) {
+  auto data = CreateDeadBeefsWithRandomNulls();
+
+  internal::ZeroNullValues(
+      kTypeSize, /* dst_idx= */0,
+      data.vals.size(),
+      reinterpret_cast<uint8_t*>(data.vals.data()),
+      data.non_nulls.data());
+
+  ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed());
+}
+
+// More complex test test of ZeroNullValues which runs on sub-ranges
+// of an array.
+TEST_F(ColumnarSerializationTest, TestZeroNullValuesWithOffset) {
+  auto data = CreateDeadBeefsWithRandomNulls();
+  int dst_idx = 0;
+  while (dst_idx < data.vals.size()) {
+    auto rem = data.vals.size() - dst_idx;
+    auto n = rng_.Uniform(rem) + 1;
+    internal::ZeroNullValues(
+        kTypeSize, dst_idx, n,
+        reinterpret_cast<uint8_t*>(data.vals.data()),
+        data.non_nulls.data());
+    dst_idx += n;
+  }
+  ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed());
+}
+
+TEST_F(ColumnarSerializationTest, TestCopyNonNullBitmap) {
+  auto save_method = internal::g_pext_method;
+  SCOPED_CLEANUP({ internal::g_pext_method = save_method; });
+  // Test using all available methods. Depending on the machine where
+  // the test is running we might miss some, but we typically run this
+  // test on relatively recent hardware that would support BMI2 (Haswell
+  // or later).
+  auto available_methods = internal::GetAvailablePextMethods();
+  for (auto m : available_methods) {
+    SCOPED_TRACE(static_cast<int>(m));
+    internal::g_pext_method = m;
+    auto n_rows = 1 + rng_.Uniform(200);
+    faststring non_null_bitmap = RandomBitmap(n_rows);
+    faststring sel_bitmap = RandomBitmap(n_rows);
+    faststring dst_bitmap;
+    dst_bitmap.resize(BitmapSize(n_rows));
+
+    internal::CopyNonNullBitmap(
+        non_null_bitmap.data(), sel_bitmap.data(),
+        /*dst_idx=*/0, n_rows,
+        dst_bitmap.data());
+
+    vector<bool> expected;
+    ForEachSetBit(sel_bitmap.data(), n_rows,
+                  [&](size_t bit) {
+                    expected.push_back(BitmapTest(non_null_bitmap.data(), bit));
+                  });
+    LOG(INFO) << "non-null:  " << BitmapToString(non_null_bitmap.data(), n_rows);
+    LOG(INFO) << "selection: " << BitmapToString(sel_bitmap.data(), n_rows);
+    LOG(INFO) << "result:    " << BitmapToString(dst_bitmap.data(), expected.size());
+    for (int i = 0; i < expected.size(); i++) {
+      EXPECT_EQ(expected[i], BitmapTest(dst_bitmap.data(), i));
+    }
+  }
+}
+
+TEST_F(ColumnarSerializationTest, TestCopySelectedRows) {
+  auto num_rows = rng_.Uniform(1000) + 1;
+  vector<uint32_t> vals;
+  for (int i = 0; i < num_rows; i++) {
+    vals.push_back(rng_.Next());
+  }
+
+  vector<uint32_t> expected;
+  vector<uint16_t> sel_indexes;
+  for (int i = 0; i < num_rows; i++) {
+    if (rng_.OneIn(3)) {
+      sel_indexes.push_back(i);
+      expected.push_back(vals[i]);
+    }
+  }
+
+  vector<uint32_t> ret(expected.size());
+  internal::CopySelectedRows(sel_indexes, kTypeSize,
+                             reinterpret_cast<const uint8_t*>(vals.data()),
+                             reinterpret_cast<uint8_t*>(ret.data()));
+  ASSERT_EQ(expected, ret);
+}
+
+} // namespace kudu
diff --git a/src/kudu/common/columnar_serialization.cc b/src/kudu/common/columnar_serialization.cc
new file mode 100644
index 0000000..69e8da9
--- /dev/null
+++ b/src/kudu/common/columnar_serialization.cc
@@ -0,0 +1,365 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "kudu/common/columnar_serialization.h"
+
+#include <immintrin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include "kudu/common/zp7.h"
+#include "kudu/gutil/cpu.h"
+#include "kudu/gutil/port.h"
+#include "kudu/util/alignment.h"
+#include "kudu/util/bitmap.h"
+
+using std::vector;
+
+namespace kudu {
+
+namespace {
+
+// Utility to write variable bit-length values to a pre-allocated buffer.
+//
+// This is similar to the BitWriter class in util/bit-stream-utils.h except that
+// the other implementation manages growing an underlying 'faststring' rather
+// than writing to existing memory.
+struct BitWriter {
+
+  // Start writing data to 'dst', but skip over the first 'skip_initial_bits'
+  // bits.
+  BitWriter(uint8_t* dst, int skip_initial_bits) : dst_(dst) {
+    DCHECK_GE(skip_initial_bits, 0);
+    dst_ += skip_initial_bits / 8;
+
+    // The "skip" may place us in the middle of a byte. To simplify this,
+    // we just position ourselves at the start of that byte and buffer the
+    // pre-existing bits, thus positioning ourselves at the right offset.
+    int preexisting_bits = skip_initial_bits % 8;
+    uint8_t preexisting_val = *dst_ & ((1 << preexisting_bits) - 1);
+    Put(preexisting_val, preexisting_bits);
+  }
+
+  ~BitWriter() {
+    CHECK(flushed_) << "must flush";
+  }
+
+  void Put(uint64_t v, int num_bits) {
+    DCHECK(!flushed_);
+    DCHECK_LE(num_bits, 64);
+    buffered_values_ |= v << num_buffered_bits_;
+    num_buffered_bits_ += num_bits;
+
+    if (PREDICT_FALSE(num_buffered_bits_ >= 64)) {
+      memcpy(dst_, &buffered_values_, 8);
+      buffered_values_ = 0;
+      num_buffered_bits_ -= 64;
+      int shift = num_bits - num_buffered_bits_;
+      buffered_values_ = (shift >= 64) ? 0 : v >> shift;
+      dst_ += 8;
+    }
+    DCHECK_LT(num_buffered_bits_, 64);
+  }
+
+  void Flush() {
+    CHECK(!flushed_) << "must only flush once";
+    while (num_buffered_bits_ > 0) {
+      *dst_++ = buffered_values_ & 0xff;
+      buffered_values_ >>= 8;
+      num_buffered_bits_ -= 8;
+    }
+    flushed_ = true;
+  }
+
+  uint8_t* dst_;
+
+  // Accumulated bits that haven't been flushed to the destination buffer yet.
+  uint64_t buffered_values_ = 0;
+
+  // The number of accumulated bits in buffered_values_.
+  int num_buffered_bits_ = 0;
+
+  bool flushed_ = false;
+};
+
+} // anonymous namespace
+
+////////////////////////////////////////////////////////////
+// ZeroNullValues
+////////////////////////////////////////////////////////////
+
+namespace internal {
+
+namespace {
+// Implementation of ZeroNullValues, specialized for a particular type size.
+template<int sizeof_type>
+ATTRIBUTE_NOINLINE
+void ZeroNullValuesImpl(int dst_idx,
+                        int n_rows,
+                        uint8_t* __restrict__ dst_values_buf,
+                        uint8_t* __restrict__ non_null_bitmap) {
+  int aligned_dst_idx = KUDU_ALIGN_DOWN(dst_idx, 8);
+  int aligned_n_sel = n_rows + (dst_idx - aligned_dst_idx);
+
+  uint8_t* aligned_values_base = dst_values_buf + aligned_dst_idx * sizeof_type;
+
+  // TODO(todd): this code path benefits from the BMI instruction set. We should
+  // compile it twice, once with BMI supported.
+  ForEachUnsetBit(non_null_bitmap + aligned_dst_idx/8,
+                  aligned_n_sel,
+                  [&](int position) {
+                    // The position here is relative to our aligned bitmap.
+                    memset(aligned_values_base + position * sizeof_type, 0, sizeof_type);
+                  });
+}
+
+} // anonymous namespace
+
+// Zero out any values in 'dst_values_buf' which are indicated as null in 'non_null_bitmap'.
+//
+// 'n_rows' cells are processed, starting at index 'dst_idx' within the buffers.
+// 'sizeof_type' indicates the size of each cell in bytes.
+//
+// NOTE: this assumes that dst_values_buf and non_null_bitmap are valid for the full range
+// of indices [0, dst_idx + n_rows). The implementation may redundantly re-zero cells
+// at indexes less than dst_idx.
+void ZeroNullValues(int sizeof_type,
+                    int dst_idx,
+                    int n_rows,
+                    uint8_t* dst_values_buf,
+                    uint8_t* dst_non_null_bitmap) {
+  // Delegate to specialized implementations for each type size.
+  // This changes variable-length memsets into inlinable single instructions.
+  switch (sizeof_type) {
+#define CASE(size)                                                      \
+    case size:                                                          \
+      ZeroNullValuesImpl<size>(dst_idx, n_rows, dst_values_buf, dst_non_null_bitmap); \
+      break;
+    CASE(1);
+    CASE(2);
+    CASE(4);
+    CASE(8);
+    CASE(16);
+#undef CASE
+    default:
+      LOG(FATAL) << "bad size: " << sizeof_type;
+  }
+}
+
+
+////////////////////////////////////////////////////////////
+// CopyNonNullBitmap
+////////////////////////////////////////////////////////////
+
+namespace {
+template<class PextImpl>
+void CopyNonNullBitmapImpl(
+    const uint8_t* __restrict__ non_null_bitmap,
+    const uint8_t* __restrict__ sel_bitmap,
+    int dst_idx,
+    int n_rows,
+    uint8_t* __restrict__ dst_non_null_bitmap) {
+  BitWriter bw(dst_non_null_bitmap, dst_idx);
+
+  int num_64bit_words = n_rows / 64;
+  for (int i = 0; i < num_64bit_words; i++) {
+    uint64_t sel_mask = UnalignedLoad<uint64_t>(sel_bitmap + i * 8);
+    int num_bits = __builtin_popcountll(sel_mask);
+
+    uint64_t non_nulls = UnalignedLoad<uint64_t>(non_null_bitmap + i * 8);
+    uint64_t extracted = PextImpl::call(non_nulls, sel_mask);
+    bw.Put(extracted, num_bits);
+  }
+
+  int rem_rows = n_rows % 64;
+  non_null_bitmap += num_64bit_words * 8;
+  sel_bitmap += num_64bit_words * 8;
+  while (rem_rows > 0) {
+    uint8_t non_nulls = *non_null_bitmap;
+    uint8_t sel_mask = *sel_bitmap;
+
+    uint64_t extracted = PextImpl::call(non_nulls, sel_mask);
+    int num_bits = __builtin_popcountl(sel_mask);
+    bw.Put(extracted, num_bits);
+
+    sel_bitmap++;
+    non_null_bitmap++;
+    rem_rows -= 8;
+  }
+  bw.Flush();
+}
+
+struct PextZp7Clmul {
+  inline static uint64_t call(uint64_t val, uint64_t mask) {
+    return zp7_pext_64_clmul(val, mask);
+  }
+};
+struct PextZp7Simple {
+  inline static uint64_t call(uint64_t val, uint64_t mask) {
+    return zp7_pext_64_simple(val, mask);
+  }
+};
+
+#ifdef __x86_64__
+struct PextInstruction {
+  __attribute__((target("bmi2")))
+  inline static uint64_t call(uint64_t val, uint64_t mask) {
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+    // GCC <5 doesn't properly handle the _pext_u64 intrinsic inside
+    // a function with a specified target attribute. So, use inline
+    // assembly instead.
+    //
+    // Though this assembly works on clang as well, it has two downsides:
+    // - the "multiple constraint" 'rm' for 'mask' is supposed to indicate to
+    //   the compiler that the mask could either be in memory or in a register,
+    //   but clang doesn't support this, and will always spill it to memory
+    //   even if the value is already in a register. That results in an extra couple
+    //   cycles.
+    // - using the intrinsic means that clang optimization passes have some opportunity
+    //   to better understand what's going on and make appropriate downstream optimizations.
+    uint64_t dst;
+    asm ("pextq %[mask], %[val], %[dst]"
+        : [dst] "=r" (dst)
+        : [val] "r" (val),
+          [mask] "rm" (mask));
+    return dst;
+#else
+    return _pext_u64(val, mask);
+#endif // compiler check
+  }
+};
+// Explicit instantiation of the template for the PextInstruction case
+// allows us to apply the 'bmi2' target attribute for just this version.
+template
+__attribute__((target("bmi2")))
+void CopyNonNullBitmapImpl<PextInstruction>(
+    const uint8_t* __restrict__ non_null_bitmap,
+    const uint8_t* __restrict__ sel_bitmap,
+    int dst_idx,
+    int n_rows,
+    uint8_t* __restrict__ dst_non_null_bitmap);
+#endif // __x86_64__
+
+} // anonymous namespace
+
+// Return a prioritized list of methods that can be used for extracting bits from the non-null
+// bitmap.
+vector<PextMethod> GetAvailablePextMethods() {
+  vector<PextMethod> ret;
+#ifdef __x86_64__
+  base::CPU cpu;
+  // Even though recent AMD chips support pext, it's extremely slow,
+  // so only use BMI2 on Intel, and instead use the 'zp7' software
+  // implementation on AMD.
+  if (cpu.has_bmi2() && cpu.vendor_name() == "GenuineIntel") {
+    ret.push_back(PextMethod::kPextInstruction);
+  }
+  if (cpu.has_pclmulqdq()) {
+    ret.push_back(PextMethod::kClmul);
+  }
+#endif
+  ret.push_back(PextMethod::kSimple);
+  return ret;
+}
+
+PextMethod g_pext_method = GetAvailablePextMethods()[0];
+
+void CopyNonNullBitmap(const uint8_t* non_null_bitmap,
+                       const uint8_t* sel_bitmap,
+                       int dst_idx,
+                       int n_rows,
+                       uint8_t* dst_non_null_bitmap) {
+  switch (g_pext_method) {
+#ifdef __x86_64__
+    case PextMethod::kPextInstruction:
+      CopyNonNullBitmapImpl<PextInstruction>(
+          non_null_bitmap, sel_bitmap, dst_idx, n_rows, dst_non_null_bitmap);
+      break;
+    case PextMethod::kClmul:
+      CopyNonNullBitmapImpl<PextZp7Clmul>(
+          non_null_bitmap, sel_bitmap, dst_idx, n_rows, dst_non_null_bitmap);
+      break;
+#endif
+    case PextMethod::kSimple:
+      CopyNonNullBitmapImpl<PextZp7Simple>(
+          non_null_bitmap,  sel_bitmap, dst_idx, n_rows, dst_non_null_bitmap);
+      break;
+    default:
+      __builtin_unreachable();
+  }
+}
+
+////////////////////////////////////////////////////////////
+// CopySelectedRows
+////////////////////////////////////////////////////////////
+
+namespace {
+template<int sizeof_type>
+ATTRIBUTE_NOINLINE
+void CopySelectedRowsImpl(const uint16_t* __restrict__ sel_rows,
+                          int n_sel_rows,
+                          const uint8_t* __restrict__ src_buf,
+                          uint8_t* __restrict__ dst_buf) {
+  int rem = n_sel_rows;
+  while (rem--) {
+    int idx = *sel_rows++;
+    memcpy(dst_buf, src_buf + idx * sizeof_type, sizeof_type);
+    dst_buf += sizeof_type;
+  }
+  // TODO(todd): should we zero out nulls first or otherwise avoid
+  // copying them?
+}
+
+template<int sizeof_type>
+ATTRIBUTE_NOINLINE
+void CopySelectedRowsImpl(const vector<uint16_t>& sel_rows,
+                          const uint8_t* __restrict__ src_buf,
+                          uint8_t* __restrict__ dst_buf) {
+  CopySelectedRowsImpl<sizeof_type>(sel_rows.data(), sel_rows.size(), src_buf, dst_buf);
+}
+
+} // anonymous namespace
+
+void CopySelectedRows(const vector<uint16_t>& sel_rows,
+                      int sizeof_type,
+                      const uint8_t* __restrict__ src_buf,
+                      uint8_t* __restrict__ dst_buf) {
+  switch (sizeof_type) {
+#define CASE(size)                                            \
+    case size:                                                \
+      CopySelectedRowsImpl<size>(sel_rows, src_buf, dst_buf); \
+      break;
+    CASE(1);
+    CASE(2);
+    CASE(4);
+    CASE(8);
+    CASE(16);
+#undef CASE
+    default:
+      LOG(FATAL) << "unexpected type size: " << sizeof_type;
+  }
+}
+
+} // namespace internal
+
+} // namespace kudu
diff --git a/src/kudu/common/columnar_serialization.h b/src/kudu/common/columnar_serialization.h
new file mode 100644
index 0000000..04025b7
--- /dev/null
+++ b/src/kudu/common/columnar_serialization.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace kudu {
+
+////////////////////////////////////////////////////////////
+// Expose these internal functions for unit testing.
+// Do not call them outside of tests!
+// See .cc file for docs.
+////////////////////////////////////////////////////////////
+namespace internal {
+void ZeroNullValues(int type_size,
+                    int dst_idx,
+                    int n_rows,
+                    uint8_t* dst_values_buf,
+                    uint8_t* non_null_bitmap);
+
+void CopyNonNullBitmap(const uint8_t* non_null_bitmap,
+                       const uint8_t* sel_bitmap,
+                       int dst_idx,
+                       int n_rows,
+                       uint8_t* dst_non_null_bitmap);
+
+void CopySelectedRows(const std::vector<uint16_t>& sel_rows,
+                      int type_size,
+                      const uint8_t* __restrict__ src_buf,
+                      uint8_t* __restrict__ dst_buf);
+
+
+enum class PextMethod {
+#ifdef __x86_64__
+  kPextInstruction,
+  kClmul,
+#endif
+  kSimple
+};
+extern PextMethod g_pext_method;
+
+std::vector<PextMethod> GetAvailablePextMethods();
+
+} // namespace internal
+} // namespace kudu
diff --git a/src/kudu/common/zp7.cc b/src/kudu/common/zp7.cc
new file mode 100644
index 0000000..de817d7
--- /dev/null
+++ b/src/kudu/common/zp7.cc
@@ -0,0 +1,173 @@
+// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
+//
+// Copyright (c) 2020 Zach Wegner
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Based on https://github.com/zwegner/zp7 as of
+// a1ed4e5ace07f7d69cb50af1cbd37df4fa3d87af
+//
+// This is imported rather than included via thirdparty since the upstream
+// project has no header file. It has been modified as follows:
+//
+// - remove 'pdep' implementations since we only need 'pext'.
+// - separate the clmul and non-accelerated variant into separate
+//   functions so they can be switched at runtime.
+// - put inside a kudu namespace
+// - disable UBSAN for some undefined integer casts
+
+#include "kudu/common/zp7.h"
+
+#include "kudu/gutil/port.h"
+
+#ifdef __x86_64__
+#include <emmintrin.h>
+#endif
+
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+#define USE_INLINE_ASM_CLMUL
+#else
+#include <wmmintrin.h>
+#endif
+
+#define N_BITS      (6)
+
+namespace kudu {
+
+typedef struct {
+  uint64_t mask;
+  uint64_t ppp_bit[N_BITS];
+} zp7_masks_64_t;
+
+// If we don't have access to the CLMUL instruction, emulate it with
+// shifts and XORs
+static inline uint64_t prefix_sum(uint64_t x) {
+  for (int i = 0; i < N_BITS; i++)
+    x ^= x << (1 << i);
+  return x;
+}
+
+// GCC <5 doesn't properly handle the _pext_u64 intrinsic inside
+// a function with a specified target attribute. So, use inline
+// assembly instead.
+#ifdef USE_INLINE_ASM_CLMUL
+static inline __m128i asm_mm_clmulepi64_si128(__m128i a, __m128i b) {
+  asm ("pclmulqdq $0, %1, %0"
+       : "+x" (a)
+       : "xm" (b));
+  return a;
+}
+#define CLMUL asm_mm_clmulepi64_si128
+#else
+#define CLMUL(a, b) (_mm_clmulepi64_si128(a, b, 0))
+#endif
+
+
+// Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills.
+// It can also be called separately and cached, if the mask values will be used
+// more than once (these can be shared across PEXT and PDEP calls if they use
+// the same masks).
+//
+// This variant depends on the CLMUL instruction.
+__attribute__((target("pclmul")))
+ATTRIBUTE_NO_SANITIZE_INTEGER
+static zp7_masks_64_t zp7_ppp_64_clmul(uint64_t mask) {
+  zp7_masks_64_t r;
+  r.mask = mask;
+
+  // Count *unset* bits
+  mask = ~mask;
+
+  // Move the mask and -2 to XMM registers for CLMUL
+  __m128i m = _mm_cvtsi64_si128(mask);
+  __m128i neg_2 = _mm_cvtsi64_si128(-2LL);
+  for (int i = 0; i < N_BITS - 1; i++) {
+    // Do a 1-bit parallel prefix popcount, shifted left by 1,
+    // in one carry-less multiply by -2.
+    __m128i bit = CLMUL(m, neg_2);
+    r.ppp_bit[i] = _mm_cvtsi128_si64(bit);
+
+    // Get the carry bit of the 1-bit parallel prefix popcount. On
+    // the next iteration, we will sum this bit to get the next mask
+    m = _mm_and_si128(m, bit);
+  }
+  // For the last iteration, we can use a regular multiply by -2 instead of a
+  // carry-less one (or rather, a strength reduction of that, with
+  // neg/add/etc), since there can't be any carries anyways. That is because
+  // the last value of m (which has one bit set for every 32nd unset mask bit)
+  // has at most two bits set in it, when mask is zero and thus there are 64
+  // bits set in ~mask. If two bits are set, one of them is the top bit, which
+  // gets shifted out, since we're counting bits below each mask bit.
+  r.ppp_bit[N_BITS - 1] = -_mm_cvtsi128_si64(m) << 1;
+
+  return r;
+}
+
+// Implementation that doesn't depend on CLMUL
+ATTRIBUTE_NO_SANITIZE_INTEGER
+static zp7_masks_64_t zp7_ppp_64_simple(uint64_t mask) {
+  zp7_masks_64_t r;
+  r.mask = mask;
+
+  // Count *unset* bits
+  mask = ~mask;
+  for (int i = 0; i < N_BITS - 1; i++) {
+    // Do a 1-bit parallel prefix popcount, shifted left by 1
+    uint64_t bit = prefix_sum(mask << 1);
+    r.ppp_bit[i] = bit;
+
+    // Get the carry bit of the 1-bit parallel prefix popcount. On
+    // the next iteration, we will sum this bit to get the next mask
+    mask &= bit;
+  }
+  // The last iteration won't carry, so just use neg/shift. See the CLMUL
+  // case above for justification.
+  r.ppp_bit[N_BITS - 1] = -mask << 1;
+  return r;
+}
+
+// PEXT
+
+static uint64_t zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
+  // Mask only the bits that are set in the input mask. Otherwise they collide
+  // with input bits and screw everything up
+  a &= masks->mask;
+
+  // For each bit in the PPP, shift right only those bits that are set in
+  // that bit's mask
+  for (int i = 0; i < N_BITS; i++) {
+    uint64_t shift = 1 << i;
+    uint64_t bit = masks->ppp_bit[i];
+    // Shift only the input bits that are set in
+    a = (a & ~bit) | ((a & bit) >> shift);
+  }
+  return a;
+}
+
+uint64_t zp7_pext_64_simple(uint64_t a, uint64_t mask) {
+  zp7_masks_64_t masks = zp7_ppp_64_simple(mask);
+  return zp7_pext_pre_64(a, &masks);
+}
+
+uint64_t zp7_pext_64_clmul(uint64_t a, uint64_t mask) {
+  zp7_masks_64_t masks = zp7_ppp_64_clmul(mask);
+  return zp7_pext_pre_64(a, &masks);
+}
+
+} // namespace kudu
diff --git a/src/kudu/common/zp7.h b/src/kudu/common/zp7.h
new file mode 100644
index 0000000..800e4e4
--- /dev/null
+++ b/src/kudu/common/zp7.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+
+namespace kudu {
+
+// Extract bits from 'a' corresponding to 'mask'.
+//
+// This is a software implementation of the 'PEXT' instruction
+// from the BMI2 instruction set.
+//
+// This implementation uses the CLMUL instruction set. Callers should
+// verify that the instruction is present (eg using base::CPU) before
+// calling.
+uint64_t zp7_pext_64_clmul(uint64_t a, uint64_t mask);
+
+// This implementation is slower but doesn't require any special instructions.
+uint64_t zp7_pext_64_simple(uint64_t a, uint64_t mask);
+
+} // namespace kudu