You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2020/04/06 18:19:41 UTC
[kudu] branch master updated: columnar_serialization: use AVX2 for
int32 and int64 copying
This is an automated email from the ASF dual-hosted git repository.
todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new aef3d42 columnar_serialization: use AVX2 for int32 and int64 copying
aef3d42 is described below
commit aef3d4291b0051184249d03bd3ee828ec7739189
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Wed Apr 1 13:51:47 2020 -0700
columnar_serialization: use AVX2 for int32 and int64 copying
This uses the AVX2 "gather" instructions to do the copying of selected
int32s and int64s. The following improvements were observed:
Int32:
Converting 10_int32_non_null to PB (method columnar) row select rate 1: 0.8829691 cycles/cell -> 0.8386091 cycles/cell
Converting 10_int32_non_null to PB (method columnar) row select rate 0.8: 1.86863074 cycles/cell -> 1.61456746 cycles/cell
Converting 10_int32_non_null to PB (method columnar) row select rate 0.5: 2.3829623 cycles/cell -> 2.05157198 cycles/cell
Converting 10_int32_non_null to PB (method columnar) row select rate 0.2: 4.15909214 cycles/cell -> 3.82449024 cycles/cell
Converting 10_int32_0pct_null to PB (method columnar) row select rate 1: 1.04652828 cycles/cell -> 1.01822806 cycles/cell
Converting 10_int32_0pct_null to PB (method columnar) row select rate 0.8: 2.10860372 cycles/cell -> 1.85333702 cycles/cell
Converting 10_int32_0pct_null to PB (method columnar) row select rate 0.5: 2.75141002 cycles/cell -> 2.39638206 cycles/cell
Converting 10_int32_0pct_null to PB (method columnar) row select rate 0.2: 4.6968821 cycles/cell -> 4.40193506 cycles/cell
Converting 10_int32_10pct_null to PB (method columnar) row select rate 1: 1.31809924 cycles/cell -> 1.31851512 cycles/cell
Converting 10_int32_10pct_null to PB (method columnar) row select rate 0.8: 2.36648378 cycles/cell -> 2.12030662 cycles/cell
Converting 10_int32_10pct_null to PB (method columnar) row select rate 0.5: 2.98480266 cycles/cell -> 2.7476185 cycles/cell
Converting 10_int32_10pct_null to PB (method columnar) row select rate 0.2: 5.0439634 cycles/cell -> 4.5842071 cycles/cell
Int64:
Converting 10_int64_non_null to PB (method columnar) row select rate 1: 1.32330358 cycles/cell -> 1.24855148 cycles/cell
Converting 10_int64_non_null to PB (method columnar) row select rate 0.8: 2.04848734 cycles/cell -> 2.12979712 cycles/cell
Converting 10_int64_non_null to PB (method columnar) row select rate 0.5: 2.50150968 cycles/cell -> 2.5724664 cycles/cell
Converting 10_int64_non_null to PB (method columnar) row select rate 0.2: 4.4513395 cycles/cell -> 4.35936382 cycles/cell
Converting 10_int64_0pct_null to PB (method columnar) row select rate 1: 1.5080423 cycles/cell -> 1.51448434 cycles/cell
Converting 10_int64_0pct_null to PB (method columnar) row select rate 0.8: 2.34286302 cycles/cell -> 2.26529584 cycles/cell
Converting 10_int64_0pct_null to PB (method columnar) row select rate 0.5: 2.99375316 cycles/cell -> 2.7263687 cycles/cell
Converting 10_int64_0pct_null to PB (method columnar) row select rate 0.2: 5.01722324 cycles/cell -> 4.71793008 cycles/cell
Converting 10_int64_10pct_null to PB (method columnar) row select rate 1: 1.7227708 cycles/cell -> 1.67661726 cycles/cell
Converting 10_int64_10pct_null to PB (method columnar) row select rate 0.8: 2.68160422 cycles/cell -> 2.50480846 cycles/cell
Converting 10_int64_10pct_null to PB (method columnar) row select rate 0.5: 3.29833934 cycles/cell -> 3.05940708 cycles/cell
Converting 10_int64_10pct_null to PB (method columnar) row select rate 0.2: 5.42127834 cycles/cell -> 4.99359244 cycles/cell
In the few places that the above indicates a regression, I looped that
same test case and found that the "after" was indeed either
indistinguishable or slightly faster. The test results just have a
little bit of noise.
Change-Id: I6c9a536b78a524e8178f5d4a0d2dea04deedbd78
Reviewed-on: http://gerrit.cloudera.org:8080/15634
Tested-by: Todd Lipcon <to...@apache.org>
Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
src/kudu/common/columnar_serialization.cc | 102 +++++++++++++++++++++++++++---
1 file changed, 93 insertions(+), 9 deletions(-)
diff --git a/src/kudu/common/columnar_serialization.cc b/src/kudu/common/columnar_serialization.cc
index 5e647d6..47060de 100644
--- a/src/kudu/common/columnar_serialization.cc
+++ b/src/kudu/common/columnar_serialization.cc
@@ -17,6 +17,7 @@
#include "kudu/common/columnar_serialization.h"
+#include <emmintrin.h>
#include <immintrin.h>
#include <cstring>
@@ -319,6 +320,79 @@ void CopyNonNullBitmap(const uint8_t* non_null_bitmap,
////////////////////////////////////////////////////////////
namespace {
+
+const bool kHasAvx2 = base::CPU().has_avx2();
+
+// Return the number of rows copied through an AVX-optimized implementation.
+// These implementations leave a "tail" of non-vectorizable rows that get
+// handled by the scalar implementation.
+template<int sizeof_type>
+int CopySelectedRowsAvx(
+ const uint16_t* __restrict__ /* sel_rows */,
+ int /* n_sel_rows */,
+ const uint8_t* __restrict__ /* src_buf */,
+ uint8_t* __restrict__ /* dst_buf */) {
+ return 0;
+}
+
+// Define AVX2-optimized variants where possible.
+// These are disabled on GCC4 because it doesn't support per-function
+// enabling of intrinsics.
+#if __x86_64__ && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5))
+template<>
+__attribute__((target("avx2")))
+int CopySelectedRowsAvx<4>(
+ const uint16_t* __restrict__ sel_rows,
+ int n_sel_rows,
+ const uint8_t* __restrict__ src_buf,
+ uint8_t* __restrict__ dst_buf) {
+
+ static constexpr int sizeof_type = 4;
+ static constexpr int ints_per_vector = sizeof(__m256i)/sizeof_type;
+ int iters = n_sel_rows / ints_per_vector;
+ while (iters--) {
+ // Load 8x16-bit indexes from sel_rows, zero-extending them to 8x32-bit integers
+ // since the 'gather' instructions don't support 16-bit indexes.
+ __m256i indexes = _mm256_cvtepu16_epi32(*reinterpret_cast<const __m128i*>(sel_rows));
+ // Gather 8x32-bit elements from src_buf[index*sizeof_type] for each index.
+ __m256i elems = _mm256_i32gather_epi32(src_buf, indexes, sizeof_type);
+ // Store the 8x32-bit elements into the destination.
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst_buf), elems);
+ dst_buf += ints_per_vector * sizeof_type;
+ sel_rows += ints_per_vector;
+ }
+ return KUDU_ALIGN_DOWN(n_sel_rows, ints_per_vector);
+}
+
+template<>
+__attribute__((target("avx2")))
+int CopySelectedRowsAvx<8>(
+ const uint16_t* __restrict__ sel_rows,
+ int n_sel_rows,
+ const uint8_t* __restrict__ src_buf,
+ uint8_t* __restrict__ dst_buf) {
+
+ static constexpr int sizeof_type = 8;
+ static constexpr int ints_per_vector = sizeof(__m256i)/sizeof_type;
+ int iters = n_sel_rows / ints_per_vector;
+ while (iters--) {
+ // Load 4x16-bit indexes from sel_rows into 'indexes'. This compiles down
+ // into a single vpmovzxwd instruction despite looking like four separate loads.
+ __m128i indexes = _mm_set_epi32(sel_rows[3],
+ sel_rows[2],
+ sel_rows[1],
+ sel_rows[0]);
+ // Load 4x64-bit integers from src_buf[index * sizeof_type] for each index.
+ __m256i elems = _mm256_i32gather_epi64(src_buf, indexes, sizeof_type);
+ // Store the 4x64-bit integers in the destination.
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst_buf), elems);
+ dst_buf += ints_per_vector * sizeof_type;
+ sel_rows += ints_per_vector;
+ }
+ return KUDU_ALIGN_DOWN(n_sel_rows, ints_per_vector);
+}
+#endif
+
template<int sizeof_type>
ATTRIBUTE_NOINLINE
void CopySelectedRowsImpl(const uint16_t* __restrict__ sel_rows,
@@ -326,6 +400,13 @@ void CopySelectedRowsImpl(const uint16_t* __restrict__ sel_rows,
const uint8_t* __restrict__ src_buf,
uint8_t* __restrict__ dst_buf) {
int rem = n_sel_rows;
+ if (kHasAvx2) {
+ int copied = CopySelectedRowsAvx<sizeof_type>(sel_rows, n_sel_rows, src_buf, dst_buf);
+ rem -= copied;
+ dst_buf += copied * sizeof_type;
+ sel_rows += copied;
+ }
+
while (rem--) {
int idx = *sel_rows++;
memcpy(dst_buf, src_buf + idx * sizeof_type, sizeof_type);
@@ -345,6 +426,8 @@ void CopySelectedRowsImpl(const vector<uint16_t>& sel_rows,
} // anonymous namespace
+// Copy the selected cells from the column data 'src_buf' into 'dst_buf' as indicated by
+// the indices in 'sel_rows'. 'sizeof_type' is the size in bytes of each cell.
void CopySelectedRows(const vector<uint16_t>& sel_rows,
int sizeof_type,
const uint8_t* __restrict__ src_buf,
@@ -395,7 +478,7 @@ void RelocateSlicesToIndirect(uint8_t* __restrict__ cells_buf, int n_rows,
// Specialized division for the known type sizes. Despite having some branching here,
// this is faster than a 'div' instruction which has a 20+ cycle latency.
-size_t div_type_size(size_t s, size_t divisor) {
+size_t div_sizeof_type(size_t s, size_t divisor) {
switch (divisor) {
case 1: return s;
case 2: return s/2;
@@ -412,22 +495,22 @@ size_t div_type_size(size_t s, size_t divisor) {
void CopySelectedCellsFromColumn(const ColumnBlock& cblock,
const SelectedRows& sel_rows,
ColumnarSerializedBatch::Column* dst) {
- size_t type_size = cblock.type_info()->size();
+ size_t sizeof_type = cblock.type_info()->size();
int n_sel = sel_rows.num_selected();
// Number of initial rows in the dst values and null_bitmap.
- DCHECK_EQ(dst->data.size() % type_size, 0);
- size_t initial_rows = div_type_size(dst->data.size(), type_size);
+ DCHECK_EQ(dst->data.size() % sizeof_type, 0);
+ size_t initial_rows = div_sizeof_type(dst->data.size(), sizeof_type);
size_t new_num_rows = initial_rows + n_sel;
- dst->data.resize_with_extra_capacity(type_size * new_num_rows);
- uint8_t* dst_buf = dst->data.data() + type_size * initial_rows;
+ dst->data.resize_with_extra_capacity(sizeof_type * new_num_rows);
+ uint8_t* dst_buf = dst->data.data() + sizeof_type * initial_rows;
const uint8_t* src_buf = cblock.cell_ptr(0);
if (sel_rows.all_selected()) {
- memcpy(dst_buf, src_buf, type_size * n_sel);
+ memcpy(dst_buf, src_buf, sizeof_type * n_sel);
} else {
- CopySelectedRows(sel_rows.indexes(), type_size, src_buf, dst_buf);
+ CopySelectedRows(sel_rows.indexes(), sizeof_type, src_buf, dst_buf);
}
if (cblock.is_nullable()) {
@@ -437,7 +520,8 @@ void CopySelectedCellsFromColumn(const ColumnBlock& cblock,
sel_rows.bitmap(),
initial_rows, cblock.nrows(),
dst->non_null_bitmap->data());
- ZeroNullValues(type_size, initial_rows, n_sel, dst->data.data(), dst->non_null_bitmap->data());
+ ZeroNullValues(sizeof_type, initial_rows, n_sel,
+ dst->data.data(), dst->non_null_bitmap->data());
}
if (cblock.type_info()->physical_type() == BINARY) {