You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jb...@apache.org on 2016/09/07 03:28:57 UTC
[2/2] incubator-impala git commit: Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.

Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.

Additionally, change the parameter of rotate to a compile-time
constant, and add "inline" to functions, increasing the performance
dramatically. The compiler can't inline the SIMD versions, because
they use Intel intrinsics -- added a TODO to add these intrinsics to
sse-util.h.

Change-Id: I11d48f8816d5b129858a1f773015e51049dd1d61
Reviewed-on: http://gerrit.cloudera.org:8080/4313
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/39e01abc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/39e01abc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/39e01abc

Branch: refs/heads/master
Commit: 39e01abcf38bafaa03810fdbfd743a1558db92d8
Parents: b35689d
Author: Jim Apple <jb...@cloudera.com>
Authored: Mon Sep 5 14:15:06 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Wed Sep 7 01:35:30 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/int-hash-benchmark.cc | 302 ++++++++++++++++++++++-----
 1 file changed, 245 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/39e01abc/be/src/benchmarks/int-hash-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/int-hash-benchmark.cc b/be/src/benchmarks/int-hash-benchmark.cc
index a0787f1..9bed3c4 100644
--- a/be/src/benchmarks/int-hash-benchmark.cc
+++ b/be/src/benchmarks/int-hash-benchmark.cc
@@ -19,10 +19,14 @@
 
 #include <iostream>
 #include <limits>
+#include <memory>
 #include <vector>
 
+#include <immintrin.h>
+
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
+#include "util/hash-util.h"
 #include "util/sse-util.h"
 
 using namespace std;
@@ -31,22 +35,49 @@ using namespace impala;
 // Test hash functions that take integers as arguments and produce integers as the result.
 //
 // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
-// 32 -> 32:             Function     Rate (iters/ms)          Comparison
-// ----------------------------------------------------------------------
-//                       Jenkins2               14.06                  1X
-//                       Jenkins1               16.97              1.207X
-//                        MultRot               19.14              1.361X
-//               MultiplyAddShift               21.21              1.509X
-//                  MultiplyShift                25.3              1.799X
-//                            CRC               25.53              1.816X
+// 32 -> 32:                  Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//                                 FNV               25.7       26     26.4         1X         1X         1X
+//                             Zobrist               30.2     30.7     30.9      1.18X      1.18X      1.17X
+//                             MultRot               42.6     43.4     43.4      1.66X      1.67X      1.64X
+//                    MultiplyAddShift               42.4     43.2     43.2      1.65X      1.66X      1.64X
+//                            Jenkins1               51.8       54       54      2.02X      2.08X      2.05X
+//                            Jenkins2               66.2     67.4     67.5      2.58X      2.59X      2.56X
+//                                 CRC               98.6      100      101      3.84X      3.85X      3.84X
+//                       MultiplyShift                150      152      153      5.84X      5.86X      5.79X
+//
+// 32 x 4 -> 32 x 4:          Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//              (Multiple<Zobrist, 4>)               30.8     31.2     31.4         1X         1X         1X
+//     (Multiple<MultiplyAddShift, 4>)               44.2       45       45      1.43X      1.44X      1.43X
+//                  (Multiple<CRC, 4>)                118      120      121      3.84X      3.86X      3.85X
+//        (Multiple<MultiplyShift, 4>)                156      159      159      5.07X       5.1X      5.08X
+//                 MultiplyAddShift128               75.7     77.2     77.2      2.46X      2.48X      2.46X
+//                    MultiplyShift128                128      131      133      4.16X      4.21X      4.23X
+//
+// 32 x 8 -> 32 x 8:          Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//              (Multiple<Zobrist, 8>)                 31     31.5     31.8         1X         1X         1X
+//     (Multiple<MultiplyAddShift, 8>)               44.3     44.5     45.2      1.43X      1.41X      1.42X
+//                  (Multiple<CRC, 8>)                121      123      123       3.9X       3.9X      3.88X
+//        (Multiple<MultiplyShift, 8>)                158      159      160      5.11X      5.05X      5.04X
+//                    Zobrist256simple               16.5     16.5     16.6     0.533X     0.524X     0.522X
+//                    Zobrist256gather               18.8     19.2     19.4     0.608X     0.608X      0.61X
+//                 MultiplyAddShift256                151      154      154      4.88X      4.88X      4.84X
+//                    MultiplyShift256                209      212      212      6.73X      6.71X      6.67X
 
 // Rotate 32 bits right by 'shift'. This is _rotr in the intel instrinsics, but that isn't
 // usable on Clang yet. Fortunately, both GCC and Clang can optimize this to use the 'ror'
 // instruction.
-uint32_t RotateRight(uint32_t x, int shift) {
-  DCHECK_GT(shift, 0);
-  DCHECK_LT(shift, std::numeric_limits<decltype(x)>::digits);
-  return (x << (std::numeric_limits<decltype(x)>::digits - shift)) | (x >> shift);
+template<int SHIFT>
+inline uint32_t RotateRight(uint32_t x) {
+  static_assert(SHIFT > 0, "Only positive shifts are defined behavior and useful");
+  static_assert(
+      SHIFT < std::numeric_limits<decltype(x)>::digits, "This much shift is just 0");
+  return (x << (std::numeric_limits<decltype(x)>::digits - SHIFT)) | (x >> SHIFT);
 }
 
 // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits
@@ -60,61 +91,178 @@ uint32_t MakeRandU32() {
 
 // Almost universal hashing, M. Dietzfelbinger, T. Hagerup, J. Katajainen, and M.
 // Penttonen, "A reliable randomized algorithm for the closest-pair problem".
-uint32_t MultiplyShift(uint32_t x) {
+inline void MultiplyShift(uint32_t* x) {
   static const uint32_t m = 0x61eaf8e9u;
-  return x*m;
+  *x = *x * m;
+}
+
+// Like MultiplyShift, but using SSE's 128-bit SIMD registers to do 4 at once.
+//
+// TODO: Add the Intel intrinsics used in this function and the other functions in this
+// file to sse-util.h so that these functions can be inlined.
+inline void MultiplyShift128(__m128i* x) __attribute__((__target__("sse4.1")));
+inline void MultiplyShift128(__m128i* x) {
+  const __m128i m = _mm_set1_epi32(0x61eaf8e9);
+  _mm_storeu_si128(x, _mm_mullo_epi32(_mm_loadu_si128(x), m));
+}
+
+// Like MultiplyShift, but using AVX2's 256-bit SIMD registers to do 8 at once.
+//
+// Not inline, because it degrades the performance for unknown reasons.
+void MultiplyShift256(__m256i* x) __attribute__((__target__("avx2")));
+void MultiplyShift256(__m256i* x) {
+  const __m256i m = _mm256_set1_epi32(0x61eaf8e9);
+  _mm256_storeu_si256(x, _mm256_mullo_epi32(_mm256_loadu_si256(x), m));
 }
 
 // 2-independent hashing. M. Dietzfelbinger, "Universal hashing and k-wise independent
 // random variables via integer arithmetic without primes"
-uint32_t MultiplyAddShift(uint32_t x) {
+inline void MultiplyAddShift(uint32_t* x) {
   static const uint64_t m = 0xa1f1bd3e020b4be0ull, a = 0x86b0426193d86e66ull;
-  return (static_cast<uint64_t>(x) * m + a) >> 32;
+  *x = (static_cast<uint64_t>(*x) * m + a) >> 32;
+}
+
+// Like MultiplyAddShift, but using SSE's 128-bit SIMD registers to do 4 at once.
+inline void MultiplyAddShift128(__m128i* x) __attribute__((__target__("sse4.1")));
+inline void MultiplyAddShift128(__m128i* x) {
+  const auto m = _mm_set1_epi64x(0xa1f1bd3e020b4be0ull),
+                mhi = _mm_set1_epi32(0xa1f1bd3e),
+                a = _mm_set1_epi64x(0x86b0426193d86e66ull);
+  auto input = _mm_loadu_si128(x);
+  auto prod32easy = _mm_mullo_epi32(input, mhi);
+  auto input_odds = _mm_srli_epi64(input, 32);
+  auto prod64_evens = _mm_mul_epu32(input, m),
+          prod64_odds = _mm_mul_epu32(input_odds, m);
+  prod64_evens = _mm_add_epi64(a, prod64_evens);
+  prod64_odds = _mm_add_epi64(a, prod64_odds);
+  auto prod32hard = _mm_unpackhi_epi32(prod64_evens, prod64_odds);
+  _mm_storeu_si128(x, _mm_add_epi32(prod32easy, prod32hard));
+}
+
+// Like MultiplyAddShift, but using AVX2's 256-bit SIMD registers to do 8 at once.
+inline void MultiplyAddShift256(__m256i* x) __attribute__((__target__("avx2")));
+inline void MultiplyAddShift256(__m256i* x) {
+  const __m256i m = _mm256_set1_epi64x(0xa1f1bd3e020b4be0ull),
+                mhi = _mm256_set1_epi32(0xa1f1bd3e),
+                a = _mm256_set1_epi64x(0x86b0426193d86e66ull);
+  __m256i input = _mm256_loadu_si256(x);
+  __m256i prod32easy = _mm256_mullo_epi32(input, mhi);
+  __m256i input_odds = _mm256_srli_epi64(input, 32);
+  __m256i prod64_evens = _mm256_mul_epu32(input, m),
+          prod64_odds = _mm256_mul_epu32(input_odds, m);
+  prod64_evens = _mm256_add_epi64(a, prod64_evens);
+  prod64_odds = _mm256_add_epi64(a, prod64_odds);
+  __m256i prod32hard = _mm256_unpackhi_epi32(prod64_evens, prod64_odds);
+  _mm256_storeu_si256(x, _mm256_add_epi32(prod32easy, prod32hard));
 }
 
 // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t Jenkins1(int32_t x) {
-  x = ~x + (x << 15);  // x = (x << 15) - x - 1;
-  x = x ^ RotateRight(x, 12);
-  x = x + (x << 2);
-  x = x ^ RotateRight(x, 4);
-  x = x * 2057;  // x = (x + (x << 3)) + (x << 11);
-  x = x ^ RotateRight(x, 16);
-  return x;
+inline void Jenkins1(int32_t* x) {
+  *x = ~*x + (*x << 15); // x = (x << 15) - x - 1;
+  *x = *x ^ RotateRight<12>(*x);
+  *x = *x + (*x << 2);
+  *x = *x ^ RotateRight<4>(*x);
+  *x = *x * 2057; // x = (x + (x << 3)) + (x << 11);
+  *x = *x ^ RotateRight<16>(*x);
 }
 
 // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-uint32_t Jenkins2(uint32_t a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+inline void Jenkins2(uint32_t* a) {
+  *a = (*a + 0x7ed55d16) + (*a << 12);
+  *a = (*a ^ 0xc761c23c) ^ (*a >> 19);
+  *a = (*a + 0x165667b1) + (*a << 5);
+  *a = (*a + 0xd3a2646c) ^ (*a << 9);
+  *a = (*a + 0xfd7046c5) + (*a << 3);
+  *a = (*a ^ 0xb55a4f09) ^ (*a >> 16);
 }
 
 // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t MultRot(int32_t key) {
+inline void MultRot(int32_t* key) {
   static const int32_t c2 = 0x27d4eb2d;  // a prime or an odd constant
-  key = (key ^ 61) ^ RotateRight(key, 16);
-  key = key + (key << 3);
-  key = key ^ RotateRight(key, 4);
-  key = key * c2;
-  key = key ^ RotateRight(key, 15);
-  return key;
+  *key = (*key ^ 61) ^ RotateRight<16>(*key);
+  *key = *key + (*key << 3);
+  *key = *key ^ RotateRight<4>(*key);
+  *key = *key * c2;
+  *key = *key ^ RotateRight<15>(*key);
+}
+
+inline void CRC(uint32_t* x) {
+  *x = SSE4_crc32_u32(*x, 0xab8ce2abu);
+}
+
+inline void FNV(uint32_t* key) {
+  *key = HashUtil::FnvHash64to32(key, sizeof(*key), HashUtil::FNV_SEED);
+}
+
+// Zobrist hashing, also known as tabulation hashing or simple tabulation hashing, is an
+// old technique that has been recently analyzed and found to be very good for a number of
+// applications. See "The Power of Simple Tabulation Hashing", by Mihai Patrascu and
+// Mikkel Thorup.
+
+uint32_t ZOBRIST_DATA[4][256];
+
+inline void Zobrist(uint32_t* key) {
+  const uint8_t* key_chars = reinterpret_cast<const uint8_t*>(key);
+  *key = ZOBRIST_DATA[0][key_chars[0]] ^ ZOBRIST_DATA[1][key_chars[1]]
+      ^ ZOBRIST_DATA[2][key_chars[2]] ^ ZOBRIST_DATA[3][key_chars[3]];
+}
+
+// Like Zobrist, but uses AVX2's "gather" primatives to hash 8 values at once.
+inline void Zobrist256gather(__m256i* key) __attribute__((__target__("avx2")));
+inline void Zobrist256gather(__m256i* key) {
+  const auto k = _mm256_loadu_si256(key);
+  const auto low_mask = _mm256_set1_epi32(0xff);
+  auto k0 = _mm256_and_si256(low_mask, k),
+       k1 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 8)),
+       k2 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 16)),
+       k3 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 24));
+  k0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[0]), k0, 1);
+  k1 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[1]), k1, 1);
+  k2 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[2]), k2, 1);
+  k3 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[3]), k3, 1);
+  auto k01 = _mm256_xor_si256(k0, k1), k23 = _mm256_xor_si256(k2, k3);
+  _mm256_storeu_si256(key, _mm256_xor_si256(k01, k23));
 }
 
-uint32_t CRC(uint32_t x) {
-  return SSE4_crc32_u32(x,0xab8ce2abu);
+// Like Zobrist256gather, but only uses AVX2's SIMD xor, not its gather.
+inline void Zobrist256simple(uint32_t (*key)[8]) __attribute__((__target__("avx2")));
+inline void Zobrist256simple(uint32_t (*key)[8]) {
+  uint32_t row[4][8];
+  const uint8_t (*key_chars)[8][4] = reinterpret_cast<const uint8_t (*)[8][4]>(key);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      row[i][j] = ZOBRIST_DATA[i][(*key_chars)[j][i]];
+    }
+  }
+  auto result0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[0])),
+       result1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[1])),
+       result2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[2])),
+       result3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[3]));
+  auto k01 = _mm256_xor_si256(result0, result1), k23 = _mm256_xor_si256(result2, result3);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(*key), _mm256_xor_si256(k01, k23));
+}
+
+// Perform one hash function the given number of times. This can sometimes auto-vectorize.
+//
+// TODO: We could also test the costs of running on non-contiguous uint32_t's. For
+// instance, ExprValuesCache.expr_values_array_ might have multiple values to hash per
+// input row.
+template <void (*F)(uint32_t*), size_t N>
+inline void Multiple(uint32_t (*x)[N]) {
+  for (int i = 0; i < N; ++i) {
+    F((*x) + i);
+  }
 }
 
-template<typename T, T (*HASH)(T)>
+// The size of the test data we run each hash function on:
+static const size_t DATA_SIZE = 1 << 15;
+
+template<typename T, void (*HASH)(T*)>
 void Run(int batch_size, void* data) {
-  vector<T>* d = reinterpret_cast<vector<T>*>(data);
+  T* d = reinterpret_cast<T*>(data);
   for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < d->size(); ++j) {
-      (*d)[j] = HASH((*d)[j]);
+    for (int j = 0; j < ((sizeof(uint32_t)) * DATA_SIZE) / sizeof(T); ++j) {
+      HASH(&d[j]);
     }
   }
 }
@@ -124,21 +272,61 @@ int main() {
   cout << endl
        << Benchmark::GetMachineInfo() << endl;
 
-  vector<uint32_t> ud(1 << 15);
-  for (size_t i = 0; i < ud.size(); ++i) {
+  unique_ptr<uint32_t[]> ud(new uint32_t[DATA_SIZE]);
+  for (size_t i = 0; i < (DATA_SIZE); ++i) {
     ud[i] = MakeRandU32();
   }
 
-  Benchmark suite("32 -> 32");
+  for (size_t i = 0; i < 4; ++i) {
+    for (size_t j = 0; j < 256; ++j) {
+      ZOBRIST_DATA[i][j] = MakeRandU32();
+    }
+  }
+
+  Benchmark suite32("32 -> 32");
 
-#define BENCH(T,x) suite.AddBenchmark(#x, Run<T, x>, &ud)
-  BENCH(uint32_t, Jenkins2);
-  BENCH(int32_t, Jenkins1);
-  BENCH(int32_t, MultRot);
-  BENCH(uint32_t, MultiplyAddShift);
-  BENCH(uint32_t, MultiplyShift);
-  BENCH(uint32_t, CRC);
-#undef BENCH
+#define BENCH(T,x) AddBenchmark(#x, Run<T, x>, ud.get())
+
+  suite32.BENCH(uint32_t, FNV);
+  suite32.BENCH(uint32_t, Zobrist);
+  suite32.BENCH(int32_t, MultRot);
+  suite32.BENCH(uint32_t, MultiplyAddShift);
+  suite32.BENCH(int32_t, Jenkins1);
+  suite32.BENCH(uint32_t, Jenkins2);
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) suite32.BENCH(uint32_t, CRC);
+  suite32.BENCH(uint32_t, MultiplyShift);
 
-  cout << suite.Measure() << endl;
+  cout << suite32.Measure() << endl;
+
+  Benchmark suite32x4("32 x 4 -> 32 x 4");
+
+  suite32x4.BENCH(uint32_t[4], (Multiple<Zobrist, 4>));
+  suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyAddShift, 4>));
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+    suite32x4.BENCH(uint32_t[4], (Multiple<CRC, 4>));
+  }
+  suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyShift, 4>));
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_1)) {
+    suite32x4.BENCH(__m128i, MultiplyAddShift128);
+    suite32x4.BENCH(__m128i, MultiplyShift128);
+  }
+
+  cout << suite32x4.Measure() << endl;
+
+  Benchmark suite32x8("32 x 8 -> 32 x 8");
+
+  suite32x8.BENCH(uint32_t[8], (Multiple<Zobrist, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyAddShift, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<CRC, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyShift, 8>));
+  if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+    suite32x8.BENCH(uint32_t[8], Zobrist256simple);
+    suite32x8.BENCH(__m256i, Zobrist256gather);
+    suite32x8.BENCH(__m256i, MultiplyAddShift256);
+    suite32x8.BENCH(__m256i, MultiplyShift256);
+  }
+
+  cout << suite32x8.Measure() << endl;
+
+#undef BENCH
 }