You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by jb...@apache.org on 2016/09/07 03:28:56 UTC

[1/2] incubator-impala git commit: Minor enhancements to helper scripts.

Repository: incubator-impala
Updated Branches:
  refs/heads/master 1909ce2d7 -> 39e01abcf


Minor enhancements to helper scripts.

- run-all-tests.sh: survive non-fatal failures when calling ulimit.
- copy-udfs-udas.sh: respect $MAKE_CMD instead of blindly using make.

Change-Id: Ic90bd0048786c799a8ac435de4303ed399ac1223
Reviewed-on: http://gerrit.cloudera.org:8080/4304
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b35689d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b35689d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b35689d7

Branch: refs/heads/master
Commit: b35689d7d9a32029fd0766c5a140dbaf58177f8e
Parents: 1909ce2
Author: Zoltan Ivanfi <zi...@cloudera.com>
Authored: Fri Sep 2 20:50:15 2016 +0200
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Mon Sep 5 15:17:22 2016 +0000

----------------------------------------------------------------------
 bin/run-all-tests.sh           | 2 +-
 testdata/bin/copy-udfs-udas.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b35689d7/bin/run-all-tests.sh
----------------------------------------------------------------------
diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index 7d68216..728c0e5 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -101,7 +101,7 @@ fi
 LOG_DIR="${IMPALA_EE_TEST_LOGS_DIR}"
 
 # Enable core dumps
-ulimit -c unlimited
+ulimit -c unlimited || true
 
 if [[ "${TARGET_FILESYSTEM}" == "hdfs" ]]; then
   # To properly test HBase integeration, HBase regions are split and assigned by this

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b35689d7/testdata/bin/copy-udfs-udas.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/copy-udfs-udas.sh b/testdata/bin/copy-udfs-udas.sh
index 814973a..4440db1 100755
--- a/testdata/bin/copy-udfs-udas.sh
+++ b/testdata/bin/copy-udfs-udas.sh
@@ -48,7 +48,7 @@ done
 if [ $BUILD -eq 1 ]
 then
   pushd $IMPALA_HOME
-  make -j$CORES \
+  "${MAKE_CMD:-make}" "-j${IMPALA_BUILD_THREADS:-4}" \
       TestUdas TestUdfs test-udfs-ir udfsample udasample udf-sample-ir uda-sample-ir
   cd $IMPALA_HOME/tests/test-hive-udfs
   ${IMPALA_HOME}/bin/mvn-quiet.sh package

[2/2] incubator-impala git commit: Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.

Posted by jb...@apache.org.

Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.

Additionally, change the parameter of rotate to a compile-time
constant, and add "inline" to functions, increasing the performance
dramatically. The compiler can't inline the SIMD versions, because
they use Intel intrinsics -- added a TODO to add these intrinsics to
sse-util.h.

Change-Id: I11d48f8816d5b129858a1f773015e51049dd1d61
Reviewed-on: http://gerrit.cloudera.org:8080/4313
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/39e01abc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/39e01abc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/39e01abc

Branch: refs/heads/master
Commit: 39e01abcf38bafaa03810fdbfd743a1558db92d8
Parents: b35689d
Author: Jim Apple <jb...@cloudera.com>
Authored: Mon Sep 5 14:15:06 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Wed Sep 7 01:35:30 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/int-hash-benchmark.cc | 302 ++++++++++++++++++++++-----
 1 file changed, 245 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/39e01abc/be/src/benchmarks/int-hash-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/int-hash-benchmark.cc b/be/src/benchmarks/int-hash-benchmark.cc
index a0787f1..9bed3c4 100644
--- a/be/src/benchmarks/int-hash-benchmark.cc
+++ b/be/src/benchmarks/int-hash-benchmark.cc
@@ -19,10 +19,14 @@
 
 #include <iostream>
 #include <limits>
+#include <memory>
 #include <vector>
 
+#include <immintrin.h>
+
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
+#include "util/hash-util.h"
 #include "util/sse-util.h"
 
 using namespace std;
@@ -31,22 +35,49 @@ using namespace impala;
 // Test hash functions that take integers as arguments and produce integers as the result.
 //
 // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
-// 32 -> 32:             Function     Rate (iters/ms)          Comparison
-// ----------------------------------------------------------------------
-//                       Jenkins2               14.06                  1X
-//                       Jenkins1               16.97              1.207X
-//                        MultRot               19.14              1.361X
-//               MultiplyAddShift               21.21              1.509X
-//                  MultiplyShift                25.3              1.799X
-//                            CRC               25.53              1.816X
+// 32 -> 32:                  Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//                                 FNV               25.7       26     26.4         1X         1X         1X
+//                             Zobrist               30.2     30.7     30.9      1.18X      1.18X      1.17X
+//                             MultRot               42.6     43.4     43.4      1.66X      1.67X      1.64X
+//                    MultiplyAddShift               42.4     43.2     43.2      1.65X      1.66X      1.64X
+//                            Jenkins1               51.8       54       54      2.02X      2.08X      2.05X
+//                            Jenkins2               66.2     67.4     67.5      2.58X      2.59X      2.56X
+//                                 CRC               98.6      100      101      3.84X      3.85X      3.84X
+//                       MultiplyShift                150      152      153      5.84X      5.86X      5.79X
+//
+// 32 x 4 -> 32 x 4:          Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//              (Multiple<Zobrist, 4>)               30.8     31.2     31.4         1X         1X         1X
+//     (Multiple<MultiplyAddShift, 4>)               44.2       45       45      1.43X      1.44X      1.43X
+//                  (Multiple<CRC, 4>)                118      120      121      3.84X      3.86X      3.85X
+//        (Multiple<MultiplyShift, 4>)                156      159      159      5.07X       5.1X      5.08X
+//                 MultiplyAddShift128               75.7     77.2     77.2      2.46X      2.48X      2.46X
+//                    MultiplyShift128                128      131      133      4.16X      4.21X      4.23X
+//
+// 32 x 8 -> 32 x 8:          Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//              (Multiple<Zobrist, 8>)                 31     31.5     31.8         1X         1X         1X
+//     (Multiple<MultiplyAddShift, 8>)               44.3     44.5     45.2      1.43X      1.41X      1.42X
+//                  (Multiple<CRC, 8>)                121      123      123       3.9X       3.9X      3.88X
+//        (Multiple<MultiplyShift, 8>)                158      159      160      5.11X      5.05X      5.04X
+//                    Zobrist256simple               16.5     16.5     16.6     0.533X     0.524X     0.522X
+//                    Zobrist256gather               18.8     19.2     19.4     0.608X     0.608X      0.61X
+//                 MultiplyAddShift256                151      154      154      4.88X      4.88X      4.84X
+//                    MultiplyShift256                209      212      212      6.73X      6.71X      6.67X
 
 // Rotate 32 bits right by 'shift'. This is _rotr in the intel instrinsics, but that isn't
 // usable on Clang yet. Fortunately, both GCC and Clang can optimize this to use the 'ror'
 // instruction.
-uint32_t RotateRight(uint32_t x, int shift) {
-  DCHECK_GT(shift, 0);
-  DCHECK_LT(shift, std::numeric_limits<decltype(x)>::digits);
-  return (x << (std::numeric_limits<decltype(x)>::digits - shift)) | (x >> shift);
+template<int SHIFT>
+inline uint32_t RotateRight(uint32_t x) {
+  static_assert(SHIFT > 0, "Only positive shifts are defined behavior and useful");
+  static_assert(
+      SHIFT < std::numeric_limits<decltype(x)>::digits, "This much shift is just 0");
+  return (x << (std::numeric_limits<decltype(x)>::digits - SHIFT)) | (x >> SHIFT);
 }
 
 // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits
@@ -60,61 +91,178 @@ uint32_t MakeRandU32() {
 
 // Almost universal hashing, M. Dietzfelbinger, T. Hagerup, J. Katajainen, and M.
 // Penttonen, "A reliable randomized algorithm for the closest-pair problem".
-uint32_t MultiplyShift(uint32_t x) {
+inline void MultiplyShift(uint32_t* x) {
   static const uint32_t m = 0x61eaf8e9u;
-  return x*m;
+  *x = *x * m;
+}
+
+// Like MultiplyShift, but using SSE's 128-bit SIMD registers to do 4 at once.
+//
+// TODO: Add the Intel intrinsics used in this function and the other functions in this
+// file to sse-util.h so that these functions can be inlined.
+inline void MultiplyShift128(__m128i* x) __attribute__((__target__("sse4.1")));
+inline void MultiplyShift128(__m128i* x) {
+  const __m128i m = _mm_set1_epi32(0x61eaf8e9);
+  _mm_storeu_si128(x, _mm_mullo_epi32(_mm_loadu_si128(x), m));
+}
+
+// Like MultiplyShift, but using AVX2's 256-bit SIMD registers to do 8 at once.
+//
+// Not inline, because it degrades the performance for unknown reasons.
+void MultiplyShift256(__m256i* x) __attribute__((__target__("avx2")));
+void MultiplyShift256(__m256i* x) {
+  const __m256i m = _mm256_set1_epi32(0x61eaf8e9);
+  _mm256_storeu_si256(x, _mm256_mullo_epi32(_mm256_loadu_si256(x), m));
 }
 
 // 2-independent hashing. M. Dietzfelbinger, "Universal hashing and k-wise independent
 // random variables via integer arithmetic without primes"
-uint32_t MultiplyAddShift(uint32_t x) {
+inline void MultiplyAddShift(uint32_t* x) {
   static const uint64_t m = 0xa1f1bd3e020b4be0ull, a = 0x86b0426193d86e66ull;
-  return (static_cast<uint64_t>(x) * m + a) >> 32;
+  *x = (static_cast<uint64_t>(*x) * m + a) >> 32;
+}
+
+// Like MultiplyAddShift, but using SSE's 128-bit SIMD registers to do 4 at once.
+inline void MultiplyAddShift128(__m128i* x) __attribute__((__target__("sse4.1")));
+inline void MultiplyAddShift128(__m128i* x) {
+  const auto m = _mm_set1_epi64x(0xa1f1bd3e020b4be0ull),
+                mhi = _mm_set1_epi32(0xa1f1bd3e),
+                a = _mm_set1_epi64x(0x86b0426193d86e66ull);
+  auto input = _mm_loadu_si128(x);
+  auto prod32easy = _mm_mullo_epi32(input, mhi);
+  auto input_odds = _mm_srli_epi64(input, 32);
+  auto prod64_evens = _mm_mul_epu32(input, m),
+          prod64_odds = _mm_mul_epu32(input_odds, m);
+  prod64_evens = _mm_add_epi64(a, prod64_evens);
+  prod64_odds = _mm_add_epi64(a, prod64_odds);
+  auto prod32hard = _mm_unpackhi_epi32(prod64_evens, prod64_odds);
+  _mm_storeu_si128(x, _mm_add_epi32(prod32easy, prod32hard));
+}
+
+// Like MultiplyAddShift, but using AVX2's 256-bit SIMD registers to do 8 at once.
+inline void MultiplyAddShift256(__m256i* x) __attribute__((__target__("avx2")));
+inline void MultiplyAddShift256(__m256i* x) {
+  const __m256i m = _mm256_set1_epi64x(0xa1f1bd3e020b4be0ull),
+                mhi = _mm256_set1_epi32(0xa1f1bd3e),
+                a = _mm256_set1_epi64x(0x86b0426193d86e66ull);
+  __m256i input = _mm256_loadu_si256(x);
+  __m256i prod32easy = _mm256_mullo_epi32(input, mhi);
+  __m256i input_odds = _mm256_srli_epi64(input, 32);
+  __m256i prod64_evens = _mm256_mul_epu32(input, m),
+          prod64_odds = _mm256_mul_epu32(input_odds, m);
+  prod64_evens = _mm256_add_epi64(a, prod64_evens);
+  prod64_odds = _mm256_add_epi64(a, prod64_odds);
+  __m256i prod32hard = _mm256_unpackhi_epi32(prod64_evens, prod64_odds);
+  _mm256_storeu_si256(x, _mm256_add_epi32(prod32easy, prod32hard));
 }
 
 // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t Jenkins1(int32_t x) {
-  x = ~x + (x << 15);  // x = (x << 15) - x - 1;
-  x = x ^ RotateRight(x, 12);
-  x = x + (x << 2);
-  x = x ^ RotateRight(x, 4);
-  x = x * 2057;  // x = (x + (x << 3)) + (x << 11);
-  x = x ^ RotateRight(x, 16);
-  return x;
+inline void Jenkins1(int32_t* x) {
+  *x = ~*x + (*x << 15); // x = (x << 15) - x - 1;
+  *x = *x ^ RotateRight<12>(*x);
+  *x = *x + (*x << 2);
+  *x = *x ^ RotateRight<4>(*x);
+  *x = *x * 2057; // x = (x + (x << 3)) + (x << 11);
+  *x = *x ^ RotateRight<16>(*x);
 }
 
 // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-uint32_t Jenkins2(uint32_t a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+inline void Jenkins2(uint32_t* a) {
+  *a = (*a + 0x7ed55d16) + (*a << 12);
+  *a = (*a ^ 0xc761c23c) ^ (*a >> 19);
+  *a = (*a + 0x165667b1) + (*a << 5);
+  *a = (*a + 0xd3a2646c) ^ (*a << 9);
+  *a = (*a + 0xfd7046c5) + (*a << 3);
+  *a = (*a ^ 0xb55a4f09) ^ (*a >> 16);
 }
 
 // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t MultRot(int32_t key) {
+inline void MultRot(int32_t* key) {
   static const int32_t c2 = 0x27d4eb2d;  // a prime or an odd constant
-  key = (key ^ 61) ^ RotateRight(key, 16);
-  key = key + (key << 3);
-  key = key ^ RotateRight(key, 4);
-  key = key * c2;
-  key = key ^ RotateRight(key, 15);
-  return key;
+  *key = (*key ^ 61) ^ RotateRight<16>(*key);
+  *key = *key + (*key << 3);
+  *key = *key ^ RotateRight<4>(*key);
+  *key = *key * c2;
+  *key = *key ^ RotateRight<15>(*key);
+}
+
+inline void CRC(uint32_t* x) {
+  *x = SSE4_crc32_u32(*x, 0xab8ce2abu);
+}
+
+inline void FNV(uint32_t* key) {
+  *key = HashUtil::FnvHash64to32(key, sizeof(*key), HashUtil::FNV_SEED);
+}
+
+// Zobrist hashing, also known as tabulation hashing or simple tabulation hashing, is an
+// old technique that has been recently analyzed and found to be very good for a number of
+// applications. See "The Power of Simple Tabulation Hashing", by Mihai Patrascu and
+// Mikkel Thorup.
+
+uint32_t ZOBRIST_DATA[4][256];
+
+inline void Zobrist(uint32_t* key) {
+  const uint8_t* key_chars = reinterpret_cast<const uint8_t*>(key);
+  *key = ZOBRIST_DATA[0][key_chars[0]] ^ ZOBRIST_DATA[1][key_chars[1]]
+      ^ ZOBRIST_DATA[2][key_chars[2]] ^ ZOBRIST_DATA[3][key_chars[3]];
+}
+
+// Like Zobrist, but uses AVX2's "gather" primatives to hash 8 values at once.
+inline void Zobrist256gather(__m256i* key) __attribute__((__target__("avx2")));
+inline void Zobrist256gather(__m256i* key) {
+  const auto k = _mm256_loadu_si256(key);
+  const auto low_mask = _mm256_set1_epi32(0xff);
+  auto k0 = _mm256_and_si256(low_mask, k),
+       k1 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 8)),
+       k2 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 16)),
+       k3 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 24));
+  k0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[0]), k0, 1);
+  k1 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[1]), k1, 1);
+  k2 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[2]), k2, 1);
+  k3 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[3]), k3, 1);
+  auto k01 = _mm256_xor_si256(k0, k1), k23 = _mm256_xor_si256(k2, k3);
+  _mm256_storeu_si256(key, _mm256_xor_si256(k01, k23));
 }
 
-uint32_t CRC(uint32_t x) {
-  return SSE4_crc32_u32(x,0xab8ce2abu);
+// Like Zobrist256gather, but only uses AVX2's SIMD xor, not its gather.
+inline void Zobrist256simple(uint32_t (*key)[8]) __attribute__((__target__("avx2")));
+inline void Zobrist256simple(uint32_t (*key)[8]) {
+  uint32_t row[4][8];
+  const uint8_t (*key_chars)[8][4] = reinterpret_cast<const uint8_t (*)[8][4]>(key);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      row[i][j] = ZOBRIST_DATA[i][(*key_chars)[j][i]];
+    }
+  }
+  auto result0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[0])),
+       result1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[1])),
+       result2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[2])),
+       result3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[3]));
+  auto k01 = _mm256_xor_si256(result0, result1), k23 = _mm256_xor_si256(result2, result3);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(*key), _mm256_xor_si256(k01, k23));
+}
+
+// Perform one hash function the given number of times. This can sometimes auto-vectorize.
+//
+// TODO: We could also test the costs of running on non-contiguous uint32_t's. For
+// instance, ExprValuesCache.expr_values_array_ might have multiple values to hash per
+// input row.
+template <void (*F)(uint32_t*), size_t N>
+inline void Multiple(uint32_t (*x)[N]) {
+  for (int i = 0; i < N; ++i) {
+    F((*x) + i);
+  }
 }
 
-template<typename T, T (*HASH)(T)>
+// The size of the test data we run each hash function on:
+static const size_t DATA_SIZE = 1 << 15;
+
+template<typename T, void (*HASH)(T*)>
 void Run(int batch_size, void* data) {
-  vector<T>* d = reinterpret_cast<vector<T>*>(data);
+  T* d = reinterpret_cast<T*>(data);
   for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < d->size(); ++j) {
-      (*d)[j] = HASH((*d)[j]);
+    for (int j = 0; j < ((sizeof(uint32_t)) * DATA_SIZE) / sizeof(T); ++j) {
+      HASH(&d[j]);
     }
   }
 }
@@ -124,21 +272,61 @@ int main() {
   cout << endl
        << Benchmark::GetMachineInfo() << endl;
 
-  vector<uint32_t> ud(1 << 15);
-  for (size_t i = 0; i < ud.size(); ++i) {
+  unique_ptr<uint32_t[]> ud(new uint32_t[DATA_SIZE]);
+  for (size_t i = 0; i < (DATA_SIZE); ++i) {
     ud[i] = MakeRandU32();
   }
 
-  Benchmark suite("32 -> 32");
+  for (size_t i = 0; i < 4; ++i) {
+    for (size_t j = 0; j < 256; ++j) {
+      ZOBRIST_DATA[i][j] = MakeRandU32();
+    }
+  }
+
+  Benchmark suite32("32 -> 32");
 
-#define BENCH(T,x) suite.AddBenchmark(#x, Run<T, x>, &ud)
-  BENCH(uint32_t, Jenkins2);
-  BENCH(int32_t, Jenkins1);
-  BENCH(int32_t, MultRot);
-  BENCH(uint32_t, MultiplyAddShift);
-  BENCH(uint32_t, MultiplyShift);
-  BENCH(uint32_t, CRC);
-#undef BENCH
+#define BENCH(T,x) AddBenchmark(#x, Run<T, x>, ud.get())
+
+  suite32.BENCH(uint32_t, FNV);
+  suite32.BENCH(uint32_t, Zobrist);
+  suite32.BENCH(int32_t, MultRot);
+  suite32.BENCH(uint32_t, MultiplyAddShift);
+  suite32.BENCH(int32_t, Jenkins1);
+  suite32.BENCH(uint32_t, Jenkins2);
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) suite32.BENCH(uint32_t, CRC);
+  suite32.BENCH(uint32_t, MultiplyShift);
 
-  cout << suite.Measure() << endl;
+  cout << suite32.Measure() << endl;
+
+  Benchmark suite32x4("32 x 4 -> 32 x 4");
+
+  suite32x4.BENCH(uint32_t[4], (Multiple<Zobrist, 4>));
+  suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyAddShift, 4>));
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+    suite32x4.BENCH(uint32_t[4], (Multiple<CRC, 4>));
+  }
+  suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyShift, 4>));
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_1)) {
+    suite32x4.BENCH(__m128i, MultiplyAddShift128);
+    suite32x4.BENCH(__m128i, MultiplyShift128);
+  }
+
+  cout << suite32x4.Measure() << endl;
+
+  Benchmark suite32x8("32 x 8 -> 32 x 8");
+
+  suite32x8.BENCH(uint32_t[8], (Multiple<Zobrist, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyAddShift, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<CRC, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyShift, 8>));
+  if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+    suite32x8.BENCH(uint32_t[8], Zobrist256simple);
+    suite32x8.BENCH(__m256i, Zobrist256gather);
+    suite32x8.BENCH(__m256i, MultiplyAddShift256);
+    suite32x8.BENCH(__m256i, MultiplyShift256);
+  }
+
+  cout << suite32x8.Measure() << endl;
+
+#undef BENCH
 }