You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jb...@apache.org on 2016/09/07 03:28:56 UTC
[1/2] incubator-impala git commit: Minor enhancements to helper
scripts.
Repository: incubator-impala
Updated Branches:
refs/heads/master 1909ce2d7 -> 39e01abcf
Minor enhancements to helper scripts.
- run-all-tests.sh: survive non-fatal failures when calling ulimit.
- copy-udfs-udas.sh: respect $MAKE_CMD instead of blindly using make.
Change-Id: Ic90bd0048786c799a8ac435de4303ed399ac1223
Reviewed-on: http://gerrit.cloudera.org:8080/4304
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b35689d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b35689d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b35689d7
Branch: refs/heads/master
Commit: b35689d7d9a32029fd0766c5a140dbaf58177f8e
Parents: 1909ce2
Author: Zoltan Ivanfi <zi...@cloudera.com>
Authored: Fri Sep 2 20:50:15 2016 +0200
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Mon Sep 5 15:17:22 2016 +0000
----------------------------------------------------------------------
bin/run-all-tests.sh | 2 +-
testdata/bin/copy-udfs-udas.sh | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b35689d7/bin/run-all-tests.sh
----------------------------------------------------------------------
diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index 7d68216..728c0e5 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -101,7 +101,7 @@ fi
LOG_DIR="${IMPALA_EE_TEST_LOGS_DIR}"
# Enable core dumps
-ulimit -c unlimited
+ulimit -c unlimited || true
if [[ "${TARGET_FILESYSTEM}" == "hdfs" ]]; then
# To properly test HBase integeration, HBase regions are split and assigned by this
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b35689d7/testdata/bin/copy-udfs-udas.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/copy-udfs-udas.sh b/testdata/bin/copy-udfs-udas.sh
index 814973a..4440db1 100755
--- a/testdata/bin/copy-udfs-udas.sh
+++ b/testdata/bin/copy-udfs-udas.sh
@@ -48,7 +48,7 @@ done
if [ $BUILD -eq 1 ]
then
pushd $IMPALA_HOME
- make -j$CORES \
+ "${MAKE_CMD:-make}" "-j${IMPALA_BUILD_THREADS:-4}" \
TestUdas TestUdfs test-udfs-ir udfsample udasample udf-sample-ir uda-sample-ir
cd $IMPALA_HOME/tests/test-hive-udfs
${IMPALA_HOME}/bin/mvn-quiet.sh package
[2/2] incubator-impala git commit: Add FNV, Zobrist,
and SIMD hash functions to the int hash benchmark.
Posted by jb...@apache.org.
Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.
Additionally, change the parameter of rotate to a compile-time
constant, and add "inline" to functions, increasing the performance
dramatically. The compiler can't inline the SIMD versions, because
they use Intel intrinsics -- added a TODO to add these intrinsics to
sse-util.h.
Change-Id: I11d48f8816d5b129858a1f773015e51049dd1d61
Reviewed-on: http://gerrit.cloudera.org:8080/4313
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/39e01abc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/39e01abc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/39e01abc
Branch: refs/heads/master
Commit: 39e01abcf38bafaa03810fdbfd743a1558db92d8
Parents: b35689d
Author: Jim Apple <jb...@cloudera.com>
Authored: Mon Sep 5 14:15:06 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Wed Sep 7 01:35:30 2016 +0000
----------------------------------------------------------------------
be/src/benchmarks/int-hash-benchmark.cc | 302 ++++++++++++++++++++++-----
1 file changed, 245 insertions(+), 57 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/39e01abc/be/src/benchmarks/int-hash-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/int-hash-benchmark.cc b/be/src/benchmarks/int-hash-benchmark.cc
index a0787f1..9bed3c4 100644
--- a/be/src/benchmarks/int-hash-benchmark.cc
+++ b/be/src/benchmarks/int-hash-benchmark.cc
@@ -19,10 +19,14 @@
#include <iostream>
#include <limits>
+#include <memory>
#include <vector>
+#include <immintrin.h>
+
#include "util/benchmark.h"
#include "util/cpu-info.h"
+#include "util/hash-util.h"
#include "util/sse-util.h"
using namespace std;
@@ -31,22 +35,49 @@ using namespace impala;
// Test hash functions that take integers as arguments and produce integers as the result.
//
// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
-// 32 -> 32: Function Rate (iters/ms) Comparison
-// ----------------------------------------------------------------------
-// Jenkins2 14.06 1X
-// Jenkins1 16.97 1.207X
-// MultRot 19.14 1.361X
-// MultiplyAddShift 21.21 1.509X
-// MultiplyShift 25.3 1.799X
-// CRC 25.53 1.816X
+// 32 -> 32: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+// FNV 25.7 26 26.4 1X 1X 1X
+// Zobrist 30.2 30.7 30.9 1.18X 1.18X 1.17X
+// MultRot 42.6 43.4 43.4 1.66X 1.67X 1.64X
+// MultiplyAddShift 42.4 43.2 43.2 1.65X 1.66X 1.64X
+// Jenkins1 51.8 54 54 2.02X 2.08X 2.05X
+// Jenkins2 66.2 67.4 67.5 2.58X 2.59X 2.56X
+// CRC 98.6 100 101 3.84X 3.85X 3.84X
+// MultiplyShift 150 152 153 5.84X 5.86X 5.79X
+//
+// 32 x 4 -> 32 x 4: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+// (Multiple<Zobrist, 4>) 30.8 31.2 31.4 1X 1X 1X
+// (Multiple<MultiplyAddShift, 4>) 44.2 45 45 1.43X 1.44X 1.43X
+// (Multiple<CRC, 4>) 118 120 121 3.84X 3.86X 3.85X
+// (Multiple<MultiplyShift, 4>) 156 159 159 5.07X 5.1X 5.08X
+// MultiplyAddShift128 75.7 77.2 77.2 2.46X 2.48X 2.46X
+// MultiplyShift128 128 131 133 4.16X 4.21X 4.23X
+//
+// 32 x 8 -> 32 x 8: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+// (Multiple<Zobrist, 8>) 31 31.5 31.8 1X 1X 1X
+// (Multiple<MultiplyAddShift, 8>) 44.3 44.5 45.2 1.43X 1.41X 1.42X
+// (Multiple<CRC, 8>) 121 123 123 3.9X 3.9X 3.88X
+// (Multiple<MultiplyShift, 8>) 158 159 160 5.11X 5.05X 5.04X
+// Zobrist256simple 16.5 16.5 16.6 0.533X 0.524X 0.522X
+// Zobrist256gather 18.8 19.2 19.4 0.608X 0.608X 0.61X
+// MultiplyAddShift256 151 154 154 4.88X 4.88X 4.84X
+// MultiplyShift256 209 212 212 6.73X 6.71X 6.67X
// Rotate 32 bits right by 'shift'. This is _rotr in the intel instrinsics, but that isn't
// usable on Clang yet. Fortunately, both GCC and Clang can optimize this to use the 'ror'
// instruction.
-uint32_t RotateRight(uint32_t x, int shift) {
- DCHECK_GT(shift, 0);
- DCHECK_LT(shift, std::numeric_limits<decltype(x)>::digits);
- return (x << (std::numeric_limits<decltype(x)>::digits - shift)) | (x >> shift);
+template<int SHIFT>
+inline uint32_t RotateRight(uint32_t x) {
+ static_assert(SHIFT > 0, "Only positive shifts are defined behavior and useful");
+ static_assert(
+ SHIFT < std::numeric_limits<decltype(x)>::digits, "This much shift is just 0");
+ return (x << (std::numeric_limits<decltype(x)>::digits - SHIFT)) | (x >> SHIFT);
}
// Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits
@@ -60,61 +91,178 @@ uint32_t MakeRandU32() {
// Almost universal hashing, M. Dietzfelbinger, T. Hagerup, J. Katajainen, and M.
// Penttonen, "A reliable randomized algorithm for the closest-pair problem".
-uint32_t MultiplyShift(uint32_t x) {
+inline void MultiplyShift(uint32_t* x) {
static const uint32_t m = 0x61eaf8e9u;
- return x*m;
+ *x = *x * m;
+}
+
+// Like MultiplyShift, but using SSE's 128-bit SIMD registers to do 4 at once.
+//
+// TODO: Add the Intel intrinsics used in this function and the other functions in this
+// file to sse-util.h so that these functions can be inlined.
+inline void MultiplyShift128(__m128i* x) __attribute__((__target__("sse4.1")));
+inline void MultiplyShift128(__m128i* x) {
+ const __m128i m = _mm_set1_epi32(0x61eaf8e9);
+ _mm_storeu_si128(x, _mm_mullo_epi32(_mm_loadu_si128(x), m));
+}
+
+// Like MultiplyShift, but using AVX2's 256-bit SIMD registers to do 8 at once.
+//
+// Not inline, because it degrades the performance for unknown reasons.
+void MultiplyShift256(__m256i* x) __attribute__((__target__("avx2")));
+void MultiplyShift256(__m256i* x) {
+ const __m256i m = _mm256_set1_epi32(0x61eaf8e9);
+ _mm256_storeu_si256(x, _mm256_mullo_epi32(_mm256_loadu_si256(x), m));
}
// 2-independent hashing. M. Dietzfelbinger, "Universal hashing and k-wise independent
// random variables via integer arithmetic without primes"
-uint32_t MultiplyAddShift(uint32_t x) {
+inline void MultiplyAddShift(uint32_t* x) {
static const uint64_t m = 0xa1f1bd3e020b4be0ull, a = 0x86b0426193d86e66ull;
- return (static_cast<uint64_t>(x) * m + a) >> 32;
+ *x = (static_cast<uint64_t>(*x) * m + a) >> 32;
+}
+
+// Like MultiplyAddShift, but using SSE's 128-bit SIMD registers to do 4 at once.
+inline void MultiplyAddShift128(__m128i* x) __attribute__((__target__("sse4.1")));
+inline void MultiplyAddShift128(__m128i* x) {
+ const auto m = _mm_set1_epi64x(0xa1f1bd3e020b4be0ull),
+ mhi = _mm_set1_epi32(0xa1f1bd3e),
+ a = _mm_set1_epi64x(0x86b0426193d86e66ull);
+ auto input = _mm_loadu_si128(x);
+ auto prod32easy = _mm_mullo_epi32(input, mhi);
+ auto input_odds = _mm_srli_epi64(input, 32);
+ auto prod64_evens = _mm_mul_epu32(input, m),
+ prod64_odds = _mm_mul_epu32(input_odds, m);
+ prod64_evens = _mm_add_epi64(a, prod64_evens);
+ prod64_odds = _mm_add_epi64(a, prod64_odds);
+ auto prod32hard = _mm_unpackhi_epi32(prod64_evens, prod64_odds);
+ _mm_storeu_si128(x, _mm_add_epi32(prod32easy, prod32hard));
+}
+
+// Like MultiplyAddShift, but using AVX2's 256-bit SIMD registers to do 8 at once.
+inline void MultiplyAddShift256(__m256i* x) __attribute__((__target__("avx2")));
+inline void MultiplyAddShift256(__m256i* x) {
+ const __m256i m = _mm256_set1_epi64x(0xa1f1bd3e020b4be0ull),
+ mhi = _mm256_set1_epi32(0xa1f1bd3e),
+ a = _mm256_set1_epi64x(0x86b0426193d86e66ull);
+ __m256i input = _mm256_loadu_si256(x);
+ __m256i prod32easy = _mm256_mullo_epi32(input, mhi);
+ __m256i input_odds = _mm256_srli_epi64(input, 32);
+ __m256i prod64_evens = _mm256_mul_epu32(input, m),
+ prod64_odds = _mm256_mul_epu32(input_odds, m);
+ prod64_evens = _mm256_add_epi64(a, prod64_evens);
+ prod64_odds = _mm256_add_epi64(a, prod64_odds);
+ __m256i prod32hard = _mm256_unpackhi_epi32(prod64_evens, prod64_odds);
+ _mm256_storeu_si256(x, _mm256_add_epi32(prod32easy, prod32hard));
}
// From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t Jenkins1(int32_t x) {
- x = ~x + (x << 15); // x = (x << 15) - x - 1;
- x = x ^ RotateRight(x, 12);
- x = x + (x << 2);
- x = x ^ RotateRight(x, 4);
- x = x * 2057; // x = (x + (x << 3)) + (x << 11);
- x = x ^ RotateRight(x, 16);
- return x;
+inline void Jenkins1(int32_t* x) {
+ *x = ~*x + (*x << 15); // x = (x << 15) - x - 1;
+ *x = *x ^ RotateRight<12>(*x);
+ *x = *x + (*x << 2);
+ *x = *x ^ RotateRight<4>(*x);
+ *x = *x * 2057; // x = (x + (x << 3)) + (x << 11);
+ *x = *x ^ RotateRight<16>(*x);
}
// From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-uint32_t Jenkins2(uint32_t a) {
- a = (a + 0x7ed55d16) + (a << 12);
- a = (a ^ 0xc761c23c) ^ (a >> 19);
- a = (a + 0x165667b1) + (a << 5);
- a = (a + 0xd3a2646c) ^ (a << 9);
- a = (a + 0xfd7046c5) + (a << 3);
- a = (a ^ 0xb55a4f09) ^ (a >> 16);
- return a;
+inline void Jenkins2(uint32_t* a) {
+ *a = (*a + 0x7ed55d16) + (*a << 12);
+ *a = (*a ^ 0xc761c23c) ^ (*a >> 19);
+ *a = (*a + 0x165667b1) + (*a << 5);
+ *a = (*a + 0xd3a2646c) ^ (*a << 9);
+ *a = (*a + 0xfd7046c5) + (*a << 3);
+ *a = (*a ^ 0xb55a4f09) ^ (*a >> 16);
}
// From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t MultRot(int32_t key) {
+inline void MultRot(int32_t* key) {
static const int32_t c2 = 0x27d4eb2d; // a prime or an odd constant
- key = (key ^ 61) ^ RotateRight(key, 16);
- key = key + (key << 3);
- key = key ^ RotateRight(key, 4);
- key = key * c2;
- key = key ^ RotateRight(key, 15);
- return key;
+ *key = (*key ^ 61) ^ RotateRight<16>(*key);
+ *key = *key + (*key << 3);
+ *key = *key ^ RotateRight<4>(*key);
+ *key = *key * c2;
+ *key = *key ^ RotateRight<15>(*key);
+}
+
+inline void CRC(uint32_t* x) {
+ *x = SSE4_crc32_u32(*x, 0xab8ce2abu);
+}
+
+inline void FNV(uint32_t* key) {
+ *key = HashUtil::FnvHash64to32(key, sizeof(*key), HashUtil::FNV_SEED);
+}
+
+// Zobrist hashing, also known as tabulation hashing or simple tabulation hashing, is an
+// old technique that has been recently analyzed and found to be very good for a number of
+// applications. See "The Power of Simple Tabulation Hashing", by Mihai Patrascu and
+// Mikkel Thorup.
+
+uint32_t ZOBRIST_DATA[4][256];
+
+inline void Zobrist(uint32_t* key) {
+ const uint8_t* key_chars = reinterpret_cast<const uint8_t*>(key);
+ *key = ZOBRIST_DATA[0][key_chars[0]] ^ ZOBRIST_DATA[1][key_chars[1]]
+ ^ ZOBRIST_DATA[2][key_chars[2]] ^ ZOBRIST_DATA[3][key_chars[3]];
+}
+
+// Like Zobrist, but uses AVX2's "gather" primatives to hash 8 values at once.
+inline void Zobrist256gather(__m256i* key) __attribute__((__target__("avx2")));
+inline void Zobrist256gather(__m256i* key) {
+ const auto k = _mm256_loadu_si256(key);
+ const auto low_mask = _mm256_set1_epi32(0xff);
+ auto k0 = _mm256_and_si256(low_mask, k),
+ k1 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 8)),
+ k2 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 16)),
+ k3 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 24));
+ k0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[0]), k0, 1);
+ k1 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[1]), k1, 1);
+ k2 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[2]), k2, 1);
+ k3 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[3]), k3, 1);
+ auto k01 = _mm256_xor_si256(k0, k1), k23 = _mm256_xor_si256(k2, k3);
+ _mm256_storeu_si256(key, _mm256_xor_si256(k01, k23));
}
-uint32_t CRC(uint32_t x) {
- return SSE4_crc32_u32(x,0xab8ce2abu);
+// Like Zobrist256gather, but only uses AVX2's SIMD xor, not its gather.
+inline void Zobrist256simple(uint32_t (*key)[8]) __attribute__((__target__("avx2")));
+inline void Zobrist256simple(uint32_t (*key)[8]) {
+ uint32_t row[4][8];
+ const uint8_t (*key_chars)[8][4] = reinterpret_cast<const uint8_t (*)[8][4]>(key);
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ row[i][j] = ZOBRIST_DATA[i][(*key_chars)[j][i]];
+ }
+ }
+ auto result0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[0])),
+ result1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[1])),
+ result2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[2])),
+ result3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[3]));
+ auto k01 = _mm256_xor_si256(result0, result1), k23 = _mm256_xor_si256(result2, result3);
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(*key), _mm256_xor_si256(k01, k23));
+}
+
+// Perform one hash function the given number of times. This can sometimes auto-vectorize.
+//
+// TODO: We could also test the costs of running on non-contiguous uint32_t's. For
+// instance, ExprValuesCache.expr_values_array_ might have multiple values to hash per
+// input row.
+template <void (*F)(uint32_t*), size_t N>
+inline void Multiple(uint32_t (*x)[N]) {
+ for (int i = 0; i < N; ++i) {
+ F((*x) + i);
+ }
}
-template<typename T, T (*HASH)(T)>
+// The size of the test data we run each hash function on:
+static const size_t DATA_SIZE = 1 << 15;
+
+template<typename T, void (*HASH)(T*)>
void Run(int batch_size, void* data) {
- vector<T>* d = reinterpret_cast<vector<T>*>(data);
+ T* d = reinterpret_cast<T*>(data);
for (int i = 0; i < batch_size; ++i) {
- for (int j = 0; j < d->size(); ++j) {
- (*d)[j] = HASH((*d)[j]);
+ for (int j = 0; j < ((sizeof(uint32_t)) * DATA_SIZE) / sizeof(T); ++j) {
+ HASH(&d[j]);
}
}
}
@@ -124,21 +272,61 @@ int main() {
cout << endl
<< Benchmark::GetMachineInfo() << endl;
- vector<uint32_t> ud(1 << 15);
- for (size_t i = 0; i < ud.size(); ++i) {
+ unique_ptr<uint32_t[]> ud(new uint32_t[DATA_SIZE]);
+ for (size_t i = 0; i < (DATA_SIZE); ++i) {
ud[i] = MakeRandU32();
}
- Benchmark suite("32 -> 32");
+ for (size_t i = 0; i < 4; ++i) {
+ for (size_t j = 0; j < 256; ++j) {
+ ZOBRIST_DATA[i][j] = MakeRandU32();
+ }
+ }
+
+ Benchmark suite32("32 -> 32");
-#define BENCH(T,x) suite.AddBenchmark(#x, Run<T, x>, &ud)
- BENCH(uint32_t, Jenkins2);
- BENCH(int32_t, Jenkins1);
- BENCH(int32_t, MultRot);
- BENCH(uint32_t, MultiplyAddShift);
- BENCH(uint32_t, MultiplyShift);
- BENCH(uint32_t, CRC);
-#undef BENCH
+#define BENCH(T,x) AddBenchmark(#x, Run<T, x>, ud.get())
+
+ suite32.BENCH(uint32_t, FNV);
+ suite32.BENCH(uint32_t, Zobrist);
+ suite32.BENCH(int32_t, MultRot);
+ suite32.BENCH(uint32_t, MultiplyAddShift);
+ suite32.BENCH(int32_t, Jenkins1);
+ suite32.BENCH(uint32_t, Jenkins2);
+ if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) suite32.BENCH(uint32_t, CRC);
+ suite32.BENCH(uint32_t, MultiplyShift);
- cout << suite.Measure() << endl;
+ cout << suite32.Measure() << endl;
+
+ Benchmark suite32x4("32 x 4 -> 32 x 4");
+
+ suite32x4.BENCH(uint32_t[4], (Multiple<Zobrist, 4>));
+ suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyAddShift, 4>));
+ if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+ suite32x4.BENCH(uint32_t[4], (Multiple<CRC, 4>));
+ }
+ suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyShift, 4>));
+ if (CpuInfo::IsSupported(CpuInfo::SSE4_1)) {
+ suite32x4.BENCH(__m128i, MultiplyAddShift128);
+ suite32x4.BENCH(__m128i, MultiplyShift128);
+ }
+
+ cout << suite32x4.Measure() << endl;
+
+ Benchmark suite32x8("32 x 8 -> 32 x 8");
+
+ suite32x8.BENCH(uint32_t[8], (Multiple<Zobrist, 8>));
+ suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyAddShift, 8>));
+ suite32x8.BENCH(uint32_t[8], (Multiple<CRC, 8>));
+ suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyShift, 8>));
+ if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+ suite32x8.BENCH(uint32_t[8], Zobrist256simple);
+ suite32x8.BENCH(__m256i, Zobrist256gather);
+ suite32x8.BENCH(__m256i, MultiplyAddShift256);
+ suite32x8.BENCH(__m256i, MultiplyShift256);
+ }
+
+ cout << suite32x8.Measure() << endl;
+
+#undef BENCH
}