You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by he...@apache.org on 2016/10/24 22:48:04 UTC
[2/2] incubator-impala git commit: IMPALA-4300: Speed up
BloomFilter::Or with SIMD
IMPALA-4300: Speed up BloomFilter::Or with SIMD
The previous code was not written in a way that GCC could
auto-vectorize it. Manually vectorizing speeds up BloomFilter::Or by
up to 184x.
Change-Id: I840799d9cfb81285c796e2abfe2029bb869b0f67
Reviewed-on: http://gerrit.cloudera.org:8080/4813
Reviewed-by: Jim Apple <jb...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/61fcb489
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/61fcb489
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/61fcb489
Branch: refs/heads/master
Commit: 61fcb489745f3f0b3f1abbf9fbf666a29a6363de
Parents: 0fbb5b7
Author: Jim Apple <jb...@cloudera.com>
Authored: Fri Oct 21 07:46:42 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Mon Oct 24 18:07:25 2016 +0000
----------------------------------------------------------------------
be/src/benchmarks/bloom-filter-benchmark.cc | 198 +++++++++++++++--------
be/src/testutil/mem-util.h | 2 +
be/src/util/bloom-filter.cc | 49 +++++-
be/src/util/cpu-info.cc | 1 +
be/src/util/cpu-info.h | 3 +-
5 files changed, 178 insertions(+), 75 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/benchmarks/bloom-filter-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc
index 1aa0619..d9019c8 100644
--- a/be/src/benchmarks/bloom-filter-benchmark.cc
+++ b/be/src/benchmarks/bloom-filter-benchmark.cc
@@ -30,13 +30,14 @@
using namespace std;
using namespace impala;
-// Tests Bloom filter performance on four tasks:
+// Tests Bloom filter performance on:
//
// 1. Construct/destruct pairs
// 2. Inserts
// 3. Lookups when the item is present
// 4. Lookups when the item is absent (this is theoretically faster than when the item is
// present in some Bloom filter variants)
+// 5. Unions
//
// As in bloom-filter.h, ndv refers to the number of unique items inserted into a filter
// and fpp is the probability of false positives.
@@ -46,89 +47,116 @@ using namespace impala;
// initialize: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// ndv 10k fpp 10.0% 7.05e+03 7.27e+03 7.34e+03 1X 1X 1X
-// ndv 10k fpp 1.0% 3.79e+03 3.93e+03 3.96e+03 0.538X 0.541X 0.54X
-// ndv 10k fpp 0.1% 1.39e+03 1.42e+03 1.44e+03 0.198X 0.196X 0.196X
-// ndv 1000k fpp 10.0% 4.62 4.78 4.81 0.000655X 0.000658X 0.000655X
-// ndv 1000k fpp 1.0% 2.49 2.55 2.6 0.000354X 0.000351X 0.000354X
-// ndv 1000k fpp 0.1% 2.45 2.55 2.6 0.000347X 0.000351X 0.000354X
-// ndv 100000k fpp 10.0% 0.035 0.0358 0.037 4.96e-06X 4.93e-06X 5.04e-06X
-// ndv 100000k fpp 1.0% 0.0347 0.0361 0.0372 4.93e-06X 4.96e-06X 5.06e-06X
-// ndv 100000k fpp 0.1% 0.0176 0.0181 0.0186 2.5e-06X 2.49e-06X 2.53e-06X
+// ndv 10k fpp 10.0% 5.89e+03 5.98e+03 6.03e+03 1X 1X 1X
+// ndv 10k fpp 1.0% 3.22e+03 3.25e+03 3.27e+03 0.546X 0.543X 0.542X
+// ndv 10k fpp 0.1% 1.13e+03 1.17e+03 1.18e+03 0.191X 0.195X 0.195X
+// ndv 1000k fpp 10.0% 3.85 3.93 3.93 0.000654X 0.000657X 0.000652X
+// ndv 1000k fpp 1.0% 2.04 2.12 2.12 0.000346X 0.000354X 0.000351X
+// ndv 1000k fpp 0.1% 2.04 2.12 2.12 0.000346X 0.000354X 0.000351X
+// ndv 100000k fpp 10.0% 0.0281 0.029 0.0294 4.77e-06X 4.85e-06X 4.87e-06X
+// ndv 100000k fpp 1.0% 0.0284 0.029 0.0298 4.82e-06X 4.85e-06X 4.93e-06X
+// ndv 100000k fpp 0.1% 0.0144 0.0147 0.0149 2.44e-06X 2.47e-06X 2.47e-06X
//
// With AVX2:
//
// insert: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// ndv 10k fpp 10.0% 2.03e+05 2.05e+05 2.08e+05 1X 1X 1X
-// ndv 10k fpp 1.0% 2.03e+05 2.06e+05 2.08e+05 0.997X 1X 1X
-// ndv 10k fpp 0.1% 2.03e+05 2.05e+05 2.07e+05 0.997X 0.998X 0.997X
-// ndv 1000k fpp 10.0% 1.82e+05 1.87e+05 1.89e+05 0.896X 0.91X 0.907X
-// ndv 1000k fpp 1.0% 1.49e+05 1.53e+05 1.56e+05 0.731X 0.747X 0.75X
-// ndv 1000k fpp 0.1% 1.79e+05 1.82e+05 1.83e+05 0.881X 0.886X 0.882X
-// ndv 100000k fpp 10.0% 4.08e+04 4.49e+04 5.44e+04 0.201X 0.219X 0.262X
-// ndv 100000k fpp 1.0% 3.94e+04 4.4e+04 5.04e+04 0.194X 0.214X 0.242X
-// ndv 100000k fpp 0.1% 4.08e+04 4.48e+04 5.68e+04 0.201X 0.218X 0.273X
+// ndv 10k fpp 10.0% 1.17e+05 1.23e+05 1.25e+05 1X 1X 1X
+// ndv 10k fpp 1.0% 1.17e+05 1.24e+05 1.25e+05 1X 1X 1X
+// ndv 10k fpp 0.1% 1.2e+05 1.23e+05 1.24e+05 1.02X 0.996X 0.991X
+// ndv 1000k fpp 10.0% 1.1e+05 1.18e+05 1.2e+05 0.944X 0.959X 0.96X
+// ndv 1000k fpp 1.0% 1.11e+05 1.16e+05 1.17e+05 0.954X 0.938X 0.934X
+// ndv 1000k fpp 0.1% 9.73e+04 1.16e+05 1.17e+05 0.834X 0.937X 0.936X
+// ndv 100000k fpp 10.0% 2.96e+04 4.19e+04 5.44e+04 0.254X 0.34X 0.436X
+// ndv 100000k fpp 1.0% 2.92e+04 3.81e+04 4.89e+04 0.25X 0.308X 0.391X
+// ndv 100000k fpp 0.1% 2.44e+04 3.28e+04 4.31e+04 0.209X 0.266X 0.345X
//
// find: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// present ndv 10k fpp 10.0% 2.48e+05 2.51e+05 2.53e+05 1X 1X 1X
-// absent ndv 10k fpp 10.0% 2.47e+05 2.52e+05 2.55e+05 0.995X 1X 1.01X
-// present ndv 10k fpp 1.0% 2.49e+05 2.52e+05 2.55e+05 1X 1.01X 1.01X
-// absent ndv 10k fpp 1.0% 2.47e+05 2.53e+05 2.56e+05 0.997X 1.01X 1.01X
-// present ndv 10k fpp 0.1% 2.49e+05 2.53e+05 2.54e+05 1X 1.01X 1.01X
-// absent ndv 10k fpp 0.1% 2.47e+05 2.53e+05 2.56e+05 0.997X 1.01X 1.01X
-// present ndv 1000k fpp 10.0% 1.98e+05 2.04e+05 2.06e+05 0.8X 0.814X 0.812X
-// absent ndv 1000k fpp 10.0% 2.01e+05 2.07e+05 2.1e+05 0.808X 0.826X 0.829X
-// present ndv 1000k fpp 1.0% 1.83e+05 1.95e+05 2.02e+05 0.737X 0.78X 0.798X
-// absent ndv 1000k fpp 1.0% 2.01e+05 2.04e+05 2.08e+05 0.808X 0.815X 0.82X
-// present ndv 1000k fpp 0.1% 1.96e+05 2.01e+05 2.03e+05 0.788X 0.8X 0.801X
-// absent ndv 1000k fpp 0.1% 2e+05 2.05e+05 2.07e+05 0.808X 0.817X 0.818X
-// present ndv 100000k fpp 10.0% 4.6e+04 5.09e+04 6.08e+04 0.185X 0.203X 0.24X
-// absent ndv 100000k fpp 10.0% 4.11e+04 4.36e+04 4.53e+04 0.166X 0.174X 0.179X
-// present ndv 100000k fpp 1.0% 4.55e+04 4.96e+04 6.19e+04 0.184X 0.198X 0.245X
-// absent ndv 100000k fpp 1.0% 3.83e+04 4.15e+04 4.69e+04 0.154X 0.166X 0.186X
-// present ndv 100000k fpp 0.1% 4.73e+04 5.43e+04 6.58e+04 0.191X 0.217X 0.26X
-// absent ndv 100000k fpp 0.1% 3.77e+04 4.07e+04 4.37e+04 0.152X 0.163X 0.173X
+// present ndv 10k fpp 10.0% 1.16e+05 1.17e+05 1.18e+05 1X 1X 1X
+// absent ndv 10k fpp 10.0% 1.16e+05 1.17e+05 1.18e+05 1X 1X 0.998X
+// present ndv 10k fpp 1.0% 1.15e+05 1.17e+05 1.18e+05 0.988X 1X 0.999X
+// absent ndv 10k fpp 1.0% 1.14e+05 1.17e+05 1.19e+05 0.978X 1X 1X
+// present ndv 10k fpp 0.1% 1.09e+05 1.17e+05 1.18e+05 0.939X 1X 1X
+// absent ndv 10k fpp 0.1% 1.13e+05 1.17e+05 1.18e+05 0.97X 1X 1X
+// present ndv 1000k fpp 10.0% 1.09e+05 1.13e+05 1.15e+05 0.942X 0.968X 0.97X
+// absent ndv 1000k fpp 10.0% 1.09e+05 1.15e+05 1.16e+05 0.937X 0.982X 0.982X
+// present ndv 1000k fpp 1.0% 9.44e+04 1.12e+05 1.13e+05 0.814X 0.952X 0.951X
+// absent ndv 1000k fpp 1.0% 1.02e+05 1.14e+05 1.15e+05 0.877X 0.973X 0.972X
+// present ndv 1000k fpp 0.1% 1.01e+05 1.11e+05 1.12e+05 0.868X 0.951X 0.949X
+// absent ndv 1000k fpp 0.1% 1.08e+05 1.14e+05 1.15e+05 0.927X 0.975X 0.975X
+// present ndv 100000k fpp 10.0% 3.18e+04 3.94e+04 5.18e+04 0.274X 0.336X 0.437X
+// absent ndv 100000k fpp 10.0% 2.74e+04 3.07e+04 3.49e+04 0.236X 0.262X 0.294X
+// present ndv 100000k fpp 1.0% 3.07e+04 4.29e+04 5.51e+04 0.265X 0.366X 0.465X
+// absent ndv 100000k fpp 1.0% 2.67e+04 2.9e+04 3.25e+04 0.23X 0.248X 0.274X
+// present ndv 100000k fpp 0.1% 2.78e+04 3.88e+04 4.9e+04 0.24X 0.331X 0.413X
+// absent ndv 100000k fpp 0.1% 2.44e+04 2.84e+04 3.02e+04 0.211X 0.242X 0.255X
//
-// Without AVX2:
+// union: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+// ndv 10k fpp 10.0% 1.81e+04 1.84e+04 1.86e+04 1X 1X 1X
+// ndv 10k fpp 1.0% 8.25e+03 8.39e+03 8.47e+03 0.455X 0.455X 0.455X
+// ndv 10k fpp 0.1% 4.02e+03 4.31e+03 4.35e+03 0.222X 0.234X 0.234X
+// ndv 1000k fpp 10.0% 105 111 112 0.00577X 0.00603X 0.00602X
+// ndv 1000k fpp 1.0% 45.9 46.4 46.9 0.00253X 0.00252X 0.00252X
+// ndv 1000k fpp 0.1% 46.2 46.6 46.9 0.00255X 0.00253X 0.00252X
+// ndv 100000k fpp 10.0% 0.2 0.2 0.2 1.1e-05X 1.08e-05X 1.07e-05X
+// ndv 100000k fpp 1.0% 0.2 0.2 0.2 1.1e-05X 1.08e-05X 1.07e-05X
+// ndv 100000k fpp 0.1% 0.133 0.143 0.145 7.35e-06X 7.75e-06X 7.79e-06X
+//
+//
+// Without AVX or AVX2:
//
// insert: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// ndv 10k fpp 10.0% 1.25e+05 1.27e+05 1.28e+05 1X 1X 1X
-// ndv 10k fpp 1.0% 1.27e+05 1.29e+05 1.3e+05 1.01X 1.02X 1.02X
-// ndv 10k fpp 0.1% 1.26e+05 1.28e+05 1.3e+05 1X 1.01X 1.01X
-// ndv 1000k fpp 10.0% 1.23e+05 1.25e+05 1.26e+05 0.977X 0.981X 0.985X
-// ndv 1000k fpp 1.0% 1.16e+05 1.22e+05 1.23e+05 0.925X 0.958X 0.958X
-// ndv 1000k fpp 0.1% 1.16e+05 1.22e+05 1.23e+05 0.928X 0.958X 0.957X
-// ndv 100000k fpp 10.0% 3.77e+04 4.06e+04 5.62e+04 0.301X 0.319X 0.438X
-// ndv 100000k fpp 1.0% 3.71e+04 4.06e+04 5.45e+04 0.296X 0.32X 0.425X
-// ndv 100000k fpp 0.1% 3.37e+04 3.68e+04 5.15e+04 0.269X 0.29X 0.401X
+// ndv 10k fpp 10.0% 9.27e+04 9.33e+04 9.4e+04 1X 1X 1X
+// ndv 10k fpp 1.0% 9.43e+04 9.49e+04 9.61e+04 1.02X 1.02X 1.02X
+// ndv 10k fpp 0.1% 9.36e+04 9.5e+04 9.58e+04 1.01X 1.02X 1.02X
+// ndv 1000k fpp 10.0% 8.4e+04 9.49e+04 9.61e+04 0.906X 1.02X 1.02X
+// ndv 1000k fpp 1.0% 7.64e+04 9.34e+04 9.45e+04 0.824X 1X 1.01X
+// ndv 1000k fpp 0.1% 8.24e+04 9.34e+04 9.44e+04 0.888X 1X 1X
+// ndv 100000k fpp 10.0% 3.22e+04 4e+04 5.03e+04 0.347X 0.429X 0.535X
+// ndv 100000k fpp 1.0% 2.77e+04 3.6e+04 4.8e+04 0.298X 0.386X 0.51X
+// ndv 100000k fpp 0.1% 2.54e+04 2.93e+04 4.32e+04 0.274X 0.314X 0.46X
//
// find: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// present ndv 10k fpp 10.0% 1.6e+05 1.64e+05 1.66e+05 1X 1X 1X
-// absent ndv 10k fpp 10.0% 1.11e+05 1.14e+05 1.15e+05 0.696X 0.697X 0.695X
-// present ndv 10k fpp 1.0% 1.57e+05 1.63e+05 1.64e+05 0.982X 0.994X 0.989X
-// absent ndv 10k fpp 1.0% 1.3e+05 1.33e+05 1.35e+05 0.814X 0.813X 0.812X
-// present ndv 10k fpp 0.1% 1.55e+05 1.58e+05 1.61e+05 0.967X 0.968X 0.969X
-// absent ndv 10k fpp 0.1% 2.26e+05 2.29e+05 2.31e+05 1.41X 1.4X 1.4X
-// present ndv 1000k fpp 10.0% 1.21e+05 1.23e+05 1.25e+05 0.758X 0.753X 0.756X
-// absent ndv 1000k fpp 10.0% 7.6e+04 7.72e+04 7.81e+04 0.475X 0.472X 0.471X
-// present ndv 1000k fpp 1.0% 1.23e+05 1.27e+05 1.28e+05 0.771X 0.773X 0.77X
-// absent ndv 1000k fpp 1.0% 1.19e+05 1.21e+05 1.22e+05 0.744X 0.739X 0.738X
-// present ndv 1000k fpp 0.1% 1.17e+05 1.18e+05 1.2e+05 0.731X 0.724X 0.723X
-// absent ndv 1000k fpp 0.1% 1.13e+05 1.16e+05 1.17e+05 0.707X 0.706X 0.705X
-// present ndv 100000k fpp 10.0% 3.42e+04 3.63e+04 3.9e+04 0.214X 0.222X 0.235X
-// absent ndv 100000k fpp 10.0% 3.6e+04 3.77e+04 3.82e+04 0.225X 0.23X 0.23X
-// present ndv 100000k fpp 1.0% 3.18e+04 3.42e+04 3.57e+04 0.199X 0.209X 0.216X
-// absent ndv 100000k fpp 1.0% 3.63e+04 3.73e+04 3.79e+04 0.227X 0.228X 0.229X
-// present ndv 100000k fpp 0.1% 2.89e+04 3.2e+04 3.33e+04 0.18X 0.196X 0.201X
-// absent ndv 100000k fpp 0.1% 4.56e+04 4.78e+04 4.86e+04 0.285X 0.292X 0.293X
+// present ndv 10k fpp 10.0% 1.3e+05 1.31e+05 1.33e+05 1X 1X 1X
+// absent ndv 10k fpp 10.0% 8.74e+04 8.83e+04 8.92e+04 0.674X 0.673X 0.671X
+// present ndv 10k fpp 1.0% 1.25e+05 1.3e+05 1.31e+05 0.96X 0.991X 0.988X
+// absent ndv 10k fpp 1.0% 1.04e+05 1.06e+05 1.07e+05 0.805X 0.809X 0.807X
+// present ndv 10k fpp 0.1% 1.28e+05 1.3e+05 1.31e+05 0.986X 0.988X 0.984X
+// absent ndv 10k fpp 0.1% 1.69e+05 1.72e+05 1.74e+05 1.3X 1.31X 1.31X
+// present ndv 1000k fpp 10.0% 9.33e+04 9.6e+04 9.69e+04 0.719X 0.732X 0.729X
+// absent ndv 1000k fpp 10.0% 5.99e+04 6.07e+04 6.12e+04 0.462X 0.462X 0.461X
+// present ndv 1000k fpp 1.0% 9.48e+04 1.01e+05 1.02e+05 0.731X 0.768X 0.768X
+// absent ndv 1000k fpp 1.0% 9.49e+04 9.67e+04 9.74e+04 0.731X 0.737X 0.734X
+// present ndv 1000k fpp 0.1% 8.46e+04 9.3e+04 9.41e+04 0.652X 0.709X 0.709X
+// absent ndv 1000k fpp 0.1% 9.05e+04 9.18e+04 9.28e+04 0.697X 0.7X 0.699X
+// present ndv 100000k fpp 10.0% 2.6e+04 2.88e+04 3.11e+04 0.201X 0.22X 0.235X
+// absent ndv 100000k fpp 10.0% 2.88e+04 2.99e+04 3.08e+04 0.222X 0.228X 0.232X
+// present ndv 100000k fpp 1.0% 2.34e+04 2.76e+04 2.91e+04 0.18X 0.21X 0.219X
+// absent ndv 100000k fpp 1.0% 2.86e+04 2.97e+04 3.03e+04 0.22X 0.227X 0.228X
+// present ndv 100000k fpp 0.1% 2.34e+04 2.65e+04 2.81e+04 0.18X 0.202X 0.211X
+// absent ndv 100000k fpp 0.1% 3.73e+04 3.85e+04 3.91e+04 0.287X 0.293X 0.295X
+//
+// union: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+// ndv 10k fpp 10.0% 3.06e+03 3.1e+03 3.12e+03 1X 1X 1X
+// ndv 10k fpp 1.0% 1.51e+03 1.55e+03 1.57e+03 0.493X 0.502X 0.503X
+// ndv 10k fpp 0.1% 748 775 782 0.244X 0.25X 0.251X
+// ndv 1000k fpp 10.0% 19.6 20 20.2 0.0064X 0.00646X 0.00647X
+// ndv 1000k fpp 1.0% 9.41 10 10.1 0.00307X 0.00324X 0.00323X
+// ndv 1000k fpp 0.1% 9.9 10 10.1 0.00323X 0.00324X 0.00323X
+// ndv 100000k fpp 10.0% 0.0671 0.0714 0.0725 2.19e-05X 2.3e-05X 2.32e-05X
+// ndv 100000k fpp 1.0% 0.0676 0.0709 0.0719 2.21e-05X 2.29e-05X 2.31e-05X
+// ndv 100000k fpp 0.1% 0.0338 0.035 0.0356 1.1e-05X 1.13e-05X 1.14e-05X
// Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits
// produced by rand().
@@ -221,6 +249,27 @@ void Absent(int batch_size, void* data) {
} // namespace find
+// Benchmark or
+namespace either {
+
+struct TestData {
+ explicit TestData(int log_heap_size) {
+ BloomFilter bf(log_heap_size);
+ BloomFilter::ToThrift(&bf, &tbf);
+ }
+
+ TBloomFilter tbf;
+};
+
+void Benchmark(int batch_size, void* data) {
+ TestData* d = reinterpret_cast<TestData*>(data);
+ for (int i = 0; i < batch_size; ++i) {
+ BloomFilter::Or(d->tbf, &d->tbf);
+ }
+}
+
+} // namespace either
+
void RunBenchmarks() {
char name[120];
@@ -254,6 +303,20 @@ void RunBenchmarks() {
}
cout << suite.Measure() << endl;
}
+
+ {
+ Benchmark suite("union");
+ vector<unique_ptr<either::TestData> > testdata;
+ for (int ndv = 10000; ndv <= 100 * 1000 * 1000; ndv *= 100) {
+ for (double fpp = 0.1; fpp >= 0.001; fpp /= 10) {
+ testdata.emplace_back(
+ new either::TestData(BloomFilter::MinLogSpace(ndv, fpp)));
+ snprintf(name, sizeof(name), "ndv %7dk fpp %6.1f%%", ndv/1000, fpp*100);
+ suite.AddBenchmark(name, either::Benchmark, testdata.back().get());
+ }
+ }
+ cout << suite.Measure() << endl;
+ }
}
int main(int argc, char **argv) {
@@ -277,7 +340,8 @@ int main(int argc, char **argv) {
cout << "With AVX2:" << endl << endl;
RunBenchmarks();
- cout << endl << "Without AVX2:" << endl << endl;
- CpuInfo::TempDisable t(CpuInfo::AVX2);
+ cout << endl << "Without AVX or AVX2:" << endl << endl;
+ CpuInfo::TempDisable t1(CpuInfo::AVX);
+ CpuInfo::TempDisable t2(CpuInfo::AVX2);
RunBenchmarks();
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/testutil/mem-util.h
----------------------------------------------------------------------
diff --git a/be/src/testutil/mem-util.h b/be/src/testutil/mem-util.h
index 78b7b48..b4ce9ea 100644
--- a/be/src/testutil/mem-util.h
+++ b/be/src/testutil/mem-util.h
@@ -21,6 +21,8 @@
#include <cstdint>
#include <cstdlib>
+#include <glog/logging.h>
+
#include "gutil/macros.h"
namespace impala {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/bloom-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc
index 6fd53f5..2aadc05 100644
--- a/be/src/util/bloom-filter.cc
+++ b/be/src/util/bloom-filter.cc
@@ -154,6 +154,21 @@ bool BloomFilter::BucketFind(
return true;
}
+namespace {
+// Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using AVX
+// instructions. 'n' must be a multiple of 32.
+void __attribute__((target("avx"))) OrEqualArrayAvx(size_t n, const char* in, char* out) {
+ constexpr size_t REGISTER_SIZE = sizeof(__m256d);
+ DCHECK_EQ(n % REGISTER_SIZE, 0) << "Invalid Bloom Filter directory size";
+ const double* simd_in = reinterpret_cast<const double*>(in);
+ double* simd_out = reinterpret_cast<double*>(out);
+ const size_t simd_size = n / REGISTER_SIZE;
+ for (size_t i = 0; i < simd_size; i += REGISTER_SIZE / sizeof(simd_in[0])) {
+ _mm256_storeu_pd(simd_out + i,
+ _mm256_or_pd(_mm256_loadu_pd(simd_out + i), _mm256_loadu_pd(simd_in + i)));
+ }
+}
+} //namespace
void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) {
DCHECK(out != NULL);
@@ -163,8 +178,29 @@ void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) {
out->directory.resize(0);
return;
}
-
- for (int i = 0; i < in.directory.size(); ++i) out->directory[i] |= in.directory[i];
+ // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
+ // written in a way that is very friendly to auto-vectorization. Instead, we manually
+ // vectorize, increasing the speed by up to 184x.
+ //
+ // TODO: Tune gcc flags to auto-vectorize the trivial loop instead of hand-vectorizing
+ // it. This might not be possible.
+ if (CpuInfo::IsSupported(CpuInfo::AVX)) {
+ OrEqualArrayAvx(in.directory.size(), &in.directory[0], &out->directory[0]);
+ } else {
+ const __m128i* simd_in = reinterpret_cast<const __m128i*>(&in.directory[0]);
+ __m128i* simd_out = reinterpret_cast<__m128i*>(&out->directory[0]);
+ const size_t simd_size =
+ (in.directory.size() * sizeof(in.directory[0])) / sizeof(simd_in[0]);
+ // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(__m128i)
+ // == 16, we can do two _mm_or_si128's in each iteration without checking array
+ // bounds.
+ for (size_t i = 0; i < simd_size; i += 2) {
+ _mm_storeu_si128(simd_out + i,
+ _mm_or_si128(_mm_loadu_si128(simd_out + i), _mm_loadu_si128(simd_in + i)));
+ _mm_storeu_si128(simd_out + i + 1, _mm_or_si128(_mm_loadu_si128(simd_out + i + 1),
+ _mm_loadu_si128(simd_in + i + 1)));
+ }
+ }
}
// The following three methods are derived from
@@ -187,14 +223,13 @@ int BloomFilter::MinLogSpace(const size_t ndv, const double fpp) {
const double m = -k * ndv / log(1 - pow(fpp, 1.0 / k));
// Handle case where ndv == 1 => ceil(log2(m/8)) < 0.
- return max(0, static_cast<int>(ceil(log2(m/8))));
+ return max(0, static_cast<int>(ceil(log2(m / 8))));
}
double BloomFilter::FalsePositiveProb(const size_t ndv, const int log_heap_space) {
- return pow(
- 1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv)) /
- static_cast<double>(1ull << (log_heap_space + 3))),
+ return pow(1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv))
+ / static_cast<double>(1ull << (log_heap_space + 3))),
BUCKET_WORDS);
}
-} // namespace impala
+} // namespace impala
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.cc
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.cc b/be/src/util/cpu-info.cc
index 532c9dd..6329ca8 100644
--- a/be/src/util/cpu-info.cc
+++ b/be/src/util/cpu-info.cc
@@ -77,6 +77,7 @@ static struct {
{ "sse4_1", CpuInfo::SSE4_1 },
{ "sse4_2", CpuInfo::SSE4_2 },
{ "popcnt", CpuInfo::POPCNT },
+ { "avx", CpuInfo::AVX },
{ "avx2", CpuInfo::AVX2 },
};
static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.h
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.h b/be/src/util/cpu-info.h
index cb577c2..868d2dd 100644
--- a/be/src/util/cpu-info.h
+++ b/be/src/util/cpu-info.h
@@ -36,7 +36,8 @@ class CpuInfo {
static const int64_t SSE4_1 = (1 << 2);
static const int64_t SSE4_2 = (1 << 3);
static const int64_t POPCNT = (1 << 4);
- static const int64_t AVX2 = (1 << 5);
+ static const int64_t AVX = (1 << 5);
+ static const int64_t AVX2 = (1 << 6);
/// Cache enums for L1 (data), L2 and L3
enum CacheLevel {