You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/10/25 20:16:13 UTC

[26/33] incubator-impala git commit: IMPALA-4300: Speed up BloomFilter::Or with SIMD

IMPALA-4300: Speed up BloomFilter::Or with SIMD

The previous code was not written in a way that GCC could
auto-vectorize it. Manually vectorizing speeds up BloomFilter::Or by
up to 184x.

Change-Id: I840799d9cfb81285c796e2abfe2029bb869b0f67
Reviewed-on: http://gerrit.cloudera.org:8080/4813
Reviewed-by: Jim Apple <jb...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/61fcb489
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/61fcb489
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/61fcb489

Branch: refs/heads/hadoop-next
Commit: 61fcb489745f3f0b3f1abbf9fbf666a29a6363de
Parents: 0fbb5b7
Author: Jim Apple <jb...@cloudera.com>
Authored: Fri Oct 21 07:46:42 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Mon Oct 24 18:07:25 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/bloom-filter-benchmark.cc | 198 +++++++++++++++--------
 be/src/testutil/mem-util.h                  |   2 +
 be/src/util/bloom-filter.cc                 |  49 +++++-
 be/src/util/cpu-info.cc                     |   1 +
 be/src/util/cpu-info.h                      |   3 +-
 5 files changed, 178 insertions(+), 75 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/benchmarks/bloom-filter-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc
index 1aa0619..d9019c8 100644
--- a/be/src/benchmarks/bloom-filter-benchmark.cc
+++ b/be/src/benchmarks/bloom-filter-benchmark.cc
@@ -30,13 +30,14 @@
 using namespace std;
 using namespace impala;
 
-// Tests Bloom filter performance on four tasks:
+// Tests Bloom filter performance on:
 //
 // 1. Construct/destruct pairs
 // 2. Inserts
 // 3. Lookups when the item is present
 // 4. Lookups when the item is absent (this is theoretically faster than when the item is
 //    present in some Bloom filter variants)
+// 5. Unions
 //
 // As in bloom-filter.h, ndv refers to the number of unique items inserted into a filter
 // and fpp is the probability of false positives.
@@ -46,89 +47,116 @@ using namespace impala;
 // initialize:                Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           7.05e+03 7.27e+03 7.34e+03         1X         1X         1X
-//            ndv      10k fpp    1.0%           3.79e+03 3.93e+03 3.96e+03     0.538X     0.541X      0.54X
-//            ndv      10k fpp    0.1%           1.39e+03 1.42e+03 1.44e+03     0.198X     0.196X     0.196X
-//            ndv    1000k fpp   10.0%               4.62     4.78     4.81  0.000655X  0.000658X  0.000655X
-//            ndv    1000k fpp    1.0%               2.49     2.55      2.6  0.000354X  0.000351X  0.000354X
-//            ndv    1000k fpp    0.1%               2.45     2.55      2.6  0.000347X  0.000351X  0.000354X
-//            ndv  100000k fpp   10.0%              0.035   0.0358    0.037  4.96e-06X  4.93e-06X  5.04e-06X
-//            ndv  100000k fpp    1.0%             0.0347   0.0361   0.0372  4.93e-06X  4.96e-06X  5.06e-06X
-//            ndv  100000k fpp    0.1%             0.0176   0.0181   0.0186   2.5e-06X  2.49e-06X  2.53e-06X
+//            ndv      10k fpp   10.0%           5.89e+03 5.98e+03 6.03e+03         1X         1X         1X
+//            ndv      10k fpp    1.0%           3.22e+03 3.25e+03 3.27e+03     0.546X     0.543X     0.542X
+//            ndv      10k fpp    0.1%           1.13e+03 1.17e+03 1.18e+03     0.191X     0.195X     0.195X
+//            ndv    1000k fpp   10.0%               3.85     3.93     3.93  0.000654X  0.000657X  0.000652X
+//            ndv    1000k fpp    1.0%               2.04     2.12     2.12  0.000346X  0.000354X  0.000351X
+//            ndv    1000k fpp    0.1%               2.04     2.12     2.12  0.000346X  0.000354X  0.000351X
+//            ndv  100000k fpp   10.0%             0.0281    0.029   0.0294  4.77e-06X  4.85e-06X  4.87e-06X
+//            ndv  100000k fpp    1.0%             0.0284    0.029   0.0298  4.82e-06X  4.85e-06X  4.93e-06X
+//            ndv  100000k fpp    0.1%             0.0144   0.0147   0.0149  2.44e-06X  2.47e-06X  2.47e-06X
 //
 // With AVX2:
 //
 // insert:                    Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           2.03e+05 2.05e+05 2.08e+05         1X         1X         1X
-//            ndv      10k fpp    1.0%           2.03e+05 2.06e+05 2.08e+05     0.997X         1X         1X
-//            ndv      10k fpp    0.1%           2.03e+05 2.05e+05 2.07e+05     0.997X     0.998X     0.997X
-//            ndv    1000k fpp   10.0%           1.82e+05 1.87e+05 1.89e+05     0.896X      0.91X     0.907X
-//            ndv    1000k fpp    1.0%           1.49e+05 1.53e+05 1.56e+05     0.731X     0.747X      0.75X
-//            ndv    1000k fpp    0.1%           1.79e+05 1.82e+05 1.83e+05     0.881X     0.886X     0.882X
-//            ndv  100000k fpp   10.0%           4.08e+04 4.49e+04 5.44e+04     0.201X     0.219X     0.262X
-//            ndv  100000k fpp    1.0%           3.94e+04  4.4e+04 5.04e+04     0.194X     0.214X     0.242X
-//            ndv  100000k fpp    0.1%           4.08e+04 4.48e+04 5.68e+04     0.201X     0.218X     0.273X
+//            ndv      10k fpp   10.0%           1.17e+05 1.23e+05 1.25e+05         1X         1X         1X
+//            ndv      10k fpp    1.0%           1.17e+05 1.24e+05 1.25e+05         1X         1X         1X
+//            ndv      10k fpp    0.1%            1.2e+05 1.23e+05 1.24e+05      1.02X     0.996X     0.991X
+//            ndv    1000k fpp   10.0%            1.1e+05 1.18e+05  1.2e+05     0.944X     0.959X      0.96X
+//            ndv    1000k fpp    1.0%           1.11e+05 1.16e+05 1.17e+05     0.954X     0.938X     0.934X
+//            ndv    1000k fpp    0.1%           9.73e+04 1.16e+05 1.17e+05     0.834X     0.937X     0.936X
+//            ndv  100000k fpp   10.0%           2.96e+04 4.19e+04 5.44e+04     0.254X      0.34X     0.436X
+//            ndv  100000k fpp    1.0%           2.92e+04 3.81e+04 4.89e+04      0.25X     0.308X     0.391X
+//            ndv  100000k fpp    0.1%           2.44e+04 3.28e+04 4.31e+04     0.209X     0.266X     0.345X
 //
 // find:                      Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//    present ndv      10k fpp   10.0%           2.48e+05 2.51e+05 2.53e+05         1X         1X         1X
-//    absent  ndv      10k fpp   10.0%           2.47e+05 2.52e+05 2.55e+05     0.995X         1X      1.01X
-//    present ndv      10k fpp    1.0%           2.49e+05 2.52e+05 2.55e+05         1X      1.01X      1.01X
-//    absent  ndv      10k fpp    1.0%           2.47e+05 2.53e+05 2.56e+05     0.997X      1.01X      1.01X
-//    present ndv      10k fpp    0.1%           2.49e+05 2.53e+05 2.54e+05         1X      1.01X      1.01X
-//    absent  ndv      10k fpp    0.1%           2.47e+05 2.53e+05 2.56e+05     0.997X      1.01X      1.01X
-//    present ndv    1000k fpp   10.0%           1.98e+05 2.04e+05 2.06e+05       0.8X     0.814X     0.812X
-//    absent  ndv    1000k fpp   10.0%           2.01e+05 2.07e+05  2.1e+05     0.808X     0.826X     0.829X
-//    present ndv    1000k fpp    1.0%           1.83e+05 1.95e+05 2.02e+05     0.737X      0.78X     0.798X
-//    absent  ndv    1000k fpp    1.0%           2.01e+05 2.04e+05 2.08e+05     0.808X     0.815X      0.82X
-//    present ndv    1000k fpp    0.1%           1.96e+05 2.01e+05 2.03e+05     0.788X       0.8X     0.801X
-//    absent  ndv    1000k fpp    0.1%              2e+05 2.05e+05 2.07e+05     0.808X     0.817X     0.818X
-//    present ndv  100000k fpp   10.0%            4.6e+04 5.09e+04 6.08e+04     0.185X     0.203X      0.24X
-//    absent  ndv  100000k fpp   10.0%           4.11e+04 4.36e+04 4.53e+04     0.166X     0.174X     0.179X
-//    present ndv  100000k fpp    1.0%           4.55e+04 4.96e+04 6.19e+04     0.184X     0.198X     0.245X
-//    absent  ndv  100000k fpp    1.0%           3.83e+04 4.15e+04 4.69e+04     0.154X     0.166X     0.186X
-//    present ndv  100000k fpp    0.1%           4.73e+04 5.43e+04 6.58e+04     0.191X     0.217X      0.26X
-//    absent  ndv  100000k fpp    0.1%           3.77e+04 4.07e+04 4.37e+04     0.152X     0.163X     0.173X
+//    present ndv      10k fpp   10.0%           1.16e+05 1.17e+05 1.18e+05         1X         1X         1X
+//    absent  ndv      10k fpp   10.0%           1.16e+05 1.17e+05 1.18e+05         1X         1X     0.998X
+//    present ndv      10k fpp    1.0%           1.15e+05 1.17e+05 1.18e+05     0.988X         1X     0.999X
+//    absent  ndv      10k fpp    1.0%           1.14e+05 1.17e+05 1.19e+05     0.978X         1X         1X
+//    present ndv      10k fpp    0.1%           1.09e+05 1.17e+05 1.18e+05     0.939X         1X         1X
+//    absent  ndv      10k fpp    0.1%           1.13e+05 1.17e+05 1.18e+05      0.97X         1X         1X
+//    present ndv    1000k fpp   10.0%           1.09e+05 1.13e+05 1.15e+05     0.942X     0.968X      0.97X
+//    absent  ndv    1000k fpp   10.0%           1.09e+05 1.15e+05 1.16e+05     0.937X     0.982X     0.982X
+//    present ndv    1000k fpp    1.0%           9.44e+04 1.12e+05 1.13e+05     0.814X     0.952X     0.951X
+//    absent  ndv    1000k fpp    1.0%           1.02e+05 1.14e+05 1.15e+05     0.877X     0.973X     0.972X
+//    present ndv    1000k fpp    0.1%           1.01e+05 1.11e+05 1.12e+05     0.868X     0.951X     0.949X
+//    absent  ndv    1000k fpp    0.1%           1.08e+05 1.14e+05 1.15e+05     0.927X     0.975X     0.975X
+//    present ndv  100000k fpp   10.0%           3.18e+04 3.94e+04 5.18e+04     0.274X     0.336X     0.437X
+//    absent  ndv  100000k fpp   10.0%           2.74e+04 3.07e+04 3.49e+04     0.236X     0.262X     0.294X
+//    present ndv  100000k fpp    1.0%           3.07e+04 4.29e+04 5.51e+04     0.265X     0.366X     0.465X
+//    absent  ndv  100000k fpp    1.0%           2.67e+04  2.9e+04 3.25e+04      0.23X     0.248X     0.274X
+//    present ndv  100000k fpp    0.1%           2.78e+04 3.88e+04  4.9e+04      0.24X     0.331X     0.413X
+//    absent  ndv  100000k fpp    0.1%           2.44e+04 2.84e+04 3.02e+04     0.211X     0.242X     0.255X
 //
-// Without AVX2:
+// union:                     Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//            ndv      10k fpp   10.0%           1.81e+04 1.84e+04 1.86e+04         1X         1X         1X
+//            ndv      10k fpp    1.0%           8.25e+03 8.39e+03 8.47e+03     0.455X     0.455X     0.455X
+//            ndv      10k fpp    0.1%           4.02e+03 4.31e+03 4.35e+03     0.222X     0.234X     0.234X
+//            ndv    1000k fpp   10.0%                105      111      112   0.00577X   0.00603X   0.00602X
+//            ndv    1000k fpp    1.0%               45.9     46.4     46.9   0.00253X   0.00252X   0.00252X
+//            ndv    1000k fpp    0.1%               46.2     46.6     46.9   0.00255X   0.00253X   0.00252X
+//            ndv  100000k fpp   10.0%                0.2      0.2      0.2   1.1e-05X  1.08e-05X  1.07e-05X
+//            ndv  100000k fpp    1.0%                0.2      0.2      0.2   1.1e-05X  1.08e-05X  1.07e-05X
+//            ndv  100000k fpp    0.1%              0.133    0.143    0.145  7.35e-06X  7.75e-06X  7.79e-06X
+//
+//
+// Without AVX or AVX2:
 //
 // insert:                    Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           1.25e+05 1.27e+05 1.28e+05         1X         1X         1X
-//            ndv      10k fpp    1.0%           1.27e+05 1.29e+05  1.3e+05      1.01X      1.02X      1.02X
-//            ndv      10k fpp    0.1%           1.26e+05 1.28e+05  1.3e+05         1X      1.01X      1.01X
-//            ndv    1000k fpp   10.0%           1.23e+05 1.25e+05 1.26e+05     0.977X     0.981X     0.985X
-//            ndv    1000k fpp    1.0%           1.16e+05 1.22e+05 1.23e+05     0.925X     0.958X     0.958X
-//            ndv    1000k fpp    0.1%           1.16e+05 1.22e+05 1.23e+05     0.928X     0.958X     0.957X
-//            ndv  100000k fpp   10.0%           3.77e+04 4.06e+04 5.62e+04     0.301X     0.319X     0.438X
-//            ndv  100000k fpp    1.0%           3.71e+04 4.06e+04 5.45e+04     0.296X      0.32X     0.425X
-//            ndv  100000k fpp    0.1%           3.37e+04 3.68e+04 5.15e+04     0.269X      0.29X     0.401X
+//            ndv      10k fpp   10.0%           9.27e+04 9.33e+04  9.4e+04         1X         1X         1X
+//            ndv      10k fpp    1.0%           9.43e+04 9.49e+04 9.61e+04      1.02X      1.02X      1.02X
+//            ndv      10k fpp    0.1%           9.36e+04  9.5e+04 9.58e+04      1.01X      1.02X      1.02X
+//            ndv    1000k fpp   10.0%            8.4e+04 9.49e+04 9.61e+04     0.906X      1.02X      1.02X
+//            ndv    1000k fpp    1.0%           7.64e+04 9.34e+04 9.45e+04     0.824X         1X      1.01X
+//            ndv    1000k fpp    0.1%           8.24e+04 9.34e+04 9.44e+04     0.888X         1X         1X
+//            ndv  100000k fpp   10.0%           3.22e+04    4e+04 5.03e+04     0.347X     0.429X     0.535X
+//            ndv  100000k fpp    1.0%           2.77e+04  3.6e+04  4.8e+04     0.298X     0.386X      0.51X
+//            ndv  100000k fpp    0.1%           2.54e+04 2.93e+04 4.32e+04     0.274X     0.314X      0.46X
 //
 // find:                      Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//    present ndv      10k fpp   10.0%            1.6e+05 1.64e+05 1.66e+05         1X         1X         1X
-//    absent  ndv      10k fpp   10.0%           1.11e+05 1.14e+05 1.15e+05     0.696X     0.697X     0.695X
-//    present ndv      10k fpp    1.0%           1.57e+05 1.63e+05 1.64e+05     0.982X     0.994X     0.989X
-//    absent  ndv      10k fpp    1.0%            1.3e+05 1.33e+05 1.35e+05     0.814X     0.813X     0.812X
-//    present ndv      10k fpp    0.1%           1.55e+05 1.58e+05 1.61e+05     0.967X     0.968X     0.969X
-//    absent  ndv      10k fpp    0.1%           2.26e+05 2.29e+05 2.31e+05      1.41X       1.4X       1.4X
-//    present ndv    1000k fpp   10.0%           1.21e+05 1.23e+05 1.25e+05     0.758X     0.753X     0.756X
-//    absent  ndv    1000k fpp   10.0%            7.6e+04 7.72e+04 7.81e+04     0.475X     0.472X     0.471X
-//    present ndv    1000k fpp    1.0%           1.23e+05 1.27e+05 1.28e+05     0.771X     0.773X      0.77X
-//    absent  ndv    1000k fpp    1.0%           1.19e+05 1.21e+05 1.22e+05     0.744X     0.739X     0.738X
-//    present ndv    1000k fpp    0.1%           1.17e+05 1.18e+05  1.2e+05     0.731X     0.724X     0.723X
-//    absent  ndv    1000k fpp    0.1%           1.13e+05 1.16e+05 1.17e+05     0.707X     0.706X     0.705X
-//    present ndv  100000k fpp   10.0%           3.42e+04 3.63e+04  3.9e+04     0.214X     0.222X     0.235X
-//    absent  ndv  100000k fpp   10.0%            3.6e+04 3.77e+04 3.82e+04     0.225X      0.23X      0.23X
-//    present ndv  100000k fpp    1.0%           3.18e+04 3.42e+04 3.57e+04     0.199X     0.209X     0.216X
-//    absent  ndv  100000k fpp    1.0%           3.63e+04 3.73e+04 3.79e+04     0.227X     0.228X     0.229X
-//    present ndv  100000k fpp    0.1%           2.89e+04  3.2e+04 3.33e+04      0.18X     0.196X     0.201X
-//    absent  ndv  100000k fpp    0.1%           4.56e+04 4.78e+04 4.86e+04     0.285X     0.292X     0.293X
+//    present ndv      10k fpp   10.0%            1.3e+05 1.31e+05 1.33e+05         1X         1X         1X
+//    absent  ndv      10k fpp   10.0%           8.74e+04 8.83e+04 8.92e+04     0.674X     0.673X     0.671X
+//    present ndv      10k fpp    1.0%           1.25e+05  1.3e+05 1.31e+05      0.96X     0.991X     0.988X
+//    absent  ndv      10k fpp    1.0%           1.04e+05 1.06e+05 1.07e+05     0.805X     0.809X     0.807X
+//    present ndv      10k fpp    0.1%           1.28e+05  1.3e+05 1.31e+05     0.986X     0.988X     0.984X
+//    absent  ndv      10k fpp    0.1%           1.69e+05 1.72e+05 1.74e+05       1.3X      1.31X      1.31X
+//    present ndv    1000k fpp   10.0%           9.33e+04  9.6e+04 9.69e+04     0.719X     0.732X     0.729X
+//    absent  ndv    1000k fpp   10.0%           5.99e+04 6.07e+04 6.12e+04     0.462X     0.462X     0.461X
+//    present ndv    1000k fpp    1.0%           9.48e+04 1.01e+05 1.02e+05     0.731X     0.768X     0.768X
+//    absent  ndv    1000k fpp    1.0%           9.49e+04 9.67e+04 9.74e+04     0.731X     0.737X     0.734X
+//    present ndv    1000k fpp    0.1%           8.46e+04  9.3e+04 9.41e+04     0.652X     0.709X     0.709X
+//    absent  ndv    1000k fpp    0.1%           9.05e+04 9.18e+04 9.28e+04     0.697X       0.7X     0.699X
+//    present ndv  100000k fpp   10.0%            2.6e+04 2.88e+04 3.11e+04     0.201X      0.22X     0.235X
+//    absent  ndv  100000k fpp   10.0%           2.88e+04 2.99e+04 3.08e+04     0.222X     0.228X     0.232X
+//    present ndv  100000k fpp    1.0%           2.34e+04 2.76e+04 2.91e+04      0.18X      0.21X     0.219X
+//    absent  ndv  100000k fpp    1.0%           2.86e+04 2.97e+04 3.03e+04      0.22X     0.227X     0.228X
+//    present ndv  100000k fpp    0.1%           2.34e+04 2.65e+04 2.81e+04      0.18X     0.202X     0.211X
+//    absent  ndv  100000k fpp    0.1%           3.73e+04 3.85e+04 3.91e+04     0.287X     0.293X     0.295X
+//
+// union:                     Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//            ndv      10k fpp   10.0%           3.06e+03  3.1e+03 3.12e+03         1X         1X         1X
+//            ndv      10k fpp    1.0%           1.51e+03 1.55e+03 1.57e+03     0.493X     0.502X     0.503X
+//            ndv      10k fpp    0.1%                748      775      782     0.244X      0.25X     0.251X
+//            ndv    1000k fpp   10.0%               19.6       20     20.2    0.0064X   0.00646X   0.00647X
+//            ndv    1000k fpp    1.0%               9.41       10     10.1   0.00307X   0.00324X   0.00323X
+//            ndv    1000k fpp    0.1%                9.9       10     10.1   0.00323X   0.00324X   0.00323X
+//            ndv  100000k fpp   10.0%             0.0671   0.0714   0.0725  2.19e-05X   2.3e-05X  2.32e-05X
+//            ndv  100000k fpp    1.0%             0.0676   0.0709   0.0719  2.21e-05X  2.29e-05X  2.31e-05X
+//            ndv  100000k fpp    0.1%             0.0338    0.035   0.0356   1.1e-05X  1.13e-05X  1.14e-05X
 
 // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits
 // produced by rand().
@@ -221,6 +249,27 @@ void Absent(int batch_size, void* data) {
 
 }  // namespace find
 
+// Benchmark or
+namespace either {
+
+struct TestData {
+  explicit TestData(int log_heap_size) {
+    BloomFilter bf(log_heap_size);
+    BloomFilter::ToThrift(&bf, &tbf);
+  }
+
+  TBloomFilter tbf;
+};
+
+void Benchmark(int batch_size, void* data) {
+  TestData* d = reinterpret_cast<TestData*>(data);
+  for (int i = 0; i < batch_size; ++i) {
+    BloomFilter::Or(d->tbf, &d->tbf);
+  }
+}
+
+} // namespace either
+
 void RunBenchmarks() {
 
   char name[120];
@@ -254,6 +303,20 @@ void RunBenchmarks() {
     }
     cout << suite.Measure() << endl;
   }
+
+  {
+    Benchmark suite("union");
+    vector<unique_ptr<either::TestData> > testdata;
+    for (int ndv = 10000; ndv <= 100 * 1000 * 1000; ndv *= 100) {
+      for (double fpp = 0.1; fpp >= 0.001; fpp /= 10) {
+        testdata.emplace_back(
+            new either::TestData(BloomFilter::MinLogSpace(ndv, fpp)));
+        snprintf(name, sizeof(name), "ndv %7dk fpp %6.1f%%", ndv/1000, fpp*100);
+        suite.AddBenchmark(name, either::Benchmark, testdata.back().get());
+      }
+    }
+    cout << suite.Measure() << endl;
+  }
 }
 
 int main(int argc, char **argv) {
@@ -277,7 +340,8 @@ int main(int argc, char **argv) {
 
   cout << "With AVX2:" << endl << endl;
   RunBenchmarks();
-  cout << endl << "Without AVX2:" << endl << endl;
-  CpuInfo::TempDisable t(CpuInfo::AVX2);
+  cout << endl << "Without AVX or AVX2:" << endl << endl;
+  CpuInfo::TempDisable t1(CpuInfo::AVX);
+  CpuInfo::TempDisable t2(CpuInfo::AVX2);
   RunBenchmarks();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/testutil/mem-util.h
----------------------------------------------------------------------
diff --git a/be/src/testutil/mem-util.h b/be/src/testutil/mem-util.h
index 78b7b48..b4ce9ea 100644
--- a/be/src/testutil/mem-util.h
+++ b/be/src/testutil/mem-util.h
@@ -21,6 +21,8 @@
 #include <cstdint>
 #include <cstdlib>
 
+#include <glog/logging.h>
+
 #include "gutil/macros.h"
 
 namespace impala {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/bloom-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc
index 6fd53f5..2aadc05 100644
--- a/be/src/util/bloom-filter.cc
+++ b/be/src/util/bloom-filter.cc
@@ -154,6 +154,21 @@ bool BloomFilter::BucketFind(
   return true;
 }
 
+namespace {
+// Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using AVX
+// instructions. 'n' must be a multiple of 32.
+void __attribute__((target("avx"))) OrEqualArrayAvx(size_t n, const char* in, char* out) {
+  constexpr size_t REGISTER_SIZE = sizeof(__m256d);
+  DCHECK_EQ(n % REGISTER_SIZE, 0) << "Invalid Bloom Filter directory size";
+  const double* simd_in = reinterpret_cast<const double*>(in);
+  double* simd_out = reinterpret_cast<double*>(out);
+  const size_t simd_size = n / REGISTER_SIZE;
+  for (size_t i = 0; i < simd_size; i += REGISTER_SIZE / sizeof(simd_in[0])) {
+    _mm256_storeu_pd(simd_out + i,
+        _mm256_or_pd(_mm256_loadu_pd(simd_out + i), _mm256_loadu_pd(simd_in + i)));
+  }
+}
+} //namespace
 
 void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) {
   DCHECK(out != NULL);
@@ -163,8 +178,29 @@ void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) {
     out->directory.resize(0);
     return;
   }
-
-  for (int i = 0; i < in.directory.size(); ++i) out->directory[i] |= in.directory[i];
+  // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
+  // written in a way that is very friendly to auto-vectorization. Instead, we manually
+  // vectorize, increasing the speed by up to 184x.
+  //
+  // TODO: Tune gcc flags to auto-vectorize the trivial loop instead of hand-vectorizing
+  // it. This might not be possible.
+  if (CpuInfo::IsSupported(CpuInfo::AVX)) {
+    OrEqualArrayAvx(in.directory.size(), &in.directory[0], &out->directory[0]);
+  } else {
+    const __m128i* simd_in = reinterpret_cast<const __m128i*>(&in.directory[0]);
+    __m128i* simd_out = reinterpret_cast<__m128i*>(&out->directory[0]);
+    const size_t simd_size =
+        (in.directory.size() * sizeof(in.directory[0])) / sizeof(simd_in[0]);
+    // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(__m128i)
+    // == 16, we can do two _mm_or_si128's in each iteration without checking array
+    // bounds.
+    for (size_t i = 0; i < simd_size; i += 2) {
+      _mm_storeu_si128(simd_out + i,
+          _mm_or_si128(_mm_loadu_si128(simd_out + i), _mm_loadu_si128(simd_in + i)));
+      _mm_storeu_si128(simd_out + i + 1, _mm_or_si128(_mm_loadu_si128(simd_out + i + 1),
+                                             _mm_loadu_si128(simd_in + i + 1)));
+    }
+  }
 }
 
 // The following three methods are derived from
@@ -187,14 +223,13 @@ int BloomFilter::MinLogSpace(const size_t ndv, const double fpp) {
   const double m = -k * ndv / log(1 - pow(fpp, 1.0 / k));
 
   // Handle case where ndv == 1 => ceil(log2(m/8)) < 0.
-  return max(0, static_cast<int>(ceil(log2(m/8))));
+  return max(0, static_cast<int>(ceil(log2(m / 8))));
 }
 
 double BloomFilter::FalsePositiveProb(const size_t ndv, const int log_heap_space) {
-  return pow(
-      1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv)) /
-              static_cast<double>(1ull << (log_heap_space + 3))),
+  return pow(1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv))
+                     / static_cast<double>(1ull << (log_heap_space + 3))),
       BUCKET_WORDS);
 }
 
-}  // namespace impala
+} // namespace impala

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.cc
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.cc b/be/src/util/cpu-info.cc
index 532c9dd..6329ca8 100644
--- a/be/src/util/cpu-info.cc
+++ b/be/src/util/cpu-info.cc
@@ -77,6 +77,7 @@ static struct {
   { "sse4_1", CpuInfo::SSE4_1 },
   { "sse4_2", CpuInfo::SSE4_2 },
   { "popcnt", CpuInfo::POPCNT },
+  { "avx",    CpuInfo::AVX },
   { "avx2",   CpuInfo::AVX2 },
 };
 static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.h
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.h b/be/src/util/cpu-info.h
index cb577c2..868d2dd 100644
--- a/be/src/util/cpu-info.h
+++ b/be/src/util/cpu-info.h
@@ -36,7 +36,8 @@ class CpuInfo {
   static const int64_t SSE4_1  = (1 << 2);
   static const int64_t SSE4_2  = (1 << 3);
   static const int64_t POPCNT  = (1 << 4);
-  static const int64_t AVX2    = (1 << 5);
+  static const int64_t AVX     = (1 << 5);
+  static const int64_t AVX2    = (1 << 6);
 
   /// Cache enums for L1 (data), L2 and L3
   enum CacheLevel {