You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by he...@apache.org on 2016/10/24 22:48:03 UTC

[1/2] incubator-impala git commit: Remove unused Bitmap code.

Repository: incubator-impala
Updated Branches:
  refs/heads/master ff6b450ad -> 61fcb4897


Remove unused Bitmap code.

These methods and code paths have been made obsolete by the switch to
Bloom filters.

Change-Id: I95fcaaa40243999800c2ec2ead5b3479d66a63e7
Reviewed-on: http://gerrit.cloudera.org:8080/4801
Reviewed-by: Henry Robinson <he...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0fbb5b7e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0fbb5b7e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0fbb5b7e

Branch: refs/heads/master
Commit: 0fbb5b7e71e55346c8b97ec143854dba0088f124
Parents: ff6b450
Author: Jim Apple <jb...@cloudera.com>
Authored: Sat Oct 22 10:42:51 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Mon Oct 24 17:53:33 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/bitmap-benchmark.cc |  6 +--
 be/src/exec/hash-table.h              |  4 +-
 be/src/exec/nested-loop-join-node.cc  | 12 ++---
 be/src/util/bitmap-test.cc            | 82 +++++-------------------------
 be/src/util/bitmap.cc                 |  8 +--
 be/src/util/bitmap.h                  | 42 ++-------------
 6 files changed, 32 insertions(+), 122 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0fbb5b7e/be/src/benchmarks/bitmap-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bitmap-benchmark.cc b/be/src/benchmarks/bitmap-benchmark.cc
index 26c51d8..5c1c9e6 100644
--- a/be/src/benchmarks/bitmap-benchmark.cc
+++ b/be/src/benchmarks/bitmap-benchmark.cc
@@ -110,7 +110,7 @@ struct TestData {
 void Benchmark(int batch_size, void* data) {
   TestData* d = reinterpret_cast<TestData*>(data);
   for (int i = 0; i < batch_size; ++i) {
-    d->bm.Set<true>(d->data[i & (d->data.size() - 1)], true);
+    d->bm.Set(d->data[i & (d->data.size() - 1)] % d->bm.num_bits(), true);
   }
 }
 
@@ -122,7 +122,7 @@ struct TestData {
   TestData(int64_t size)
     : bm(size), data (1ull << 20) {
     for (size_t i = 0; i < size/2; ++i) {
-      bm.Set<true>(MakeNonNegativeRand(), true);
+      bm.Set(MakeNonNegativeRand() % size, true);
     }
     for (size_t i = 0; i < data.size(); ++i) {
       data[i] = MakeNonNegativeRand();
@@ -138,7 +138,7 @@ struct TestData {
 void Benchmark(int batch_size, void* data) {
   TestData* d = reinterpret_cast<TestData*>(data);
   for (int i = 0; i < batch_size; ++i) {
-    d->result += d->bm.Get<true>(d->data[i & (d->data.size() - 1)]);
+    d->result += d->bm.Get(d->data[i & (d->data.size() - 1)] % d->bm.num_bits());
   }
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0fbb5b7e/be/src/exec/hash-table.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.h b/be/src/exec/hash-table.h
index 404b294..2ebc22f 100644
--- a/be/src/exec/hash-table.h
+++ b/be/src/exec/hash-table.h
@@ -290,11 +290,11 @@ class HashTableCtx {
 
     /// Returns true if the current row is null but nulls are not considered in the current
     /// phase (build or probe).
-    bool ALWAYS_INLINE IsRowNull() const { return null_bitmap_.Get<false>(CurIdx()); }
+    bool ALWAYS_INLINE IsRowNull() const { return null_bitmap_.Get(CurIdx()); }
 
     /// Record in a bitmap that the current row is null but nulls are not considered in
     /// the current phase (build or probe).
-    void ALWAYS_INLINE SetRowNull() { null_bitmap_.Set<false>(CurIdx(), true); }
+    void ALWAYS_INLINE SetRowNull() { null_bitmap_.Set(CurIdx(), true); }
 
     /// Returns the hash values of the current row.
     uint32_t ALWAYS_INLINE CurExprValuesHash() const { return *cur_expr_values_hash_; }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0fbb5b7e/be/src/exec/nested-loop-join-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/nested-loop-join-node.cc b/be/src/exec/nested-loop-join-node.cc
index 0a115c6..6c75797 100644
--- a/be/src/exec/nested-loop-join-node.cc
+++ b/be/src/exec/nested-loop-join-node.cc
@@ -410,7 +410,7 @@ Status NestedLoopJoinNode::GetNextRightSemiJoin(RuntimeState* state,
       }
 
       // Check if we already have a match for the build row.
-      if (matching_build_rows_->Get<false>(current_build_row_idx_)) {
+      if (matching_build_rows_->Get(current_build_row_idx_)) {
         build_row_iterator_.Next();
         ++current_build_row_idx_;
         continue;
@@ -424,7 +424,7 @@ Status NestedLoopJoinNode::GetNextRightSemiJoin(RuntimeState* state,
         continue;
       }
       TupleRow* output_row = output_batch->GetRow(output_batch->AddRow());
-      matching_build_rows_->Set<false>(current_build_row_idx_, true);
+      matching_build_rows_->Set(current_build_row_idx_, true);
       output_batch->CopyRow(build_row_iterator_.GetRow(), output_row);
       build_row_iterator_.Next();
       ++current_build_row_idx_;
@@ -461,7 +461,7 @@ Status NestedLoopJoinNode::GetNextRightAntiJoin(RuntimeState* state,
         RETURN_IF_ERROR(QueryMaintenance(state));
       }
 
-      if (matching_build_rows_->Get<false>(current_build_row_idx_)) {
+      if (matching_build_rows_->Get(current_build_row_idx_)) {
         build_row_iterator_.Next();
         ++current_build_row_idx_;
         continue;
@@ -469,7 +469,7 @@ Status NestedLoopJoinNode::GetNextRightAntiJoin(RuntimeState* state,
       CreateOutputRow(semi_join_staging_row_, current_probe_row_,
           build_row_iterator_.GetRow());
       if (EvalConjuncts(join_conjunct_ctxs, num_join_ctxs, semi_join_staging_row_)) {
-        matching_build_rows_->Set<false>(current_build_row_idx_, true);
+        matching_build_rows_->Set(current_build_row_idx_, true);
       }
       build_row_iterator_.Next();
       ++current_build_row_idx_;
@@ -548,7 +548,7 @@ Status NestedLoopJoinNode::ProcessUnmatchedBuildRows(
       RETURN_IF_ERROR(QueryMaintenance(state));
     }
 
-    if (matching_build_rows_->Get<false>(current_build_row_idx_)) {
+    if (matching_build_rows_->Get(current_build_row_idx_)) {
       build_row_iterator_.Next();
       ++current_build_row_idx_;
       continue;
@@ -612,7 +612,7 @@ Status NestedLoopJoinNode::FindBuildMatches(
     if (!EvalConjuncts(join_conjunct_ctxs, num_join_ctxs, output_row)) continue;
     matched_probe_ = true;
     if (matching_build_rows_ != NULL) {
-      matching_build_rows_->Set<false>(current_build_row_idx_ - 1, true);
+      matching_build_rows_->Set(current_build_row_idx_ - 1, true);
     }
     if (!EvalConjuncts(conjunct_ctxs, num_ctxs, output_row)) continue;
     VLOG_ROW << "match row: " << PrintRow(output_row, row_desc());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0fbb5b7e/be/src/util/bitmap-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bitmap-test.cc b/be/src/util/bitmap-test.cc
index 238b5fc..c0a3899 100644
--- a/be/src/util/bitmap-test.cc
+++ b/be/src/util/bitmap-test.cc
@@ -30,14 +30,14 @@ namespace impala {
 
 void CreateRandomBitmap(Bitmap* bitmap) {
   for (int64_t i = 0; i < bitmap->num_bits(); ++i) {
-    bitmap->Set<false>(i, rand() % 2 == 0);
+    bitmap->Set(i, rand() % 2 == 0);
   }
 }
 
 // Returns true if all the bits in the bitmap are equal to 'value'.
 bool CheckAll(const Bitmap& bitmap, const bool value) {
   for (int64_t i = 0; i < bitmap.num_bits(); ++i) {
-    if (bitmap.Get<false>(i) != value) return false;
+    if (bitmap.Get(i) != value) return false;
   }
   return true;
 }
@@ -70,24 +70,6 @@ TEST(Bitmap, SetAllTest) {
   EXPECT_TRUE(CheckAll(bm, false));
 }
 
-TEST(Bitmap, AndTest) {
-  Bitmap bm(1024);
-  CreateRandomBitmap(&bm);
-  Bitmap bm_zeros(1024);
-  bm_zeros.SetAllBits(false);
-  bm.And(&bm_zeros);
-  EXPECT_TRUE(CheckAll(bm, false));
-}
-
-TEST(Bitmap, OrTest) {
-  Bitmap bm(1024);
-  CreateRandomBitmap(&bm);
-  Bitmap bm_ones(1024);
-  bm_ones.SetAllBits(true);
-  bm.Or(&bm_ones);
-  EXPECT_TRUE(CheckAll(bm, true));
-}
-
 // Regression test for IMPALA-2155.
 TEST(Bitmap, SetGetTest) {
   Bitmap bm(1024);
@@ -96,36 +78,16 @@ TEST(Bitmap, SetGetTest) {
   // to 0 and 1.
   for (int64_t bit_idx = 0; bit_idx < 63; ++bit_idx) {
     for (int64_t i = 0; i < 4; ++i) {
-      bm.Set<false>((1 << (6 + i)) + bit_idx, (i + bit_idx) % 2 == 0);
+      bm.Set((1 << (6 + i)) + bit_idx, (i + bit_idx) % 2 == 0);
     }
   }
   for (int64_t bit_idx = 0; bit_idx < 63; ++bit_idx) {
     for (int64_t i = 0; i < 4; ++i) {
-      EXPECT_EQ(bm.Get<false>((1 << (6 + i)) + bit_idx), (i + bit_idx) % 2 == 0);
+      EXPECT_EQ(bm.Get((1 << (6 + i)) + bit_idx), (i + bit_idx) % 2 == 0);
     }
   }
 }
 
-TEST(Bitmap, SetGetModTest) {
-  Bitmap bm(256);
-  bm.SetAllBits(false);
-  for (int64_t bit_idx = 0; bit_idx < 1024; ++bit_idx) {
-    bm.Set<true>(bit_idx, true);
-    EXPECT_TRUE(bm.Get<true>(bit_idx));
-    bm.Set<true>(bit_idx, false);
-    EXPECT_FALSE(bm.Get<true>(bit_idx));
-  }
-
-  bm.SetAllBits(false);
-  EXPECT_TRUE(CheckAll(bm, false));
-  for (int64_t bit_idx = 0; bit_idx < 1024; ++bit_idx) {
-    bm.Set<true>(bit_idx, bit_idx % 2 == 0);
-  }
-  for (int64_t bit_idx = 0; bit_idx < 1024; ++bit_idx) {
-    EXPECT_EQ(bm.Get<true>(bit_idx), bit_idx % 2 == 0);
-  }
-}
-
 /// Regression test for IMPALA-2307.
 TEST(Bitmap, OverflowTest) {
   Bitmap bm(64);
@@ -133,36 +95,20 @@ TEST(Bitmap, OverflowTest) {
   int64_t bit_idx = 45;
   int64_t ovr_idx = 13;
 
-  bm.Set<false>(bit_idx, true);
-  EXPECT_FALSE(bm.Get<false>(ovr_idx));
-  EXPECT_TRUE(bm.Get<false>(bit_idx));
-
-  bm.SetAllBits(false);
-  bm.Set<false>(ovr_idx, true);
-  EXPECT_FALSE(bm.Get<false>(bit_idx));
-  EXPECT_TRUE(bm.Get<false>(ovr_idx));
-
-  bm.SetAllBits(false);
-  bm.Set<false>(ovr_idx, true);
-  bm.Set<false>(bit_idx, false);
-  EXPECT_TRUE(bm.Get<false>(ovr_idx));
-  EXPECT_FALSE(bm.Get<false>(bit_idx));
-
-  bm.SetAllBits(false);
-  bm.Set<true>(bit_idx, true);
-  EXPECT_FALSE(bm.Get<true>(ovr_idx));
-  EXPECT_TRUE(bm.Get<true>(bit_idx));
+  bm.Set(bit_idx, true);
+  EXPECT_FALSE(bm.Get(ovr_idx));
+  EXPECT_TRUE(bm.Get(bit_idx));
 
   bm.SetAllBits(false);
-  bm.Set<true>(ovr_idx, true);
-  EXPECT_FALSE(bm.Get<true>(bit_idx));
-  EXPECT_TRUE(bm.Get<true>(ovr_idx));
+  bm.Set(ovr_idx, true);
+  EXPECT_FALSE(bm.Get(bit_idx));
+  EXPECT_TRUE(bm.Get(ovr_idx));
 
   bm.SetAllBits(false);
-  bm.Set<true>(ovr_idx, true);
-  bm.Set<true>(bit_idx, false);
-  EXPECT_TRUE(bm.Get<true>(ovr_idx));
-  EXPECT_FALSE(bm.Get<true>(bit_idx));
+  bm.Set(ovr_idx, true);
+  bm.Set(bit_idx, false);
+  EXPECT_TRUE(bm.Get(ovr_idx));
+  EXPECT_FALSE(bm.Get(bit_idx));
 }
 
 /// Test that bitmap memory usage calculation is correct.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0fbb5b7e/be/src/util/bitmap.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bitmap.cc b/be/src/util/bitmap.cc
index b1f54bc..504c919 100644
--- a/be/src/util/bitmap.cc
+++ b/be/src/util/bitmap.cc
@@ -23,21 +23,21 @@
 
 using namespace impala;
 
-string Bitmap::DebugString(bool print_bits) {
+string Bitmap::DebugString(bool print_bits) const {
   int64_t words = BitUtil::RoundUp(num_bits_, 64) / 64;
   stringstream ss;
   ss << "Size (" << num_bits_ << ") words (" << words << ") ";
   if (print_bits) {
     for (int i = 0; i < num_bits(); ++i) {
-      if (Get<false>(i)) {
+      if (Get(i)) {
         ss << "1";
       } else {
         ss << "0";
       }
     }
   } else {
-    for (vector<uint64_t>::iterator it = buffer_.begin(); it != buffer_.end(); ++it) {
-      ss << *it << ".";
+    for (auto v : buffer_) {
+      ss << v << ".";
     }
   }
   ss << endl;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0fbb5b7e/be/src/util/bitmap.h
----------------------------------------------------------------------
diff --git a/be/src/util/bitmap.h b/be/src/util/bitmap.h
index 1ff8050..5f02f60 100644
--- a/be/src/util/bitmap.h
+++ b/be/src/util/bitmap.h
@@ -37,10 +37,6 @@ class Bitmap {
     num_bits_ = num_bits;
   }
 
-  Bitmap(const uint64_t* from_buf, int64_t num_bits) {
-    SetFromBuffer(from_buf, num_bits);
-  }
-
   /// Resize bitmap and set all bits to zero.
   void Reset(int64_t num_bits) {
     DCHECK_GE(num_bits, 0);
@@ -49,30 +45,17 @@ class Bitmap {
     SetAllBits(false);
   }
 
-  void SetFromBuffer(const uint64_t* from_buf, int64_t num_bits) {
-    buffer_.resize(BitUtil::RoundUpNumi64(num_bits));
-    for (int i = 0; i < buffer_.size(); ++i) {
-      buffer_[i] = from_buf[i];
-    }
-    num_bits_ = num_bits;
-  }
-
   /// Compute memory usage of a bitmap, not including the Bitmap object itself.
   static int64_t MemUsage(int64_t num_bits) {
     DCHECK_GE(num_bits, 0);
     return BitUtil::RoundUpNumi64(num_bits) * sizeof(int64_t);
   }
 
-  static int64_t DefaultHashSeed() { return 1234; }
-
   /// Compute memory usage of this bitmap, not including the Bitmap object itself.
   int64_t MemUsage() const { return MemUsage(num_bits_); }
 
-  /// Sets the bit at 'bit_index' to v. If mod is true, this
-  /// function will first mod the bit_index by the bitmap size.
-  template<bool mod>
+  /// Sets the bit at 'bit_index' to v.
   void Set(int64_t bit_index, bool v) {
-    if (mod) bit_index &= (num_bits() - 1);
     int64_t word_index = bit_index >> NUM_OFFSET_BITS;
     bit_index &= BIT_INDEX_MASK;
     DCHECK_LT(word_index, buffer_.size());
@@ -83,33 +66,14 @@ class Bitmap {
     }
   }
 
-  /// Returns true if the bit at 'bit_index' is set. If mod is true, this
-  /// function will first mod the bit_index by the bitmap size.
-  template<bool mod>
+  /// Returns true if the bit at 'bit_index' is set.
   bool Get(int64_t bit_index) const {
-    if (mod) bit_index &= (num_bits() - 1);
     int64_t word_index = bit_index >> NUM_OFFSET_BITS;
     bit_index &= BIT_INDEX_MASK;
     DCHECK_LT(word_index, buffer_.size());
     return (buffer_[word_index] & (1LL << bit_index)) != 0;
   }
 
-  /// Bitwise ANDs the src bitmap into this one.
-  void And(const Bitmap* src) {
-    DCHECK_EQ(num_bits(), src->num_bits());
-    for (int i = 0; i < buffer_.size(); ++i) {
-      buffer_[i] &= src->buffer_[i];
-    }
-  }
-
-  /// Bitwise ORs the src bitmap into this one.
-  void Or(const Bitmap* src) {
-    DCHECK_EQ(num_bits(), src->num_bits());
-    for (int i = 0; i < buffer_.size(); ++i) {
-      buffer_[i] |= src->buffer_[i];
-    }
-  }
-
   void SetAllBits(bool b) {
     memset(&buffer_[0], 255 * b, buffer_.size() * sizeof(uint64_t));
   }
@@ -117,7 +81,7 @@ class Bitmap {
   int64_t num_bits() const { return num_bits_; }
 
   /// If 'print_bits' prints 0/1 per bit, otherwise it prints the int64_t value.
-  std::string DebugString(bool print_bits);
+  std::string DebugString(bool print_bits) const;
 
  private:
   std::vector<uint64_t> buffer_;

[2/2] incubator-impala git commit: IMPALA-4300: Speed up BloomFilter::Or with SIMD

Posted by he...@apache.org.

IMPALA-4300: Speed up BloomFilter::Or with SIMD

The previous code was not written in a way that GCC could
auto-vectorize it. Manually vectorizing speeds up BloomFilter::Or by
up to 184x.

Change-Id: I840799d9cfb81285c796e2abfe2029bb869b0f67
Reviewed-on: http://gerrit.cloudera.org:8080/4813
Reviewed-by: Jim Apple <jb...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/61fcb489
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/61fcb489
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/61fcb489

Branch: refs/heads/master
Commit: 61fcb489745f3f0b3f1abbf9fbf666a29a6363de
Parents: 0fbb5b7
Author: Jim Apple <jb...@cloudera.com>
Authored: Fri Oct 21 07:46:42 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Mon Oct 24 18:07:25 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/bloom-filter-benchmark.cc | 198 +++++++++++++++--------
 be/src/testutil/mem-util.h                  |   2 +
 be/src/util/bloom-filter.cc                 |  49 +++++-
 be/src/util/cpu-info.cc                     |   1 +
 be/src/util/cpu-info.h                      |   3 +-
 5 files changed, 178 insertions(+), 75 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/benchmarks/bloom-filter-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc
index 1aa0619..d9019c8 100644
--- a/be/src/benchmarks/bloom-filter-benchmark.cc
+++ b/be/src/benchmarks/bloom-filter-benchmark.cc
@@ -30,13 +30,14 @@
 using namespace std;
 using namespace impala;
 
-// Tests Bloom filter performance on four tasks:
+// Tests Bloom filter performance on:
 //
 // 1. Construct/destruct pairs
 // 2. Inserts
 // 3. Lookups when the item is present
 // 4. Lookups when the item is absent (this is theoretically faster than when the item is
 //    present in some Bloom filter variants)
+// 5. Unions
 //
 // As in bloom-filter.h, ndv refers to the number of unique items inserted into a filter
 // and fpp is the probability of false positives.
@@ -46,89 +47,116 @@ using namespace impala;
 // initialize:                Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           7.05e+03 7.27e+03 7.34e+03         1X         1X         1X
-//            ndv      10k fpp    1.0%           3.79e+03 3.93e+03 3.96e+03     0.538X     0.541X      0.54X
-//            ndv      10k fpp    0.1%           1.39e+03 1.42e+03 1.44e+03     0.198X     0.196X     0.196X
-//            ndv    1000k fpp   10.0%               4.62     4.78     4.81  0.000655X  0.000658X  0.000655X
-//            ndv    1000k fpp    1.0%               2.49     2.55      2.6  0.000354X  0.000351X  0.000354X
-//            ndv    1000k fpp    0.1%               2.45     2.55      2.6  0.000347X  0.000351X  0.000354X
-//            ndv  100000k fpp   10.0%              0.035   0.0358    0.037  4.96e-06X  4.93e-06X  5.04e-06X
-//            ndv  100000k fpp    1.0%             0.0347   0.0361   0.0372  4.93e-06X  4.96e-06X  5.06e-06X
-//            ndv  100000k fpp    0.1%             0.0176   0.0181   0.0186   2.5e-06X  2.49e-06X  2.53e-06X
+//            ndv      10k fpp   10.0%           5.89e+03 5.98e+03 6.03e+03         1X         1X         1X
+//            ndv      10k fpp    1.0%           3.22e+03 3.25e+03 3.27e+03     0.546X     0.543X     0.542X
+//            ndv      10k fpp    0.1%           1.13e+03 1.17e+03 1.18e+03     0.191X     0.195X     0.195X
+//            ndv    1000k fpp   10.0%               3.85     3.93     3.93  0.000654X  0.000657X  0.000652X
+//            ndv    1000k fpp    1.0%               2.04     2.12     2.12  0.000346X  0.000354X  0.000351X
+//            ndv    1000k fpp    0.1%               2.04     2.12     2.12  0.000346X  0.000354X  0.000351X
+//            ndv  100000k fpp   10.0%             0.0281    0.029   0.0294  4.77e-06X  4.85e-06X  4.87e-06X
+//            ndv  100000k fpp    1.0%             0.0284    0.029   0.0298  4.82e-06X  4.85e-06X  4.93e-06X
+//            ndv  100000k fpp    0.1%             0.0144   0.0147   0.0149  2.44e-06X  2.47e-06X  2.47e-06X
 //
 // With AVX2:
 //
 // insert:                    Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           2.03e+05 2.05e+05 2.08e+05         1X         1X         1X
-//            ndv      10k fpp    1.0%           2.03e+05 2.06e+05 2.08e+05     0.997X         1X         1X
-//            ndv      10k fpp    0.1%           2.03e+05 2.05e+05 2.07e+05     0.997X     0.998X     0.997X
-//            ndv    1000k fpp   10.0%           1.82e+05 1.87e+05 1.89e+05     0.896X      0.91X     0.907X
-//            ndv    1000k fpp    1.0%           1.49e+05 1.53e+05 1.56e+05     0.731X     0.747X      0.75X
-//            ndv    1000k fpp    0.1%           1.79e+05 1.82e+05 1.83e+05     0.881X     0.886X     0.882X
-//            ndv  100000k fpp   10.0%           4.08e+04 4.49e+04 5.44e+04     0.201X     0.219X     0.262X
-//            ndv  100000k fpp    1.0%           3.94e+04  4.4e+04 5.04e+04     0.194X     0.214X     0.242X
-//            ndv  100000k fpp    0.1%           4.08e+04 4.48e+04 5.68e+04     0.201X     0.218X     0.273X
+//            ndv      10k fpp   10.0%           1.17e+05 1.23e+05 1.25e+05         1X         1X         1X
+//            ndv      10k fpp    1.0%           1.17e+05 1.24e+05 1.25e+05         1X         1X         1X
+//            ndv      10k fpp    0.1%            1.2e+05 1.23e+05 1.24e+05      1.02X     0.996X     0.991X
+//            ndv    1000k fpp   10.0%            1.1e+05 1.18e+05  1.2e+05     0.944X     0.959X      0.96X
+//            ndv    1000k fpp    1.0%           1.11e+05 1.16e+05 1.17e+05     0.954X     0.938X     0.934X
+//            ndv    1000k fpp    0.1%           9.73e+04 1.16e+05 1.17e+05     0.834X     0.937X     0.936X
+//            ndv  100000k fpp   10.0%           2.96e+04 4.19e+04 5.44e+04     0.254X      0.34X     0.436X
+//            ndv  100000k fpp    1.0%           2.92e+04 3.81e+04 4.89e+04      0.25X     0.308X     0.391X
+//            ndv  100000k fpp    0.1%           2.44e+04 3.28e+04 4.31e+04     0.209X     0.266X     0.345X
 //
 // find:                      Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//    present ndv      10k fpp   10.0%           2.48e+05 2.51e+05 2.53e+05         1X         1X         1X
-//    absent  ndv      10k fpp   10.0%           2.47e+05 2.52e+05 2.55e+05     0.995X         1X      1.01X
-//    present ndv      10k fpp    1.0%           2.49e+05 2.52e+05 2.55e+05         1X      1.01X      1.01X
-//    absent  ndv      10k fpp    1.0%           2.47e+05 2.53e+05 2.56e+05     0.997X      1.01X      1.01X
-//    present ndv      10k fpp    0.1%           2.49e+05 2.53e+05 2.54e+05         1X      1.01X      1.01X
-//    absent  ndv      10k fpp    0.1%           2.47e+05 2.53e+05 2.56e+05     0.997X      1.01X      1.01X
-//    present ndv    1000k fpp   10.0%           1.98e+05 2.04e+05 2.06e+05       0.8X     0.814X     0.812X
-//    absent  ndv    1000k fpp   10.0%           2.01e+05 2.07e+05  2.1e+05     0.808X     0.826X     0.829X
-//    present ndv    1000k fpp    1.0%           1.83e+05 1.95e+05 2.02e+05     0.737X      0.78X     0.798X
-//    absent  ndv    1000k fpp    1.0%           2.01e+05 2.04e+05 2.08e+05     0.808X     0.815X      0.82X
-//    present ndv    1000k fpp    0.1%           1.96e+05 2.01e+05 2.03e+05     0.788X       0.8X     0.801X
-//    absent  ndv    1000k fpp    0.1%              2e+05 2.05e+05 2.07e+05     0.808X     0.817X     0.818X
-//    present ndv  100000k fpp   10.0%            4.6e+04 5.09e+04 6.08e+04     0.185X     0.203X      0.24X
-//    absent  ndv  100000k fpp   10.0%           4.11e+04 4.36e+04 4.53e+04     0.166X     0.174X     0.179X
-//    present ndv  100000k fpp    1.0%           4.55e+04 4.96e+04 6.19e+04     0.184X     0.198X     0.245X
-//    absent  ndv  100000k fpp    1.0%           3.83e+04 4.15e+04 4.69e+04     0.154X     0.166X     0.186X
-//    present ndv  100000k fpp    0.1%           4.73e+04 5.43e+04 6.58e+04     0.191X     0.217X      0.26X
-//    absent  ndv  100000k fpp    0.1%           3.77e+04 4.07e+04 4.37e+04     0.152X     0.163X     0.173X
+//    present ndv      10k fpp   10.0%           1.16e+05 1.17e+05 1.18e+05         1X         1X         1X
+//    absent  ndv      10k fpp   10.0%           1.16e+05 1.17e+05 1.18e+05         1X         1X     0.998X
+//    present ndv      10k fpp    1.0%           1.15e+05 1.17e+05 1.18e+05     0.988X         1X     0.999X
+//    absent  ndv      10k fpp    1.0%           1.14e+05 1.17e+05 1.19e+05     0.978X         1X         1X
+//    present ndv      10k fpp    0.1%           1.09e+05 1.17e+05 1.18e+05     0.939X         1X         1X
+//    absent  ndv      10k fpp    0.1%           1.13e+05 1.17e+05 1.18e+05      0.97X         1X         1X
+//    present ndv    1000k fpp   10.0%           1.09e+05 1.13e+05 1.15e+05     0.942X     0.968X      0.97X
+//    absent  ndv    1000k fpp   10.0%           1.09e+05 1.15e+05 1.16e+05     0.937X     0.982X     0.982X
+//    present ndv    1000k fpp    1.0%           9.44e+04 1.12e+05 1.13e+05     0.814X     0.952X     0.951X
+//    absent  ndv    1000k fpp    1.0%           1.02e+05 1.14e+05 1.15e+05     0.877X     0.973X     0.972X
+//    present ndv    1000k fpp    0.1%           1.01e+05 1.11e+05 1.12e+05     0.868X     0.951X     0.949X
+//    absent  ndv    1000k fpp    0.1%           1.08e+05 1.14e+05 1.15e+05     0.927X     0.975X     0.975X
+//    present ndv  100000k fpp   10.0%           3.18e+04 3.94e+04 5.18e+04     0.274X     0.336X     0.437X
+//    absent  ndv  100000k fpp   10.0%           2.74e+04 3.07e+04 3.49e+04     0.236X     0.262X     0.294X
+//    present ndv  100000k fpp    1.0%           3.07e+04 4.29e+04 5.51e+04     0.265X     0.366X     0.465X
+//    absent  ndv  100000k fpp    1.0%           2.67e+04  2.9e+04 3.25e+04      0.23X     0.248X     0.274X
+//    present ndv  100000k fpp    0.1%           2.78e+04 3.88e+04  4.9e+04      0.24X     0.331X     0.413X
+//    absent  ndv  100000k fpp    0.1%           2.44e+04 2.84e+04 3.02e+04     0.211X     0.242X     0.255X
 //
-// Without AVX2:
+// union:                     Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//            ndv      10k fpp   10.0%           1.81e+04 1.84e+04 1.86e+04         1X         1X         1X
+//            ndv      10k fpp    1.0%           8.25e+03 8.39e+03 8.47e+03     0.455X     0.455X     0.455X
+//            ndv      10k fpp    0.1%           4.02e+03 4.31e+03 4.35e+03     0.222X     0.234X     0.234X
+//            ndv    1000k fpp   10.0%                105      111      112   0.00577X   0.00603X   0.00602X
+//            ndv    1000k fpp    1.0%               45.9     46.4     46.9   0.00253X   0.00252X   0.00252X
+//            ndv    1000k fpp    0.1%               46.2     46.6     46.9   0.00255X   0.00253X   0.00252X
+//            ndv  100000k fpp   10.0%                0.2      0.2      0.2   1.1e-05X  1.08e-05X  1.07e-05X
+//            ndv  100000k fpp    1.0%                0.2      0.2      0.2   1.1e-05X  1.08e-05X  1.07e-05X
+//            ndv  100000k fpp    0.1%              0.133    0.143    0.145  7.35e-06X  7.75e-06X  7.79e-06X
+//
+//
+// Without AVX or AVX2:
 //
 // insert:                    Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           1.25e+05 1.27e+05 1.28e+05         1X         1X         1X
-//            ndv      10k fpp    1.0%           1.27e+05 1.29e+05  1.3e+05      1.01X      1.02X      1.02X
-//            ndv      10k fpp    0.1%           1.26e+05 1.28e+05  1.3e+05         1X      1.01X      1.01X
-//            ndv    1000k fpp   10.0%           1.23e+05 1.25e+05 1.26e+05     0.977X     0.981X     0.985X
-//            ndv    1000k fpp    1.0%           1.16e+05 1.22e+05 1.23e+05     0.925X     0.958X     0.958X
-//            ndv    1000k fpp    0.1%           1.16e+05 1.22e+05 1.23e+05     0.928X     0.958X     0.957X
-//            ndv  100000k fpp   10.0%           3.77e+04 4.06e+04 5.62e+04     0.301X     0.319X     0.438X
-//            ndv  100000k fpp    1.0%           3.71e+04 4.06e+04 5.45e+04     0.296X      0.32X     0.425X
-//            ndv  100000k fpp    0.1%           3.37e+04 3.68e+04 5.15e+04     0.269X      0.29X     0.401X
+//            ndv      10k fpp   10.0%           9.27e+04 9.33e+04  9.4e+04         1X         1X         1X
+//            ndv      10k fpp    1.0%           9.43e+04 9.49e+04 9.61e+04      1.02X      1.02X      1.02X
+//            ndv      10k fpp    0.1%           9.36e+04  9.5e+04 9.58e+04      1.01X      1.02X      1.02X
+//            ndv    1000k fpp   10.0%            8.4e+04 9.49e+04 9.61e+04     0.906X      1.02X      1.02X
+//            ndv    1000k fpp    1.0%           7.64e+04 9.34e+04 9.45e+04     0.824X         1X      1.01X
+//            ndv    1000k fpp    0.1%           8.24e+04 9.34e+04 9.44e+04     0.888X         1X         1X
+//            ndv  100000k fpp   10.0%           3.22e+04    4e+04 5.03e+04     0.347X     0.429X     0.535X
+//            ndv  100000k fpp    1.0%           2.77e+04  3.6e+04  4.8e+04     0.298X     0.386X      0.51X
+//            ndv  100000k fpp    0.1%           2.54e+04 2.93e+04 4.32e+04     0.274X     0.314X      0.46X
 //
 // find:                      Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//    present ndv      10k fpp   10.0%            1.6e+05 1.64e+05 1.66e+05         1X         1X         1X
-//    absent  ndv      10k fpp   10.0%           1.11e+05 1.14e+05 1.15e+05     0.696X     0.697X     0.695X
-//    present ndv      10k fpp    1.0%           1.57e+05 1.63e+05 1.64e+05     0.982X     0.994X     0.989X
-//    absent  ndv      10k fpp    1.0%            1.3e+05 1.33e+05 1.35e+05     0.814X     0.813X     0.812X
-//    present ndv      10k fpp    0.1%           1.55e+05 1.58e+05 1.61e+05     0.967X     0.968X     0.969X
-//    absent  ndv      10k fpp    0.1%           2.26e+05 2.29e+05 2.31e+05      1.41X       1.4X       1.4X
-//    present ndv    1000k fpp   10.0%           1.21e+05 1.23e+05 1.25e+05     0.758X     0.753X     0.756X
-//    absent  ndv    1000k fpp   10.0%            7.6e+04 7.72e+04 7.81e+04     0.475X     0.472X     0.471X
-//    present ndv    1000k fpp    1.0%           1.23e+05 1.27e+05 1.28e+05     0.771X     0.773X      0.77X
-//    absent  ndv    1000k fpp    1.0%           1.19e+05 1.21e+05 1.22e+05     0.744X     0.739X     0.738X
-//    present ndv    1000k fpp    0.1%           1.17e+05 1.18e+05  1.2e+05     0.731X     0.724X     0.723X
-//    absent  ndv    1000k fpp    0.1%           1.13e+05 1.16e+05 1.17e+05     0.707X     0.706X     0.705X
-//    present ndv  100000k fpp   10.0%           3.42e+04 3.63e+04  3.9e+04     0.214X     0.222X     0.235X
-//    absent  ndv  100000k fpp   10.0%            3.6e+04 3.77e+04 3.82e+04     0.225X      0.23X      0.23X
-//    present ndv  100000k fpp    1.0%           3.18e+04 3.42e+04 3.57e+04     0.199X     0.209X     0.216X
-//    absent  ndv  100000k fpp    1.0%           3.63e+04 3.73e+04 3.79e+04     0.227X     0.228X     0.229X
-//    present ndv  100000k fpp    0.1%           2.89e+04  3.2e+04 3.33e+04      0.18X     0.196X     0.201X
-//    absent  ndv  100000k fpp    0.1%           4.56e+04 4.78e+04 4.86e+04     0.285X     0.292X     0.293X
+//    present ndv      10k fpp   10.0%            1.3e+05 1.31e+05 1.33e+05         1X         1X         1X
+//    absent  ndv      10k fpp   10.0%           8.74e+04 8.83e+04 8.92e+04     0.674X     0.673X     0.671X
+//    present ndv      10k fpp    1.0%           1.25e+05  1.3e+05 1.31e+05      0.96X     0.991X     0.988X
+//    absent  ndv      10k fpp    1.0%           1.04e+05 1.06e+05 1.07e+05     0.805X     0.809X     0.807X
+//    present ndv      10k fpp    0.1%           1.28e+05  1.3e+05 1.31e+05     0.986X     0.988X     0.984X
+//    absent  ndv      10k fpp    0.1%           1.69e+05 1.72e+05 1.74e+05       1.3X      1.31X      1.31X
+//    present ndv    1000k fpp   10.0%           9.33e+04  9.6e+04 9.69e+04     0.719X     0.732X     0.729X
+//    absent  ndv    1000k fpp   10.0%           5.99e+04 6.07e+04 6.12e+04     0.462X     0.462X     0.461X
+//    present ndv    1000k fpp    1.0%           9.48e+04 1.01e+05 1.02e+05     0.731X     0.768X     0.768X
+//    absent  ndv    1000k fpp    1.0%           9.49e+04 9.67e+04 9.74e+04     0.731X     0.737X     0.734X
+//    present ndv    1000k fpp    0.1%           8.46e+04  9.3e+04 9.41e+04     0.652X     0.709X     0.709X
+//    absent  ndv    1000k fpp    0.1%           9.05e+04 9.18e+04 9.28e+04     0.697X       0.7X     0.699X
+//    present ndv  100000k fpp   10.0%            2.6e+04 2.88e+04 3.11e+04     0.201X      0.22X     0.235X
+//    absent  ndv  100000k fpp   10.0%           2.88e+04 2.99e+04 3.08e+04     0.222X     0.228X     0.232X
+//    present ndv  100000k fpp    1.0%           2.34e+04 2.76e+04 2.91e+04      0.18X      0.21X     0.219X
+//    absent  ndv  100000k fpp    1.0%           2.86e+04 2.97e+04 3.03e+04      0.22X     0.227X     0.228X
+//    present ndv  100000k fpp    0.1%           2.34e+04 2.65e+04 2.81e+04      0.18X     0.202X     0.211X
+//    absent  ndv  100000k fpp    0.1%           3.73e+04 3.85e+04 3.91e+04     0.287X     0.293X     0.295X
+//
+// union:                     Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//            ndv      10k fpp   10.0%           3.06e+03  3.1e+03 3.12e+03         1X         1X         1X
+//            ndv      10k fpp    1.0%           1.51e+03 1.55e+03 1.57e+03     0.493X     0.502X     0.503X
+//            ndv      10k fpp    0.1%                748      775      782     0.244X      0.25X     0.251X
+//            ndv    1000k fpp   10.0%               19.6       20     20.2    0.0064X   0.00646X   0.00647X
+//            ndv    1000k fpp    1.0%               9.41       10     10.1   0.00307X   0.00324X   0.00323X
+//            ndv    1000k fpp    0.1%                9.9       10     10.1   0.00323X   0.00324X   0.00323X
+//            ndv  100000k fpp   10.0%             0.0671   0.0714   0.0725  2.19e-05X   2.3e-05X  2.32e-05X
+//            ndv  100000k fpp    1.0%             0.0676   0.0709   0.0719  2.21e-05X  2.29e-05X  2.31e-05X
+//            ndv  100000k fpp    0.1%             0.0338    0.035   0.0356   1.1e-05X  1.13e-05X  1.14e-05X
 
 // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits
 // produced by rand().
@@ -221,6 +249,27 @@ void Absent(int batch_size, void* data) {
 
 }  // namespace find
 
+// Benchmark or
+namespace either {
+
+struct TestData {
+  explicit TestData(int log_heap_size) {
+    BloomFilter bf(log_heap_size);
+    BloomFilter::ToThrift(&bf, &tbf);
+  }
+
+  TBloomFilter tbf;
+};
+
+void Benchmark(int batch_size, void* data) {
+  TestData* d = reinterpret_cast<TestData*>(data);
+  for (int i = 0; i < batch_size; ++i) {
+    BloomFilter::Or(d->tbf, &d->tbf);
+  }
+}
+
+} // namespace either
+
 void RunBenchmarks() {
 
   char name[120];
@@ -254,6 +303,20 @@ void RunBenchmarks() {
     }
     cout << suite.Measure() << endl;
   }
+
+  {
+    Benchmark suite("union");
+    vector<unique_ptr<either::TestData> > testdata;
+    for (int ndv = 10000; ndv <= 100 * 1000 * 1000; ndv *= 100) {
+      for (double fpp = 0.1; fpp >= 0.001; fpp /= 10) {
+        testdata.emplace_back(
+            new either::TestData(BloomFilter::MinLogSpace(ndv, fpp)));
+        snprintf(name, sizeof(name), "ndv %7dk fpp %6.1f%%", ndv/1000, fpp*100);
+        suite.AddBenchmark(name, either::Benchmark, testdata.back().get());
+      }
+    }
+    cout << suite.Measure() << endl;
+  }
 }
 
 int main(int argc, char **argv) {
@@ -277,7 +340,8 @@ int main(int argc, char **argv) {
 
   cout << "With AVX2:" << endl << endl;
   RunBenchmarks();
-  cout << endl << "Without AVX2:" << endl << endl;
-  CpuInfo::TempDisable t(CpuInfo::AVX2);
+  cout << endl << "Without AVX or AVX2:" << endl << endl;
+  CpuInfo::TempDisable t1(CpuInfo::AVX);
+  CpuInfo::TempDisable t2(CpuInfo::AVX2);
   RunBenchmarks();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/testutil/mem-util.h
----------------------------------------------------------------------
diff --git a/be/src/testutil/mem-util.h b/be/src/testutil/mem-util.h
index 78b7b48..b4ce9ea 100644
--- a/be/src/testutil/mem-util.h
+++ b/be/src/testutil/mem-util.h
@@ -21,6 +21,8 @@
 #include <cstdint>
 #include <cstdlib>
 
+#include <glog/logging.h>
+
 #include "gutil/macros.h"
 
 namespace impala {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/bloom-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc
index 6fd53f5..2aadc05 100644
--- a/be/src/util/bloom-filter.cc
+++ b/be/src/util/bloom-filter.cc
@@ -154,6 +154,21 @@ bool BloomFilter::BucketFind(
   return true;
 }
 
+namespace {
+// Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using AVX
+// instructions. 'n' must be a multiple of 32.
+void __attribute__((target("avx"))) OrEqualArrayAvx(size_t n, const char* in, char* out) {
+  constexpr size_t REGISTER_SIZE = sizeof(__m256d);
+  DCHECK_EQ(n % REGISTER_SIZE, 0) << "Invalid Bloom Filter directory size";
+  const double* simd_in = reinterpret_cast<const double*>(in);
+  double* simd_out = reinterpret_cast<double*>(out);
+  const size_t simd_size = n / REGISTER_SIZE;
+  for (size_t i = 0; i < simd_size; i += REGISTER_SIZE / sizeof(simd_in[0])) {
+    _mm256_storeu_pd(simd_out + i,
+        _mm256_or_pd(_mm256_loadu_pd(simd_out + i), _mm256_loadu_pd(simd_in + i)));
+  }
+}
+} //namespace
 
 void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) {
   DCHECK(out != NULL);
@@ -163,8 +178,29 @@ void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) {
     out->directory.resize(0);
     return;
   }
-
-  for (int i = 0; i < in.directory.size(); ++i) out->directory[i] |= in.directory[i];
+  // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
+  // written in a way that is very friendly to auto-vectorization. Instead, we manually
+  // vectorize, increasing the speed by up to 184x.
+  //
+  // TODO: Tune gcc flags to auto-vectorize the trivial loop instead of hand-vectorizing
+  // it. This might not be possible.
+  if (CpuInfo::IsSupported(CpuInfo::AVX)) {
+    OrEqualArrayAvx(in.directory.size(), &in.directory[0], &out->directory[0]);
+  } else {
+    const __m128i* simd_in = reinterpret_cast<const __m128i*>(&in.directory[0]);
+    __m128i* simd_out = reinterpret_cast<__m128i*>(&out->directory[0]);
+    const size_t simd_size =
+        (in.directory.size() * sizeof(in.directory[0])) / sizeof(simd_in[0]);
+    // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(__m128i)
+    // == 16, we can do two _mm_or_si128's in each iteration without checking array
+    // bounds.
+    for (size_t i = 0; i < simd_size; i += 2) {
+      _mm_storeu_si128(simd_out + i,
+          _mm_or_si128(_mm_loadu_si128(simd_out + i), _mm_loadu_si128(simd_in + i)));
+      _mm_storeu_si128(simd_out + i + 1, _mm_or_si128(_mm_loadu_si128(simd_out + i + 1),
+                                             _mm_loadu_si128(simd_in + i + 1)));
+    }
+  }
 }
 
 // The following three methods are derived from
@@ -187,14 +223,13 @@ int BloomFilter::MinLogSpace(const size_t ndv, const double fpp) {
   const double m = -k * ndv / log(1 - pow(fpp, 1.0 / k));
 
   // Handle case where ndv == 1 => ceil(log2(m/8)) < 0.
-  return max(0, static_cast<int>(ceil(log2(m/8))));
+  return max(0, static_cast<int>(ceil(log2(m / 8))));
 }
 
 double BloomFilter::FalsePositiveProb(const size_t ndv, const int log_heap_space) {
-  return pow(
-      1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv)) /
-              static_cast<double>(1ull << (log_heap_space + 3))),
+  return pow(1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv))
+                     / static_cast<double>(1ull << (log_heap_space + 3))),
       BUCKET_WORDS);
 }
 
-}  // namespace impala
+} // namespace impala

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.cc
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.cc b/be/src/util/cpu-info.cc
index 532c9dd..6329ca8 100644
--- a/be/src/util/cpu-info.cc
+++ b/be/src/util/cpu-info.cc
@@ -77,6 +77,7 @@ static struct {
   { "sse4_1", CpuInfo::SSE4_1 },
   { "sse4_2", CpuInfo::SSE4_2 },
   { "popcnt", CpuInfo::POPCNT },
+  { "avx",    CpuInfo::AVX },
   { "avx2",   CpuInfo::AVX2 },
 };
 static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.h
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.h b/be/src/util/cpu-info.h
index cb577c2..868d2dd 100644
--- a/be/src/util/cpu-info.h
+++ b/be/src/util/cpu-info.h
@@ -36,7 +36,8 @@ class CpuInfo {
   static const int64_t SSE4_1  = (1 << 2);
   static const int64_t SSE4_2  = (1 << 3);
   static const int64_t POPCNT  = (1 << 4);
-  static const int64_t AVX2    = (1 << 5);
+  static const int64_t AVX     = (1 << 5);
+  static const int64_t AVX2    = (1 << 6);
 
   /// Cache enums for L1 (data), L2 and L3
   enum CacheLevel {