You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@quickstep.apache.org by na...@apache.org on 2016/07/16 22:56:52 UTC
incubator-quickstep git commit: Adaptively adjust the batch size.

Repository: incubator-quickstep
Updated Branches:
  refs/heads/expt_bloom_filter_hash_fn 6369ee91c -> d04402d84


Adaptively adjust the batch size.


Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/d04402d8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/d04402d8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/d04402d8

Branch: refs/heads/expt_bloom_filter_hash_fn
Commit: d04402d842a1c63cc2000c7a827d4c7386330fd2
Parents: 6369ee9
Author: Navneet Potti <na...@gmail.com>
Authored: Sat Jul 16 17:10:32 2016 -0500
Committer: Navneet Potti <na...@gmail.com>
Committed: Sat Jul 16 17:49:31 2016 -0500

----------------------------------------------------------------------
 storage/HashTable.hpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/d04402d8/storage/HashTable.hpp
----------------------------------------------------------------------
diff --git a/storage/HashTable.hpp b/storage/HashTable.hpp
index 564b783..a74d71c 100644
--- a/storage/HashTable.hpp
+++ b/storage/HashTable.hpp
@@ -2274,12 +2274,22 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_
       bloom_filter_adapter.reset(new BloomFilterAdapter(
               probe_bloom_filters_, probe_attribute_ids_, attr_size_vectors));
 
-      static const uint32_t kMaxBatchSize = FLAGS_bloom_adapter_batch_size;
+      // We want to have large batch sizes for cache efficiency while probeing,
+      // but small batch sizes to ensure that the adaptation logic kicks in
+      // (and does early). We use exponentially increasing batch sizes to
+      // achieve a balance between the two.
+      //
+      // We also keep track of num_tuples_left in the block, to ensure that
+      // we don't reserve an unnecessarily large vector.
+      std::uint32_t batch_size_try = FLAGS_bloom_adapter_batch_size;
+      std::uint32_t num_tuples_left = accessor->getNumTuples();
       std::vector<tuple_id> batch;
-      batch.reserve(kMaxBatchSize);
 
       do {
-        while (batch.size() < kMaxBatchSize && accessor->next())
+        std::uint32_t batch_size =
+            batch_size_try > num_tuples_left ? batch_size_try : num_tuples_left;
+        batch.reserve(batch_size);
+        while (batch.size() < batch_size && accessor->next())
           batch.push_back(accessor->getCurrentPosition());
 
         std::size_t num_hits = bloom_filter_adapter->bulkProbe(accessor, batch);
@@ -2303,6 +2313,8 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_
           }
         }
         batch.clear();
+        num_tuples_left -= batch_size;
+        batch_size_try = batch_size * 2;
       } while (!accessor->iterationFinished());
     }