You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@quickstep.apache.org by na...@apache.org on 2016/07/13 04:49:39 UTC
[5/5] incubator-quickstep git commit: Probe Bloom filter adapter in
batches
Probe Bloom filter adapter in batches
Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/0161a4de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/0161a4de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/0161a4de
Branch: refs/heads/expt_bloom_filter_hash_fn
Commit: 0161a4dee295ea4efe8c8d9ebdf56759e2c56da2
Parents: fa815ff
Author: Navneet Potti <na...@gmail.com>
Authored: Tue Jul 12 23:49:05 2016 -0500
Committer: Navneet Potti <na...@gmail.com>
Committed: Tue Jul 12 23:49:05 2016 -0500
----------------------------------------------------------------------
storage/HashTable.hpp | 87 ++++++++++++++++-----------
utility/BloomFilterAdapter.hpp | 117 ++++++++++++++++++++----------------
2 files changed, 115 insertions(+), 89 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/0161a4de/storage/HashTable.hpp
----------------------------------------------------------------------
diff --git a/storage/HashTable.hpp b/storage/HashTable.hpp
index a4b09fb..b666e0c 100644
--- a/storage/HashTable.hpp
+++ b/storage/HashTable.hpp
@@ -2254,15 +2254,16 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_
// Find (and cache) the size of each attribute in the probe lists.
// NOTE(nav): This code uses the accessor to get the size,
// and hence only works if there's at least one tuple.
- // NOTE(nav): This code is meant to be called only for PKs, and so
- // we assume non-null values.
std::vector<std::vector<std::size_t>> attr_size_vectors;
attr_size_vectors.reserve(probe_attribute_ids_.size());
for (const auto &probe_attr_vector : probe_attribute_ids_) {
std::vector<std::size_t> attr_sizes;
attr_sizes.reserve(probe_attr_vector.size());
for (const auto &probe_attr : probe_attr_vector) {
- auto val_and_size = accessor->getUntypedValueAndByteLengthAtAbsolutePosition(0, probe_attr);
+ // Using the function below is the easiest way to get attribute size
+ // here. The template parameter check_null is set to false to ensure
+ // that we get the size even if the first attr value is null.
+ auto val_and_size = accessor->template getUntypedValueAndByteLengthAtAbsolutePosition<false>(0, probe_attr);
attr_sizes.push_back(val_and_size.second);
}
attr_size_vectors.push_back(attr_sizes);
@@ -2270,44 +2271,58 @@ void HashTable<ValueT, resizable, serializable, force_key_copy, allow_duplicate_
bloom_filter_adapter.reset(new BloomFilterAdapter(
probe_bloom_filters_, probe_attribute_ids_, attr_size_vectors));
+
+ static const uint32_t kMaxBatchSize = 4000;
+ std::vector<tuple_id> batch;
+ batch.reserve(kMaxBatchSize);
+
+ do {
+ while (batch.size() < kMaxBatchSize && accessor->next())
+ batch.push_back(accessor->getCurrentPosition());
+
+ std::size_t num_hits = bloom_filter_adapter->bulkProbe(accessor, batch);
+
+ for (std::size_t t = 0; t < num_hits; ++t){
+ tuple_id probe_tid = batch[t];
+ TypedValue key = accessor->getTypedValueAtAbsolutePosition(key_attr_id, probe_tid);
+ if (check_for_null_keys && key.isNull()) {
+ continue;
+ }
+ const std::size_t true_hash = use_scalar_literal_hash_template ? key.getHashScalarLiteral()
+ : key.getHash();
+ const std::size_t adjusted_hash = adjust_hashes_template ? this->AdjustHash(true_hash)
+ : true_hash;
+ std::size_t entry_num = 0;
+ const ValueT *value;
+ while (this->getNextEntryForKey(key, adjusted_hash, &value, &entry_num)) {
+ (*functor)(probe_tid, *value);
+ if (!allow_duplicate_keys)
+ break;
+ }
+ }
+ batch.clear();
+ } while (!accessor->iterationFinished());
}
- // std::size_t numFalsePositives = 0;
- // std::size_t numMisses = 0;
- // std::size_t numHits = 0;
- while (accessor->next()) {
- if (has_probe_side_bloom_filter_ && bloom_filter_adapter->miss(accessor)) {
- // numMisses++;
- continue; // On a bloom filter miss, probing the hash table can be skipped.
- }
- TypedValue key = accessor->getTypedValue(key_attr_id);
- if (check_for_null_keys && key.isNull()) {
- continue;
- }
- const std::size_t true_hash = use_scalar_literal_hash_template ? key.getHashScalarLiteral()
- : key.getHash();
- const std::size_t adjusted_hash = adjust_hashes_template ? this->AdjustHash(true_hash)
- : true_hash;
- std::size_t entry_num = 0;
- const ValueT *value;
- bool wasHit = false;
- while (this->getNextEntryForKey(key, adjusted_hash, &value, &entry_num)) {
- wasHit = true;
- (*functor)(*accessor, *value);
- if (!allow_duplicate_keys) {
- break;
+ else { // no Bloom filters to probe
+ while(accessor->next()) {
+ TypedValue key = accessor->getTypedValue(key_attr_id);
+ if (check_for_null_keys && key.isNull()) {
+ continue;
+ }
+ const std::size_t true_hash = use_scalar_literal_hash_template ? key.getHashScalarLiteral()
+ : key.getHash();
+ const std::size_t adjusted_hash = adjust_hashes_template ? this->AdjustHash(true_hash)
+ : true_hash;
+ std::size_t entry_num = 0;
+ const ValueT *value;
+ while (this->getNextEntryForKey(key, adjusted_hash, &value, &entry_num)) {
+ (*functor)(*accessor, *value);
+ if (!allow_duplicate_keys)
+ break;
}
}
-
- // if (!wasHit)
- // numFalsePositives++;
- // else
- // numHits++;
-
}
- // std::cerr << "numFalsePositives: " << numFalsePositives
- // << " numMisses: " << numMisses
- // << " numHits: " << numHits << "\n";
});
}
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/0161a4de/utility/BloomFilterAdapter.hpp
----------------------------------------------------------------------
diff --git a/utility/BloomFilterAdapter.hpp b/utility/BloomFilterAdapter.hpp
index 10d1450..f1916dc 100644
--- a/utility/BloomFilterAdapter.hpp
+++ b/utility/BloomFilterAdapter.hpp
@@ -40,59 +40,33 @@ namespace quickstep {
class BloomFilterAdapter {
public:
BloomFilterAdapter(const std::vector<const BloomFilter*> &bloom_filters,
- const std::vector<std::vector<attribute_id>> &attribute_ids,
- const std::vector<std::vector<std::size_t>> &attr_sizes)
- : num_bloom_filters_(bloom_filters.size()) {
+ const std::vector<std::vector<attribute_id>> attribute_ids,
+ const std::vector<std::vector<std::size_t>> attr_sizes) {
DCHECK_EQ(bloom_filters.size(), attribute_ids.size());
DCHECK_EQ(bloom_filters.size(), attr_sizes.size());
- bloom_filter_entries_.reserve(num_bloom_filters_);
- bloom_filter_entry_indices_.reserve(num_bloom_filters_);
-
- for (std::size_t i = 0; i < num_bloom_filters_; ++i) {
- bloom_filter_entries_.emplace_back(bloom_filters[i], attribute_ids[i], attr_sizes[i]);
- bloom_filter_entry_indices_.emplace_back(i);
+ bloom_filter_entries_.reserve(bloom_filters.size());
+ for (std::size_t i = 0; i < bloom_filters.size(); ++i) {
+ auto entry = new BloomFilterEntry(
+ bloom_filters[i], attribute_ids[i], attr_sizes[i]);
+ bloom_filter_entries_.push_back(entry);
}
}
- template <typename ValueAccessorT>
- inline bool miss(const ValueAccessorT *accessor) {
- return missImpl<ValueAccessorT, true>(accessor);
+ ~BloomFilterAdapter() {
+ for (auto &entry : bloom_filter_entries_)
+ delete entry;
}
- template <typename ValueAccessorT, bool adapt_filters>
- inline bool missImpl(const ValueAccessorT *accessor) {
- for (std::size_t i = 0; i < num_bloom_filters_; ++i) {
- const std::size_t entry_idx = bloom_filter_entry_indices_[i];
- BloomFilterEntry &entry = bloom_filter_entries_[entry_idx];
- if (adapt_filters) {
- ++entry.cnt;
- }
-
- const BloomFilter *bloom_filter = entry.bloom_filter;
- for (std::size_t i = 0; i < entry.attribute_ids.size(); ++i) {
- const attribute_id &attr_id = entry.attribute_ids[i];
- const std::size_t size = entry.attribute_sizes[i];
- auto value = static_cast<const std::uint8_t*>(accessor->template getUntypedValue<false>(attr_id));
- if (!bloom_filter->contains(value, size)) {
- if (adapt_filters) {
- // Record miss
- ++entry.miss;
-
- // Update entry order
- if (!(entry.miss % kNumMissesBeforeAdapt) && i > 0) {
- const std::size_t prev_entry_idx = bloom_filter_entry_indices_[i-1];
- if (entry.isBetterThan(bloom_filter_entries_[prev_entry_idx])) {
- bloom_filter_entry_indices_[i-1] = entry_idx;
- bloom_filter_entry_indices_[i] = prev_entry_idx;
- }
- }
- }
- return true;
- }
- }
- }
- return false;
+ template <typename ValueAccessorT, bool adapt_filters = true>
+ std::size_t bulkProbe(
+ const ValueAccessorT *accessor,
+ std::vector<tuple_id> &batch) {
+ std::size_t out_size = batch.size();
+ for (auto &entry : bloom_filter_entries_)
+ out_size = bulkProbeBloomFilterEntry(*entry, accessor, batch, out_size);
+ adaptEntryOrder();
+ return out_size;
}
private:
@@ -107,22 +81,59 @@ class BloomFilterAdapter {
cnt(0) {
}
- inline bool isBetterThan(const BloomFilterEntry& other) {
- return static_cast<std::uint64_t>(miss) * other.cnt
- > static_cast<std::uint64_t>(cnt + 5) * (other.miss + 5);
+ static bool isBetterThan(const BloomFilterEntry *a,
+ const BloomFilterEntry *b) {
+ return a->miss_rate > b->miss_rate;
+ // return static_cast<std::uint64_t>(a.miss) * b.cnt
+ // > static_cast<std::uint64_t>(a.cnt + 5) * (b.miss + 5);
}
const BloomFilter *bloom_filter;
- const std::vector<attribute_id> &attribute_ids;
+ std::vector<attribute_id> attribute_ids;
std::vector<std::size_t> attribute_sizes;
std::uint32_t miss;
std::uint32_t cnt;
+ float miss_rate;
};
- const std::size_t kNumMissesBeforeAdapt = 64;
- const std::size_t num_bloom_filters_;
- std::vector<BloomFilterEntry> bloom_filter_entries_;
- std::vector<std::size_t> bloom_filter_entry_indices_;
+ template <typename ValueAccessorT, bool adapt_filters = true>
+ std::size_t bulkProbeBloomFilterEntry(
+ BloomFilterEntry &entry,
+ const ValueAccessorT *accessor,
+ std::vector<tuple_id> &batch,
+ const std::size_t in_size) {
+ std::size_t out_size = 0;
+ const BloomFilter *bloom_filter = entry.bloom_filter;
+
+ for (std::size_t t = 0; t < in_size; ++t) {
+ tuple_id tid = batch[t];
+ for (std::size_t a = 0; a < entry.attribute_ids.size(); ++a) {
+ const attribute_id &attr_id = entry.attribute_ids[a];
+ const std::size_t size = entry.attribute_sizes[a];
+ auto value = static_cast<const std::uint8_t*>(
+ accessor->getUntypedValueAtAbsolutePosition(attr_id, tid));
+ if (bloom_filter->contains(value, size)) {
+ batch[out_size] = tid;
+ ++out_size;
+ }
+ }
+ }
+ if (adapt_filters) {
+ entry.cnt += in_size;
+ entry.miss += (in_size - out_size);
+ }
+ return out_size;
+ }
+
+ void adaptEntryOrder() {
+ for (auto &entry : bloom_filter_entries_)
+ entry->miss_rate = static_cast<float>(entry->miss) / entry->cnt;
+ std::sort(bloom_filter_entries_.begin(),
+ bloom_filter_entries_.end(),
+ BloomFilterEntry::isBetterThan);
+ }
+
+ std::vector<BloomFilterEntry *> bloom_filter_entries_;
DISALLOW_COPY_AND_ASSIGN(BloomFilterAdapter);
};