You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/07/21 03:08:07 UTC

[incubator-datasketches-cpp] branch tuple_sketch updated: reuse update base in intersection, cleanup

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/tuple_sketch by this push:
     new 75791ce  reuse update base in intersection, cleanup
75791ce is described below

commit 75791ce40c9970d780efeecec07b82f7de55a878
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Jul 20 20:07:58 2020 -0700

    reuse update base in intersection, cleanup
---
 tuple/include/theta_intersection_base.hpp        | 14 +---
 tuple/include/theta_intersection_base_impl.hpp   | 98 +++++++-----------------
 tuple/include/theta_set_difference_base_impl.hpp |  2 +-
 tuple/include/theta_sketch_experimental.hpp      |  6 +-
 tuple/include/theta_sketch_experimental_impl.hpp | 46 ++---------
 tuple/include/theta_union_base.hpp               |  2 +-
 tuple/include/theta_union_base_impl.hpp          |  6 +-
 tuple/include/theta_union_experimental.hpp       |  2 +-
 tuple/include/theta_union_experimental_impl.hpp  |  6 +-
 tuple/include/theta_update_sketch_base.hpp       |  7 +-
 tuple/include/theta_update_sketch_base_impl.hpp  | 54 ++++++-------
 tuple/include/tuple_sketch.hpp                   |  2 +-
 tuple/include/tuple_sketch_impl.hpp              |  6 +-
 tuple/include/tuple_union.hpp                    |  2 +-
 tuple/include/tuple_union_impl.hpp               |  6 +-
 15 files changed, 87 insertions(+), 172 deletions(-)

diff --git a/tuple/include/theta_intersection_base.hpp b/tuple/include/theta_intersection_base.hpp
index f6ca4d0..1313e11 100644
--- a/tuple/include/theta_intersection_base.hpp
+++ b/tuple/include/theta_intersection_base.hpp
@@ -32,12 +32,10 @@ template<
 >
 class theta_intersection_base {
 public:
+  using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
+  using resize_factor = typename hash_table::resize_factor;
   using comparator = compare_by_key<ExtractKey>;
   theta_intersection_base(uint64_t seed, const Policy& policy, const Allocator& allocator);
-  ~theta_intersection_base();
-  void destroy_objects();
-
-  // TODO: copy and move
 
   template<typename FwdSketch>
   void update(FwdSketch&& sketch);
@@ -47,15 +45,9 @@ public:
   bool has_result() const;
 
 private:
-  Allocator allocator_;
   Policy policy_;
   bool is_valid_;
-  bool is_empty_;
-  uint8_t lg_size_;
-  uint16_t seed_hash_;
-  uint32_t num_entries_;
-  uint64_t theta_;
-  Entry* entries_;
+  hash_table table_;
 };
 
 } /* namespace datasketches */
diff --git a/tuple/include/theta_intersection_base_impl.hpp b/tuple/include/theta_intersection_base_impl.hpp
index e56abcb..970ec50 100644
--- a/tuple/include/theta_intersection_base_impl.hpp
+++ b/tuple/include/theta_intersection_base_impl.hpp
@@ -27,77 +27,45 @@ namespace datasketches {
 
 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
 theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
-allocator_(allocator),
 policy_(policy),
 is_valid_(false),
-is_empty_(false),
-lg_size_(0),
-seed_hash_(compute_seed_hash(seed)),
-num_entries_(0),
-theta_(theta_constants::MAX_THETA),
-entries_(nullptr)
+table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
 {}
 
 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
-theta_intersection_base<EN, EK, P, S, CS, A>::~theta_intersection_base() {
-  destroy_objects();
-  if (entries_ != nullptr) allocator_.deallocate(entries_, 1 << lg_size_);
-}
-
-template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
-void theta_intersection_base<EN, EK, P, S, CS, A>::destroy_objects() {
-  if (entries_ != nullptr) {
-    const size_t size = 1 << lg_size_;
-    for (size_t i = 0; i < size; ++i) {
-      if (EK()(entries_[i]) != 0) {
-        entries_[i].~EN();
-        EK()(entries_[i]) = 0;
-      }
-    }
-  }
-}
-
-template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
 template<typename SS>
 void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
-  if (is_empty_) return;
-  if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
-  is_empty_ |= sketch.is_empty();
-  theta_ = std::min(theta_, sketch.get_theta64());
-  if (is_valid_ && num_entries_ == 0) return;
+  if (table_.is_empty_) return;
+  if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
+  table_.is_empty_ |= sketch.is_empty();
+  table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
+  if (is_valid_ && table_.num_entries_ == 0) return;
   if (sketch.get_num_retained() == 0) {
     is_valid_ = true;
-    destroy_objects();
-    allocator_.deallocate(entries_, 1 << lg_size_);
-    entries_ = nullptr;
-    lg_size_ = 0;
-    num_entries_ = 0;
+    table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
     return;
   }
   if (!is_valid_) { // first update, copy or move incoming sketch
     is_valid_ = true;
-    lg_size_ = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
-    const size_t size = 1 << lg_size_;
-    entries_ = allocator_.allocate(size);
-    for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
+    const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
+    table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
     for (auto& entry: sketch) {
-      auto result = theta_update_sketch_base<EN, EK, A>::find(entries_, lg_size_, EK()(entry));
+      auto result = table_.find(EK()(entry));
       if (result.second) {
         throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
       }
-      new (result.first) EN(conditional_forward<SS>(entry));
-      ++num_entries_;
+      table_.insert(result.first, conditional_forward<SS>(entry));
     }
-    if (num_entries_ != sketch.get_num_retained()) throw std::invalid_argument("num entries mismatch, possibly corrupted input sketch");
+    if (table_.num_entries_ != sketch.get_num_retained()) throw std::invalid_argument("num entries mismatch, possibly corrupted input sketch");
   } else { // intersection
-    const uint32_t max_matches = std::min(num_entries_, sketch.get_num_retained());
-    std::vector<EN, A> matched_entries;
+    const uint32_t max_matches = std::min(table_.num_entries_, sketch.get_num_retained());
+    std::vector<EN, A> matched_entries(table_.allocator_);
     matched_entries.reserve(max_matches);
     uint32_t match_count = 0;
     uint32_t count = 0;
     for (auto& entry: sketch) {
-      if (EK()(entry) < theta_) {
-        auto result = theta_update_sketch_base<EN, EK, A>::find(entries_, lg_size_, EK()(entry));
+      if (EK()(entry) < table_.theta_) {
+        auto result = table_.find(EK()(entry));
         if (result.second) {
           if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
           policy_(*result.first, conditional_forward<SS>(entry));
@@ -114,28 +82,16 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
     } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
       throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
     }
-    destroy_objects();
     if (match_count == 0) {
-      allocator_.deallocate(entries_, 1 << lg_size_);
-      entries_ = nullptr;
-      lg_size_ = 0;
-      num_entries_ = 0;
-      if (theta_ == theta_constants::MAX_THETA) is_empty_ = true;
+      table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
+      if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
     } else {
       const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
-      const size_t size = 1 << lg_size;
-      if (lg_size != lg_size_) {
-        allocator_.deallocate(entries_, 1 << lg_size_);
-        entries_ = nullptr;
-        lg_size_ = lg_size;
-        entries_ = allocator_.allocate(size);
-        for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
-      }
+      table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
       for (uint32_t i = 0; i < match_count; i++) {
-        auto result = theta_update_sketch_base<EN, EK, A>::find(entries_, lg_size_, EK()(matched_entries[i]));
-        new (result.first) EN(std::move(matched_entries[i]));
+        auto result = table_.find(EK()(matched_entries[i]));
+        table_.insert(result.first, std::move(matched_entries[i]));
       }
-      num_entries_ = match_count;
     }
   }
 }
@@ -143,13 +99,13 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
 CS theta_intersection_base<EN, EK, P, S, CS, A>::get_result(bool ordered) const {
   if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
-  std::vector<EN, A> entries_copy;
-  if (num_entries_ > 0) {
-    entries_copy.reserve(num_entries_);
-    std::copy_if(entries_, entries_ + (1 << lg_size_), std::back_inserter(entries_copy), key_not_zero<EN, EK>());
-    if (ordered) std::sort(entries_copy.begin(), entries_copy.end(), comparator());
+  std::vector<EN, A> entries(table_.allocator_);
+  if (table_.num_entries_ > 0) {
+    entries.reserve(table_.num_entries_);
+    std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero<EN, EK>());
+    if (ordered) std::sort(entries.begin(), entries.end(), comparator());
   }
-  return CS(is_empty_, ordered, seed_hash_, theta_, std::move(entries_copy));
+  return CS(table_.is_empty_, ordered, compute_seed_hash(table_.seed_), table_.theta_, std::move(entries));
 }
 
 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
diff --git a/tuple/include/theta_set_difference_base_impl.hpp b/tuple/include/theta_set_difference_base_impl.hpp
index e3c51bb..cf5484b 100644
--- a/tuple/include/theta_set_difference_base_impl.hpp
+++ b/tuple/include/theta_set_difference_base_impl.hpp
@@ -50,7 +50,7 @@ CS theta_set_difference_base<EN, EK, S, CS, A>::compute(SS&& a, const S& b, bool
           conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
     } else { // hash-based
       const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
-      hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, allocator_); // seed is not used here
+      hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
       for (const auto& entry: b) {
         const uint64_t hash = EK()(entry);
         if (hash < theta) {
diff --git a/tuple/include/theta_sketch_experimental.hpp b/tuple/include/theta_sketch_experimental.hpp
index 1e24560..da15c0b 100644
--- a/tuple/include/theta_sketch_experimental.hpp
+++ b/tuple/include/theta_sketch_experimental.hpp
@@ -33,8 +33,6 @@ template<typename A = std::allocator<uint64_t>>
 class theta_sketch_experimental {
 public:
   using resize_factor = theta_constants::resize_factor;
-  using AllocBytes = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
-  using vector_bytes = std::vector<uint8_t, AllocBytes>;
 
   class builder: public theta_base_builder<builder> {
   public:
@@ -58,8 +56,6 @@ public:
 
   string<A> to_string(bool detail = false) const;
 
-  vector_bytes serialize(unsigned header_size_bytes = 0) const;
-
   using const_iterator = theta_const_iterator<uint64_t, trivial_extract_key>;
   const_iterator begin() const;
   const_iterator end() const;
@@ -71,7 +67,7 @@ private:
   using theta_table = theta_update_sketch_base<uint64_t, trivial_extract_key, A>;
   theta_table table_;
 
-  theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const A& allocator);
+  theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator);
 };
 
 template<typename A = std::allocator<uint64_t>>
diff --git a/tuple/include/theta_sketch_experimental_impl.hpp b/tuple/include/theta_sketch_experimental_impl.hpp
index 15fbb0e..19fb707 100644
--- a/tuple/include/theta_sketch_experimental_impl.hpp
+++ b/tuple/include/theta_sketch_experimental_impl.hpp
@@ -25,8 +25,8 @@ namespace datasketches {
 
 template<typename A>
 theta_sketch_experimental<A>::theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
-    float p, uint64_t seed, const A& allocator):
-table_(lg_cur_size, lg_nom_size, rf, p, seed, allocator)
+    uint64_t theta, uint64_t seed, const A& allocator):
+table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
 {}
 
 template<typename A>
@@ -66,36 +66,6 @@ string<A> theta_sketch_experimental<A>::to_string(bool detail) const {
 }
 
 template<typename A>
-auto theta_sketch_experimental<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
-  const uint8_t preamble_longs = 3;
-  const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * (1 << table_.lg_cur_size_);
-  vector_bytes bytes(size);
-  uint8_t* ptr = bytes.data() + header_size_bytes;
-
-  const uint8_t preamble_longs_and_rf = preamble_longs | (table_.rf_ << 6);
-  ptr += copy_to_mem(&preamble_longs_and_rf, ptr, sizeof(preamble_longs_and_rf));
-  const uint8_t serial_version = 0;
-  ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
-  const uint8_t type = 0;
-  ptr += copy_to_mem(&type, ptr, sizeof(type));
-  ptr += copy_to_mem(&table_.lg_nom_size_, ptr, sizeof(table_.lg_nom_size_));
-  ptr += copy_to_mem(&table_.lg_cur_size_, ptr, sizeof(table_.lg_cur_size_));
-  const uint8_t flags_byte(
-    (this->is_empty() ? 1 << flags::IS_EMPTY : 0)
-  );
-  ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
-  const uint16_t seed_hash = 0;
-  ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
-  ptr += copy_to_mem(&table_.num_entries_, ptr, sizeof(table_.num_entries_));
-  const float p = 1;
-  ptr += copy_to_mem(&p, ptr, sizeof(p));
-  ptr += copy_to_mem(&table_.theta_, ptr, sizeof(table_.theta_));
-  ptr += copy_to_mem(table_.entries_, ptr, sizeof(uint64_t) * (1 << table_.lg_cur_size_));
-
-  return bytes;
-}
-
-template<typename A>
 auto theta_sketch_experimental<A>::begin() const -> const_iterator {
   return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
 }
@@ -105,6 +75,11 @@ auto theta_sketch_experimental<A>::end() const -> const_iterator {
   return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
 }
 
+template<typename A>
+compact_theta_sketch_experimental<A> theta_sketch_experimental<A>::compact(bool ordered) const {
+  return compact_theta_sketch_experimental<A>(*this, ordered);
+}
+
 // builder
 
 template<typename A>
@@ -112,12 +87,7 @@ theta_sketch_experimental<A>::builder::builder(const A& allocator): allocator_(a
 
 template<typename A>
 theta_sketch_experimental<A> theta_sketch_experimental<A>::builder::build() const {
-  return theta_sketch_experimental(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->seed_, allocator_);
-}
-
-template<typename A>
-compact_theta_sketch_experimental<A> theta_sketch_experimental<A>::compact(bool ordered) const {
-  return compact_theta_sketch_experimental<A>(*this, ordered);
+  return theta_sketch_experimental(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, allocator_);
 }
 
 // experimental compact theta sketch
diff --git a/tuple/include/theta_union_base.hpp b/tuple/include/theta_union_base.hpp
index 03e71fd..72d6eca 100644
--- a/tuple/include/theta_union_base.hpp
+++ b/tuple/include/theta_union_base.hpp
@@ -38,7 +38,7 @@ public:
   using resize_factor = typename hash_table::resize_factor;
   using comparator = compare_by_key<ExtractKey>;
 
-  theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy, const Allocator& allocator);
+  theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
 
   template<typename FwdSketch>
   void update(FwdSketch&& sketch);
diff --git a/tuple/include/theta_union_base_impl.hpp b/tuple/include/theta_union_base_impl.hpp
index a1015ef..2bc458d 100644
--- a/tuple/include/theta_union_base_impl.hpp
+++ b/tuple/include/theta_union_base_impl.hpp
@@ -25,9 +25,9 @@ namespace datasketches {
 
 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
 theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
-    float p, uint64_t seed, const P& policy, const A& allocator):
+    uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
 policy_(policy),
-table_(lg_cur_size, lg_nom_size, rf, p, seed, allocator),
+table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
 union_theta_(table_.theta_)
 {}
 
@@ -56,7 +56,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
 
 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
 CS theta_union_base<EN, EK, P, S, CS, A>::get_result(bool ordered) const {
-  std::vector<EN, A> entries;
+  std::vector<EN, A> entries(table_.allocator_);
   if (table_.is_empty_) return CS(true, true, compute_seed_hash(table_.seed_), union_theta_, std::move(entries));
   entries.reserve(table_.num_entries_);
   uint64_t theta = std::min(union_theta_, table_.theta_);
diff --git a/tuple/include/theta_union_experimental.hpp b/tuple/include/theta_union_experimental.hpp
index 5a5cb84..8f93248 100644
--- a/tuple/include/theta_union_experimental.hpp
+++ b/tuple/include/theta_union_experimental.hpp
@@ -67,7 +67,7 @@ private:
   State state_;
 
   // for builder
-  theta_union_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Allocator& allocator);
+  theta_union_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
 };
 
 template<typename A>
diff --git a/tuple/include/theta_union_experimental_impl.hpp b/tuple/include/theta_union_experimental_impl.hpp
index 51fe13e..4aac362 100644
--- a/tuple/include/theta_union_experimental_impl.hpp
+++ b/tuple/include/theta_union_experimental_impl.hpp
@@ -20,8 +20,8 @@
 namespace datasketches {
 
 template<typename A>
-theta_union_experimental<A>::theta_union_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const A& allocator):
-state_(lg_cur_size, lg_nom_size, rf, p, seed, pass_through_policy(), allocator)
+theta_union_experimental<A>::theta_union_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
+state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
 {}
 
 template<typename A>
@@ -41,7 +41,7 @@ template<typename A>
 auto theta_union_experimental<A>::builder::build() const -> theta_union_experimental {
   return theta_union_experimental(
       this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
-      this->lg_k_, this->rf_, this->p_, this->seed_, allocator_);
+      this->lg_k_, this->rf_, this->starting_theta(), this->seed_, allocator_);
 }
 
 } /* namespace datasketches */
diff --git a/tuple/include/theta_update_sketch_base.hpp b/tuple/include/theta_update_sketch_base.hpp
index 0da68c5..15b6acd 100644
--- a/tuple/include/theta_update_sketch_base.hpp
+++ b/tuple/include/theta_update_sketch_base.hpp
@@ -43,8 +43,8 @@ struct theta_update_sketch_base {
   using resize_factor = theta_constants::resize_factor;
   using comparator = compare_by_key<ExtractKey>;
 
-  theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
-      uint64_t seed, const Allocator& allocator);
+  theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
+      uint64_t seed, const Allocator& allocator, bool is_empty = true);
   theta_update_sketch_base(const theta_update_sketch_base& other);
   theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
   ~theta_update_sketch_base();
@@ -87,8 +87,6 @@ struct theta_update_sketch_base {
 
   static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
   static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
-
-  static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
 };
 
 // builder
@@ -144,6 +142,7 @@ protected:
   float p_;
   uint64_t seed_;
 
+  uint64_t starting_theta() const;
   uint8_t starting_lg_size() const;
   static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
 };
diff --git a/tuple/include/theta_update_sketch_base_impl.hpp b/tuple/include/theta_update_sketch_base_impl.hpp
index 9fbf6f0..cf6b46d 100644
--- a/tuple/include/theta_update_sketch_base_impl.hpp
+++ b/tuple/include/theta_update_sketch_base_impl.hpp
@@ -24,21 +24,22 @@
 namespace datasketches {
 
 template<typename EN, typename EK, typename A>
-theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const A& allocator):
+theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
 allocator_(allocator),
-is_empty_(true),
+is_empty_(is_empty),
 lg_cur_size_(lg_cur_size),
 lg_nom_size_(lg_nom_size),
 rf_(rf),
 num_entries_(0),
-theta_(theta_constants::MAX_THETA),
+theta_(theta),
 seed_(seed),
 entries_(nullptr)
 {
-  const size_t size = 1 << lg_cur_size;
-  entries_ = allocator_.allocate(size);
-  for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
-  if (p < 1) this->theta_ *= p;
+  if (lg_cur_size > 0) {
+    const size_t size = 1 << lg_cur_size;
+    entries_ = allocator_.allocate(size);
+    for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
+  }
 }
 
 template<typename EN, typename EK, typename A>
@@ -123,19 +124,27 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
 }
 
 template<typename EN, typename EK, typename A>
-auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uint64_t key) -> std::pair<iterator, bool> {
-  const size_t size = 1 << lg_size;
+uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data, size_t length) {
+  is_empty_ = false;
+  const uint64_t hash = compute_hash(data, length, seed_);
+  if (hash >= theta_) return 0; // hash == 0 is reserved to mark empty slots in the table
+  return hash;
+}
+
+template<typename EN, typename EK, typename A>
+auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
+  const size_t size = 1 << lg_cur_size_;
   const size_t mask = size - 1;
-  const uint32_t stride = get_stride(key, lg_size);
+  const uint32_t stride = get_stride(key, lg_cur_size_);
   uint32_t index = static_cast<uint32_t>(key) & mask;
   // search for duplicate or zero
   const uint32_t loop_index = index;
   do {
-    const uint64_t probe = EK()(entries[index]);
+    const uint64_t probe = EK()(entries_[index]);
     if (probe == 0) {
-      return std::pair<iterator, bool>(&entries[index], false);
+      return std::pair<iterator, bool>(&entries_[index], false);
     } else if (probe == key) {
-      return std::pair<iterator, bool>(&entries[index], true);
+      return std::pair<iterator, bool>(&entries_[index], true);
     }
     index = (index + stride) & mask;
   } while (index != loop_index);
@@ -143,19 +152,6 @@ auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uin
 }
 
 template<typename EN, typename EK, typename A>
-uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data, size_t length) {
-  is_empty_ = false;
-  const uint64_t hash = compute_hash(data, length, seed_);
-  if (hash >= theta_) return 0; // hash == 0 is reserved to mark empty slots in the table
-  return hash;
-}
-
-template<typename EN, typename EK, typename A>
-auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
-  return find(entries_, lg_cur_size_, key);
-}
-
-template<typename EN, typename EK, typename A>
 template<typename Fwd>
 void theta_update_sketch_base<EN, EK, A>::insert(iterator it, Fwd&& entry) {
   new (it) EN(std::forward<Fwd>(entry));
@@ -272,6 +268,12 @@ Derived& theta_base_builder<Derived>::set_seed(uint64_t seed) {
 }
 
 template<typename Derived>
+uint64_t theta_base_builder<Derived>::starting_theta() const {
+  if (p_ < 1) return theta_constants::MAX_THETA * p_;
+  return theta_constants::MAX_THETA;
+}
+
+template<typename Derived>
 uint8_t theta_base_builder<Derived>::starting_lg_size() const {
   return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
 }
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 0f22005..6ae8833 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -324,7 +324,7 @@ private:
   tuple_map map_;
 
   // for builder
-  update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy, const Allocator& allocator);
+  update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
 
   virtual void print_specifics(std::ostringstream& os) const;
 };
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index aa68cd5..7178e50 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -82,9 +82,9 @@ string<A> tuple_sketch<S, A>::to_string(bool detail) const {
 // update sketch
 
 template<typename S, typename U, typename P, typename A>
-update_tuple_sketch<S, U, P, A>::update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const P& policy, const A& allocator):
+update_tuple_sketch<S, U, P, A>::update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
 policy_(policy),
-map_(lg_cur_size, lg_nom_size, rf, p, seed, allocator)
+map_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
 {}
 
 template<typename S, typename U, typename P, typename A>
@@ -549,7 +549,7 @@ policy_(policy), allocator_(allocator) {}
 
 template<typename S, typename U, typename P, typename A>
 auto update_tuple_sketch<S, U, P, A>::builder::build() const -> update_tuple_sketch {
-  return update_tuple_sketch(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->seed_, policy_, allocator_);
+  return update_tuple_sketch(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, policy_, allocator_);
 }
 
 } /* namespace datasketches */
diff --git a/tuple/include/tuple_union.hpp b/tuple/include/tuple_union.hpp
index d716664..0c87af3 100644
--- a/tuple/include/tuple_union.hpp
+++ b/tuple/include/tuple_union.hpp
@@ -83,7 +83,7 @@ private:
   State state_;
 
   // for builder
-  tuple_union(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy, const Allocator& allocator);
+  tuple_union(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
 };
 
 template<typename S, typename P, typename A>
diff --git a/tuple/include/tuple_union_impl.hpp b/tuple/include/tuple_union_impl.hpp
index 6df2794..e184fbc 100644
--- a/tuple/include/tuple_union_impl.hpp
+++ b/tuple/include/tuple_union_impl.hpp
@@ -20,8 +20,8 @@
 namespace datasketches {
 
 template<typename S, typename P, typename A>
-tuple_union<S, P, A>::tuple_union(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const P& policy, const A& allocator):
-state_(lg_cur_size, lg_nom_size, rf, p, seed, internal_policy(policy), allocator)
+tuple_union<S, P, A>::tuple_union(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
+state_(lg_cur_size, lg_nom_size, rf, theta, seed, internal_policy(policy), allocator)
 {}
 
 template<typename S, typename P, typename A>
@@ -41,7 +41,7 @@ policy_(policy), allocator_(allocator) {}
 
 template<typename S, typename P, typename A>
 auto tuple_union<S, P, A>::builder::build() const -> tuple_union {
-  return tuple_union(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->seed_, policy_, allocator_);
+  return tuple_union(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, policy_, allocator_);
 }
 
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org