You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/11/01 22:13:31 UTC

[datasketches-cpp] 01/02: equality operator instance

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch equal_to_instance
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 687c089226f4f0f91004cffb2dbdce5e70634086
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Nov 1 14:52:52 2022 -0700

    equality operator instance
---
 fi/include/frequent_items_sketch.hpp               | 34 ++++----
 fi/include/frequent_items_sketch_impl.hpp          | 14 ++--
 fi/include/reverse_purge_hash_map.hpp              | 13 ++-
 fi/include/reverse_purge_hash_map_impl.hpp         | 25 +++---
 fi/test/frequent_items_sketch_custom_type_test.cpp | 93 ++++++++++++----------
 fi/test/reverse_purge_hash_map_test.cpp            |  9 ++-
 6 files changed, 114 insertions(+), 74 deletions(-)

diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp
index 79c8e57..d0161c5 100644
--- a/fi/include/frequent_items_sketch.hpp
+++ b/fi/include/frequent_items_sketch.hpp
@@ -59,11 +59,13 @@ public:
    * @param lg_max_map_size Log2 of the physical size of the internal hash map managed by this
    * sketch. The maximum capacity of this internal hash map is 0.75 times 2^lg_max_map_size.
    * Both the ultimate accuracy and size of this sketch are functions of lg_max_map_size.
-   *
    * @param lg_start_map_size Log2 of the starting physical size of the internal hash
    * map managed by this sketch.
+   * @param equal instance of Equality operator
+   * @param allocator instance of an Allocator
    */
-  explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE, const A& allocator = A());
+  explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE,
+      const E& equal = E(), const A& allocator = A());
 
   /**
    * Update this sketch with an item and a positive weight (frequency count).
@@ -157,7 +159,7 @@ public:
   /**
    * Returns epsilon used to compute <i>a priori</i> error.
    * This is just the value <i>3.5 / maxMapSize</i>.
-   * @param maxMapSize the planned map size to be used when constructing this sketch.
+   * @param lg_max_map_size the planned map size to be used when constructing this sketch.
    * @return epsilon used to compute <i>a priori</i> error.
    */
   static double get_epsilon(uint8_t lg_max_map_size);
@@ -166,13 +168,13 @@ public:
    * Returns the estimated <i>a priori</i> error given the max_map_size for the sketch and the
    * estimated_total_stream_weight.
    * @param lg_max_map_size the planned map size to be used when constructing this sketch.
-   * @param estimated_total_stream_weight the estimated total stream weight.
+   * @param estimated_total_weight the estimated total stream weight.
    * @return the estimated <i>a priori</i> error.
    */
   static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
 
   class row;
-  typedef typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>> vector_row; // alias for users
+  using vector_row = typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>>;
 
   /**
    * Returns an array of rows that include frequent items, estimates, upper and lower bounds
@@ -224,7 +226,7 @@ public:
   /**
    * Computes size needed to serialize the current state of the sketch.
    * This can be expensive since every item needs to be looked at.
-   * @param instance of a SerDe
+   * @param serde instance of a SerDe
    * @return size in bytes needed to serialize this sketch
    */
   template<typename SerDe = serde<T>>
@@ -233,7 +235,7 @@ public:
   /**
    * This method serializes the sketch into a given stream in a binary form
    * @param os output stream
-   * @param instance of a SerDe
+   * @param serde instance of a SerDe
    */
   template<typename SerDe = serde<T>>
   void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -248,7 +250,7 @@ public:
    * It is a blank space of a given size.
    * This header is used in Datasketches PostgreSQL extension.
    * @param header_size_bytes space to reserve in front of the sketch
-   * @param instance of a SerDe
+   * @param serde instance of a SerDe
    * @return serialized sketch as a vector of bytes
    */
   template<typename SerDe = serde<T>>
@@ -257,23 +259,27 @@ public:
   /**
    * This method deserializes a sketch from a given stream.
    * @param is input stream
-   * @param instance of a SerDe
-   * @param instance of an Allocator
+   * @param serde instance of a SerDe
+   * @param equal instance of Equality operator
+   * @param allocator instance of an Allocator
    * @return an instance of the sketch
    */
   template<typename SerDe = serde<T>>
-  static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
+  static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
+      const E& equal = E(), const A& allocator = A());
 
   /**
    * This method deserializes a sketch from a given array of bytes.
    * @param bytes pointer to the array of bytes
    * @param size the size of the array
-   * @param instance of a SerDe
-   * @param instance of an Allocator
+   * @param serde instance of a SerDe
+   * @param equal instance of Equality operator
+   * @param allocator instance of an Allocator
    * @return an instance of the sketch
    */
   template<typename SerDe = serde<T>>
-  static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
+  static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
+      const E& equal = E(), const A& allocator = A());
 
   /**
    * Returns a human readable summary of this sketch
diff --git a/fi/include/frequent_items_sketch_impl.hpp b/fi/include/frequent_items_sketch_impl.hpp
index e485da7..d72f462 100644
--- a/fi/include/frequent_items_sketch_impl.hpp
+++ b/fi/include/frequent_items_sketch_impl.hpp
@@ -34,12 +34,14 @@ template<typename T, typename W, typename H, typename E, typename A>
 const uint8_t frequent_items_sketch<T, W, H, E, A>::LG_MIN_MAP_SIZE;
 
 template<typename T, typename W, typename H, typename E, typename A>
-frequent_items_sketch<T, W, H, E, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size, const A& allocator):
+frequent_items_sketch<T, W, H, E, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size,
+    const E& equal, const A& allocator):
 total_weight(0),
 offset(0),
 map(
   std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
   std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
+  equal,
   allocator
 )
 {
@@ -286,7 +288,8 @@ private:
 
 template<typename T, typename W, typename H, typename E, typename A>
 template<typename SerDe>
-frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
+frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(std::istream& is,
+    const SerDe& sd, const E& equal, const A& allocator) {
   const auto preamble_longs = read<uint8_t>(is);
   const auto serial_version = read<uint8_t>(is);
   const auto family_id = read<uint8_t>(is);
@@ -302,7 +305,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
   check_family_id(family_id);
   check_size(lg_cur_size, lg_max_size);
 
-  frequent_items_sketch sketch(lg_max_size, lg_cur_size, allocator);
+  frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
   if (!is_empty) {
     const auto num_items = read<uint32_t>(is);
     read<uint32_t>(is); // unused
@@ -330,7 +333,8 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
 
 template<typename T, typename W, typename H, typename E, typename A>
 template<typename SerDe>
-frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
+frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(const void* bytes, size_t size,
+    const SerDe& sd, const E& equal, const A& allocator) {
   ensure_minimum_memory(size, 8);
   const char* ptr = static_cast<const char*>(bytes);
   const char* base = static_cast<const char*>(bytes);
@@ -356,7 +360,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
   check_size(lg_cur_size, lg_max_size);
   ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
 
-  frequent_items_sketch sketch(lg_max_size, lg_cur_size, allocator);
+  frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
   if (!is_empty) {
     uint32_t num_items;
     ptr += copy_from_mem(ptr, num_items);
diff --git a/fi/include/reverse_purge_hash_map.hpp b/fi/include/reverse_purge_hash_map.hpp
index 1f42191..6e2e53a 100644
--- a/fi/include/reverse_purge_hash_map.hpp
+++ b/fi/include/reverse_purge_hash_map.hpp
@@ -33,17 +33,23 @@ namespace datasketches {
  * author Alexander Saydakov
  */
 
-template<typename K, typename V = uint64_t, typename H = std::hash<K>, typename E = std::equal_to<K>, typename A = std::allocator<K>>
+template<
+  typename K,
+  typename V = uint64_t,
+  typename H = std::hash<K>,
+  typename E = std::equal_to<K>,
+  typename A = std::allocator<K>
+>
 class reverse_purge_hash_map {
 public:
   using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
   using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
 
-  reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const A& allocator);
+  reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const E& equal, const A& allocator);
   reverse_purge_hash_map(const reverse_purge_hash_map& other);
   reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
   ~reverse_purge_hash_map();
-  reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
+  reverse_purge_hash_map& operator=(const reverse_purge_hash_map& other);
   reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
 
   template<typename FwdK>
@@ -65,6 +71,7 @@ private:
   static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
   static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
 
+  E equal_;
   A allocator_;
   uint8_t lg_cur_size_;
   uint8_t lg_max_size_;
diff --git a/fi/include/reverse_purge_hash_map_impl.hpp b/fi/include/reverse_purge_hash_map_impl.hpp
index 77461de..fa2ad82 100644
--- a/fi/include/reverse_purge_hash_map_impl.hpp
+++ b/fi/include/reverse_purge_hash_map_impl.hpp
@@ -34,7 +34,9 @@ template<typename K, typename V, typename H, typename E, typename A>
 constexpr uint32_t reverse_purge_hash_map<K, V, H, E, A>::MAX_SAMPLE_SIZE;
 
 template<typename K, typename V, typename H, typename E, typename A>
-reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size, const A& allocator):
+reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size,
+    const E& equal, const A& allocator):
+equal_(equal),
 allocator_(allocator),
 lg_cur_size_(lg_cur_size),
 lg_max_size_(lg_max_size),
@@ -52,6 +54,7 @@ states_(nullptr)
 
 template<typename K, typename V, typename H, typename E, typename A>
 reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(const reverse_purge_hash_map<K, V, H, E, A>& other):
+equal_(other.equal_),
 allocator_(other.allocator_),
 lg_cur_size_(other.lg_cur_size_),
 lg_max_size_(other.lg_max_size_),
@@ -80,6 +83,7 @@ states_(nullptr)
 
 template<typename K, typename V, typename H, typename E, typename A>
 reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(reverse_purge_hash_map<K, V, H, E, A>&& other) noexcept:
+equal_(std::move(other.equal_)),
 allocator_(std::move(other.allocator_)),
 lg_cur_size_(other.lg_cur_size_),
 lg_max_size_(other.lg_max_size_),
@@ -119,19 +123,22 @@ reverse_purge_hash_map<K, V, H, E, A>::~reverse_purge_hash_map() {
 }
 
 template<typename K, typename V, typename H, typename E, typename A>
-reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A> other) {
-  std::swap(allocator_, other.allocator_);
-  std::swap(lg_cur_size_, other.lg_cur_size_);
-  std::swap(lg_max_size_, other.lg_max_size_);
-  std::swap(num_active_, other.num_active_);
-  std::swap(keys_, other.keys_);
-  std::swap(values_, other.values_);
-  std::swap(states_, other.states_);
+reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(const reverse_purge_hash_map<K, V, H, E, A>& other) {
+  reverse_purge_hash_map copy(other);
+  std::swap(equal_, copy.equal_);
+  std::swap(allocator_, copy.allocator_);
+  std::swap(lg_cur_size_, copy.lg_cur_size_);
+  std::swap(lg_max_size_, copy.lg_max_size_);
+  std::swap(num_active_, copy.num_active_);
+  std::swap(keys_, copy.keys_);
+  std::swap(values_, copy.values_);
+  std::swap(states_, copy.states_);
   return *this;
 }
 
 template<typename K, typename V, typename H, typename E, typename A>
 reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A>&& other) {
+  std::swap(equal_, other.equal_);
   std::swap(allocator_, other.allocator_);
   std::swap(lg_cur_size_, other.lg_cur_size_);
   std::swap(lg_max_size_, other.lg_max_size_);
diff --git a/fi/test/frequent_items_sketch_custom_type_test.cpp b/fi/test/frequent_items_sketch_custom_type_test.cpp
index bf12989..6d077da 100644
--- a/fi/test/frequent_items_sketch_custom_type_test.cpp
+++ b/fi/test/frequent_items_sketch_custom_type_test.cpp
@@ -31,58 +31,71 @@ using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_t
 using alloc = test_allocator<test_type>;
 
 TEST_CASE("frequent items: custom type", "[frequent_items_sketch]") {
-  frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
-  sketch.update(1, 10); // should survive the purge
-  sketch.update(2);
-  sketch.update(3);
-  sketch.update(4);
-  sketch.update(5);
-  sketch.update(6);
-  sketch.update(7);
-  test_type a8(8);
-  sketch.update(a8);
-  REQUIRE_FALSE(sketch.is_empty());
-  REQUIRE(sketch.get_total_weight() == 17);
-  REQUIRE(sketch.get_estimate(1) == 10);
+  test_allocator_total_bytes = 0;
+  {
+    frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+    sketch.update(1, 10); // should survive the purge
+    sketch.update(2);
+    sketch.update(3);
+    sketch.update(4);
+    sketch.update(5);
+    sketch.update(6);
+    sketch.update(7);
+    test_type a8(8);
+    sketch.update(a8);
+    REQUIRE_FALSE(sketch.is_empty());
+    REQUIRE(sketch.get_total_weight() == 17);
+    REQUIRE(sketch.get_estimate(1) == 10);
 
-  auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
-  REQUIRE(items.size() == 1); // only 1 item should be above threshold
-  REQUIRE(items[0].get_item().get_value() == 1);
-  REQUIRE(items[0].get_estimate() == 10);
+    auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
+    REQUIRE(items.size() == 1); // only 1 item should be above threshold
+    REQUIRE(items[0].get_item().get_value() == 1);
+    REQUIRE(items[0].get_estimate() == 10);
 
-  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
-  sketch.serialize(s, test_type_serde());
-  auto sketch2 = frequent_test_type_sketch::deserialize(s, test_type_serde(), alloc(0));
-  REQUIRE_FALSE(sketch2.is_empty());
-  REQUIRE(sketch2.get_total_weight() == 17);
-  REQUIRE(sketch2.get_estimate(1) == 10);
-  REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
-  REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
+    std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
+    sketch.serialize(s, test_type_serde());
+    auto sketch2 = frequent_test_type_sketch::deserialize(s, test_type_serde(), test_type_equal(), 0);
+    REQUIRE_FALSE(sketch2.is_empty());
+    REQUIRE(sketch2.get_total_weight() == 17);
+    REQUIRE(sketch2.get_estimate(1) == 10);
+    REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
+    REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
 
-  auto bytes = sketch.serialize(0, test_type_serde());
-  auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), test_type_serde(), alloc(0));
-  REQUIRE_FALSE(sketch3.is_empty());
-  REQUIRE(sketch3.get_total_weight() == 17);
-  REQUIRE(sketch3.get_estimate(1) == 10);
-  REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
-  REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
+    auto bytes = sketch.serialize(0, test_type_serde());
+    auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), test_type_serde(),
+        test_type_equal(), 0);
+    REQUIRE_FALSE(sketch3.is_empty());
+    REQUIRE(sketch3.get_total_weight() == 17);
+    REQUIRE(sketch3.get_estimate(1) == 10);
+    REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
+    REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
+  }
+  REQUIRE(test_allocator_total_bytes == 0);
 }
 
 // this is to see the debug print from test_type if enabled there to make sure items are moved
 TEST_CASE("frequent items: moving merge", "[frequent_items_sketch]") {
-  frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
-  sketch1.update(1);
+  test_allocator_total_bytes = 0;
+  {
+    frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+    sketch1.update(1);
 
-  frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
-  sketch2.update(2);
+    frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+    sketch2.update(2);
 
-  sketch2.merge(std::move(sketch1));
-  REQUIRE(sketch2.get_total_weight() == 2);
+    sketch2.merge(std::move(sketch1));
+    REQUIRE(sketch2.get_total_weight() == 2);
+  }
+  REQUIRE(test_allocator_total_bytes == 0);
 }
 
 TEST_CASE("frequent items: negative weight", "[frequent_items_sketch]") {
-  frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
-  REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
+  test_allocator_total_bytes = 0;
+  {
+    frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+    REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
+  }
+  REQUIRE(test_allocator_total_bytes == 0);
 }
 
 } /* namespace datasketches */
diff --git a/fi/test/reverse_purge_hash_map_test.cpp b/fi/test/reverse_purge_hash_map_test.cpp
index 55c61a2..a9ae092 100644
--- a/fi/test/reverse_purge_hash_map_test.cpp
+++ b/fi/test/reverse_purge_hash_map_test.cpp
@@ -19,25 +19,28 @@
 
 #include <catch2/catch.hpp>
 
+#include <iostream>
+
 #include <reverse_purge_hash_map.hpp>
 
 namespace datasketches {
 
 TEST_CASE("reverse purge hash map: empty", "[frequent_items_sketch]") {
-  reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
+  std::cout << "sizeof(reverse_purge_hash_map<int>)=" << sizeof(reverse_purge_hash_map<int>) << '\n';
+  reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
   REQUIRE(map.get_num_active() == 0);
   REQUIRE(map.get_lg_cur_size() == 3);  // static_cast<uint8_t>(3)
 }
 
 TEST_CASE("reverse purge hash map: one item", "[frequent_items_sketch]") {
-  reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
+  reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
   map.adjust_or_insert(1, 1);
   REQUIRE(map.get_num_active() == 1);
   REQUIRE(map.get(1) == 1);
 }
 
 TEST_CASE("reverse purge hash map: iterator", "[frequent_items_sketch]") {
-  reverse_purge_hash_map<int> map(3, 4, std::allocator<int>());
+  reverse_purge_hash_map<int> map(3, 4, std::equal_to<int>(), std::allocator<int>());
   for (int i = 0; i < 11; i++) map.adjust_or_insert(i, 1); // this should fit with no purge
   uint64_t sum = 0;
   for (auto it: map) sum += it.second;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org