You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/11/01 22:13:31 UTC
[datasketches-cpp] 01/02: equality operator instance
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch equal_to_instance
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 687c089226f4f0f91004cffb2dbdce5e70634086
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Nov 1 14:52:52 2022 -0700
equality operator instance
---
fi/include/frequent_items_sketch.hpp | 34 ++++----
fi/include/frequent_items_sketch_impl.hpp | 14 ++--
fi/include/reverse_purge_hash_map.hpp | 13 ++-
fi/include/reverse_purge_hash_map_impl.hpp | 25 +++---
fi/test/frequent_items_sketch_custom_type_test.cpp | 93 ++++++++++++----------
fi/test/reverse_purge_hash_map_test.cpp | 9 ++-
6 files changed, 114 insertions(+), 74 deletions(-)
diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp
index 79c8e57..d0161c5 100644
--- a/fi/include/frequent_items_sketch.hpp
+++ b/fi/include/frequent_items_sketch.hpp
@@ -59,11 +59,13 @@ public:
* @param lg_max_map_size Log2 of the physical size of the internal hash map managed by this
* sketch. The maximum capacity of this internal hash map is 0.75 times 2^lg_max_map_size.
* Both the ultimate accuracy and size of this sketch are functions of lg_max_map_size.
- *
* @param lg_start_map_size Log2 of the starting physical size of the internal hash
* map managed by this sketch.
+ * @param equal instance of Equality operator
+ * @param allocator instance of an Allocator
*/
- explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE, const A& allocator = A());
+ explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE,
+ const E& equal = E(), const A& allocator = A());
/**
* Update this sketch with an item and a positive weight (frequency count).
@@ -157,7 +159,7 @@ public:
/**
* Returns epsilon used to compute <i>a priori</i> error.
* This is just the value <i>3.5 / maxMapSize</i>.
- * @param maxMapSize the planned map size to be used when constructing this sketch.
+ * @param lg_max_map_size the planned map size to be used when constructing this sketch.
* @return epsilon used to compute <i>a priori</i> error.
*/
static double get_epsilon(uint8_t lg_max_map_size);
@@ -166,13 +168,13 @@ public:
* Returns the estimated <i>a priori</i> error given the max_map_size for the sketch and the
* estimated_total_stream_weight.
* @param lg_max_map_size the planned map size to be used when constructing this sketch.
- * @param estimated_total_stream_weight the estimated total stream weight.
+ * @param estimated_total_weight the estimated total stream weight.
* @return the estimated <i>a priori</i> error.
*/
static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
class row;
- typedef typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>> vector_row; // alias for users
+ using vector_row = typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>>;
/**
* Returns an array of rows that include frequent items, estimates, upper and lower bounds
@@ -224,7 +226,7 @@ public:
/**
* Computes size needed to serialize the current state of the sketch.
* This can be expensive since every item needs to be looked at.
- * @param instance of a SerDe
+ * @param serde instance of a SerDe
* @return size in bytes needed to serialize this sketch
*/
template<typename SerDe = serde<T>>
@@ -233,7 +235,7 @@ public:
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
- * @param instance of a SerDe
+ * @param serde instance of a SerDe
*/
template<typename SerDe = serde<T>>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -248,7 +250,7 @@ public:
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
- * @param instance of a SerDe
+ * @param serde instance of a SerDe
* @return serialized sketch as a vector of bytes
*/
template<typename SerDe = serde<T>>
@@ -257,23 +259,27 @@ public:
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param serde instance of a SerDe
+ * @param equal instance of Equality operator
+ * @param allocator instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = serde<T>>
- static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
+ static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
+ const E& equal = E(), const A& allocator = A());
/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param serde instance of a SerDe
+ * @param equal instance of Equality operator
+ * @param allocator instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = serde<T>>
- static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
+ static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
+ const E& equal = E(), const A& allocator = A());
/**
* Returns a human readable summary of this sketch
diff --git a/fi/include/frequent_items_sketch_impl.hpp b/fi/include/frequent_items_sketch_impl.hpp
index e485da7..d72f462 100644
--- a/fi/include/frequent_items_sketch_impl.hpp
+++ b/fi/include/frequent_items_sketch_impl.hpp
@@ -34,12 +34,14 @@ template<typename T, typename W, typename H, typename E, typename A>
const uint8_t frequent_items_sketch<T, W, H, E, A>::LG_MIN_MAP_SIZE;
template<typename T, typename W, typename H, typename E, typename A>
-frequent_items_sketch<T, W, H, E, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size, const A& allocator):
+frequent_items_sketch<T, W, H, E, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size,
+ const E& equal, const A& allocator):
total_weight(0),
offset(0),
map(
std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
+ equal,
allocator
)
{
@@ -286,7 +288,8 @@ private:
template<typename T, typename W, typename H, typename E, typename A>
template<typename SerDe>
-frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
+frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(std::istream& is,
+ const SerDe& sd, const E& equal, const A& allocator) {
const auto preamble_longs = read<uint8_t>(is);
const auto serial_version = read<uint8_t>(is);
const auto family_id = read<uint8_t>(is);
@@ -302,7 +305,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
check_family_id(family_id);
check_size(lg_cur_size, lg_max_size);
- frequent_items_sketch sketch(lg_max_size, lg_cur_size, allocator);
+ frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
if (!is_empty) {
const auto num_items = read<uint32_t>(is);
read<uint32_t>(is); // unused
@@ -330,7 +333,8 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
template<typename T, typename W, typename H, typename E, typename A>
template<typename SerDe>
-frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
+frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deserialize(const void* bytes, size_t size,
+ const SerDe& sd, const E& equal, const A& allocator) {
ensure_minimum_memory(size, 8);
const char* ptr = static_cast<const char*>(bytes);
const char* base = static_cast<const char*>(bytes);
@@ -356,7 +360,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
check_size(lg_cur_size, lg_max_size);
ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
- frequent_items_sketch sketch(lg_max_size, lg_cur_size, allocator);
+ frequent_items_sketch sketch(lg_max_size, lg_cur_size, equal, allocator);
if (!is_empty) {
uint32_t num_items;
ptr += copy_from_mem(ptr, num_items);
diff --git a/fi/include/reverse_purge_hash_map.hpp b/fi/include/reverse_purge_hash_map.hpp
index 1f42191..6e2e53a 100644
--- a/fi/include/reverse_purge_hash_map.hpp
+++ b/fi/include/reverse_purge_hash_map.hpp
@@ -33,17 +33,23 @@ namespace datasketches {
* author Alexander Saydakov
*/
-template<typename K, typename V = uint64_t, typename H = std::hash<K>, typename E = std::equal_to<K>, typename A = std::allocator<K>>
+template<
+ typename K,
+ typename V = uint64_t,
+ typename H = std::hash<K>,
+ typename E = std::equal_to<K>,
+ typename A = std::allocator<K>
+>
class reverse_purge_hash_map {
public:
using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
- reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const A& allocator);
+ reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const E& equal, const A& allocator);
reverse_purge_hash_map(const reverse_purge_hash_map& other);
reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
~reverse_purge_hash_map();
- reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
+ reverse_purge_hash_map& operator=(const reverse_purge_hash_map& other);
reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
template<typename FwdK>
@@ -65,6 +71,7 @@ private:
static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
+ E equal_;
A allocator_;
uint8_t lg_cur_size_;
uint8_t lg_max_size_;
diff --git a/fi/include/reverse_purge_hash_map_impl.hpp b/fi/include/reverse_purge_hash_map_impl.hpp
index 77461de..fa2ad82 100644
--- a/fi/include/reverse_purge_hash_map_impl.hpp
+++ b/fi/include/reverse_purge_hash_map_impl.hpp
@@ -34,7 +34,9 @@ template<typename K, typename V, typename H, typename E, typename A>
constexpr uint32_t reverse_purge_hash_map<K, V, H, E, A>::MAX_SAMPLE_SIZE;
template<typename K, typename V, typename H, typename E, typename A>
-reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size, const A& allocator):
+reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size,
+ const E& equal, const A& allocator):
+equal_(equal),
allocator_(allocator),
lg_cur_size_(lg_cur_size),
lg_max_size_(lg_max_size),
@@ -52,6 +54,7 @@ states_(nullptr)
template<typename K, typename V, typename H, typename E, typename A>
reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(const reverse_purge_hash_map<K, V, H, E, A>& other):
+equal_(other.equal_),
allocator_(other.allocator_),
lg_cur_size_(other.lg_cur_size_),
lg_max_size_(other.lg_max_size_),
@@ -80,6 +83,7 @@ states_(nullptr)
template<typename K, typename V, typename H, typename E, typename A>
reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(reverse_purge_hash_map<K, V, H, E, A>&& other) noexcept:
+equal_(std::move(other.equal_)),
allocator_(std::move(other.allocator_)),
lg_cur_size_(other.lg_cur_size_),
lg_max_size_(other.lg_max_size_),
@@ -119,19 +123,22 @@ reverse_purge_hash_map<K, V, H, E, A>::~reverse_purge_hash_map() {
}
template<typename K, typename V, typename H, typename E, typename A>
-reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A> other) {
- std::swap(allocator_, other.allocator_);
- std::swap(lg_cur_size_, other.lg_cur_size_);
- std::swap(lg_max_size_, other.lg_max_size_);
- std::swap(num_active_, other.num_active_);
- std::swap(keys_, other.keys_);
- std::swap(values_, other.values_);
- std::swap(states_, other.states_);
+reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(const reverse_purge_hash_map<K, V, H, E, A>& other) {
+ reverse_purge_hash_map copy(other);
+ std::swap(equal_, copy.equal_);
+ std::swap(allocator_, copy.allocator_);
+ std::swap(lg_cur_size_, copy.lg_cur_size_);
+ std::swap(lg_max_size_, copy.lg_max_size_);
+ std::swap(num_active_, copy.num_active_);
+ std::swap(keys_, copy.keys_);
+ std::swap(values_, copy.values_);
+ std::swap(states_, copy.states_);
return *this;
}
template<typename K, typename V, typename H, typename E, typename A>
reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A>&& other) {
+ std::swap(equal_, other.equal_);
std::swap(allocator_, other.allocator_);
std::swap(lg_cur_size_, other.lg_cur_size_);
std::swap(lg_max_size_, other.lg_max_size_);
diff --git a/fi/test/frequent_items_sketch_custom_type_test.cpp b/fi/test/frequent_items_sketch_custom_type_test.cpp
index bf12989..6d077da 100644
--- a/fi/test/frequent_items_sketch_custom_type_test.cpp
+++ b/fi/test/frequent_items_sketch_custom_type_test.cpp
@@ -31,58 +31,71 @@ using frequent_test_type_sketch = frequent_items_sketch<test_type, float, test_t
using alloc = test_allocator<test_type>;
TEST_CASE("frequent items: custom type", "[frequent_items_sketch]") {
- frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
- sketch.update(1, 10); // should survive the purge
- sketch.update(2);
- sketch.update(3);
- sketch.update(4);
- sketch.update(5);
- sketch.update(6);
- sketch.update(7);
- test_type a8(8);
- sketch.update(a8);
- REQUIRE_FALSE(sketch.is_empty());
- REQUIRE(sketch.get_total_weight() == 17);
- REQUIRE(sketch.get_estimate(1) == 10);
+ test_allocator_total_bytes = 0;
+ {
+ frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+ sketch.update(1, 10); // should survive the purge
+ sketch.update(2);
+ sketch.update(3);
+ sketch.update(4);
+ sketch.update(5);
+ sketch.update(6);
+ sketch.update(7);
+ test_type a8(8);
+ sketch.update(a8);
+ REQUIRE_FALSE(sketch.is_empty());
+ REQUIRE(sketch.get_total_weight() == 17);
+ REQUIRE(sketch.get_estimate(1) == 10);
- auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
- REQUIRE(items.size() == 1); // only 1 item should be above threshold
- REQUIRE(items[0].get_item().get_value() == 1);
- REQUIRE(items[0].get_estimate() == 10);
+ auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
+ REQUIRE(items.size() == 1); // only 1 item should be above threshold
+ REQUIRE(items[0].get_item().get_value() == 1);
+ REQUIRE(items[0].get_estimate() == 10);
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
- sketch.serialize(s, test_type_serde());
- auto sketch2 = frequent_test_type_sketch::deserialize(s, test_type_serde(), alloc(0));
- REQUIRE_FALSE(sketch2.is_empty());
- REQUIRE(sketch2.get_total_weight() == 17);
- REQUIRE(sketch2.get_estimate(1) == 10);
- REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
- REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
+ sketch.serialize(s, test_type_serde());
+ auto sketch2 = frequent_test_type_sketch::deserialize(s, test_type_serde(), test_type_equal(), 0);
+ REQUIRE_FALSE(sketch2.is_empty());
+ REQUIRE(sketch2.get_total_weight() == 17);
+ REQUIRE(sketch2.get_estimate(1) == 10);
+ REQUIRE(sketch.get_num_active_items() == sketch2.get_num_active_items());
+ REQUIRE(sketch.get_maximum_error() == sketch2.get_maximum_error());
- auto bytes = sketch.serialize(0, test_type_serde());
- auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), test_type_serde(), alloc(0));
- REQUIRE_FALSE(sketch3.is_empty());
- REQUIRE(sketch3.get_total_weight() == 17);
- REQUIRE(sketch3.get_estimate(1) == 10);
- REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
- REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
+ auto bytes = sketch.serialize(0, test_type_serde());
+ auto sketch3 = frequent_test_type_sketch::deserialize(bytes.data(), bytes.size(), test_type_serde(),
+ test_type_equal(), 0);
+ REQUIRE_FALSE(sketch3.is_empty());
+ REQUIRE(sketch3.get_total_weight() == 17);
+ REQUIRE(sketch3.get_estimate(1) == 10);
+ REQUIRE(sketch.get_num_active_items() == sketch3.get_num_active_items());
+ REQUIRE(sketch.get_maximum_error() == sketch3.get_maximum_error());
+ }
+ REQUIRE(test_allocator_total_bytes == 0);
}
// this is to see the debug print from test_type if enabled there to make sure items are moved
TEST_CASE("frequent items: moving merge", "[frequent_items_sketch]") {
- frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
- sketch1.update(1);
+ test_allocator_total_bytes = 0;
+ {
+ frequent_test_type_sketch sketch1(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+ sketch1.update(1);
- frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
- sketch2.update(2);
+ frequent_test_type_sketch sketch2(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+ sketch2.update(2);
- sketch2.merge(std::move(sketch1));
- REQUIRE(sketch2.get_total_weight() == 2);
+ sketch2.merge(std::move(sketch1));
+ REQUIRE(sketch2.get_total_weight() == 2);
+ }
+ REQUIRE(test_allocator_total_bytes == 0);
}
TEST_CASE("frequent items: negative weight", "[frequent_items_sketch]") {
- frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, 0);
- REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
+ test_allocator_total_bytes = 0;
+ {
+ frequent_test_type_sketch sketch(3, frequent_test_type_sketch::LG_MIN_MAP_SIZE, test_type_equal(), 0);
+ REQUIRE_THROWS_AS(sketch.update(1, -1), std::invalid_argument);
+ }
+ REQUIRE(test_allocator_total_bytes == 0);
}
} /* namespace datasketches */
diff --git a/fi/test/reverse_purge_hash_map_test.cpp b/fi/test/reverse_purge_hash_map_test.cpp
index 55c61a2..a9ae092 100644
--- a/fi/test/reverse_purge_hash_map_test.cpp
+++ b/fi/test/reverse_purge_hash_map_test.cpp
@@ -19,25 +19,28 @@
#include <catch2/catch.hpp>
+#include <iostream>
+
#include <reverse_purge_hash_map.hpp>
namespace datasketches {
TEST_CASE("reverse purge hash map: empty", "[frequent_items_sketch]") {
- reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
+ std::cout << "sizeof(reverse_purge_hash_map<int>)=" << sizeof(reverse_purge_hash_map<int>) << '\n';
+ reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
REQUIRE(map.get_num_active() == 0);
REQUIRE(map.get_lg_cur_size() == 3); // static_cast<uint8_t>(3)
}
TEST_CASE("reverse purge hash map: one item", "[frequent_items_sketch]") {
- reverse_purge_hash_map<int> map(3, 3, std::allocator<int>());
+ reverse_purge_hash_map<int> map(3, 3, std::equal_to<int>(), std::allocator<int>());
map.adjust_or_insert(1, 1);
REQUIRE(map.get_num_active() == 1);
REQUIRE(map.get(1) == 1);
}
TEST_CASE("reverse purge hash map: iterator", "[frequent_items_sketch]") {
- reverse_purge_hash_map<int> map(3, 4, std::allocator<int>());
+ reverse_purge_hash_map<int> map(3, 4, std::equal_to<int>(), std::allocator<int>());
for (int i = 0; i < 11; i++) map.adjust_or_insert(i, 1); // this should fit with no purge
uint64_t sum = 0;
for (auto it: map) sum += it.second;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org