You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2023/06/29 04:39:04 UTC
[datasketches-cpp] 01/01: documentation fixes
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch doxygen
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit ef01d71b973eeb97c267ec5bab935079dfd10d05
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Wed Jun 28 21:38:52 2023 -0700
documentation fixes
---
common/include/common_defs.hpp | 1 +
common/include/kolmogorov_smirnov.hpp | 15 +-
common/include/quantiles_sorted_view.hpp | 92 ++++++++++++
common/include/serde.hpp | 87 ++++++++---
count/include/count_min.hpp | 166 ++++++++++++++-------
count/include/count_min_impl.hpp | 16 --
cpc/include/cpc_sketch.hpp | 66 ++++----
cpc/include/cpc_union.hpp | 37 ++++-
density/include/density_sketch.hpp | 38 +++--
fi/include/frequent_items_sketch.hpp | 22 ++-
hll/include/HllSketch-internal.hpp | 4 +-
hll/include/hll.hpp | 110 ++++++++------
kll/include/kll_sketch.hpp | 38 ++++-
quantiles/include/quantiles_sketch.hpp | 80 ++++++++--
req/include/req_common.hpp | 10 +-
req/include/req_sketch.hpp | 99 +++++++++++-
sampling/include/var_opt_sketch.hpp | 92 ++++++++----
sampling/include/var_opt_sketch_impl.hpp | 6 +-
sampling/include/var_opt_union.hpp | 14 +-
theta/include/bounds_on_ratios_in_sampled_sets.hpp | 1 +
.../bounds_on_ratios_in_theta_sketched_sets.hpp | 13 +-
theta/include/theta_a_not_b.hpp | 9 ++
theta/include/theta_intersection.hpp | 6 +-
theta/include/theta_intersection_impl.hpp | 6 +-
theta/include/theta_jaccard_similarity.hpp | 3 +-
theta/include/theta_jaccard_similarity_base.hpp | 1 +
theta/include/theta_sketch.hpp | 56 ++++++-
theta/include/theta_union.hpp | 5 +
theta/include/theta_union_impl.hpp | 6 +-
theta/include/theta_update_sketch_base.hpp | 3 +-
tuple/include/array_of_doubles_sketch.hpp | 13 +-
tuple/include/tuple_intersection.hpp | 10 ++
tuple/include/tuple_jaccard_similarity.hpp | 1 +
tuple/include/tuple_sketch.hpp | 83 ++++++++---
tuple/include/tuple_union.hpp | 8 +
tuple/test/tuple_sketch_test.cpp | 2 +-
36 files changed, 904 insertions(+), 315 deletions(-)
diff --git a/common/include/common_defs.hpp b/common/include/common_defs.hpp
index fc80fe0..280ad7a 100644
--- a/common/include/common_defs.hpp
+++ b/common/include/common_defs.hpp
@@ -28,6 +28,7 @@
#include <chrono>
#include <thread>
+/// DataSketches namespace
namespace datasketches {
static const uint64_t DEFAULT_SEED = 9001;
diff --git a/common/include/kolmogorov_smirnov.hpp b/common/include/kolmogorov_smirnov.hpp
index d00853d..90f226c 100644
--- a/common/include/kolmogorov_smirnov.hpp
+++ b/common/include/kolmogorov_smirnov.hpp
@@ -22,13 +22,16 @@
namespace datasketches {
+/**
+ * Kolmogorov-Smirnov test for KLL or Quantiles sketches
+ */
class kolmogorov_smirnov {
public:
/**
* Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
- * @param sketch1 KLL sketch 1
- * @param sketch2 KLL sketch 2
+ * @param sketch1 sketch 1
+ * @param sketch2 sketch 2
* @return the raw delta between two KLL quantile sketches
*/
template<typename Sketch>
@@ -39,8 +42,8 @@ public:
* Adjusts the computed threshold by the error epsilons of the two given sketches.
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
- * @param sketch1 KLL sketch 1
- * @param sketch2 KLL sketch 2
+ * @param sketch1 sketch 1
+ * @param sketch2 sketch 2
* @param p Target p-value. Typically .001 to .1, e.g., .05.
* @return the adjusted threshold to be compared with the raw delta
*/
@@ -52,8 +55,8 @@ public:
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
* this will return false.
- * @param sketch1 KLL sketch 1
- * @param sketch2 KLL sketch 2
+ * @param sketch1 sketch 1
+ * @param sketch2 sketch 2
* @param p Target p-value. Typically .001 to .1, e.g., .05.
* @return Boolean indicating whether we can reject the null hypothesis (that the sketches
* reflect the same underlying distribution) using the provided p-value.
diff --git a/common/include/quantiles_sorted_view.hpp b/common/include/quantiles_sorted_view.hpp
index e965cb1..465f186 100755
--- a/common/include/quantiles_sorted_view.hpp
+++ b/common/include/quantiles_sorted_view.hpp
@@ -27,6 +27,9 @@
namespace datasketches {
+/**
+ * Sorted view for quantiles sketches (REQ, KLL and Quantiles)
+ */
template<
typename T,
typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
@@ -34,30 +37,119 @@ template<
>
class quantiles_sorted_view {
public:
+ /// Entry type
using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
using Container = std::vector<Entry, AllocEntry>;
+ /// @private
quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);
+ /// @private
template<typename Iterator>
void add(Iterator begin, Iterator end, uint64_t weight);
+ /// @private
void convert_to_cummulative();
class const_iterator;
+
+ /**
+ * Iterator pointing to the first entry in the view.
+ * If the view is empty, the returned iterator must not be dereferenced or incremented.
+ * @return iterator pointing to the first entry
+ */
const_iterator begin() const;
+
+ /**
+ * Iterator pointing to the past-the-end entry in the view.
+ * The past-the-end entry is the hypothetical entry that would follow the last entry.
+ * It does not point to any entry, and must not be dereferenced or incremented.
+ * @return iterator pointing to the past-the-end entry
+ */
const_iterator end() const;
+ /// @return size of the view
size_t size() const;
+ /**
+ * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
+ *
+ * <p>If the view is empty this throws std::runtime_error.
+ *
+ * @param item to be ranked
+ * @param inclusive if true the weight of the given item is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
+ * according to the Comparator.
+ *
+ * @return an approximate rank of the given item
+ */
double get_rank(const T& item, bool inclusive = true) const;
+ /**
+ * Quantile return type.
+ * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
+ */
using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
+
+ /**
+ * Returns an item from the sketch that is the best approximation to an item
+ * from the original stream with the given rank.
+ *
+ * <p>If the view is empty this throws std::runtime_error.
+ *
+ * @param rank of an item in the hypothetical sorted stream.
+ * @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
+ *
+ * @return approximate quantile associated with the given rank
+ */
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
+
+ /**
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+ * cumulative analog of the PMF, of the input stream given a set of split points (items).
+ *
+ * <p>If the view is empty this throws std::runtime_error.
+ *
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
+ *
+ * @param size the number of split points in the array
+ *
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in CDF such items are
+ * included into the interval to the left of split point. Otherwise they are included into
+ * the interval to the right of split point.
+ *
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
+ * of the input stream given the split_points. The value at array position j of the returned
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
+ * array. This can be viewed as array of ranks of the given split points plus one more value
+ * that is always 1.
+ */
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
+
+ /**
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+ * given a set of split points (items).
+ *
+ * <p>If the view is empty this throws std::runtime_error.
+ *
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
+ *
+ * @param size the number of split points in the array
+ *
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in PMF such items are
+ * included into the interval to the left of split point. Otherwise they are included into the interval
+ * to the right of split point.
+ *
+ * @return an array of m+1 doubles each of which is an approximation
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
+ */
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
private:
diff --git a/common/include/serde.hpp b/common/include/serde.hpp
index 9b3349b..5cc55a8 100644
--- a/common/include/serde.hpp
+++ b/common/include/serde.hpp
@@ -30,23 +30,56 @@
namespace datasketches {
-// serialize and deserialize
+/// Interface for serializing and deserializing items
template<typename T, typename Enable = void> struct serde {
- // stream serialization
+ /**
+ * Stream serialization
+ * @param os output stream
+ * @param items pointer to array of items
+ * @param num number of items
+ */
void serialize(std::ostream& os, const T* items, unsigned num) const;
+
+ /**
+ * Stream deserialization
+ * @param is input stream
+ * @param items pointer to array of items
+ * @param num number of items
+ */
void deserialize(std::istream& is, T* items, unsigned num) const; // items allocated but not initialized
- // raw bytes serialization
- size_t size_of_item(const T& item) const;
+ /**
+ * Raw bytes serialization
+ * @param ptr pointer to output buffer
+ * @param capacity size of the buffer in bytes
+ * @param items pointer to array of items
+ * @param num number of items
+ */
size_t serialize(void* ptr, size_t capacity, const T* items, unsigned num) const;
- size_t deserialize(const void* ptr, size_t capacity, T* items, unsigned num) const; // items allocated but not initialized
+
+ /**
+ * Raw bytes deserialization
+ * @param ptr pointer to input buffer
+ * @param capacity size of the buffer in bytes
+ * @param items pointer to array of items (items in the array are allocated but not initialized)
+ * @param num number of items
+ */
+ size_t deserialize(const void* ptr, size_t capacity, T* items, unsigned num) const;
+
+ /**
+ * Size of the given item
+ * @param item to be sized
+ * @return size of the given item in bytes
+ */
+ size_t size_of_item(const T& item) const;
};
-// serde for all fixed-size arithmetic types (int and float of different sizes)
-// in particular, kll_sketch<int64_t> should produce sketches binary-compatible
-// with LongsSketch and ItemsSketch<Long> with ArrayOfLongsSerDe in Java
+/// serde for all fixed-size arithmetic types (int and float of different sizes).
+/// in particular, kll_sketch<int64_t> should produce sketches binary-compatible
+/// with LongsSketch and ItemsSketch<Long> with ArrayOfLongsSerDe in Java
template<typename T>
struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
+ /// @copydoc serde::serialize
void serialize(std::ostream& os, const T* items, unsigned num) const {
bool failure = false;
try {
@@ -58,6 +91,7 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
throw std::runtime_error("error writing to std::ostream with " + std::to_string(num) + " items");
}
}
+
void deserialize(std::istream& is, T* items, unsigned num) const {
bool failure = false;
try {
@@ -70,30 +104,37 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
}
}
- size_t size_of_item(const T&) const {
- return sizeof(T);
- }
+ /// @copydoc serde::serialize(void*,size_t,const T*,unsigned) const
size_t serialize(void* ptr, size_t capacity, const T* items, unsigned num) const {
const size_t bytes_written = sizeof(T) * num;
check_memory_size(bytes_written, capacity);
memcpy(ptr, items, bytes_written);
return bytes_written;
}
+
+ /// @copydoc serde::deserialize(const void*,size_t,T*,unsigned) const
size_t deserialize(const void* ptr, size_t capacity, T* items, unsigned num) const {
const size_t bytes_read = sizeof(T) * num;
check_memory_size(bytes_read, capacity);
memcpy(items, ptr, bytes_read);
return bytes_read;
}
+
+ /// @copydoc serde::size_of_item
+ size_t size_of_item(const T& item) const {
+ unused(item);
+ return sizeof(T);
+ }
};
-// serde for std::string items
-// This should produce sketches binary-compatible with
-// ItemsSketch<String> with ArrayOfStringsSerDe in Java.
-// The length of each string is stored as a 32-bit integer (historically),
-// which may be too wasteful. Treat this as an example.
+/// serde for std::string items.
+/// This should produce sketches binary-compatible with
+/// ItemsSketch<String> with ArrayOfStringsSerDe in Java.
+/// The length of each string is stored as a 32-bit integer (historically),
+/// which may be too wasteful. Treat this as an example.
template<>
struct serde<std::string> {
+ /// @copydoc serde::serialize
void serialize(std::ostream& os, const std::string* items, unsigned num) const {
unsigned i = 0;
bool failure = false;
@@ -110,6 +151,8 @@ struct serde<std::string> {
throw std::runtime_error("error writing to std::ostream at item " + std::to_string(i));
}
}
+
+ /// @copydoc serde::deserialize
void deserialize(std::istream& is, std::string* items, unsigned num) const {
unsigned i = 0;
bool failure = false;
@@ -137,9 +180,8 @@ struct serde<std::string> {
throw std::runtime_error("error reading from std::istream at item " + std::to_string(i));
}
}
- size_t size_of_item(const std::string& item) const {
- return sizeof(uint32_t) + item.size();
- }
+
+ /// @copydoc serde::serialize(void*,size_t,const T*,unsigned) const
size_t serialize(void* ptr, size_t capacity, const std::string* items, unsigned num) const {
size_t bytes_written = 0;
for (unsigned i = 0; i < num; ++i) {
@@ -154,6 +196,8 @@ struct serde<std::string> {
}
return bytes_written;
}
+
+ /// @copydoc serde::deserialize(const void*,size_t,T*,unsigned) const
size_t deserialize(const void* ptr, size_t capacity, std::string* items, unsigned num) const {
size_t bytes_read = 0;
unsigned i = 0;
@@ -189,6 +233,11 @@ struct serde<std::string> {
return bytes_read;
}
+
+ /// @copydoc serde::size_of_item
+ size_t size_of_item(const std::string& item) const {
+ return sizeof(uint32_t) + item.size();
+ }
};
} /* namespace datasketches */
diff --git a/count/include/count_min.hpp b/count/include/count_min.hpp
index b394b75..7884653 100644
--- a/count/include/count_min.hpp
+++ b/count/include/count_min.hpp
@@ -25,26 +25,27 @@
namespace datasketches {
- /*
- * C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
- * [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
- * The template type W is the type of the vector that contains the weights of the objects inserted into the sketch,
- * not the type of the input items themselves.
- * @author Charlie Dickens
- */
-
+/**
+ * C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
+ * [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
+ * The template type W is the type of the vector that contains the weights of the objects inserted into the sketch,
+ * not the type of the input items themselves.
+ * @author Charlie Dickens
+ */
template <typename W,
typename Allocator = std::allocator<W>>
class count_min_sketch{
static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
public:
using allocator_type = Allocator;
+ using const_iterator = typename std::vector<W, Allocator>::const_iterator;
/**
* Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`.
* @param num_hashes : number of hash functions in the sketch. Equivalently the number of rows in the array
* @param num_buckets : number of buckets that hash functions map into. Equivalently the number of columns in the array
* @param seed for hash function
+ * @param allocator to use by this instance
*
* The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash.
* Only update and estimate methods are added for uint64_t and string types.
@@ -79,21 +80,23 @@ public:
*/
W get_total_weight() const;
- /*
- * @param relative_error : double -- the desired accuracy within which estimates should lie.
+ /**
+ * Suggests the number of buckets required to achieve the given relative error
+ * @param relative_error the desired accuracy within which estimates should lie.
* For example, when relative_error = 0.05, then the returned frequency estimates satisfy the
* `relative_error` guarantee that never overestimates the weights but may underestimate the weights
* by 5% of the total weight in the sketch.
- * @return number_of_buckets : the number of hash buckets at every level of the
+ * @return the number of hash buckets at every level of the
* sketch required in order to obtain the specified relative error.
* [1] - Section 3 ``Data Structure'', page 6.
*/
static uint32_t suggest_num_buckets(double relative_error);
- /*
- * @param confidence : double -- the desired confidence with which estimates should be correct.
+ /**
+ * Suggests the number of hash functions required to achieve the given confidence
+ * @param confidence the desired confidence with which estimates should be correct.
* For example, with 95% confidence, frequency estimates satisfy the `relative_error` guarantee.
- * @return number_of_hashes : the number of hash functions that are required in
+ * @return the number of hash functions that are required in
* order to achieve the specified confidence of the sketch.
* confidence = 1 - delta, with delta denoting the sketch failure probability in the literature.
* [1] - Section 3 ``Data Structure'', page 6.
@@ -111,7 +114,7 @@ public:
/**
* Specific get_estimate function for int64_t type
* see generic get_estimate function
- * @param item : uint64_t type.
+ * @param item : int64_t type.
* @return an estimate of the item's frequency.
*/
W get_estimate(int64_t item) const;
@@ -127,8 +130,8 @@ public:
/**
* This is the generic estimate query function for any of the given datatypes.
* Query the sketch for the estimate of a given item.
- * @param item : pointer to the data item to be query from the sketch.
- * @param size : size_t
+ * @param item pointer to the data item to be query from the sketch.
+ * @param size size of the item in bytes
* @return the estimated frequency of the item denoted f_est satisfying
* f_true - relative_error*_total_weight <= f_est <= f_true
*/
@@ -136,60 +139,106 @@ public:
/**
* Query the sketch for the upper bound of a given item.
- * @param item : uint64_t or std::string to query
+ * @param item to query
+ * @param size of the item in bytes
* @return the upper bound on the true frequency of the item
* f_true <= f_est + relative_error*_total_weight
*/
W get_upper_bound(const void* item, size_t size) const;
- W get_upper_bound(int64_t) const;
- W get_upper_bound(uint64_t) const;
+
+ /**
+ * Query the sketch for the upper bound of a given item.
+ * @param item to query
+ * @return the upper bound on the true frequency of the item
+ * f_true <= f_est + relative_error*_total_weight
+ */
+ W get_upper_bound(int64_t item) const;
+
+ /**
+ * Query the sketch for the upper bound of a given item.
+ * @param item to query
+ * @return the upper bound on the true frequency of the item
+ * f_true <= f_est + relative_error*_total_weight
+ */
+ W get_upper_bound(uint64_t item) const;
+
+ /**
+ * Query the sketch for the upper bound of a given item.
+ * @param item to query
+ * @return the upper bound on the true frequency of the item
+ * f_true <= f_est + relative_error*_total_weight
+ */
W get_upper_bound(const std::string& item) const;
/**
* Query the sketch for the lower bound of a given item.
- * @param item : uint64_t or std::string to query
+ * @param item to query
+ * @param size of the item in bytes
* @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
* f_true - relative_error*_total_weight <= f_est
*/
W get_lower_bound(const void* item, size_t size) const;
- W get_lower_bound(int64_t) const;
- W get_lower_bound(uint64_t) const;
+
+ /**
+ * Query the sketch for the lower bound of a given item.
+ * @param item to query
+ * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
+ * f_true - relative_error*_total_weight <= f_est
+ */
+ W get_lower_bound(int64_t item) const;
+
+ /**
+ * Query the sketch for the lower bound of a given item.
+ * @param item to query
+ * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
+ * f_true - relative_error*_total_weight <= f_est
+ */
+ W get_lower_bound(uint64_t item) const;
+
+ /**
+ * Query the sketch for the lower bound of a given item.
+ * @param item to query
+ * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
+ * f_true - relative_error*_total_weight <= f_est
+ */
W get_lower_bound(const std::string& item) const;
- /*
+ /**
* Update this sketch with given data of any type.
- * This is a "universal" update that covers all cases above,
- * but may produce different hashes.
+ * This is a "universal" update that covers all cases,
+ * but may produce different hashes compared to specialized update methods.
* @param item pointer to the data item to be inserted into the sketch.
* @param size of the data in bytes
- * @return vector of uint64_t which each represent the index to which `value' must update in the sketch
+ * @param weight arithmetic type
*/
void update(const void* item, size_t size, W weight);
/**
- * Update this sketch with a given uint64_t item.
- * @param item : uint64_t to update the sketch with
- * @param weight : arithmetic type
- * void function which inserts an item of type uint64_t into the sketch
+ * Update this sketch with a given item.
+ * @param item to update the sketch with
+ * @param weight arithmetic type
*/
- void update(uint64_t item, W weight);
- void update(uint64_t item);
- void update(int64_t item, W weight);
- void update(int64_t item);
+ void update(uint64_t item, W weight = 1);
+
+ /**
+ * Update this sketch with a given item.
+ * @param item to update the sketch with
+ * @param weight arithmetic type
+ */
+ void update(int64_t item, W weight = 1);
/**
* Update this sketch with a given string.
- * @param item : string to update the sketch with
- * @param weight : arithmetic type
- * void function which inserts an item of type std::string into the sketch
+ * @param item string to update the sketch with
+ * @param weight arithmetic type
*/
- void update(const std::string& item, W weight);
- void update(const std::string& item);
+ void update(const std::string& item, W weight = 1);
- /*
- * merges a separate count_min_sketch into this count_min_sketch.
+ /**
+ * Merges another count_min_sketch into this count_min_sketch.
+ * @param other_sketch
*/
- void merge(const count_min_sketch &other_sketch);
+ void merge(const count_min_sketch& other_sketch);
/**
* Returns true if this sketch is empty.
@@ -205,15 +254,23 @@ public:
*/
string<Allocator> to_string() const;
- // Iterators
- using const_iterator = typename std::vector<W, Allocator>::const_iterator;
+ /**
+ * Iterator pointing to the first item in the sketch.
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
+ * @return iterator pointing to the first item in the sketch
+ */
const_iterator begin() const;
- const_iterator end() const;
/**
- * This method serializes the sketch into a given stream in a binary form
- * @param os output stream
- * The byte output has the following structure
+ * Iterator pointing to the past-the-end item in the sketch.
+ * The past-the-end item is the hypothetical item that would follow the last item.
+ * It does not point to any item, and must not be dereferenced or incremented.
+ * @return iterator pointing to the past-the-end item in the sketch
+ */
+ const_iterator end() const;
+
+ /*
+ * The serialized sketch binary form has the following structure
* Byte 0:
* 1 - if and only if the sketch is empty
* 0 - otherwise
@@ -254,8 +311,6 @@ public:
||---------------------------- sketch entries ---------------------------|
...
- *
- *
*/
@@ -266,7 +321,8 @@ public:
size_t get_serialized_size_bytes() const;
/**
- * This method serializes a binary image of the sketch to an output stream.
+ * This method serializes the sketch into a given stream in a binary form
+ * @param os output stream
*/
void serialize(std::ostream& os) const;
@@ -287,6 +343,7 @@ public:
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
+ * @param allocator instance of an Allocator
* @return an instance of a sketch
*/
static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
@@ -296,12 +353,12 @@ public:
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
+ * @param allocator instance of an Allocator
* @return an instance of the sketch
*/
static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
- * Returns the allocator for this sketch.
* @return allocator
*/
allocator_type get_allocator() const;
@@ -331,9 +388,6 @@ private:
*/
static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);
-
-
-
/*
* Obtain the hash values when inserting an item into the sketch.
* @param item pointer to the data item to be inserted into the sketch.
diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp
index 568debc..45376e7 100644
--- a/count/include/count_min_impl.hpp
+++ b/count/include/count_min_impl.hpp
@@ -169,33 +169,17 @@ void count_min_sketch<W,A>::update(uint64_t item, W weight) {
update(&item, sizeof(item), weight);
}
-template<typename W, typename A>
-void count_min_sketch<W,A>::update(uint64_t item) {
- update(&item, sizeof(item), 1);
-}
-
template<typename W, typename A>
void count_min_sketch<W,A>::update(int64_t item, W weight) {
update(&item, sizeof(item), weight);
}
-template<typename W, typename A>
-void count_min_sketch<W,A>::update(int64_t item) {
- update(&item, sizeof(item), 1);
-}
-
template<typename W, typename A>
void count_min_sketch<W,A>::update(const std::string& item, W weight) {
if (item.empty()) return;
update(item.c_str(), item.length(), weight);
}
-template<typename W, typename A>
-void count_min_sketch<W,A>::update(const std::string& item) {
- if (item.empty()) return;
- update(item.c_str(), item.length(), 1);
-}
-
template<typename W, typename A>
void count_min_sketch<W,A>::update(const void* item, size_t size, W weight) {
/*
diff --git a/cpc/include/cpc_sketch.hpp b/cpc/include/cpc_sketch.hpp
index c000ac9..164c33a 100644
--- a/cpc/include/cpc_sketch.hpp
+++ b/cpc/include/cpc_sketch.hpp
@@ -33,58 +33,56 @@
namespace datasketches {
-/*
- * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch
- *
- * This is a very compact (in serialized form) distinct counting sketch.
- * The theory is described in the following paper:
- * https://arxiv.org/abs/1708.06839
- *
- * author Kevin Lang
- * author Alexander Saydakov
- */
-
// forward-declarations
template<typename A> class cpc_sketch_alloc;
template<typename A> class cpc_union_alloc;
-// alias with default allocator for convenience
+/// CPC sketch alias with default allocator
using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
-// allocation and initialization of global decompression (decoding) tables
-// call this before anything else if you want to control the initialization time
-// for instance, to have this happen outside of a transaction context
-// otherwise initialization happens on the first use (serialization or deserialization)
-// it is safe to call more than once assuming no race conditions
-// this is not thread safe! neither is the rest of the library
+/**
+ * Allocation and initialization of global decompression (decoding) tables.
+ * Call this before anything else if you want to control the initialization time.
+ * For instance, to have this happen outside of a transaction context.
+ * Otherwise initialization happens on the first use (serialization or deserialization).
+ * It is safe to call more than once assuming no race conditions.
+ * This is not thread safe! Neither is the rest of the library.
+ */
template<typename A> void cpc_init();
+/**
+ * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch
+ *
+ * This is a very compact (in serialized form) distinct counting sketch.
+ * The theory is described in the following paper:
+ * https://arxiv.org/abs/1708.06839
+ *
+ * @author Kevin Lang
+ * @author Alexander Saydakov
+ */
template<typename A>
class cpc_sketch_alloc {
public:
+ using allocator_type = A;
+
/**
* Creates an instance of the sketch given the lg_k parameter and hash seed.
* @param lg_k base 2 logarithm of the number of bins in the sketch
* @param seed for hash function
+ * @param allocator instance of an allocator
*/
explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
- using allocator_type = A;
+ /// @return allocator
A get_allocator() const;
- /**
- * @return configured lg_k of this sketch
- */
+ /// @return configured lg_k of this sketch
uint8_t get_lg_k() const;
- /**
- * @return true if this sketch represents an empty set
- */
+ /// @return true if this sketch represents an empty set
bool is_empty() const;
- /**
- * @return estimate of the distinct count of the input stream
- */
+ /// @return estimate of the distinct count of the input stream
double get_estimate() const;
/**
@@ -189,13 +187,14 @@ public:
* Otherwise two sketches that should represent overlapping sets will be disjoint
* For instance, for signed 32-bit values call update(int32_t) method above,
* which does widening conversion to int64_t, if compatibility with Java is expected
- * @param data pointer to the data
- * @param length of the data in bytes
+ * @param value pointer to the data
+ * @param size of the data in bytes
*/
void update(const void* value, size_t size);
/**
* Returns a human-readable summary of this sketch
+ * @return a human-readable summary of this sketch
*/
string<A> to_string() const;
@@ -215,6 +214,7 @@ public:
* It is an uninitialized space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
+ * @return serialized sketch as a vector of bytes
*/
vector_bytes serialize(unsigned header_size_bytes = 0) const;
@@ -222,6 +222,7 @@ public:
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
+ * @param allocator instance of an Allocator
* @return an instance of a sketch
*/
static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
@@ -231,6 +232,7 @@ public:
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
+ * @param allocator instance of an Allocator
* @return an instance of the sketch
*/
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
@@ -246,10 +248,10 @@ public:
*/
static size_t get_max_serialized_size_bytes(uint8_t lg_k);
- // for internal use
+ /// @private for internal use
uint32_t get_num_coupons() const;
- // for debugging
+ /// @private for debugging
// this should catch some forms of corruption during serialization-deserialization
bool validate() const;
diff --git a/cpc/include/cpc_union.hpp b/cpc/include/cpc_union.hpp
index 54fbed5..f380fb7 100644
--- a/cpc/include/cpc_union.hpp
+++ b/cpc/include/cpc_union.hpp
@@ -27,16 +27,15 @@
namespace datasketches {
-/*
+/// CPC union alias with default allocator
+using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>;
+
+/**
* High performance C++ implementation of Compressed Probabilistic Counting (CPC) Union
*
* author Kevin Lang
* author Alexander Saydakov
*/
-
-// alias with default allocator for convenience
-using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>;
-
template<typename A>
class cpc_union_alloc {
public:
@@ -44,14 +43,36 @@ public:
* Creates an instance of the union given the lg_k parameter and hash seed.
* @param lg_k base 2 logarithm of the number of bins in the sketch
* @param seed for hash function
+ * @param allocator instance of an allocator
*/
explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
+ /**
+ * Copy constructor
+ * @param other union to be copied
+ */
cpc_union_alloc(const cpc_union_alloc<A>& other);
+
+ /**
+ * Move constructor
+ * @param other union to be moved
+ */
cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
+
~cpc_union_alloc();
+ /**
+ * Copy assignment
+ * @param other union to be copied
+ * @return reference to this union
+ */
cpc_union_alloc<A>& operator=(const cpc_union_alloc<A>& other);
+
+ /**
+ * Move assignment
+ * @param other union to be moved
+ * @return reference to this union
+ */
cpc_union_alloc<A>& operator=(cpc_union_alloc<A>&& other) noexcept;
/**
@@ -73,9 +94,9 @@ public:
cpc_sketch_alloc<A> get_result() const;
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> AllocU8;
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
- typedef typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>> AllocCpc;
+ using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
+ using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
+ using AllocCpc = typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>>;
uint8_t lg_k;
uint64_t seed;
diff --git a/density/include/density_sketch.hpp b/density/include/density_sketch.hpp
index 734f477..e5674c6 100755
--- a/density/include/density_sketch.hpp
+++ b/density/include/density_sketch.hpp
@@ -28,15 +28,6 @@
#include "common_defs.hpp"
-/*
- * Based on the following paper:
- * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
- * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
- *
- * Inspired by the following implementation:
- * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
- */
-
namespace datasketches {
template<typename T>
@@ -46,6 +37,18 @@ struct gaussian_kernel {
}
};
+/**
+ * Density sketch.
+ *
+ * Builds a coreset from the given set of input points. Provides density estimate at a given point.
+ *
+ * Based on the following paper:
+ * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
+ * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
+ *
+ * Inspired by the following implementation:
+ * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
+ */
template<
typename T,
typename Kernel = gaussian_kernel<T>,
@@ -118,6 +121,10 @@ public:
template<typename FwdSketch>
void merge(FwdSketch&& other);
+ /**
+ * Density estimate at a given point
+ * @return density estimate at a given point
+ */
T get_estimate(const std::vector<T>& point) const;
/**
@@ -172,7 +179,20 @@ public:
string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
class const_iterator;
+
+ /**
+ * Iterator pointing to the first item in the sketch.
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
+ * @return iterator pointing to the first item in the sketch
+ */
const_iterator begin() const;
+
+ /**
+ * Iterator pointing to the past-the-end item in the sketch.
+ * The past-the-end item is the hypothetical item that would follow the last item.
+ * It does not point to any item, and must not be dereferenced or incremented.
+ * @return iterator pointing to the past-the-end item in the sketch
+ */
const_iterator end() const;
private:
diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp
index 02a0854..beec930 100644
--- a/fi/include/frequent_items_sketch.hpp
+++ b/fi/include/frequent_items_sketch.hpp
@@ -32,15 +32,15 @@
namespace datasketches {
-/*
+enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
+
+/**
+ * Frequent Items sketch.
+ *
* Based on Java implementation here:
* https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java
- * author Alexander Saydakov
+ * @author Alexander Saydakov
*/
-
-enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
-
-// type W for weight must be an arithmetic type (integral or floating point)
template<
typename T,
typename W = uint64_t,
@@ -49,6 +49,7 @@ template<
typename A = std::allocator<T>
>
class frequent_items_sketch {
+ static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
public:
static const uint8_t LG_MIN_MAP_SIZE = 3;
@@ -194,7 +195,7 @@ public:
* There may be items omitted from the set with true frequencies greater than the
* threshold (false negatives).</p>
*
- * @param error_type determines whether no false positives or no false negatives are desired.
+ * @param err_type determines whether no false positives or no false negatives are desired.
* @return an array of frequent items
*/
vector_row get_frequent_items(frequent_items_error_type err_type) const;
@@ -217,7 +218,7 @@ public:
* There may be items omitted from the set with true frequencies greater than the
* threshold (false negatives).</p>
*
- * @param error_type determines whether no false positives or no false negatives are desired.
+ * @param err_type determines whether no false positives or no false negatives are desired.
* @param threshold to include items in the result list
* @return an array of frequent items
*/
@@ -318,14 +319,19 @@ private:
class items_deleter;
};
+/// Row in the output from #get_frequent_items
template<typename T, typename W, typename H, typename E, typename A>
class frequent_items_sketch<T, W, H, E, A>::row {
public:
row(const T* item, W weight, W offset):
item(item), weight(weight), offset(offset) {}
+ /// @return item
const T& get_item() const { return *item; }
+ /// @return frequency (weight) estimate
W get_estimate() const { return weight + offset; }
+ /// @return estimate lower bound
W get_lower_bound() const { return weight; }
+ /// @return estimate upper bound
W get_upper_bound() const { return weight + offset; }
private:
const T* item;
diff --git a/hll/include/HllSketch-internal.hpp b/hll/include/HllSketch-internal.hpp
index f3b99c1..dfd7e43 100644
--- a/hll/include/HllSketch-internal.hpp
+++ b/hll/include/HllSketch-internal.hpp
@@ -94,14 +94,14 @@ hll_sketch_alloc<A>::hll_sketch_alloc(HllSketchImpl<A>* that) :
{}
template<typename A>
-hll_sketch_alloc<A> hll_sketch_alloc<A>::operator=(const hll_sketch_alloc<A>& other) {
+hll_sketch_alloc<A>& hll_sketch_alloc<A>::operator=(const hll_sketch_alloc<A>& other) {
sketch_impl->get_deleter()(sketch_impl);
sketch_impl = other.sketch_impl->copy();
return *this;
}
template<typename A>
-hll_sketch_alloc<A> hll_sketch_alloc<A>::operator=(hll_sketch_alloc<A>&& other) {
+hll_sketch_alloc<A>& hll_sketch_alloc<A>::operator=(hll_sketch_alloc<A>&& other) {
std::swap(sketch_impl, other.sketch_impl);
return *this;
}
diff --git a/hll/include/hll.hpp b/hll/include/hll.hpp
index 63b9f4f..1417c82 100644
--- a/hll/include/hll.hpp
+++ b/hll/include/hll.hpp
@@ -29,40 +29,6 @@
#include <vector>
namespace datasketches {
-
- /**
- * This is a high performance implementation of Phillipe Flajolet’s HLL sketch but with
- * significantly improved error behavior. If the ONLY use case for sketching is counting
- * uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for
- * storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to
- * 16 times smaller than the Theta sketch family for the same accuracy.
- *
- * <p>This implementation offers three different types of HLL sketch, each with different
- * trade-offs with accuracy, space and performance. These types are specified with the
- * {@link TgtHllType} parameter.
- *
- * <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
- * distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
- * The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
- * where <i>K</i> is the number of buckets or slots for the sketch.
- *
- * <p>During warmup, when the sketch has only received a small number of unique items
- * (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
- * algorithms with significantly better accuracy.
- *
- * <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
- * created by the user, the sketch will perform all of its updates and internal phase transitions
- * in that object, which can actually reside either on-heap or off-heap based on how it is
- * configured. In large systems that must update and merge many millions of sketches, having the
- * sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
- * to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
- * delays.
- *
- * author Jon Malkin
- * author Lee Rhodes
- * author Kevin Lang
- */
-
/**
* Specifies the target type of HLL sketch to be created. It is a target in that the actual
@@ -109,6 +75,39 @@ class hll_union_alloc;
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
+/**
+ * This is a high performance implementation of Phillipe Flajolet's HLL sketch but with
+ * significantly improved error behavior. If the ONLY use case for sketching is counting
+ * uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for
+ * storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to
+ * 16 times smaller than the Theta sketch family for the same accuracy.
+ *
+ * <p>This implementation offers three different types of HLL sketch, each with different
+ * trade-offs with accuracy, space and performance. These types are specified with the
+ * {@link target_hll_type} parameter.
+ *
+ * <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
+ * distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
+ * The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
+ * where <i>K</i> is the number of buckets or slots for the sketch.
+ *
+ * <p>During warmup, when the sketch has only received a small number of unique items
+ * (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
+ * algorithms with significantly better accuracy.
+ *
+ * <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
+ * created by the user, the sketch will perform all of its updates and internal phase transitions
+ * in that object, which can actually reside either on-heap or off-heap based on how it is
+ * configured. In large systems that must update and merge many millions of sketches, having the
+ * sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
+ * to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
+ * delays.
+ *
+ * author Jon Malkin
+ * author Lee Rhodes
+ * author Kevin Lang
+ */
+
template<typename A = std::allocator<uint8_t> >
class hll_sketch_alloc final {
public:
@@ -119,27 +118,33 @@ class hll_sketch_alloc final {
* @param start_full_size Indicates whether to start in HLL mode,
* keeping memory use constant (if HLL_6 or HLL_8) at the cost of
* starting out using much more memory
+ * @param allocator instance of an Allocator
*/
explicit hll_sketch_alloc(uint8_t lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false, const A& allocator = A());
/**
* Copy constructor
+ * @param that sketch to be copied
*/
hll_sketch_alloc(const hll_sketch_alloc<A>& that);
/**
* Copy constructor to a new target type
+ * @param that sketch to be copied
+ * @param tgt_type target_hll_type
*/
hll_sketch_alloc(const hll_sketch_alloc<A>& that, target_hll_type tgt_type);
/**
* Move constructor
+ * @param that sketch to be moved
*/
hll_sketch_alloc(hll_sketch_alloc<A>&& that) noexcept;
/**
* Reconstructs a sketch from a serialized image on a stream.
* @param is An input stream with a binary image of a sketch
+ * @param allocator instance of an Allocator
*/
static hll_sketch_alloc deserialize(std::istream& is, const A& allocator = A());
@@ -147,17 +152,26 @@ class hll_sketch_alloc final {
* Reconstructs a sketch from a serialized image in a byte array.
* @param bytes An input array with a binary image of a sketch
* @param len Length of the input array, in bytes
+ * @param allocator instance of an Allocator
*/
static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A());
//! Class destructor
virtual ~hll_sketch_alloc();
- //! Copy assignment operator
- hll_sketch_alloc operator=(const hll_sketch_alloc<A>& other);
+ /**
+ * Copy assignment operator
+ * @param other sketch to be copied
+ * @return reference to this sketch
+ */
+ hll_sketch_alloc& operator=(const hll_sketch_alloc<A>& other);
- //! Move assignment operator
- hll_sketch_alloc operator=(hll_sketch_alloc<A>&& other);
+ /**
+ * Move assignment operator
+ * @param other sketch to be moved
+ * @return reference to this sketch
+ */
+ hll_sketch_alloc& operator=(hll_sketch_alloc<A>&& other);
/**
* Resets the sketch to an empty state in coupon collection mode.
@@ -165,18 +179,20 @@ class hll_sketch_alloc final {
*/
void reset();
- typedef vector_u8<A> vector_bytes; // alias for users
+ using vector_bytes = vector_u8<A>; // alias for users
/**
* Serializes the sketch to a byte array, compacting data structures
* where feasible to eliminate unused storage in the serialized image.
* @param header_size_bytes Allows for PostgreSQL integration
+ * @return serialized sketch in binary form
*/
vector_bytes serialize_compact(unsigned header_size_bytes = 0) const;
/**
* Serializes the sketch to a byte array, retaining all internal
* data structures in their current form.
+ * @return serialized sketch in binary form
*/
vector_bytes serialize_updatable() const;
@@ -413,8 +429,8 @@ class hll_sketch_alloc final {
* <p>Although the API for this union operator parallels many of the methods of the
* <i>HllSketch</i>, the behavior of the union operator has some fundamental differences.
*
- * <p>First, the user cannot specify the #tgt_hll_type as an input parameter.
- * Instead, it is specified for the sketch returned with #get_result(tgt_hll_tyope).
+ * <p>First, the user cannot specify the #target_hll_type as an input parameter.
+ * Instead, it is specified for the sketch returned with #get_result.
*
* <p>Second, the internal effective value of log-base-2 of <i>k</i> for the union operation can
* change dynamically based on the smallest <i>lg_config_k</i> that the union operation has seen.
@@ -423,7 +439,6 @@ class hll_sketch_alloc final {
* author Lee Rhodes
* author Kevin Lang
*/
-
template<typename A = std::allocator<uint8_t> >
class hll_union_alloc {
public:
@@ -431,6 +446,7 @@ class hll_union_alloc {
* Construct an hll_union operator with the given maximum log2 of k.
* @param lg_max_k The maximum size, in log2, of k. The value must
* be between 7 and 21, inclusive.
+ * @param allocator instance of an Allocator
*/
explicit hll_union_alloc(uint8_t lg_max_k, const A& allocator = A());
@@ -495,7 +511,7 @@ class hll_union_alloc {
/**
* Returns the result of this union operator with the specified
- * #tgt_hll_type.
+ * #target_hll_type.
* @param tgt_type The tgt_hll_type enum value of the desired result (Default: HLL_4)
* @return The result of this union with the specified tgt_hll_type
*/
@@ -629,11 +645,11 @@ class hll_union_alloc {
hll_sketch_alloc<A> gadget_;
};
-/// convenience alias for hll_sketch with default allocator
-typedef hll_sketch_alloc<> hll_sketch;
+/// HLL sketch alias with default allocator
+using hll_sketch = hll_sketch_alloc<std::allocator<uint8_t>>;
-/// convenience alias for hll_union with default allocator
-typedef hll_union_alloc<> hll_union;
+/// HLL union alias with default allocator
+using hll_union = hll_union_alloc<std::allocator<uint8_t>>;
} // namespace datasketches
diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index 560f097..e9fc73e 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -29,7 +29,13 @@
namespace datasketches {
-/*
+/// KLL sketch constants
+namespace kll_constants {
+ /// default value of parameter K
+ const uint16_t DEFAULT_K = 200;
+}
+
+/**
* Implementation of a very compact quantiles sketch with lazy compaction scheme
* and nearly optimal accuracy per retained item.
* See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
@@ -146,10 +152,6 @@ namespace datasketches {
* author Lee Rhodes
*/
-namespace kll_constants {
- const uint16_t DEFAULT_K = 200;
-}
-
template <
typename T,
typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
@@ -160,6 +162,13 @@ class kll_sketch {
using value_type = T;
using comparator = C;
using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
+ using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
+
+ /**
+ * Quantile return type.
+ * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
+ */
+ using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
static const uint8_t DEFAULT_M = 8;
static const uint16_t MIN_K = DEFAULT_M;
@@ -262,7 +271,6 @@ class kll_sketch {
*
* @return approximate quantile associated with the given rank
*/
- using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
/**
@@ -339,7 +347,6 @@ class kll_sketch {
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
- using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
/**
@@ -489,9 +496,26 @@ class kll_sketch {
string<A> to_string(bool print_levels = false, bool print_items = false) const;
class const_iterator;
+
+ /**
+ * Iterator pointing to the first item in the sketch.
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
+ * @return iterator pointing to the first item in the sketch
+ */
const_iterator begin() const;
+
+ /**
+ * Iterator pointing to the past-the-end item in the sketch.
+ * The past-the-end item is the hypothetical item that would follow the last item.
+ * It does not point to any item, and must not be dereferenced or incremented.
+ * @return iterator pointing to the past-the-end item in the sketch
+ */
const_iterator end() const;
+ /**
+ * Gets the sorted view of this sketch
+ * @return the sorted view of this sketch
+ */
quantiles_sorted_view<T, C, A> get_sorted_view() const;
private:
diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp
index 469dc72..6f728db 100644
--- a/quantiles/include/quantiles_sketch.hpp
+++ b/quantiles/include/quantiles_sketch.hpp
@@ -30,6 +30,16 @@
namespace datasketches {
+/// Constants for Quantiles sketch
+namespace quantiles_constants {
+ /// default value of parameter K
+ const uint16_t DEFAULT_K = 128;
+ /// minimum value of parameter K
+ const uint16_t MIN_K = 2;
+ /// maximum value of parameter K
+ const uint16_t MAX_K = 1 << 15;
+}
+
/**
* This is a stochastic streaming sketch that enables near-real time analysis of the
* approximate distribution from a very large stream in a single pass.
@@ -136,13 +146,6 @@ Table Guide for DoublesSketch Size in Bytes and Approximate Error:
* @author Alexander Saydakov
* @author Jon Malkin
*/
-
-namespace quantiles_constants {
- const uint16_t DEFAULT_K = 128;
- const uint16_t MIN_K = 2;
- const uint16_t MAX_K = 1 << 15;
-}
-
template <typename T,
typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
typename Allocator = std::allocator<T>>
@@ -151,13 +154,43 @@ public:
using value_type = T;
using allocator_type = Allocator;
using comparator = Comparator;
+ using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
+ using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
+ /**
+ * Constructor
+ * @param k
+ * @param comparator
+ * @param allocator
+ */
explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K,
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
+
+ /**
+ * Copy constructor
+ * @param other sketch to be copied
+ */
quantiles_sketch(const quantiles_sketch& other);
+
+ /** Move constructor
+ * @param other sketch to be moved
+ */
quantiles_sketch(quantiles_sketch&& other) noexcept;
+
~quantiles_sketch();
+
+ /**
+ * Copy assignment
+ * @param other sketch to be copied
+ * @return reference to this sketch
+ */
quantiles_sketch& operator=(const quantiles_sketch& other);
+
+ /**
+ * Move assignment
+ * @param other sketch to be moved
+ * @return reference to this sketch
+ */
quantiles_sketch& operator=(quantiles_sketch&& other) noexcept;
/**
@@ -247,10 +280,11 @@ public:
* If the sketch is empty this throws std::runtime_error.
*
* @param rank the specified normalized rank in the hypothetical sorted stream.
- *
+ * @param inclusive if true the weight of the given item is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
+ * according to the Comparator.
* @return the approximation to the item at the given rank
*/
- using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
/**
@@ -264,7 +298,9 @@ public:
* @param ranks given array of normalized ranks in the hypothetical sorted stream.
* These ranks must be in the interval [0.0, 1.0], inclusive.
* @param size the number of ranks in the array
- *
+ * @param inclusive if true the weight of the given item is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
+ * according to the Comparator.
* @return array of approximations to items associated with given ranks in the same order as given ranks
* in the input array.
*
@@ -282,7 +318,9 @@ public:
* This must be an integer greater than 0. A value of 1 is equivalent to get_quantiles([0]).
* A value of 2 is equivalent to get_quantiles([0, 1]). A value of 3 is equivalent to
* get_quantiles([0, 0.5, 1]), etc.
- *
+ * @param inclusive if true the weight of the given item is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
+ * according to the Comparator.
* @return array of approximations to items associated with the given number of evenly-spaced normalized ranks.
*
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
@@ -300,7 +338,7 @@ public:
* @param item to be ranked
* @param inclusive if true the weight of the given item is included into the rank.
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
- * according to the comparator C.
+ * according to the Comparator.
* @return an approximate normalized rank of the given item
*/
double get_rank(const T& item, bool inclusive = true) const;
@@ -327,7 +365,6 @@ public:
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
- using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
/**
@@ -451,9 +488,26 @@ public:
string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
class const_iterator;
+
+ /**
+ * Iterator pointing to the first item in the sketch.
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
+ * @return iterator pointing to the first item in the sketch
+ */
const_iterator begin() const;
+
+ /**
+ * Iterator pointing to the past-the-end item in the sketch.
+ * The past-the-end item is the hypothetical item that would follow the last item.
+ * It does not point to any item, and must not be dereferenced or incremented.
+ * @return iterator pointing to the past-the-end item in the sketch
+ */
const_iterator end() const;
+ /**
+ * Gets the sorted view of this sketch
+ * @return the sorted view of this sketch
+ */
quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const;
private:
diff --git a/req/include/req_common.hpp b/req/include/req_common.hpp
index 1ea4b41..aaead87 100755
--- a/req/include/req_common.hpp
+++ b/req/include/req_common.hpp
@@ -27,10 +27,14 @@
namespace datasketches {
+/// REQ sketch constants
namespace req_constants {
- static const uint16_t MIN_K = 4;
- static const uint8_t INIT_NUM_SECTIONS = 3;
- static const unsigned MULTIPLIER = 2;
+ /// minimum value of parameter K
+ const uint16_t MIN_K = 4;
+ /// initial number of sections
+ const uint8_t INIT_NUM_SECTIONS = 3;
+ /// multiplier for nominal capacity
+ const unsigned MULTIPLIER = 2;
}
} /* namespace datasketches */
diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp
index 29658bf..99b4524 100755
--- a/req/include/req_sketch.hpp
+++ b/req/include/req_sketch.hpp
@@ -28,6 +28,48 @@
namespace datasketches {
+/**
+ * Relative Error Quantiles Sketch.
+ * This is an implementation based on the paper
+ * "Relative Error Streaming Quantiles" by Graham Cormode, Zohar Karnin, Edo Liberty,
+ * Justin Thaler, Pavel Veselý, and loosely derived from a Python prototype written by Pavel Veselý.
+ *
+ * <p>Reference: https://arxiv.org/abs/2004.01668</p>
+ *
+ * <p>This implementation differs from the algorithm described in the paper in the following:</p>
+ *
+ * <ul>
+ * <li>The algorithm requires no upper bound on the stream length.
+ * Instead, each relative-compactor counts the number of compaction operations performed
+ * so far (via variable state). Initially, the relative-compactor starts with INIT_NUMBER_OF_SECTIONS.
+ * Each time the number of compactions (variable state) exceeds 2^{numSections - 1}, we double
+ * numSections. Note that after merging the sketch with another one variable state may not correspond
+ * to the number of compactions performed at a particular level, however, since the state variable
+ * never exceeds the number of compactions, the guarantees of the sketch remain valid.</li>
+ *
+ * <li>The size of each section (variable k and section_size in the code and parameter k in
+ * the paper) is initialized with a number set by the user via variable k.
+ * When the number of sections doubles, we decrease section_size by a factor of sqrt(2).
+ * This is applied at each level separately. Thus, when we double the number of sections, the
+ * nominal compactor size increases by a factor of approx. sqrt(2) (+/- rounding).</li>
+ *
+ * <li>The merge operation here does not perform "special compactions", which are used in the paper
+ * to allow for a tight mathematical analysis of the sketch.</li>
+ * </ul>
+ *
+ * <p>This implementation provides a number of capabilities not discussed in the paper or provided
+ * in the Python prototype.</p>
+ *
+ * <ul><li>The Python prototype only implemented high accuracy for low ranks. This implementation
+ * provides the user with the ability to choose either high rank accuracy or low rank accuracy at
+ * the time of sketch construction.</li>
+ * <li>The Python prototype only implemented a comparison criterion of "INCLUSIVE". This implementation
+ * allows the user to use both the "INCLUSIVE" criterion and the "EXCLUSIVE" criterion.</li>
+ * <li>This implementation provides extensive debug visibility into the operation of the sketch with
+ * two levels of detail output. This is not only useful for debugging, but is a powerful tool to
+ * help users understand how the sketch works.</li>
+ * </ul>
+ */
template<
typename T,
typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
@@ -39,6 +81,13 @@ public:
using comparator = Comparator;
using Compactor = req_compactor<T, Comparator, Allocator>;
using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
+ using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
+
+ /**
+ * Quantile return type.
+ * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
+ */
+ using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
/**
* Constructor
@@ -46,19 +95,41 @@ public:
* Value of 12 roughly corresponds to 1% relative error guarantee at 95% confidence.
* @param hra if true, the default, the high ranks are prioritized for better
* accuracy. Otherwise the low ranks are prioritized for better accuracy.
- * @param comparator to use by this instance
- * @param allocator to use by this instance
+ * @param comparator instance to use by this sketch instance
+ * @param allocator instance to use by this sketch instance
*/
explicit req_sketch(uint16_t k, bool hra = true, const Comparator& comparator = Comparator(),
const Allocator& allocator = Allocator());
- ~req_sketch();
+ /**
+ * Copy constructor
+ * @param other sketch to be copied
+ */
req_sketch(const req_sketch& other);
+
+ /**
+ * Move constructor
+ * @param other sketch to be moved
+ */
req_sketch(req_sketch&& other) noexcept;
+
+ ~req_sketch();
+
+ /**
+ * Copy assignment
+ * @param other sketch to be copied
+ * @return reference to this sketch
+ */
req_sketch& operator=(const req_sketch& other);
+
+ /**
+ * Move assignment
+ * @param other sketch to be moved
+ * @return reference to this sketch
+ */
req_sketch& operator=(req_sketch&& other);
- /*
+ /**
* Type converting constructor.
* @param other sketch of a different type
* @param comparator instance of a Comparator
@@ -177,7 +248,6 @@ public:
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
- using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
/**
@@ -214,7 +284,6 @@ public:
*
* @return approximate quantile associated with the given rank
*/
- using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
/**
@@ -223,6 +292,7 @@ public:
*
* @param ranks given array of normalized ranks.
* @param size the number of ranks in the array.
+ * @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
*
* @return array of quantiles that correspond to the given array of normalized ranks
*
@@ -333,9 +403,26 @@ public:
string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
class const_iterator;
+
+ /**
+ * Iterator pointing to the first item in the sketch.
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
+ * @return iterator pointing to the first item in the sketch
+ */
const_iterator begin() const;
+
+ /**
+ * Iterator pointing to the past-the-end item in the sketch.
+ * The past-the-end item is the hypothetical item that would follow the last item.
+ * It does not point to any item, and must not be dereferenced or incremented.
+ * @return iterator pointing to the past-the-end item in the sketch
+ */
const_iterator end() const;
+ /**
+ * Gets the sorted view of this sketch
+ * @return the sorted view of this sketch
+ */
quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const;
private:
diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index 10a7019..40dac80 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -26,22 +26,12 @@
#include <iterator>
#include <vector>
-
-/**
- * This sketch samples data from a stream of items, designed for optimal (minimum) variance when
- * querying the sketch to estimate subset sums of items matchng a provided predicate. Variance
- * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
- * subset sum estimation.
- *
- * author Kevin Lang
- * author Jon Malkin
- */
namespace datasketches {
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
-/**
+/*
* A struct to hold the result of subset sum queries
*/
struct subset_summary {
@@ -53,11 +43,23 @@ struct subset_summary {
template <typename T, typename A> class var_opt_union; // forward declaration
+/// VarOpt sketch constants
namespace var_opt_constants {
- const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
- const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
+ /// default resize factor
+ const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
+ /// maximum value of parameter K
+ const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
}
+/**
+ * This sketch samples data from a stream of items. Designed for optimal (minimum) variance when
+ * querying the sketch to estimate subset sums of items matching a provided predicate. Variance
+ * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
+ * subset sum estimation.
+ *
+ * author Kevin Lang
+ * author Jon Malkin
+ */
template<
typename T,
typename A = std::allocator<T>
@@ -68,15 +70,42 @@ class var_opt_sketch {
static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
static const uint32_t MAX_K = var_opt_constants::MAX_K;
+ /**
+ * Constructor
+ * @param k sketch size
+ * @param rf resize factor
+ * @param allocator instance of an allocator
+ */
explicit var_opt_sketch(uint32_t k,
resize_factor rf = var_opt_constants::DEFAULT_RESIZE_FACTOR,
const A& allocator = A());
+
+ /**
+ * Copy constructor
+ * @param other sketch to be copied
+ */
var_opt_sketch(const var_opt_sketch& other);
+
+ /**
+ * Move constructor
+ * @param other sketch to be moved
+ */
var_opt_sketch(var_opt_sketch&& other) noexcept;
~var_opt_sketch();
+ /**
+ * Copy assignment
+ * @param other sketch to be copied
+ * @return reference to this sketch
+ */
var_opt_sketch& operator=(const var_opt_sketch& other);
+
+ /**
+ * Move assignment
+ * @param other sketch to be moved
+ * @return reference to this sketch
+ */
var_opt_sketch& operator=(var_opt_sketch&& other);
/**
@@ -85,7 +114,7 @@ class var_opt_sketch {
* @param item an item from a stream of items
* @param weight the weight of the item
*/
- void update(const T& item, double weight=1.0);
+ void update(const T& item, double weight = 1.0);
/**
* Updates this sketch with the given data item with the given weight.
@@ -93,7 +122,7 @@ class var_opt_sketch {
* @param item an item from a stream of items
* @param weight the weight of the item
*/
- void update(T&& item, double weight=1.0);
+ void update(T&& item, double weight = 1.0);
/**
* Returns the configured maximum sample size.
@@ -117,7 +146,7 @@ class var_opt_sketch {
* Computes an estimated subset sum from the entire stream for objects matching a given
* predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard
* deviations. This is technically a heuristic method and tries to err on the conservative side.
- * @param P a predicate function
+ * @param predicate a predicate function
* @return a subset_summary item with estimate, upper and lower bounds,
* and total sketch weight
*/
@@ -138,7 +167,7 @@ class var_opt_sketch {
/**
* Computes size needed to serialize the current state of the sketch.
* This version is for fixed-size arithmetic types (integral and floating point).
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
* @return size in bytes needed to serialize this sketch
*/
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
@@ -147,7 +176,7 @@ class var_opt_sketch {
/**
* Computes size needed to serialize the current state of the sketch.
* This version is for all other types and can be expensive since every item needs to be looked at.
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
* @return size in bytes needed to serialize this sketch
*/
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
@@ -155,7 +184,7 @@ class var_opt_sketch {
// This is a convenience alias for users
// The type returned by the following serialize method
- typedef vector_u8<A> vector_bytes;
+ using vector_bytes = vector_u8<A>;
/**
* This method serializes the sketch as a vector of bytes.
@@ -163,7 +192,7 @@ class var_opt_sketch {
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
*/
template<typename SerDe = serde<T>>
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
@@ -171,7 +200,7 @@ class var_opt_sketch {
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
*/
template<typename SerDe = serde<T>>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -179,8 +208,8 @@ class var_opt_sketch {
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param sd instance of a SerDe
+ * @param allocator instance of an allocator
* @return an instance of a sketch
*/
template<typename SerDe = serde<T>>
@@ -190,8 +219,8 @@ class var_opt_sketch {
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param sd instance of a SerDe
+ * @param allocator instance of an allocator
* @return an instance of a sketch
*/
template<typename SerDe = serde<T>>
@@ -213,7 +242,20 @@ class var_opt_sketch {
string<A> items_to_string() const;
class const_iterator;
+
+ /**
+ * Iterator pointing to the first item in the sketch.
+ * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
+ * @return iterator pointing to the first item in the sketch
+ */
const_iterator begin() const;
+
+ /**
+ * Iterator pointing to the past-the-end item in the sketch.
+ * The past-the-end item is the hypothetical item that would follow the last item.
+ * It does not point to any item, and must not be dereferenced or incremented.
+ * @return iterator pointing to the past-the-end item in the sketch
+ */
const_iterator end() const;
private:
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 8ce66a5..7bf4095 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -36,7 +36,7 @@
namespace datasketches {
-/**
+/*
* Implementation code for the VarOpt sketch.
*
* author Kevin Lang
@@ -895,7 +895,7 @@ void var_opt_sketch<T, A>::update_heavy_r_eq1(O&& item, double weight, bool mark
grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
}
-/**
+/*
* Decreases sketch's value of k by 1, updating stored values as needed.
*
* <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
@@ -1685,7 +1685,7 @@ bool var_opt_sketch<T, A>::iterator::get_mark() const {
return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
}
-/**
+/*
* Checks if target sampling allocation is more than 50% of max sampling size.
* If so, returns max sampling size, otherwise passes through target size.
*/
diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp
index 3d00735..f91377e 100644
--- a/sampling/include/var_opt_union.hpp
+++ b/sampling/include/var_opt_union.hpp
@@ -91,7 +91,7 @@ public:
/**
* Computes size needed to serialize the current state of the union.
* This version is for all other types and can be expensive since every item needs to be looked at.
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
* @return size in bytes needed to serialize this sketch
*/
template<typename SerDe = serde<T>>
@@ -108,7 +108,7 @@ public:
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
*/
template<typename SerDe = serde<T>>
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
@@ -117,7 +117,7 @@ public:
* NOTE: This method may be deprecated in a future version.
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
*/
template<typename SerDe = serde<T>>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -126,8 +126,8 @@ public:
* NOTE: This method may be deprecated in a future version.
* This method deserializes a union from a given stream.
* @param is input stream
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param sd instance of a SerDe
+ * @param allocator instance of an Allocator
* @return an instance of a union
*/
template<typename SerDe = serde<T>>
@@ -138,8 +138,8 @@ public:
* This method deserializes a union from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param sd instance of a SerDe
+ * @param allocator instance of an Allocator
* @return an instance of a union
*/
template<typename SerDe = serde<T>>
diff --git a/theta/include/bounds_on_ratios_in_sampled_sets.hpp b/theta/include/bounds_on_ratios_in_sampled_sets.hpp
index aa44b1f..413d71a 100644
--- a/theta/include/bounds_on_ratios_in_sampled_sets.hpp
+++ b/theta/include/bounds_on_ratios_in_sampled_sets.hpp
@@ -29,6 +29,7 @@
namespace datasketches {
/**
+ * Bounds on ratios in sampled sets.
* This class is used to compute the bounds on the estimate of the ratio <i>|B| / |A|</i>, where:
* <ul>
* <li><i>|A|</i> is the unknown size of a set <i>A</i> of unique identifiers.</li>
diff --git a/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp b/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp
index 1779ec1..b62f92c 100644
--- a/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp
+++ b/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp
@@ -28,6 +28,7 @@
namespace datasketches {
/**
+ * Bounds on ratios in Theta sketched sets.
* This is to compute the bounds on the estimate of the ratio <i>B / A</i>, where:
* <ul>
* <li><i>A</i> is a Theta Sketch of population <i>PopA</i>.</li>
@@ -50,8 +51,8 @@ class bounds_on_ratios_in_theta_sketched_sets {
public:
/**
* Gets the approximate lower bound for B over A based on a 95% confidence interval
- * @param sketchA the sketch A
- * @param sketchB the sketch B
+ * @param sketch_a the sketch A
+ * @param sketch_b the sketch B
* @return the approximate lower bound for B over A
*/
template<typename SketchA, typename SketchB>
@@ -72,8 +73,8 @@ public:
/**
* Gets the approximate upper bound for B over A based on a 95% confidence interval
- * @param sketchA the sketch A
- * @param sketchB the sketch B
+ * @param sketch_a the sketch A
+ * @param sketch_b the sketch B
* @return the approximate upper bound for B over A
*/
template<typename SketchA, typename SketchB>
@@ -94,8 +95,8 @@ public:
/**
* Gets the estimate for B over A
- * @param sketchA the sketch A
- * @param sketchB the sketch B
+ * @param sketch_a the sketch A
+ * @param sketch_b the sketch B
* @return the estimate for B over A
*/
template<typename SketchA, typename SketchB>
diff --git a/theta/include/theta_a_not_b.hpp b/theta/include/theta_a_not_b.hpp
index 4beef60..0ca5c7e 100644
--- a/theta/include/theta_a_not_b.hpp
+++ b/theta/include/theta_a_not_b.hpp
@@ -25,6 +25,10 @@
namespace datasketches {
+/**
+ * Theta A-not-B (set difference).
+ * Computes set difference of Theta sketches.
+ */
template<typename Allocator = std::allocator<uint64_t>>
class theta_a_not_b_alloc {
public:
@@ -33,6 +37,11 @@ public:
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
+ /**
+ * Constructor
+ * @param seed for the hash function that was used to create the sketch
+ * @param allocator to use for allocating and deallocating memory
+ */
explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
diff --git a/theta/include/theta_intersection.hpp b/theta/include/theta_intersection.hpp
index 19397e5..93a4a7c 100644
--- a/theta/include/theta_intersection.hpp
+++ b/theta/include/theta_intersection.hpp
@@ -25,6 +25,10 @@
namespace datasketches {
+/**
+ * Theta intersection.
+ * Computes intersection of Theta sketches.
+ */
template<typename Allocator = std::allocator<uint64_t>>
class theta_intersection_alloc {
public:
@@ -41,7 +45,7 @@ public:
};
using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
- /*
+ /**
* Constructor
* @param seed for the hash function that was used to create the sketch
* @param allocator to use for allocating and deallocating memory
diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp
index 5f0575f..bc3d822 100644
--- a/theta/include/theta_intersection_impl.hpp
+++ b/theta/include/theta_intersection_impl.hpp
@@ -28,9 +28,9 @@ state_(seed, nop_policy(), allocator)
{}
template<typename A>
-template<typename SS>
-void theta_intersection_alloc<A>::update(SS&& sketch) {
- state_.update(std::forward<SS>(sketch));
+template<typename FwdSketch>
+void theta_intersection_alloc<A>::update(FwdSketch&& sketch) {
+ state_.update(std::forward<FwdSketch>(sketch));
}
template<typename A>
diff --git a/theta/include/theta_jaccard_similarity.hpp b/theta/include/theta_jaccard_similarity.hpp
index 417ed54..126b78d 100644
--- a/theta/include/theta_jaccard_similarity.hpp
+++ b/theta/include/theta_jaccard_similarity.hpp
@@ -26,10 +26,11 @@
namespace datasketches {
+/// Theta Jaccard similarity alias
template<typename Allocator = std::allocator<uint64_t>>
using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>;
-// alias with default allocator for convenience
+/// Theta Jaccard similarity alias with default allocator
using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
diff --git a/theta/include/theta_jaccard_similarity_base.hpp b/theta/include/theta_jaccard_similarity_base.hpp
index 5e8dcf5..48d8676 100644
--- a/theta/include/theta_jaccard_similarity_base.hpp
+++ b/theta/include/theta_jaccard_similarity_base.hpp
@@ -30,6 +30,7 @@
namespace datasketches {
+/// Base class for Jaccard similarity
template<typename Union, typename Intersection, typename ExtractKey>
class jaccard_similarity_base {
public:
diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp
index 4bc1c2e..4a022c5 100644
--- a/theta/include/theta_sketch.hpp
+++ b/theta/include/theta_sketch.hpp
@@ -25,6 +25,7 @@
namespace datasketches {
+/// Abstract base class for Theta sketch
template<typename Allocator = std::allocator<uint64_t>>
class base_theta_sketch_alloc {
public:
@@ -106,6 +107,7 @@ protected:
virtual void print_items(std::ostringstream& os) const = 0;
};
+/// Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
template<typename Allocator = std::allocator<uint64_t>>
class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
public:
@@ -149,6 +151,11 @@ protected:
// forward declaration
template<typename A> class compact_theta_sketch_alloc;
+/**
+ * Update Theta sketch.
+ * The purpose of this class is to build a Theta sketch from input data via the update() methods.
+ * There is no constructor. Use builder instead.
+ */
template<typename Allocator = std::allocator<uint64_t>>
class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
public:
@@ -307,8 +314,10 @@ private:
virtual void print_specifics(std::ostringstream& os) const;
};
-// compact sketch
-
+/**
+ * Compact Theta sketch.
+ * This is an immutable form of the Theta sketch, the form that can be serialized and deserialized.
+ */
template<typename Allocator = std::allocator<uint64_t>>
class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
public:
@@ -327,8 +336,15 @@ public:
// - as a result of a set operation
// - by deserializing a previously serialized compact sketch
+ /**
+ * Copy constructor.
+ * Constructs a compact sketch from any other type of Theta sketch
+ * @param other sketch to be constructed from
+ * @param ordered if true make the resulting sketch ordered
+ */
template<typename Other>
compact_theta_sketch_alloc(const Other& other, bool ordered);
+
compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
virtual ~compact_theta_sketch_alloc() = default;
@@ -385,6 +401,7 @@ public:
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
+ * @param allocator instance of an Allocator
* @return an instance of the sketch
*/
static compact_theta_sketch_alloc deserialize(std::istream& is,
@@ -395,12 +412,13 @@ public:
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
+ * @param allocator instance of an Allocator
* @return an instance of the sketch
*/
static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
- // for internal use
+ /// @private constructor for internal use
compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
private:
@@ -425,18 +443,26 @@ private:
virtual void print_specifics(std::ostringstream& os) const;
};
+/// Update Theta sketch builder
template<typename Allocator>
class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
public:
+ /**
+ * Constructor
+ * @param allocator
+ */
builder(const Allocator& allocator = Allocator());
+ /// @return instance of Update Theta sketch
update_theta_sketch_alloc build() const;
};
-// This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
-// It does not take the ownership of the buffer.
-
+/**
+ * Wrapped Compact Theta sketch.
+ * This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
+ * It does not take the ownership of the buffer.
+ */
template<typename Allocator = std::allocator<uint64_t>>
-class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
+class wrapped_compact_theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
public:
class const_iterator;
@@ -447,7 +473,17 @@ public:
uint32_t get_num_retained() const;
uint16_t get_seed_hash() const;
+ /**
+ * Const iterator over hash values in this sketch.
+ * @return begin iterator
+ */
const_iterator begin() const;
+
+ /**
+ * Const iterator pointing past the valid range.
+ * Not to be incremented or dereferenced.
+ * @return end iterator
+ */
const_iterator end() const;
/**
@@ -455,6 +491,7 @@ public:
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
+ * @param dump_on_error if true prints hex dump of the input
* @return an instance of the sketch
*/
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
@@ -499,10 +536,13 @@ private:
uint64_t buffer_[8];
};
-// aliases with default allocator for convenience
+/// Theta sketch alias with default allocator
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
+/// Update Theta sketch alias with default allocator
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
+/// Compact Theta sketch alias with default allocator
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
+/// Wrapped Compact Theta sketch alias with default allocator
using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
diff --git a/theta/include/theta_union.hpp b/theta/include/theta_union.hpp
index d90c53a..ca59629 100644
--- a/theta/include/theta_union.hpp
+++ b/theta/include/theta_union.hpp
@@ -26,6 +26,10 @@
namespace datasketches {
+/**
+ * Theta Union.
+ * Computes union of Theta sketches. There is no constructor. Use builder instead.
+ */
template<typename Allocator = std::allocator<uint64_t>>
class theta_union_alloc {
public:
@@ -72,6 +76,7 @@ private:
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
};
+/// Theta union builder
template<typename A>
class theta_union_alloc<A>::builder: public theta_base_builder<builder, A> {
public:
diff --git a/theta/include/theta_union_impl.hpp b/theta/include/theta_union_impl.hpp
index 8618618..310d58a 100644
--- a/theta/include/theta_union_impl.hpp
+++ b/theta/include/theta_union_impl.hpp
@@ -28,9 +28,9 @@ state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
{}
template<typename A>
-template<typename SS>
-void theta_union_alloc<A>::update(SS&& sketch) {
- state_.update(std::forward<SS>(sketch));
+template<typename FwdSketch>
+void theta_union_alloc<A>::update(FwdSketch&& sketch) {
+ state_.update(std::forward<FwdSketch>(sketch));
}
template<typename A>
diff --git a/theta/include/theta_update_sketch_base.hpp b/theta/include/theta_update_sketch_base.hpp
index a8babf2..9101363 100644
--- a/theta/include/theta_update_sketch_base.hpp
+++ b/theta/include/theta_update_sketch_base.hpp
@@ -91,13 +91,14 @@ struct theta_update_sketch_base {
static void consolidate_non_empty(Entry* entries, size_t size, size_t num);
};
-// builder
+/// Theta base builder
template<typename Derived, typename Allocator>
class theta_base_builder {
public:
/**
* Creates and instance of the builder with default parameters.
+ * @param allocator instance of an Allocator to pass to created sketches
*/
theta_base_builder(const Allocator& allocator);
diff --git a/tuple/include/array_of_doubles_sketch.hpp b/tuple/include/array_of_doubles_sketch.hpp
index 4c468a3..3f50227 100644
--- a/tuple/include/array_of_doubles_sketch.hpp
+++ b/tuple/include/array_of_doubles_sketch.hpp
@@ -109,6 +109,10 @@ template<typename A> class compact_array_of_doubles_sketch_alloc;
template<typename A> using AllocAOD = typename std::allocator_traits<A>::template rebind_alloc<aod<A>>;
+/**
+ * Update Array of doubles sketch.
+ * There is no constructor. Use builder instead.
+ */
template<typename A = std::allocator<double>>
class update_array_of_doubles_sketch_alloc: public update_tuple_sketch<aod<A>, aod<A>, array_of_doubles_update_policy<A>, AllocAOD<A>> {
public:
@@ -118,6 +122,8 @@ public:
class builder;
compact_array_of_doubles_sketch_alloc<A> compact(bool ordered = true) const;
+
+ /// @return number of double values in array
uint8_t get_num_values() const;
private:
@@ -126,9 +132,10 @@ private:
uint64_t seed, const array_of_doubles_update_policy<A>& policy, const A& allocator);
};
-// alias with the default allocator for convenience
+/// alias with the default allocator for convenience
using update_array_of_doubles_sketch = update_array_of_doubles_sketch_alloc<>;
+/// Update Array of doubles sketch builder
template<typename A>
class update_array_of_doubles_sketch_alloc<A>::builder: public tuple_base_builder<builder, array_of_doubles_update_policy<A>, A> {
public:
@@ -136,6 +143,7 @@ public:
update_array_of_doubles_sketch_alloc<A> build() const;
};
+/// Compact Array of doubles sketch
template<typename A = std::allocator<double>>
class compact_array_of_doubles_sketch_alloc: public compact_tuple_sketch<aod<A>, AllocAOD<A>> {
public:
@@ -153,6 +161,7 @@ public:
template<typename Sketch>
compact_array_of_doubles_sketch_alloc(const Sketch& other, bool ordered = true);
+ /// @return number of double values in array
uint8_t get_num_values() const;
void serialize(std::ostream& os) const;
@@ -169,7 +178,7 @@ private:
uint8_t num_values_;
};
-// alias with the default allocator for convenience
+/// alias with the default allocator for convenience
using compact_array_of_doubles_sketch = compact_array_of_doubles_sketch_alloc<>;
} /* namespace datasketches */
diff --git a/tuple/include/tuple_intersection.hpp b/tuple/include/tuple_intersection.hpp
index 966ea9f..4d9df08 100644
--- a/tuple/include/tuple_intersection.hpp
+++ b/tuple/include/tuple_intersection.hpp
@@ -38,6 +38,10 @@ struct example_intersection_policy {
};
*/
+/**
+ * Tuple intersection.
+ * Computes intersection of Tuple sketches.
+ */
template<
typename Summary,
typename Policy,
@@ -67,6 +71,12 @@ public:
using State = theta_intersection_base<Entry, ExtractKey, internal_policy, Sketch, CompactSketch, AllocEntry>;
+ /**
+ * Constructor
+ * @param seed for the hash function that was used to create the sketch
+ * @param policy user-defined way of combining Summary during intersection
+ * @param allocator to use for allocating and deallocating memory
+ */
explicit tuple_intersection(uint64_t seed = DEFAULT_SEED, const Policy& policy = Policy(), const Allocator& allocator = Allocator());
/**
diff --git a/tuple/include/tuple_jaccard_similarity.hpp b/tuple/include/tuple_jaccard_similarity.hpp
index 0a6633c..9d4cea5 100644
--- a/tuple/include/tuple_jaccard_similarity.hpp
+++ b/tuple/include/tuple_jaccard_similarity.hpp
@@ -26,6 +26,7 @@
namespace datasketches {
+/// Tuple Jaccard similarity alias
template<
typename Summary,
typename IntersectionPolicy,
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 70571df..eeb776f 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -43,6 +43,10 @@ struct pair_extract_key {
}
};
+/**
+ * Base class for Tuple sketch.
+ * This is an extension of Theta sketch that allows keeping arbitrary Summary associated with each retained key.
+ */
template<
typename Summary,
typename Allocator = std::allocator<Summary>
@@ -199,6 +203,11 @@ struct default_update_policy {
}
};
+/**
+ * Update Tuple sketch.
+ * The purpose of this class is to build a Tuple sketch from input data via the update() methods.
+ * There is no constructor. Use builder instead.
+ */
template<
typename Summary,
typename Update = Summary,
@@ -244,21 +253,24 @@ public:
/**
* Update this sketch with a given string.
- * @param value string to update the sketch with
+ * @param key string to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(const std::string& key, FwdUpdate&& value);
/**
* Update this sketch with a given unsigned 64-bit integer.
- * @param value uint64_t to update the sketch with
+ * @param key uint64_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint64_t key, FwdUpdate&& value);
/**
* Update this sketch with a given signed 64-bit integer.
- * @param value int64_t to update the sketch with
+ * @param key int64_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int64_t key, FwdUpdate&& value);
@@ -266,7 +278,8 @@ public:
/**
* Update this sketch with a given unsigned 32-bit integer.
* For compatibility with Java implementation.
- * @param value uint32_t to update the sketch with
+ * @param key uint32_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint32_t key, FwdUpdate&& value);
@@ -274,7 +287,8 @@ public:
/**
* Update this sketch with a given signed 32-bit integer.
* For compatibility with Java implementation.
- * @param value int32_t to update the sketch with
+ * @param key int32_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int32_t key, FwdUpdate&& value);
@@ -282,7 +296,8 @@ public:
/**
* Update this sketch with a given unsigned 16-bit integer.
* For compatibility with Java implementation.
- * @param value uint16_t to update the sketch with
+ * @param key uint16_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint16_t key, FwdUpdate&& value);
@@ -290,7 +305,8 @@ public:
/**
* Update this sketch with a given signed 16-bit integer.
* For compatibility with Java implementation.
- * @param value int16_t to update the sketch with
+ * @param key int16_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int16_t key, FwdUpdate&& value);
@@ -298,7 +314,8 @@ public:
/**
* Update this sketch with a given unsigned 8-bit integer.
* For compatibility with Java implementation.
- * @param value uint8_t to update the sketch with
+ * @param key uint8_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint8_t key, FwdUpdate&& value);
@@ -306,7 +323,8 @@ public:
/**
* Update this sketch with a given signed 8-bit integer.
* For compatibility with Java implementation.
- * @param value int8_t to update the sketch with
+ * @param key int8_t to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int8_t key, FwdUpdate&& value);
@@ -314,7 +332,8 @@ public:
/**
* Update this sketch with a given double-precision floating point value.
* For compatibility with Java implementation.
- * @param value double to update the sketch with
+ * @param key double to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(double key, FwdUpdate&& value);
@@ -322,7 +341,8 @@ public:
/**
* Update this sketch with a given floating point value.
* For compatibility with Java implementation.
- * @param value float to update the sketch with
+ * @param key float to update the sketch with
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(float key, FwdUpdate&& value);
@@ -337,8 +357,9 @@ public:
* Otherwise two sketches that should represent overlapping sets will be disjoint
* For instance, for signed 32-bit values call update(int32_t) method above,
* which does widening conversion to int64_t, if compatibility with Java is expected
- * @param data pointer to the data
+ * @param key pointer to the data
* @param length of the data in bytes
+ * @param value to update the sketch with
*/
template<typename FwdUpdate>
void update(const void* key, size_t length, FwdUpdate&& value);
@@ -375,8 +396,10 @@ protected:
virtual void print_specifics(std::ostringstream& os) const;
};
-// compact sketch
-
+/**
+ * Compact Tuple sketch.
+ * This is an immutable form of the Tuple sketch, the form that can be serialized and deserialized.
+ */
template<
typename Summary,
typename Allocator = std::allocator<Summary>
@@ -406,13 +429,26 @@ public:
// - as a result of a set operation
// - by deserializing a previously serialized compact sketch
+ /**
+ * Copy constructor.
+ * Constructs a compact sketch from another sketch (either update or compact)
+ * @param other sketch to be copied
+ * @param ordered if true make the resulting sketch ordered
+ */
compact_tuple_sketch(const Base& other, bool ordered);
+
compact_tuple_sketch(const compact_tuple_sketch&) = default;
compact_tuple_sketch(compact_tuple_sketch&&) noexcept;
virtual ~compact_tuple_sketch() = default;
compact_tuple_sketch& operator=(const compact_tuple_sketch&) = default;
compact_tuple_sketch& operator=(compact_tuple_sketch&&) = default;
+ /**
+ * Constructor from Theta sketch
+ * @param other Theta sketch to be constructed from
+ * @param summary Summary instance to be associated with each entry
+ * @param ordered if true make the resulting sketch ordered
+ */
compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true);
virtual Allocator get_allocator() const;
@@ -425,7 +461,7 @@ public:
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
*/
template<typename SerDe = serde<Summary>>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -436,7 +472,7 @@ public:
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
- * @param instance of a SerDe
+ * @param sd instance of a SerDe
* @return serialized sketch as a vector of bytes
*/
template<typename SerDe = serde<Summary>>
@@ -451,8 +487,8 @@ public:
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param sd instance of a SerDe
+ * @param allocator instance of an Allocator
* @return an instance of a sketch
*/
template<typename SerDe = serde<Summary>>
@@ -464,8 +500,8 @@ public:
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
- * @param instance of a SerDe
- * @param instance of an Allocator
+ * @param sd instance of a SerDe
+ * @param allocator instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = serde<Summary>>
@@ -522,8 +558,7 @@ protected:
};
-// builder
-
+// base builder
template<typename Derived, typename Policy, typename Allocator>
class tuple_base_builder: public theta_base_builder<Derived, Allocator> {
public:
@@ -533,11 +568,15 @@ protected:
Policy policy_;
};
+/// Update Tuple sketch builder
template<typename S, typename U, typename P, typename A>
class update_tuple_sketch<S, U, P, A>::builder: public tuple_base_builder<builder, P, A> {
public:
/**
+ * Constructor
* Creates and instance of the builder with default parameters.
+ * @param policy user-defined way of creating and updating Summary
+ * @param allocator instance of an Allocator to pass to created sketches
*/
builder(const P& policy = P(), const A& allocator = A());
diff --git a/tuple/include/tuple_union.hpp b/tuple/include/tuple_union.hpp
index 1c518da..381b550 100644
--- a/tuple/include/tuple_union.hpp
+++ b/tuple/include/tuple_union.hpp
@@ -33,6 +33,10 @@ struct default_union_policy {
}
};
+/**
+ * Tuple Union.
+ * Computes union of Tuple sketches. There is no constructor. Use builder instead.
+ */
template<
typename Summary,
typename Policy = default_union_policy<Summary>,
@@ -92,11 +96,15 @@ protected:
tuple_union(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
};
+/// Tuple union builder
template<typename S, typename P, typename A>
class tuple_union<S, P, A>::builder: public tuple_base_builder<builder, P, A> {
public:
/**
+ * Constructor.
* Creates and instance of the builder with default parameters.
+ * @param policy
+ * @param allocator
*/
builder(const P& policy = P(), const A& allocator = A());
diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp
index 352f953..b5316e8 100644
--- a/tuple/test/tuple_sketch_test.cpp
+++ b/tuple/test/tuple_sketch_test.cpp
@@ -86,7 +86,7 @@ TEST_CASE("tuple sketch float: empty", "[tuple_sketch]") {
REQUIRE(update_sketch.compact(false).is_ordered());
}
-TEST_CASE("tuple sketch: single item", "[theta_sketch]") {
+TEST_CASE("tuple sketch: single item", "[tuple_sketch]") {
auto update_sketch = update_tuple_sketch<float>::builder().build();
update_sketch.update(1, 1.0f);
REQUIRE_FALSE(update_sketch.is_empty());
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org