You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/10/06 19:27:26 UTC
[datasketches-cpp] branch universal_sorted_view updated: removed get_quantiles(), used generic term item, documentation changes
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch universal_sorted_view
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
The following commit(s) were added to refs/heads/universal_sorted_view by this push:
new f61e959 removed get_quantiles(), used generic term item, documentation changes
f61e959 is described below
commit f61e959e94143e51f916bfd0de2fbcad74b4cca0
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Oct 6 12:27:21 2022 -0700
removed get_quantiles(), used generic term item, documentation changes
---
kll/include/kll_sketch.hpp | 182 ++++++++++-------------
kll/include/kll_sketch_impl.hpp | 214 ++++++++++++----------------
kll/test/kll_sketch_test.cpp | 41 ------
kll/test/kll_sketch_validation.cpp | 8 +-
quantiles/include/quantiles_sketch.hpp | 79 ++++------
quantiles/include/quantiles_sketch_impl.hpp | 10 +-
req/include/req_sketch.hpp | 47 +++---
req/include/req_sketch_impl.hpp | 25 +---
req/test/req_sketch_test.cpp | 16 ---
9 files changed, 220 insertions(+), 402 deletions(-)
diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index 4a76e34..715717c 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -35,9 +35,9 @@ namespace datasketches {
* See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
*
* <p>This is a stochastic streaming sketch that enables near real-time analysis of the
- * approximate distribution of values from a very large stream in a single pass, requiring only
- * that the values are comparable.
- * The analysis is obtained using <i>get_quantile()</i> or <i>get_quantiles()</i> functions or the
+ * approximate distribution of items from a very large stream in a single pass, requiring only
+ * that the items are comparable.
+ * The analysis is obtained using <i>get_quantile()</i> function or the
* inverse functions get_rank(), get_PMF() (Probability Mass Function), and get_CDF()
* (Cumulative Distribution Function).
*
@@ -45,14 +45,15 @@ namespace datasketches {
* with the equivalent Java implementation only when template parameter T = float
* (32-bit single precision values).
*
- * <p>Given an input stream of <i>N</i> numeric values, the <i>absolute rank</i> of any specific
- * value is defined as its index <i>(0 to N-1)</i> in the hypothetical sorted stream of all
- * <i>N</i> input values.
+ * <p>Given an input stream of <i>N</i> items, the <i>natural rank</i> of any specific
+ * item is defined as its index <i>(1 to N)</i> in inclusive mode
+ * or <i>(0 to N-1)</i> in exclusive mode
+ * in the hypothetical sorted stream of all <i>N</i> input items.
*
- * <p>The <i>normalized rank</i> (<i>rank</i>) of any specific value is defined as its
- * <i>absolute rank</i> divided by <i>N</i>.
- * Thus, the <i>normalized rank</i> is a value between zero and one.
- * In the documentation for this sketch <i>absolute rank</i> is never used so any
+ * <p>The <i>normalized rank</i> (<i>rank</i>) of any specific item is defined as its
+ * <i>natural rank</i> divided by <i>N</i>.
+ * Thus, the <i>normalized rank</i> is between zero and one.
+ * In the documentation for this sketch <i>natural rank</i> is never used so any
* reference to just <i>rank</i> should be interpreted to mean <i>normalized rank</i>.
*
* <p>This sketch is configured with a parameter <i>k</i>, which affects the size of the sketch
@@ -61,18 +62,18 @@ namespace datasketches {
* <p>The estimation error is commonly called <i>epsilon</i> (or <i>eps</i>) and is a fraction
* between zero and one. Larger values of <i>k</i> result in smaller values of epsilon.
* Epsilon is always with respect to the rank and cannot be applied to the
- * corresponding values.
+ * corresponding items.
*
- * <p>The relationship between the normalized rank and the corresponding values can be viewed
+ * <p>The relationship between the normalized rank and the corresponding items can be viewed
* as a two dimensional monotonic plot with the normalized rank on one axis and the
- * corresponding values on the other axis. If the y-axis is specified as the value-axis and
+ * corresponding items on the other axis. If the y-axis is specified as the item-axis and
* the x-axis as the normalized rank, then <i>y = get_quantile(x)</i> is a monotonically
* increasing function.
*
- * <p>The functions <i>get_quantile(rank)</i> and get_quantiles(...) translate ranks into
- * corresponding values. The functions <i>get_rank(value),
+ * <p>The function <i>get_quantile(rank)</i> translates ranks into
+ * corresponding quantiles. The functions <i>get_rank(item),
* get_CDF(...) (Cumulative Distribution Function), and get_PMF(...)
- * (Probability Mass Function)</i> perform the opposite operation and translate values into ranks.
+ * (Probability Mass Function)</i> perform the opposite operation and translate items into ranks.
*
* <p>The <i>getPMF(...)</i> function has about 13 to 47% worse rank error (depending
* on <i>k</i>) than the other queries because the mass of each "bin" of the PMF has
@@ -84,60 +85,60 @@ namespace datasketches {
*
* <p>A <i>get_quantile(rank)</i> query has the following guarantees:
* <ul>
- * <li>Let <i>v = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
- * <li>The value <i>v</i> will be a value from the input stream.</li>
- * <li>Let <i>trueRank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
- * stream of all <i>N</i> values.</li>
+ * <li>Let <i>q = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
+ * <li>The quantile <i>q</i> will be an item from the input stream.</li>
+ * <li>Let <i>trueRank</i> be the true rank of <i>q</i> derived from the hypothetical sorted
+ * stream of all <i>N</i> items.</li>
* <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
* <li>Then <i>r - eps ≤ trueRank ≤ r + eps</i> with a confidence of 99%. Note that the
- * error is on the rank, not the value.</li>
+ * error is on the rank, not the quantile.</li>
* </ul>
*
- * <p>A <i>get_rank(value)</i> query has the following guarantees:
+ * <p>A <i>get_rank(item)</i> query has the following guarantees:
* <ul>
- * <li>Let <i>r = get_rank(v)</i> where <i>v</i> is a value between the min and max values of
+ * <li>Let <i>r = get_rank(i)</i> where <i>i</i> is an item between the min and max items of
* the input stream.</li>
- * <li>Let <i>true_rank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
- * stream of all <i>N</i> values.</li>
+ * <li>Let <i>true_rank</i> be the true rank of <i>i</i> derived from the hypothetical sorted
+ * stream of all <i>N</i> items.</li>
* <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
* <li>Then <i>r - eps ≤ trueRank ≤ r + eps</i> with a confidence of 99%.</li>
* </ul>
*
* <p>A <i>get_PMF()</i> query has the following guarantees:
* <ul>
- * <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
- * between the min and max values of the input stream.
- * <li>Let <i>mass<sub>i</sub> = estimated mass between v<sub>i</sub> and v<sub>i+1</sub></i>.</li>
- * <li>Let <i>trueMass</i> be the true mass between the values of <i>v<sub>i</sub>,
- * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(s1, s2, ..., sm)</i> where <i>s1, s2</i> are
+ * split points (items from the input domain) between the min and max items of the input stream.
+ * <li>Let <i>mass<sub>i</sub> = estimated mass between s<sub>i</sub> and s<sub>i+1</sub></i>.</li>
+ * <li>Let <i>trueMass</i> be the true mass between the items of <i>s<sub>i</sub>,
+ * s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
* <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
* <li>then <i>mass - eps ≤ trueMass ≤ mass + eps</i> with a confidence of 99%.</li>
- * <li>r(m+1) includes the mass of all points larger than vm.</li>
+ * <li>r(m+1) includes the mass of all points larger than sm.</li>
* </ul>
*
* <p>A <i>get_CDF(...)</i> query has the following guarantees;
* <ul>
- * <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
- * between the min and max values of the input stream.
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(s1, s2, ..., sm)</i> where <i>s1, s2, ...</i> are
+ * split points (items from the input domain) between the min and max items of the input stream.
* <li>Let <i>mass<sub>i</sub> = r<sub>i+1</sub> - r<sub>i</sub></i>.</li>
- * <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>v<sub>i</sub>,
- * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
+ * <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>s<sub>i</sub>,
+ * s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
* <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
* <li>then <i>mass - eps ≤ trueMass ≤ mass + eps</i> with a confidence of 99%.</li>
- * <li>1 - r(m+1) includes the mass of all points larger than vm.</li>
+ * <li>1 - r(m+1) includes the mass of all points larger than sm.</li>
* </ul>
*
* <p>From the above, it might seem like we could make some estimates to bound the
- * <em>value</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
- * let us derive error bounds or confidences around values. Because errors are independent, we
+ * <em>item</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
+ * let us derive error bounds or confidences around items. Because errors are independent, we
* can approximately bracket a value as shown below, but there are no error estimates available.
* Additionally, the interval may be quite large for certain distributions.
* <ul>
- * <li>Let <i>v = get_quantile(r)</i>, the estimated quantile value of rank <i>r</i>.</li>
+ * <li>Let <i>q = get_quantile(r)</i>, the estimated quantile of rank <i>r</i>.</li>
* <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
- * <li>Let <i>v<sub>lo</sub></i> = estimated quantile value of rank <i>(r - eps)</i>.</li>
- * <li>Let <i>v<sub>hi</sub></i> = estimated quantile value of rank <i>(r + eps)</i>.</li>
- * <li>Then <i>v<sub>lo</sub> ≤ v ≤ v<sub>hi</sub></i>, with 99% confidence.</li>
+ * <li>Let <i>q<sub>lo</sub></i> = estimated quantile of rank <i>(r - eps)</i>.</li>
+ * <li>Let <i>q<sub>hi</sub></i> = estimated quantile of rank <i>(r + eps)</i>.</li>
+ * <li>Then <i>q<sub>lo</sub> ≤ q ≤ q<sub>hi</sub></i>, with 99% confidence.</li>
* </ul>
*
* author Kevin Lang
@@ -145,13 +146,6 @@ namespace datasketches {
* author Lee Rhodes
*/
-template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
-template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
-template<typename A> using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
-template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
-template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
-template<typename A> using vector_d = std::vector<double, AllocD<A>>;
-
namespace kll_constants {
const uint16_t DEFAULT_K = 200;
}
@@ -165,6 +159,7 @@ class kll_sketch {
public:
using value_type = T;
using comparator = C;
+ using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
static const uint8_t DEFAULT_M = 8;
static const uint16_t MIN_K = DEFAULT_M;
@@ -266,36 +261,6 @@ class kll_sketch {
using quantile_return_type = typename quantile_sketch_sorted_view<T, C, A>::quantile_return_type;
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
- /**
- * This returns an array that could have been generated by using get_quantile() for each
- * rank separately.
- *
- * <p>If the sketch is empty this returns an empty vector.
- *
- * @param ranks given array of ranks in the hypothetical sorted stream.
- * These ranks must be in the interval [0.0, 1.0].
- * @param inclusive if true, the given ranks are considered inclusive (include weights of items)
- *
- * @return array of approximate quantiles corresponding to the given ranks in the same order.
- */
- std::vector<T, A> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
-
- /**
- * This is a multiple-query version of get_quantile() that allows the caller to
- * specify the number of evenly-spaced ranks.
- *
- * <p>If the sketch is empty this returns an empty vector.
- *
- * @param num an integer that specifies the number of evenly-spaced ranks.
- * This must be an integer greater than 0. A value of 1 will return the quantile of rank 0.
- * A value of 2 will return quantiles of ranks 0 and 1. A value of 3 will return quantiles of ranks 0,
- * 0.5 (median) and 1, etc.
- * @param inclusive if true, the ranks are considered inclusive (include weights of items)
- *
- * @return array of approximate quantiles corresponding to the given number of evenly-spaced ranks.
- */
- std::vector<T, A> get_quantiles(uint32_t num, bool inclusive = true) const;
-
/**
* Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
*
@@ -323,18 +288,20 @@ class kll_sketch {
* <p>If the sketch is empty this returns an empty vector.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
- * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
+ *
* @param size the number of split points in the array
*
- * @param inclusive if false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
- * split point, with the exception that the last interval will include the maximum value.
- * If true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
- * split point.
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in PMF such items are
+ * included into the interval to the left of split point. Otherwise they are included into the interval
+ * to the right of split point.
*
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
- vector_d<A> get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
+ using vector_double = typename quantile_sketch_sorted_view<T, C, A>::vector_double;
+ vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
/**
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
@@ -347,19 +314,21 @@ class kll_sketch {
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
+ *
* @param size the number of split points in the array
*
- * @param inclusive if false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
- * split point, with the exception that the last interval will include the maximum value.
- * If true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
- * split point.
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in CDF such items are
+ * included into the interval to the left of split point. Otherwise they are included into
+ * the interval to the right of split point.
*
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
* of the input stream given the split_points. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
- * array.
+ * array. This can be viewed as array of ranks of the given split points plus one more value
+ * that is always 1.
*/
- vector_d<A> get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
+ vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
/**
* Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
@@ -425,7 +394,7 @@ class kll_sketch {
// This is a convenience alias for users
// The type returned by the following serialize method
- using vector_bytes = vector_u8<A>;
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
/**
* This method serializes the sketch as a vector of bytes.
@@ -513,22 +482,22 @@ class kll_sketch {
uint8_t num_levels_;
bool is_level_zero_sorted_;
uint64_t n_;
- vector_u32<A> levels_;
+ vector_u32 levels_;
T* items_;
uint32_t items_size_;
- T* min_value_;
- T* max_value_;
+ T* min_item_;
+ T* max_item_;
mutable quantile_sketch_sorted_view<T, C, A>* sorted_view_;
// for deserialization
class item_deleter;
class items_deleter;
- kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
- std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
- std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted);
+ kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_item,
+ std::unique_ptr<T, item_deleter> max_item, bool is_level_zero_sorted);
// common update code
- inline void update_min_max(const T& value);
+ inline void update_min_max(const T& item);
inline uint32_t internal_update();
// The following code is only valid in the special case of exactly reaching capacity while updating.
@@ -557,24 +526,23 @@ class kll_sketch {
// implementations for floating point types
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
- static const TT& get_invalid_value() {
- static TT value = std::numeric_limits<TT>::quiet_NaN();
- return value;
+ static const TT& get_invalid_item() {
+ return std::numeric_limits<TT>::quiet_NaN();
}
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
- static inline bool check_update_value(TT value) {
- return !std::isnan(value);
+ static inline bool check_update_item(TT item) {
+ return !std::isnan(item);
}
// implementations for all other types
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
- static const TT& get_invalid_value() {
- throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of value");
+ static const TT& get_invalid_item() {
+ throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of item");
}
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
- static inline bool check_update_value(TT) {
+ static inline bool check_update_item(TT) {
return true;
}
diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index c83c6bd..85eb1c2 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp
@@ -44,8 +44,8 @@ n_(0),
levels_(2, 0, allocator),
items_(nullptr),
items_size_(k_),
-min_value_(nullptr),
-max_value_(nullptr),
+min_item_(nullptr),
+max_item_(nullptr),
sorted_view_(nullptr)
{
if (k < MIN_K || k > MAX_K) {
@@ -67,14 +67,14 @@ n_(other.n_),
levels_(other.levels_),
items_(nullptr),
items_size_(other.items_size_),
-min_value_(nullptr),
-max_value_(nullptr),
+min_item_(nullptr),
+max_item_(nullptr),
sorted_view_(nullptr)
{
items_ = allocator_.allocate(items_size_);
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
- if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
- if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
+ if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
+ if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
}
template<typename T, typename C, typename A>
@@ -89,13 +89,13 @@ n_(other.n_),
levels_(std::move(other.levels_)),
items_(other.items_),
items_size_(other.items_size_),
-min_value_(other.min_value_),
-max_value_(other.max_value_),
+min_item_(other.min_item_),
+max_item_(other.max_item_),
sorted_view_(nullptr)
{
other.items_ = nullptr;
- other.min_value_ = nullptr;
- other.max_value_ = nullptr;
+ other.min_item_ = nullptr;
+ other.max_item_ = nullptr;
}
template<typename T, typename C, typename A>
@@ -111,8 +111,8 @@ kll_sketch<T, C, A>& kll_sketch<T, C, A>::operator=(const kll_sketch& other) {
std::swap(levels_, copy.levels_);
std::swap(items_, copy.items_);
std::swap(items_size_, copy.items_size_);
- std::swap(min_value_, copy.min_value_);
- std::swap(max_value_, copy.max_value_);
+ std::swap(min_item_, copy.min_item_);
+ std::swap(max_item_, copy.max_item_);
reset_sorted_view();
return *this;
}
@@ -129,8 +129,8 @@ kll_sketch<T, C, A>& kll_sketch<T, C, A>::operator=(kll_sketch&& other) {
std::swap(levels_, other.levels_);
std::swap(items_, other.items_);
std::swap(items_size_, other.items_size_);
- std::swap(min_value_, other.min_value_);
- std::swap(max_value_, other.max_value_);
+ std::swap(min_item_, other.min_item_);
+ std::swap(max_item_, other.max_item_);
reset_sorted_view();
return *this;
}
@@ -143,13 +143,13 @@ kll_sketch<T, C, A>::~kll_sketch() {
for (uint32_t i = begin; i < end; i++) items_[i].~T();
allocator_.deallocate(items_, items_size_);
}
- if (min_value_ != nullptr) {
- min_value_->~T();
- allocator_.deallocate(min_value_, 1);
+ if (min_item_ != nullptr) {
+ min_item_->~T();
+ allocator_.deallocate(min_item_, 1);
}
- if (max_value_ != nullptr) {
- max_value_->~T();
- allocator_.deallocate(max_value_, 1);
+ if (max_item_ != nullptr) {
+ max_item_->~T();
+ allocator_.deallocate(max_item_, 1);
}
reset_sorted_view();
}
@@ -167,8 +167,8 @@ n_(other.n_),
levels_(other.levels_, allocator_),
items_(nullptr),
items_size_(other.items_size_),
-min_value_(nullptr),
-max_value_(nullptr),
+min_item_(nullptr),
+max_item_(nullptr),
sorted_view_(nullptr)
{
static_assert(
@@ -177,29 +177,29 @@ sorted_view_(nullptr)
);
items_ = allocator_.allocate(items_size_);
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
- if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
- if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
+ if (other.min_item_ != nullptr) min_item_ = new (allocator_.allocate(1)) T(*other.min_item_);
+ if (other.max_item_ != nullptr) max_item_ = new (allocator_.allocate(1)) T(*other.max_item_);
check_sorting();
}
template<typename T, typename C, typename A>
template<typename FwdT>
-void kll_sketch<T, C, A>::update(FwdT&& value) {
- if (!check_update_value(value)) { return; }
- update_min_max(value);
+void kll_sketch<T, C, A>::update(FwdT&& item) {
+ if (!check_update_item(item)) { return; }
+ update_min_max(item);
const uint32_t index = internal_update();
- new (&items_[index]) T(std::forward<FwdT>(value));
+ new (&items_[index]) T(std::forward<FwdT>(item));
reset_sorted_view();
}
template<typename T, typename C, typename A>
-void kll_sketch<T, C, A>::update_min_max(const T& value) {
+void kll_sketch<T, C, A>::update_min_max(const T& item) {
if (is_empty()) {
- min_value_ = new (allocator_.allocate(1)) T(value);
- max_value_ = new (allocator_.allocate(1)) T(value);
+ min_item_ = new (allocator_.allocate(1)) T(item);
+ max_item_ = new (allocator_.allocate(1)) T(item);
} else {
- if (C()(value, *min_value_)) *min_value_ = value;
- if (C()(*max_value_, value)) *max_value_ = value;
+ if (C()(item, *min_item_)) *min_item_ = item;
+ if (C()(*max_item_, item)) *max_item_ = item;
}
}
@@ -219,11 +219,11 @@ void kll_sketch<T, C, A>::merge(FwdSk&& other) {
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
}
if (is_empty()) {
- min_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_value_));
- max_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_value_));
+ min_item_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_item_));
+ max_item_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_item_));
} else {
- if (C()(*other.min_value_, *min_value_)) *min_value_ = conditional_forward<FwdSk>(*other.min_value_);
- if (C()(*max_value_, *other.max_value_)) *max_value_ = conditional_forward<FwdSk>(*other.max_value_);
+ if (C()(*other.min_item_, *min_item_)) *min_item_ = conditional_forward<FwdSk>(*other.min_item_);
+ if (C()(*max_item_, *other.max_item_)) *max_item_ = conditional_forward<FwdSk>(*other.max_item_);
}
const uint64_t final_n = n_ + other.n_;
for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
@@ -264,14 +264,14 @@ bool kll_sketch<T, C, A>::is_estimation_mode() const {
template<typename T, typename C, typename A>
T kll_sketch<T, C, A>::get_min_item() const {
- if (is_empty()) return get_invalid_value();
- return *min_value_;
+ if (is_empty()) return get_invalid_item();
+ return *min_item_;
}
template<typename T, typename C, typename A>
T kll_sketch<T, C, A>::get_max_item() const {
- if (is_empty()) return get_invalid_value();
- return *max_value_;
+ if (is_empty()) return get_invalid_item();
+ return *max_item_;
}
template<typename T, typename C, typename A>
@@ -281,7 +281,7 @@ C kll_sketch<T, C, A>::get_comparator() const {
template<typename T, typename C, typename A>
auto kll_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> quantile_return_type {
- if (is_empty()) return get_invalid_value();
+ if (is_empty()) return get_invalid_item();
if ((rank < 0.0) || (rank > 1.0)) {
throw std::invalid_argument("normalized rank cannot be less than zero or greater than 1.0");
}
@@ -290,42 +290,6 @@ auto kll_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> qua
return sorted_view_->get_quantile(rank, inclusive);
}
-template<typename T, typename C, typename A>
-std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size, bool inclusive) const {
- std::vector<T, A> quantiles(allocator_);
- if (is_empty()) return quantiles;
- quantiles.reserve(size);
-
- // may have a side effect of sorting level zero if needed
- setup_sorted_view();
-
- for (uint32_t i = 0; i < size; i++) {
- const double rank = ranks[i];
- if ((rank < 0.0) || (rank > 1.0)) {
- throw std::invalid_argument("normalized rank cannot be less than 0 or greater than 1");
- }
- quantiles.push_back(sorted_view_->get_quantile(rank, inclusive));
- }
- return quantiles;
-}
-
-template<typename T, typename C, typename A>
-std::vector<T, A> kll_sketch<T, C, A>::get_quantiles(uint32_t num, bool inclusive) const {
- if (is_empty()) return std::vector<T, A>(allocator_);
- if (num == 0) {
- throw std::invalid_argument("num must be > 0");
- }
- vector_d<A> ranks(num, 0, allocator_);
- ranks[0] = 0.0;
- for (size_t i = 1; i < num; i++) {
- ranks[i] = static_cast<double>(i) / (num - 1);
- }
- if (num > 1) {
- ranks[num - 1] = 1.0;
- }
- return get_quantiles(ranks.data(), num, inclusive);
-}
-
template<typename T, typename C, typename A>
double kll_sketch<T, C, A>::get_rank(const T& item, bool inclusive) const {
if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
@@ -334,13 +298,13 @@ double kll_sketch<T, C, A>::get_rank(const T& item, bool inclusive) const {
}
template<typename T, typename C, typename A>
-vector_d<A> kll_sketch<T, C, A>::get_PMF(const T* split_points, uint32_t size, bool inclusive) const {
+auto kll_sketch<T, C, A>::get_PMF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
setup_sorted_view();
return sorted_view_->get_PMF(split_points, size, inclusive);
}
template<typename T, typename C, typename A>
-vector_d<A> kll_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size, bool inclusive) const {
+auto kll_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
setup_sorted_view();
return sorted_view_->get_CDF(split_points, size, inclusive);
}
@@ -372,8 +336,8 @@ size_t kll_sketch<T, C, A>::get_serialized_size_bytes(const SerDe& sd) const {
}
// the last integer in the levels_ array is not serialized because it can be derived
size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
- size += sd.size_of_item(*min_value_);
- size += sd.size_of_item(*max_value_);
+ size += sd.size_of_item(*min_item_);
+ size += sd.size_of_item(*max_item_);
for (auto it: *this) size += sd.size_of_item(it.first);
return size;
}
@@ -425,18 +389,18 @@ void kll_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& sd) const {
write(os, num_levels_);
write(os, unused);
write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
- sd.serialize(os, min_value_, 1);
- sd.serialize(os, max_value_, 1);
+ sd.serialize(os, min_item_, 1);
+ sd.serialize(os, max_item_, 1);
}
sd.serialize(os, &items_[levels_[0]], get_num_retained());
}
template<typename T, typename C, typename A>
template<typename SerDe>
-vector_u8<A> kll_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
+auto kll_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
const bool is_single_item = n_ == 1;
const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
- vector_u8<A> bytes(size, 0, allocator_);
+ vector_bytes bytes(size, 0, allocator_);
uint8_t* ptr = bytes.data() + header_size_bytes;
const uint8_t* end_ptr = ptr + size;
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
@@ -461,8 +425,8 @@ vector_u8<A> kll_sketch<T, C, A>::serialize(unsigned header_size_bytes, const Se
ptr += copy_to_mem(num_levels_, ptr);
ptr += sizeof(uint8_t); // unused
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
- ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
- ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
+ ptr += sd.serialize(ptr, end_ptr - ptr, min_item_, 1);
+ ptr += sd.serialize(ptr, end_ptr - ptr, max_item_, 1);
}
const size_t bytes_remaining = end_ptr - ptr;
ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
@@ -506,7 +470,7 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
num_levels = read<uint8_t>(is);
read<uint8_t>(is); // skip unused byte
}
- vector_u32<A> levels(num_levels + 1, 0, allocator);
+ vector_u32 levels(num_levels + 1, 0, allocator);
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
if (is_single_item) {
levels[0] = capacity - 1;
@@ -517,17 +481,17 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
levels[num_levels] = capacity;
A alloc(allocator);
auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
- std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
- std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
- std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
- std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_item_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
+ std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
if (!is_single_item) {
- sd.deserialize(is, min_value_buffer.get(), 1);
+ sd.deserialize(is, min_item_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
- sd.deserialize(is, max_value_buffer.get(), 1);
+ min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
+ sd.deserialize(is, max_item_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
+ max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
}
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
@@ -537,17 +501,17 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
if (is_single_item) {
- new (min_value_buffer.get()) T(items.get()[levels[0]]);
+ new (min_item_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
- new (max_value_buffer.get()) T(items.get()[levels[0]]);
+ min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
+ new (max_item_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
+ max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
}
if (!is.good())
throw std::runtime_error("error reading from std::istream");
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
- std::move(min_value), std::move(max_value), is_level_zero_sorted);
+ std::move(min_item), std::move(max_item), is_level_zero_sorted);
}
template<typename T, typename C, typename A>
@@ -593,7 +557,7 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
ptr += copy_from_mem(ptr, num_levels);
ptr += sizeof(uint8_t); // skip unused byte
}
- vector_u32<A> levels(num_levels + 1, 0, allocator);
+ vector_u32 levels(num_levels + 1, 0, allocator);
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
if (is_single_item) {
levels[0] = capacity - 1;
@@ -604,17 +568,17 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
levels[num_levels] = capacity;
A alloc(allocator);
auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
- std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
- std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
- std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
- std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_item_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_item_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, item_deleter> min_item(nullptr, item_deleter(allocator));
+ std::unique_ptr<T, item_deleter> max_item(nullptr, item_deleter(allocator));
if (!is_single_item) {
- ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
+ ptr += sd.deserialize(ptr, end_ptr - ptr, min_item_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
- ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
+ min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
+ ptr += sd.deserialize(ptr, end_ptr - ptr, max_item_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
+ max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
}
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
@@ -626,15 +590,15 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
if (is_single_item) {
- new (min_value_buffer.get()) T(items.get()[levels[0]]);
+ new (min_item_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
- new (max_value_buffer.get()) T(items.get()[levels[0]]);
+ min_item = std::unique_ptr<T, item_deleter>(min_item_buffer.release(), item_deleter(allocator));
+ new (max_item_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
+ max_item = std::unique_ptr<T, item_deleter>(max_item_buffer.release(), item_deleter(allocator));
}
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
- std::move(min_value), std::move(max_value), is_level_zero_sorted);
+ std::move(min_item), std::move(max_item), is_level_zero_sorted);
}
/*
@@ -653,9 +617,9 @@ double kll_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool pmf) {
// for deserialization
template<typename T, typename C, typename A>
-kll_sketch<T, C, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
- std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
- std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted):
+kll_sketch<T, C, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_item,
+ std::unique_ptr<T, item_deleter> max_item, bool is_level_zero_sorted):
allocator_(levels.get_allocator()),
k_(k),
m_(DEFAULT_M),
@@ -666,8 +630,8 @@ n_(n),
levels_(std::move(levels)),
items_(items.release()),
items_size_(items_size),
-min_value_(min_value.release()),
-max_value_(max_value.release()),
+min_item_(min_item.release()),
+max_item_(max_item.release()),
sorted_view_(nullptr)
{}
@@ -818,8 +782,8 @@ void kll_sketch<T, C, A>::merge_higher_levels(O&& other, uint64_t final_n) {
const std::unique_ptr<T, decltype(tmp_items_deleter)> workbuf(allocator_.allocate(tmp_num_items), tmp_items_deleter);
const uint8_t ub = kll_helper::ub_on_num_levels(final_n);
const size_t work_levels_size = ub + 2; // ub+1 does not work
- vector_u32<A> worklevels(work_levels_size, 0, allocator_);
- vector_u32<A> outlevels(work_levels_size, 0, allocator_);
+ vector_u32 worklevels(work_levels_size, 0, allocator_);
+ vector_u32 outlevels(work_levels_size, 0, allocator_);
const uint8_t provisional_num_levels = std::max(num_levels_, other.num_levels_);
@@ -959,8 +923,8 @@ string<A> kll_sketch<T, C, A>::to_string(bool print_levels, bool print_items) co
os << " Capacity items : " << items_size_ << std::endl;
os << " Retained items : " << get_num_retained() << std::endl;
if (!is_empty()) {
- os << " Min value : " << *min_value_ << std::endl;
- os << " Max value : " << *max_value_ << std::endl;
+ os << " Min item : " << *min_item_ << std::endl;
+ os << " Max item : " << *max_item_ << std::endl;
}
os << "### End sketch summary" << std::endl;
diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp
index 0d9d5a6..aec8535 100644
--- a/kll/test/kll_sketch_test.cpp
+++ b/kll/test/kll_sketch_test.cpp
@@ -67,8 +67,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
REQUIRE(std::isnan(sketch.get_min_item()));
REQUIRE(std::isnan(sketch.get_max_item()));
REQUIRE(std::isnan(sketch.get_quantile(0.5)));
- const double fractions[3] {0, 0.5, 1};
- REQUIRE(sketch.get_quantiles(fractions, 3).size() == 0);
const float split_points[1] {0};
REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
@@ -99,12 +97,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
REQUIRE(sketch.get_min_item() == 1.0);
REQUIRE(sketch.get_max_item() == 1.0);
REQUIRE(sketch.get_quantile(0.5) == 1.0);
- const double fractions[3] {0, 0.5, 1};
- auto quantiles = sketch.get_quantiles(fractions, 3);
- REQUIRE(quantiles.size() == 3);
- REQUIRE(quantiles[0] == 1.0);
- REQUIRE(quantiles[1] == 1.0);
- REQUIRE(quantiles[2] == 1.0);
int count = 0;
for (auto it: sketch) {
@@ -139,26 +131,12 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
REQUIRE(sketch.get_max_item() == n - 1);
REQUIRE(sketch.get_quantile(1) == n - 1);
- const double ranks[3] {0, 0.5, 1};
- auto quantiles = sketch.get_quantiles(ranks, 3, false);
- REQUIRE(quantiles.size() == 3);
- REQUIRE(quantiles[0] == 0.0);
- REQUIRE(quantiles[1] == n / 2);
- REQUIRE(quantiles[2] == n - 1 );
-
for (uint32_t i = 0; i < n; i++) {
const double true_rank = (double) i / n;
REQUIRE(sketch.get_rank(static_cast<float>(i), false) == true_rank);
const double true_rank_inclusive = (double) (i + 1) / n;
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank_inclusive);
}
-
- // the alternative method must produce the same result
- auto quantiles2 = sketch.get_quantiles(3, false);
- REQUIRE(quantiles2.size() == 3);
- REQUIRE(quantiles[0] == quantiles2[0]);
- REQUIRE(quantiles[1] == quantiles2[1]);
- REQUIRE(quantiles[2] == quantiles2[2]);
}
SECTION("10 items") {
@@ -207,25 +185,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
REQUIRE(sketch.get_rank(static_cast<float>(i), false) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
}
- // test quantiles at every 0.1 percentage point
- double ranks[1001];
- double reverse_ranks[1001]; // check that ordering does not matter
- for (int i = 0; i < 1001; i++) {
- ranks[i] = (double) i / 1000;
- reverse_ranks[1000 - i] = ranks[i];
- }
- auto quantiles = sketch.get_quantiles(ranks, 1001);
- auto reverse_quantiles = sketch.get_quantiles(reverse_ranks, 1001);
- float previous_quantile = 0;
- for (int i = 0; i < 1001; i++) {
- // expensive in a loop, just to check the equivalence here, not advised for real code
- const float quantile = sketch.get_quantile(ranks[i]);
- REQUIRE(quantiles[i] == quantile);
- REQUIRE(reverse_quantiles[1000 - i] == quantile);
- REQUIRE(previous_quantile <= quantile);
- previous_quantile = quantile;
- }
-
//std::cout << sketch.to_string();
uint32_t count = 0;
diff --git a/kll/test/kll_sketch_validation.cpp b/kll/test/kll_sketch_validation.cpp
index 31ab2c1..2dbf60d 100644
--- a/kll/test/kll_sketch_validation.cpp
+++ b/kll/test/kll_sketch_validation.cpp
@@ -151,11 +151,11 @@ const int64_t correct_results[num_tests * 7] = {
113, 200, 8311133, 6554171, 16, 637, 121111429906734123
};
-static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
+static std::vector<int> make_input_array(unsigned n, unsigned stride) {
if (!kll_helper::is_odd(stride)) throw std::logic_error("stride must be odd");
unsigned mask = (1 << 23) - 1; // because items are single-precision floats at the moment
unsigned cur = 0;
- std::unique_ptr<int[]> arr(new int[n]);
+ std::vector<int> arr(n, 0);
for (unsigned i = 0; i < n; i++) {
cur += stride;
cur &= mask;
@@ -187,7 +187,7 @@ TEST_CASE("kll validation", "[kll_sketch][validation]") {
unsigned k = correct_results[7 * i + 1];
unsigned n = correct_results[7 * i + 2];
unsigned stride = correct_results[7 * i + 3];
- std::unique_ptr<int[]> input_array = make_input_array(n, stride);
+ auto input_array = make_input_array(n, stride);
kll_sketch<float> sketch(k);
kll_next_offset = 0;
for (unsigned j = 0; j < n; j++) {
@@ -201,7 +201,7 @@ TEST_CASE("kll validation", "[kll_sketch][validation]") {
if (correct_results[7 * i + 6] == p.first) {
std::cout << " pass" << std::endl;
} else {
- std::cout << " " << (correct_results[7 * i + 6]) << " != " << p.first;
+ std::cout << " " << (correct_results[7 * i + 6]) << " != " << p.first << "\n";
std::cout << sketch.to_string();
FAIL();
}
diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp
index 5a9b95f..d6efec0 100644
--- a/quantiles/include/quantiles_sketch.hpp
+++ b/quantiles/include/quantiles_sketch.hpp
@@ -37,7 +37,7 @@ namespace datasketches {
* the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF().
*
* <p>Consider a large stream of one million values such as packet sizes coming into a network node.
- * The natural rank of any specific size value is simply its index in the hypothetical sorted
+ * The natural rank of any specific size value is its index in the hypothetical sorted
* array of values.
* The normalized rank is the natural rank divided by the stream size,
* in this case one million.
@@ -54,7 +54,7 @@ namespace datasketches {
* <li>20% of the values were ≥ 500 and < 900, and</li>
* <li>10% of the values were ≥ 900.</li>
* </ul>
- * A frequency histogram can be obtained by simply multiplying these fractions by get_n(),
+ * A frequency histogram can be obtained by multiplying these fractions by get_n(),
* which is the total count of values received.
* The get_CDF() works similarly, but produces the cumulative distribution instead.
*
@@ -65,8 +65,8 @@ namespace datasketches {
* <p>The accuracy of this sketch is a function of the configured value <i>k</i>, which also affects
* the overall size of the sketch. Accuracy of this quantile sketch is always with respect to
* the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
- * For example, the median value returned from getQuantile(0.5) will be between the actual values
- * from the hypothetically sorted array of input values at normalized ranks of 0.483 and 0.517, with
+ * For example, the median item returned from getQuantile(0.5) will be between the actual items
+ * from the hypothetically sorted array of input items at normalized ranks of 0.483 and 0.517, with
* a confidence of about 99%.</p>
*
* <pre>
@@ -119,17 +119,17 @@ Table Guide for DoublesSketch Size in Bytes and Approximate Error:
* by Agarwal, Cormode, Huang, Phillips, Wei, and Yi.
* <a href="http://dblp.org/rec/html/journals/tods/AgarwalCHPWY13"></a></p>
*
- * <p>This algorithm is independent of the distribution of values and
- * requires only that the values be comparable.</p>
+ * <p>This algorithm is independent of the distribution of items and
+ * requires only that the items be comparable.</p>
*
- * <p>This algorithm intentionally inserts randomness into the sampling process for values that
+ * <p>This algorithm intentionally inserts randomness into the sampling process for items that
* ultimately get retained in the sketch. The results produced by this algorithm are not
* deterministic. For example, if the same stream is inserted into two different instances of this
* sketch, the answers obtained from the two sketches may not be identical.</p>
*
* <p>Similarly, there may be directional inconsistencies. For example, the result
* obtained from get_quantile(rank) input into the reverse directional query
- * get_rank(item) may not result in the original value.</p>
+ * get_rank(item) may not result in the original item.</p>
*
* @author Kevin Lang
* @author Lee Rhodes
@@ -254,37 +254,6 @@ public:
using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
- /**
- * This is a multiple-query version of get_quantile().
- * <p>
- * This returns an array that could have been generated by using get_quantile() for each
- * fractional rank separately.
- *
- * <p>If the sketch is empty this returns an empty vector.
- *
- * @param ranks given array of normalized ranks in the hypothetical sorted stream.
- * These ranks must be in the interval [0.0, 1.0], inclusive.
- *
- * @return array of approximations to items associated with given ranks in the same order as given ranks
- * in the input array.
- */
- std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
-
- /**
- * This is a multiple-query version of get_quantile() that allows the caller to
- * specify the number of evenly-spaced normalized ranks.
- *
- * <p>If the sketch is empty this returns an empty vector.
- *
- * @param num an integer that specifies the number of evenly-spaced ranks.
- * This must be an integer greater than 0. A value of 1 is equivalent to get_quantiles([0]).
- * A value of 2 is equivalent to get_quantiles([0, 1]). A value of 3 is equivalent to
- * get_quantiles([0, 0.5, 1]), etc.
- *
- * @return array of approximations to items associated with the given number of evenly-spaced normalized ranks.
- */
- std::vector<T, Allocator> get_quantiles(uint32_t num, bool inclusive = true) const;
-
/**
* Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
*
@@ -311,17 +280,17 @@ public:
* <p>If the sketch is empty this returns an empty vector.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
- * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
*
* @param size of the array of split points.
*
- * @param inclusive if false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
- * split point, with the exception that the last interval will include the maximum item.
- * If true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
- * split point.
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in PMF such items are
+ * included into the interval to the left of split point. Otherwise they are included into the interval
+ * to the right of split point.
*
* @return an array of m+1 doubles each of which is an approximation
- * to the fraction of the input stream values (the mass) that fall into one of those intervals.
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
@@ -339,15 +308,16 @@ public:
*
* @param size of the array of split points.
*
- * @param inclusive if false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
- * split point, with the exception that the last interval will include the maximum item.
- * If true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
- * split point.
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in CDF such items are
+ * included into the interval to the left of split point. Otherwise they are included into
+ * the interval to the right of split point.
*
* @return an array of m+1 double values, which are a consecutive approximation to the CDF
* of the input stream given the split_points. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
- * array.
+ * array. This can be viewed as array of ranks of the given split points plus one more value
+ * that is always 1.
*/
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
@@ -558,9 +528,8 @@ private:
// implementations for floating point types
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
- static const TT& get_invalid_value() {
- static TT value = std::numeric_limits<TT>::quiet_NaN();
- return value;
+ static const TT& get_invalid_item() {
+ return std::numeric_limits<TT>::quiet_NaN();
}
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
@@ -570,8 +539,8 @@ private:
// implementations for all other types
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
- static const TT& get_invalid_value() {
- throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
+ static const TT& get_invalid_item() {
+ throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of items");
}
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp
index f5d80b3..713327b 100644
--- a/quantiles/include/quantiles_sketch_impl.hpp
+++ b/quantiles/include/quantiles_sketch_impl.hpp
@@ -675,13 +675,13 @@ uint32_t quantiles_sketch<T, C, A>::get_num_retained() const {
template<typename T, typename C, typename A>
const T& quantiles_sketch<T, C, A>::get_min_item() const {
- if (is_empty()) return get_invalid_value();
+ if (is_empty()) return get_invalid_item();
return *min_item_;
}
template<typename T, typename C, typename A>
const T& quantiles_sketch<T, C, A>::get_max_item() const {
- if (is_empty()) return get_invalid_value();
+ if (is_empty()) return get_invalid_item();
return *max_item_;
}
@@ -750,7 +750,7 @@ quantile_sketch_sorted_view<T, C, A> quantiles_sketch<T, C, A>::get_sorted_view(
template<typename T, typename C, typename A>
auto quantiles_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> quantile_return_type {
- if (is_empty()) return get_invalid_value();
+ if (is_empty()) return get_invalid_item();
if ((rank < 0.0) || (rank > 1.0)) {
throw std::invalid_argument("Normalized rank cannot be less than 0 or greater than 1");
}
@@ -1111,7 +1111,7 @@ void quantiles_sketch<T, C, A>::standard_merge(quantiles_sketch& tgt, FwdSk&& sr
throw std::logic_error("Failed internal consistency check after standard_merge()");
}
- // update min and max values
+ // update min and max items
// can't just check is_empty() since min and max might not have been set if
// there were no base buffer items added via update()
if (tgt.min_item_ == nullptr) {
@@ -1187,7 +1187,7 @@ void quantiles_sketch<T, C, A>::downsampling_merge(quantiles_sketch& tgt, FwdSk&
throw std::logic_error("Failed internal consistency check after downsampling_merge()");
}
- // update min and max values
+ // update min and max items
// can't just check is_empty() since min and max might not have been set if
// there were no base buffer items added via update()
if (tgt.min_item_ == nullptr) {
diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp
index 0b9a255..cab57ac 100755
--- a/req/include/req_sketch.hpp
+++ b/req/include/req_sketch.hpp
@@ -158,16 +158,16 @@ public:
* <p>If the sketch is empty this returns an empty vector.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
- * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
*
- * @param size of the array of split points.
+ * @param size the number of split points in the array
*
- * @param inclusive if false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
- * split point, with the exception that the last interval will include the maximum item.
- * If true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
- * split point.
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in PMF such items are
+ * included into the interval to the left of split point. Otherwise they are included into the interval
+ * to the right of split point.
*
- * @return an array of m+1 double values each of which is an approximation
+ * @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
@@ -181,17 +181,18 @@ public:
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
*
- * @param size of the array of split points.
+ * @param size the number of split points in the array
*
- * @param inclusive if false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
- * split point, with the exception that the last interval will include the maximum item.
- * If true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
- * split point.
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
+ * if the sketch contains items equal to a slit point, then in CDF such items are
+ * included into the interval to the left of split point. Otherwise they are included into
+ * the interval to the right of split point.
*
- * @return an array of m+1 double values, which are a consecutive approximation to the CDF
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
* of the input stream given the split_points. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
- * array.
+ * array. This can be viewed as array of ranks of the given split points plus one more value
+ * that is always 1.
*/
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
@@ -206,13 +207,6 @@ public:
using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
- /**
- * Returns an array of quantiles that correspond to the given array of normalized ranks.
- * @param ranks given array of normalized ranks.
- * @return array of quantiles that correspond to the given array of normalized ranks
- */
- std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
-
/**
* Returns an approximate lower bound of the given normalized rank.
* @param rank the given rank, a value between 0 and 1.0.
@@ -362,19 +356,18 @@ private:
// implementations for floating point types
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
- static const TT& get_invalid_value() {
- static TT value = std::numeric_limits<TT>::quiet_NaN();
- return value;
+ static const TT& get_invalid_item() {
+ return std::numeric_limits<TT>::quiet_NaN();
}
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
- static inline bool check_update_item(const TT& value) {
- return !std::isnan(value);
+ static inline bool check_update_item(const TT& item) {
+ return !std::isnan(item);
}
// implementations for all other types
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
- static const TT& get_invalid_value() {
+ static const TT& get_invalid_item() {
throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of items");
}
diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp
index d404fd7..6cc714f 100755
--- a/req/include/req_sketch_impl.hpp
+++ b/req/include/req_sketch_impl.hpp
@@ -222,13 +222,13 @@ void req_sketch<T, C, A>::merge(FwdSk&& other) {
template<typename T, typename C, typename A>
const T& req_sketch<T, C, A>::get_min_item() const {
- if (is_empty()) return get_invalid_value();
+ if (is_empty()) return get_invalid_item();
return *min_item_;
}
template<typename T, typename C, typename A>
const T& req_sketch<T, C, A>::get_max_item() const {
- if (is_empty()) return get_invalid_value();
+ if (is_empty()) return get_invalid_item();
return *max_item_;
}
@@ -260,7 +260,7 @@ auto req_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size, bool inc
template<typename T, typename C, typename A>
auto req_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> quantile_return_type {
- if (is_empty()) return get_invalid_value();
+ if (is_empty()) return get_invalid_item();
if ((rank < 0.0) || (rank > 1.0)) {
throw std::invalid_argument("Normalized rank cannot be less than 0 or greater than 1");
}
@@ -269,25 +269,6 @@ auto req_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> qua
return sorted_view_->get_quantile(rank, inclusive);
}
-template<typename T, typename C, typename A>
-std::vector<T, A> req_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size, bool inclusive) const {
- std::vector<T, A> quantiles(allocator_);
- if (is_empty()) return quantiles;
- quantiles.reserve(size);
-
- // possible side-effect of sorting level zero
- setup_sorted_view();
-
- for (uint32_t i = 0; i < size; ++i) {
- const double rank = ranks[i];
- if ((rank < 0.0) || (rank > 1.0)) {
- throw std::invalid_argument("Normalized rank cannot be less than 0 or greater than 1");
- }
- quantiles.push_back(sorted_view_->get_quantile(rank, inclusive));
- }
- return quantiles;
-}
-
template<typename T, typename C, typename A>
quantile_sketch_sorted_view<T, C, A> req_sketch<T, C, A>::get_sorted_view() const {
if (!compactors_[0].is_sorted()) {
diff --git a/req/test/req_sketch_test.cpp b/req/test/req_sketch_test.cpp
index abe4979..e427039 100755
--- a/req/test/req_sketch_test.cpp
+++ b/req/test/req_sketch_test.cpp
@@ -50,8 +50,6 @@ TEST_CASE("req sketch: empty", "[req_sketch]") {
REQUIRE(std::isnan(sketch.get_quantile(0)));
REQUIRE(std::isnan(sketch.get_quantile(0.5)));
REQUIRE(std::isnan(sketch.get_quantile(1)));
- const double ranks[3] {0, 0.5, 1};
- REQUIRE(sketch.get_quantiles(ranks, 3).size() == 0);
const float split_points[1] {0};
REQUIRE(sketch.get_CDF(split_points, 1).empty());
@@ -74,13 +72,6 @@ TEST_CASE("req sketch: single value, lra", "[req_sketch]") {
REQUIRE(sketch.get_quantile(0.5, false) == 1);
REQUIRE(sketch.get_quantile(1, false) == 1);
- const double ranks[3] {0, 0.5, 1};
- auto quantiles = sketch.get_quantiles(ranks, 3, false);
- REQUIRE(quantiles.size() == 3);
- REQUIRE(quantiles[0] == 1);
- REQUIRE(quantiles[1] == 1);
- REQUIRE(quantiles[2] == 1);
-
unsigned count = 0;
for (auto it: sketch) {
REQUIRE(it.second == 1);
@@ -143,13 +134,6 @@ TEST_CASE("req sketch: exact mode", "[req_sketch]") {
REQUIRE(sketch.get_quantile(0.9) == 9);
REQUIRE(sketch.get_quantile(1) == 10);
- const double ranks[3] {0, 0.5, 1};
- auto quantiles = sketch.get_quantiles(ranks, 3, false);
- REQUIRE(quantiles.size() == 3);
- REQUIRE(quantiles[0] == 1);
- REQUIRE(quantiles[1] == 6);
- REQUIRE(quantiles[2] == 10);
-
const float splits[3] {2, 6, 9};
auto cdf = sketch.get_CDF(splits, 3, false);
REQUIRE(cdf[0] == 0.1);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org