You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/01/08 01:35:53 UTC
[datasketches-cpp] 01/01: kll inclusive rank
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch kll_inclusive_rank
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit c0504fcc352ff2ea00af084b5e36a1c4d3019745
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Jan 7 17:35:28 2022 -0800
kll inclusive rank
---
kll/include/kll_sketch.hpp | 26 ++++++++++++++++++++++----
kll/include/kll_sketch_impl.hpp | 20 +++++++++++++-------
kll/test/kll_sketch_test.cpp | 8 ++++++--
req/include/req_sketch.hpp | 3 +--
4 files changed, 42 insertions(+), 15 deletions(-)
diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index fa55520..3a3e7bd 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -35,7 +35,7 @@ namespace datasketches {
* and nearly optimal accuracy per retained item.
* See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
*
- * <p>This is a stochastic streaming sketch that enables near-real time analysis of the
+ * <p>This is a stochastic streaming sketch that enables near real-time analysis of the
* approximate distribution of values from a very large stream in a single pass, requiring only
* that the values are comparable.
* The analysis is obtained using <i>get_quantile()</i> or <i>get_quantiles()</i> functions or the
@@ -157,7 +157,12 @@ namespace kll_constants {
const uint16_t DEFAULT_K = 200;
}
-template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
+template <
+ typename T,
+ typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
+ typename S = serde<T>,
+ typename A = std::allocator<T>
+>
class kll_sketch {
public:
using value_type = T;
@@ -309,6 +314,9 @@ class kll_sketch {
/**
* Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
* inclusive.
+ * With the template parameter inclusive=true the weight of the given value is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+ * according to the comparator C.
*
* <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
* get_normalized_rank_error(false) function.
@@ -318,6 +326,7 @@ class kll_sketch {
* @param value to be ranked
* @return an approximate rank of the given value
*/
+ template<bool inclusive = false>
double get_rank(const T& value) const;
/**
@@ -338,9 +347,12 @@ class kll_sketch {
*
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
- * The definition of an "interval" is inclusive of the left split point and exclusive of the right
- * split point, with the exception that the last interval will include maximum value.
+ * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
+ * split point, with the exception that the last interval will include the maximum value.
+ * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
+ * split point.
*/
+ template<bool inclusive = false>
vector_d<A> get_PMF(const T* split_points, uint32_t size) const;
/**
@@ -364,6 +376,7 @@ class kll_sketch {
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
* array.
*/
+ template<bool inclusive = false>
vector_d<A> get_CDF(const T* split_points, uint32_t size) const;
/**
@@ -536,11 +549,16 @@ class kll_sketch {
void add_empty_top_level_to_completely_full_sketch();
void sort_level_zero();
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> get_quantile_calculator();
+
+ template<bool inclusive>
vector_d<A> get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const;
+ template<bool inclusive>
void increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
const T* split_points, uint32_t size, double* buckets) const;
+ template<bool inclusive>
void increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
const T* split_points, uint32_t size, double* buckets) const;
+
template<typename O> void merge_higher_levels(O&& other, uint64_t final_n);
void populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels);
void populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels);
diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index e346549..2276b05 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp
@@ -320,6 +320,7 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
}
template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
uint8_t level = 0;
@@ -329,7 +330,7 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
const auto from_index(levels_[level]);
const auto to_index(levels_[level + 1]); // exclusive
for (uint32_t i = from_index; i < to_index; i++) {
- if (C()(items_[i], value)) {
+ if (inclusive ? !C()(value, items_[i]) : C()(items_[i], value)) {
total += weight;
} else if ((level > 0) || is_level_zero_sorted_) {
break; // levels above 0 are sorted, no point comparing further
@@ -342,13 +343,15 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
}
template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
- return get_PMF_or_CDF(split_points, size, false);
+ return get_PMF_or_CDF<inclusive>(split_points, size, false);
}
template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
- return get_PMF_or_CDF(split_points, size, true);
+ return get_PMF_or_CDF<inclusive>(split_points, size, true);
}
template<typename T, typename C, typename S, typename A>
@@ -798,6 +801,7 @@ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantil
}
template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
if (is_empty()) return vector_d<A>(allocator_);
kll_helper::validate_values<T, C>(split_points, size);
@@ -808,9 +812,9 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
const auto from_index = levels_[level];
const auto to_index = levels_[level + 1]; // exclusive
if ((level == 0) && !is_level_zero_sorted_) {
- increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
+ increment_buckets_unsorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
} else {
- increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
+ increment_buckets_sorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
}
level++;
weight *= 2;
@@ -831,13 +835,14 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
}
template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
const T* split_points, uint32_t size, double* buckets) const
{
for (uint32_t i = from_index; i < to_index; i++) {
uint32_t j;
for (j = 0; j < size; j++) {
- if (C()(items_[i], split_points[j])) {
+ if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
break;
}
}
@@ -846,13 +851,14 @@ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_inde
}
template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
const T* split_points, uint32_t size, double* buckets) const
{
uint32_t i = from_index;
uint32_t j = 0;
while ((i < to_index) && (j < size)) {
- if (C()(items_[i], split_points[j])) {
+ if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
buckets[j] += weight; // this sample goes into this bucket
i++; // move on to next sample and see whether it also goes into this bucket
} else {
diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp
index 7b230d7..5da33bc 100644
--- a/kll/test/kll_sketch_test.cpp
+++ b/kll/test/kll_sketch_test.cpp
@@ -90,7 +90,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
REQUIRE(sketch.get_n() == 1);
REQUIRE(sketch.get_num_retained() == 1);
REQUIRE(sketch.get_rank(1.0f) == 0.0);
+ REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
REQUIRE(sketch.get_rank(2.0f) == 1.0);
+ REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
REQUIRE(sketch.get_min_value() == 1.0);
REQUIRE(sketch.get_max_value() == 1.0);
REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -142,8 +144,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
REQUIRE(quantiles[2] == n - 1 );
for (uint32_t i = 0; i < n; i++) {
- const double trueRank = (double) i / n;
- REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
+ const double true_rank = (double) i / n;
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
+ const double true_rank_inclusive = (double) (i + 1) / n;
+ REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
}
// the alternative method must produce the same result
diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp
index 779caba..1ef2157 100755
--- a/req/include/req_sketch.hpp
+++ b/req/include/req_sketch.hpp
@@ -28,7 +28,7 @@ namespace datasketches {
template<
typename T,
- typename Comparator = std::less<T>,
+ typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
typename SerDe = serde<T>,
typename Allocator = std::allocator<T>
>
@@ -123,7 +123,6 @@ public:
* @param item to be ranked
* @return an approximate rank of the given item
*/
-
template<bool inclusive = false>
double get_rank(const T& item) const;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org