You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/01/08 01:35:53 UTC
[datasketches-cpp] 01/01: kll inclusive rank

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch kll_inclusive_rank
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit c0504fcc352ff2ea00af084b5e36a1c4d3019745
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Jan 7 17:35:28 2022 -0800

    kll inclusive rank
---
 kll/include/kll_sketch.hpp      | 26 ++++++++++++++++++++++----
 kll/include/kll_sketch_impl.hpp | 20 +++++++++++++-------
 kll/test/kll_sketch_test.cpp    |  8 ++++++--
 req/include/req_sketch.hpp      |  3 +--
 4 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index fa55520..3a3e7bd 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -35,7 +35,7 @@ namespace datasketches {
  * and nearly optimal accuracy per retained item.
  * See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
  *
- * <p>This is a stochastic streaming sketch that enables near-real time analysis of the
+ * <p>This is a stochastic streaming sketch that enables near real-time analysis of the
  * approximate distribution of values from a very large stream in a single pass, requiring only
  * that the values are comparable.
  * The analysis is obtained using <i>get_quantile()</i> or <i>get_quantiles()</i> functions or the
@@ -157,7 +157,12 @@ namespace kll_constants {
   const uint16_t DEFAULT_K = 200;
 }
 
-template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
+template <
+  typename T,
+  typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
+  typename S = serde<T>,
+  typename A = std::allocator<T>
+>
 class kll_sketch {
   public:
     using value_type = T;
@@ -309,6 +314,9 @@ class kll_sketch {
     /**
      * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
      * inclusive.
+     * With the template parameter inclusive=true the weight of the given value is included into the rank.
+     * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+     * according to the comparator C.
      *
      * <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
      * get_normalized_rank_error(false) function.
@@ -318,6 +326,7 @@ class kll_sketch {
      * @param value to be ranked
      * @return an approximate rank of the given value
      */
+    template<bool inclusive = false>
     double get_rank(const T& value) const;
 
     /**
@@ -338,9 +347,12 @@ class kll_sketch {
      *
      * @return an array of m+1 doubles each of which is an approximation
      * to the fraction of the input stream values (the mass) that fall into one of those intervals.
-     * The definition of an "interval" is inclusive of the left split point and exclusive of the right
-     * split point, with the exception that the last interval will include maximum value.
+     * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
+     * split point, with the exception that the last interval will include the maximum value.
+     * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
+     * split point.
      */
+    template<bool inclusive = false>
     vector_d<A> get_PMF(const T* split_points, uint32_t size) const;
 
     /**
@@ -364,6 +376,7 @@ class kll_sketch {
      * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
      * array.
      */
+    template<bool inclusive = false>
     vector_d<A> get_CDF(const T* split_points, uint32_t size) const;
 
     /**
@@ -536,11 +549,16 @@ class kll_sketch {
     void add_empty_top_level_to_completely_full_sketch();
     void sort_level_zero();
     std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> get_quantile_calculator();
+
+    template<bool inclusive>
     vector_d<A> get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const;
+    template<bool inclusive>
     void increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
         const T* split_points, uint32_t size, double* buckets) const;
+    template<bool inclusive>
     void increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
         const T* split_points, uint32_t size, double* buckets) const;
+
     template<typename O> void merge_higher_levels(O&& other, uint64_t final_n);
     void populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels);
     void populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels);
diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index e346549..2276b05 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp
@@ -320,6 +320,7 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
 }
 
 template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
 double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
   if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
   uint8_t level = 0;
@@ -329,7 +330,7 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
     const auto from_index(levels_[level]);
     const auto to_index(levels_[level + 1]); // exclusive
     for (uint32_t i = from_index; i < to_index; i++) {
-      if (C()(items_[i], value)) {
+      if (inclusive ? !C()(value, items_[i]) : C()(items_[i], value)) {
         total += weight;
       } else if ((level > 0) || is_level_zero_sorted_) {
         break; // levels above 0 are sorted, no point comparing further
@@ -342,13 +343,15 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
 }
 
 template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
 vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
-  return get_PMF_or_CDF(split_points, size, false);
+  return get_PMF_or_CDF<inclusive>(split_points, size, false);
 }
 
 template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
 vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
-  return get_PMF_or_CDF(split_points, size, true);
+  return get_PMF_or_CDF<inclusive>(split_points, size, true);
 }
 
 template<typename T, typename C, typename S, typename A>
@@ -798,6 +801,7 @@ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantil
 }
 
 template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
 vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
   if (is_empty()) return vector_d<A>(allocator_);
   kll_helper::validate_values<T, C>(split_points, size);
@@ -808,9 +812,9 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
     const auto from_index = levels_[level];
     const auto to_index = levels_[level + 1]; // exclusive
     if ((level == 0) && !is_level_zero_sorted_) {
-      increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
+      increment_buckets_unsorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
     } else {
-      increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
+      increment_buckets_sorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
     }
     level++;
     weight *= 2;
@@ -831,13 +835,14 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
 }
 
 template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
 void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
     const T* split_points, uint32_t size, double* buckets) const
 {
   for (uint32_t i = from_index; i < to_index; i++) {
     uint32_t j;
     for (j = 0; j < size; j++) {
-      if (C()(items_[i], split_points[j])) {
+      if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
         break;
       }
     }
@@ -846,13 +851,14 @@ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_inde
 }
 
 template<typename T, typename C, typename S, typename A>
+template<bool inclusive>
 void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
     const T* split_points, uint32_t size, double* buckets) const
 {
   uint32_t i = from_index;
   uint32_t j = 0;
   while ((i <  to_index) && (j < size)) {
-    if (C()(items_[i], split_points[j])) {
+    if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
       buckets[j] += weight; // this sample goes into this bucket
       i++; // move on to next sample and see whether it also goes into this bucket
     } else {
diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp
index 7b230d7..5da33bc 100644
--- a/kll/test/kll_sketch_test.cpp
+++ b/kll/test/kll_sketch_test.cpp
@@ -90,7 +90,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
     REQUIRE(sketch.get_n() == 1);
     REQUIRE(sketch.get_num_retained() == 1);
     REQUIRE(sketch.get_rank(1.0f) == 0.0);
+    REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
     REQUIRE(sketch.get_rank(2.0f) == 1.0);
+    REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
     REQUIRE(sketch.get_min_value() == 1.0);
     REQUIRE(sketch.get_max_value() == 1.0);
     REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -142,8 +144,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
     REQUIRE(quantiles[2] == n - 1 );
 
     for (uint32_t i = 0; i < n; i++) {
-      const double trueRank = (double) i / n;
-      REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
+      const double true_rank = (double) i / n;
+      REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
+      const double true_rank_inclusive = (double) (i + 1) / n;
+      REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
     }
 
     // the alternative method must produce the same result
diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp
index 779caba..1ef2157 100755
--- a/req/include/req_sketch.hpp
+++ b/req/include/req_sketch.hpp
@@ -28,7 +28,7 @@ namespace datasketches {
 
 template<
   typename T,
-  typename Comparator = std::less<T>,
+  typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
   typename SerDe = serde<T>,
   typename Allocator = std::allocator<T>
 >
@@ -123,7 +123,6 @@ public:
    * @param item to be ranked
    * @return an approximate rank of the given item
    */
-
   template<bool inclusive = false>
   double get_rank(const T& item) const;
 

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org