You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/11/03 19:58:32 UTC

[incubator-datasketches-cpp] branch req_sketch updated: quantile calculator

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch req_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/req_sketch by this push:
     new 4a63aa6  quantile calculator
4a63aa6 is described below

commit 4a63aa648a460cf39e3d2dbc7572d5398eef15fa
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Nov 3 11:56:01 2020 -0800

    quantile calculator
---
 req/CMakeLists.txt                           |  4 ++
 req/include/req_compactor.hpp                |  2 +
 req/include/req_compactor_impl.hpp           | 12 +++++-
 req/include/req_quantile_calculator.hpp      | 61 ++++++++++++++++++++++++++++
 req/include/req_quantile_calculator_impl.hpp | 60 +++++++++++++++++++++++++++
 req/include/req_sketch.hpp                   |  4 ++
 req/include/req_sketch_impl.hpp              | 19 +++++++++
 req/test/req_sketch_test.cpp                 | 29 +++++++++++++
 8 files changed, 190 insertions(+), 1 deletion(-)

diff --git a/req/CMakeLists.txt b/req/CMakeLists.txt
index 95bfb0a..36e7d90 100755
--- a/req/CMakeLists.txt
+++ b/req/CMakeLists.txt
@@ -38,6 +38,8 @@ list(APPEND req_HEADERS "include/req_sketch.hpp")
 list(APPEND req_HEADERS "include/req_sketch_impl.hpp")
 list(APPEND req_HEADERS "include/req_compactor.hpp")
 list(APPEND req_HEADERS "include/req_compactor_impl.hpp")
+list(APPEND req_HEADERS "include/req_quantile_calculator.hpp")
+list(APPEND req_HEADERS "include/req_quantile_calculator_impl.hpp")
 
 install(TARGETS req
   EXPORT ${PROJECT_NAME}
@@ -53,4 +55,6 @@ target_sources(req
     ${CMAKE_CURRENT_SOURCE_DIR}/include/req_sketch_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor_impl.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator_impl.hpp
 )
diff --git a/req/include/req_compactor.hpp b/req/include/req_compactor.hpp
index 8da2c44..939dd5e 100755
--- a/req/include/req_compactor.hpp
+++ b/req/include/req_compactor.hpp
@@ -35,6 +35,8 @@ public:
   bool is_sorted() const;
   uint32_t get_num_items() const;
   uint32_t get_nom_capacity() const;
+  uint8_t get_lg_weight() const;
+  const std::vector<T, Allocator>& get_items() const;
 
   template<bool inclusive>
   uint64_t compute_weight(const T& item) const;
diff --git a/req/include/req_compactor_impl.hpp b/req/include/req_compactor_impl.hpp
index 170edf0..9e62970 100755
--- a/req/include/req_compactor_impl.hpp
+++ b/req/include/req_compactor_impl.hpp
@@ -57,6 +57,16 @@ uint32_t req_compactor<T, H, C, A>::get_nom_capacity() const {
 }
 
 template<typename T, bool H, typename C, typename A>
+uint8_t req_compactor<T, H, C, A>::get_lg_weight() const {
+  return lg_weight_;
+}
+
+template<typename T, bool H, typename C, typename A>
+const std::vector<T, A>& req_compactor<T, H, C, A>::get_items() const {
+  return items_;
+}
+
+template<typename T, bool H, typename C, typename A>
 template<bool inclusive>
 uint64_t req_compactor<T, H, C, A>::compute_weight(const T& item) const {
   if (!sorted_) const_cast<req_compactor*>(this)->sort(); // allow sorting as a side effect
@@ -85,7 +95,7 @@ void req_compactor<T, H, C, A>::merge_sort_in(std::vector<T, A>&& items) {
   if (items_.capacity() < items_.size() + items.size()) items_.reserve(items_.size() + items.size());
   auto middle = items_.end();
   std::move(items.begin(), items.end(), std::back_inserter(items_));
-  std::inplace_merge(items_.begin(), middle, items_.end());
+  std::inplace_merge(items_.begin(), middle, items_.end(), C());
 }
 
 template<typename T, bool H, typename C, typename A>
diff --git a/req/include/req_quantile_calculator.hpp b/req/include/req_quantile_calculator.hpp
new file mode 100755
index 0000000..ccf901c
--- /dev/null
+++ b/req/include/req_quantile_calculator.hpp
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef REQ_QUANTILE_CALCULATOR_HPP_
+#define REQ_QUANTILE_CALCULATOR_HPP_
+
+#include <functional>
+
+namespace datasketches {
+
+template<
+  typename T,
+  typename Allocator
+>
+class req_quantile_calculator {
+public:
+  req_quantile_calculator(uint64_t n, const Allocator& allocator);
+
+  void add(const std::vector<T, Allocator>& items, uint8_t lg_weight);
+
+  template<bool inclusive>
+  void convert_to_cummulative();
+
+  const T& get_quantile(double rank) const;
+
+private:
+  using Entry = std::pair<const T*, uint64_t>;
+  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
+  using Container = std::vector<Entry, AllocEntry>;
+
+  struct compare_pairs_by_second {
+    bool operator()(const Entry& a, const Entry& b) {
+      return a.second < b.second;
+    }
+  };
+
+  uint64_t n_;
+  Container entries_;
+};
+
+} /* namespace datasketches */
+
+#include "req_quantile_calculator_impl.hpp"
+
+#endif
diff --git a/req/include/req_quantile_calculator_impl.hpp b/req/include/req_quantile_calculator_impl.hpp
new file mode 100755
index 0000000..bc143bf
--- /dev/null
+++ b/req/include/req_quantile_calculator_impl.hpp
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef REQ_QUANTILE_CALCULATOR_IMPL_HPP_
+#define REQ_QUANTILE_CALCULATOR_IMPL_HPP_
+
+namespace datasketches {
+
+template<typename T, typename A>
+req_quantile_calculator<T, A>::req_quantile_calculator(uint64_t n, const A& allocator):
+n_(n),
+entries_(allocator)
+{}
+
+template<typename T, typename A>
+void req_quantile_calculator<T, A>::add(const std::vector<T, A>& items, uint8_t lg_weight) {
+  if (entries_.capacity() < entries_.size() + items.size()) entries_.reserve(entries_.size() + items.size());
+  auto middle = entries_.end();
+  for (const auto& item: items) entries_.push_back(Entry(&item, 1 << lg_weight));
+  std::inplace_merge(entries_.begin(), middle, entries_.end(), compare_pairs_by_second());
+}
+
+template<typename T, typename A>
+template<bool inclusive>
+void req_quantile_calculator<T, A>::convert_to_cummulative() {
+  uint64_t subtotal = 0;
+  for (auto& entry: entries_) {
+    const uint64_t new_subtotal = subtotal + entry.second;
+    entry.second = inclusive ? new_subtotal : subtotal;
+    subtotal = new_subtotal;
+  }
+}
+
+template<typename T, typename A>
+const T& req_quantile_calculator<T, A>::get_quantile(double rank) const {
+  uint64_t weight = rank * n_;
+  auto it = std::lower_bound(entries_.begin(), entries_.end(), Entry(nullptr, weight), compare_pairs_by_second());
+  if (it == entries_.end()) return *(entries_[entries_.size() - 1].first);
+  return *(it->first);
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp
index d99f687..30841c3 100755
--- a/req/include/req_sketch.hpp
+++ b/req/include/req_sketch.hpp
@@ -22,6 +22,7 @@
 
 #include "req_common.hpp"
 #include "req_compactor.hpp"
+#include "req_quantile_calculator.hpp"
 
 namespace datasketches {
 
@@ -78,6 +79,9 @@ public:
   template<bool inclusive = false>
   double get_rank(const T& item) const;
 
+  template<bool inclusive = false>
+  const T& get_quantile(double rank) const;
+
   /**
    * Prints a summary of the sketch.
    * @param print_levels if true include information about levels
diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp
index 6137e99..f8f592a 100755
--- a/req/include/req_sketch_impl.hpp
+++ b/req/include/req_sketch_impl.hpp
@@ -88,6 +88,25 @@ double req_sketch<T, H, C, S, A>::get_rank(const T& item) const {
 }
 
 template<typename T, bool H, typename C, typename S, typename A>
+template<bool inclusive>
+const T& req_sketch<T, H, C, S, A>::get_quantile(double rank) const {
+  if (is_empty()) throw new std::invalid_argument("sketch is empty");
+  if ((rank < 0.0) || (rank > 1.0)) {
+    throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
+  }
+  // TODO: min and max
+  if (!compactors_[0].is_sorted()) {
+    const_cast<req_compactor<T, H, C, A>&>(compactors_[0]).sort(); // allow this side effect
+  }
+  req_quantile_calculator<T, A> quantile_calculator(n_, allocator_);
+  for (auto& compactor: compactors_) {
+    quantile_calculator.add(compactor.get_items(), compactor.get_lg_weight());
+  }
+  quantile_calculator.template convert_to_cummulative<inclusive>();
+  return quantile_calculator.get_quantile(rank);
+}
+
+template<typename T, bool H, typename C, typename S, typename A>
 void req_sketch<T, H, C, S, A>::grow() {
   const uint8_t lg_weight = get_num_levels();
   compactors_.push_back(Compactor(lg_weight, k_, allocator_));
diff --git a/req/test/req_sketch_test.cpp b/req/test/req_sketch_test.cpp
index 5451fcc..d55907f 100755
--- a/req/test/req_sketch_test.cpp
+++ b/req/test/req_sketch_test.cpp
@@ -52,6 +52,9 @@ TEST_CASE("req sketch: single value", "[req_sketch]") {
   REQUIRE(sketch.get_rank<true>(1) == 1);
   REQUIRE(sketch.get_rank(1.1) == 1);
   REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1);
+  REQUIRE(sketch.get_quantile(0) == 1);
+  REQUIRE(sketch.get_quantile(0.5) == 1);
+  REQUIRE(sketch.get_quantile(1) == 1);
 }
 
 TEST_CASE("req sketch: repeated values", "[req_sketch]") {
@@ -79,8 +82,34 @@ TEST_CASE("req sketch: exact mode", "[req_sketch]") {
   REQUIRE_FALSE(sketch.is_estimation_mode());
   REQUIRE(sketch.get_n() == 100);
   REQUIRE(sketch.get_num_retained() == 100);
+
+  // like KLL
+  REQUIRE(sketch.get_rank(0) == 0);
+  REQUIRE(sketch.get_rank(1) == 0.01);
   REQUIRE(sketch.get_rank(50) == 0.5);
+  REQUIRE(sketch.get_rank(98) == 0.98);
+  REQUIRE(sketch.get_rank(99) == 0.99);
+
+  // inclusive
+  REQUIRE(sketch.get_rank<true>(0) == 0.01);
+  REQUIRE(sketch.get_rank<true>(1) == 0.02);
   REQUIRE(sketch.get_rank<true>(49) == 0.5);
+  REQUIRE(sketch.get_rank<true>(98) == 0.99);
+  REQUIRE(sketch.get_rank<true>(99) == 1);
+
+  // like KLL
+  REQUIRE(sketch.get_quantile(0) == 0);
+  REQUIRE(sketch.get_quantile(0.01) == 1);
+  REQUIRE(sketch.get_quantile(0.5) == 50);
+  REQUIRE(sketch.get_quantile(.99) == 99);
+  REQUIRE(sketch.get_quantile(1) == 99);
+
+  // inclusive
+  REQUIRE(sketch.get_quantile<true>(0) == 0);
+  REQUIRE(sketch.get_quantile<true>(0.01) == 0);
+  REQUIRE(sketch.get_quantile<true>(0.5) == 49);
+  REQUIRE(sketch.get_quantile<true>(0.99) == 98);
+  REQUIRE(sketch.get_quantile<true>(1) == 99);
 }
 
 TEST_CASE("req sketch: estimation mode", "[req_sketch]") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org