You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/11/03 19:58:32 UTC
[incubator-datasketches-cpp] branch req_sketch updated: quantile
calculator
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch req_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
The following commit(s) were added to refs/heads/req_sketch by this push:
new 4a63aa6 quantile calculator
4a63aa6 is described below
commit 4a63aa648a460cf39e3d2dbc7572d5398eef15fa
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Nov 3 11:56:01 2020 -0800
quantile calculator
---
req/CMakeLists.txt | 4 ++
req/include/req_compactor.hpp | 2 +
req/include/req_compactor_impl.hpp | 12 +++++-
req/include/req_quantile_calculator.hpp | 61 ++++++++++++++++++++++++++++
req/include/req_quantile_calculator_impl.hpp | 60 +++++++++++++++++++++++++++
req/include/req_sketch.hpp | 4 ++
req/include/req_sketch_impl.hpp | 19 +++++++++
req/test/req_sketch_test.cpp | 29 +++++++++++++
8 files changed, 190 insertions(+), 1 deletion(-)
diff --git a/req/CMakeLists.txt b/req/CMakeLists.txt
index 95bfb0a..36e7d90 100755
--- a/req/CMakeLists.txt
+++ b/req/CMakeLists.txt
@@ -38,6 +38,8 @@ list(APPEND req_HEADERS "include/req_sketch.hpp")
list(APPEND req_HEADERS "include/req_sketch_impl.hpp")
list(APPEND req_HEADERS "include/req_compactor.hpp")
list(APPEND req_HEADERS "include/req_compactor_impl.hpp")
+list(APPEND req_HEADERS "include/req_quantile_calculator.hpp")
+list(APPEND req_HEADERS "include/req_quantile_calculator_impl.hpp")
install(TARGETS req
EXPORT ${PROJECT_NAME}
@@ -53,4 +55,6 @@ target_sources(req
${CMAKE_CURRENT_SOURCE_DIR}/include/req_sketch_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator_impl.hpp
)
diff --git a/req/include/req_compactor.hpp b/req/include/req_compactor.hpp
index 8da2c44..939dd5e 100755
--- a/req/include/req_compactor.hpp
+++ b/req/include/req_compactor.hpp
@@ -35,6 +35,8 @@ public:
bool is_sorted() const;
uint32_t get_num_items() const;
uint32_t get_nom_capacity() const;
+ uint8_t get_lg_weight() const;
+ const std::vector<T, Allocator>& get_items() const;
template<bool inclusive>
uint64_t compute_weight(const T& item) const;
diff --git a/req/include/req_compactor_impl.hpp b/req/include/req_compactor_impl.hpp
index 170edf0..9e62970 100755
--- a/req/include/req_compactor_impl.hpp
+++ b/req/include/req_compactor_impl.hpp
@@ -57,6 +57,16 @@ uint32_t req_compactor<T, H, C, A>::get_nom_capacity() const {
}
template<typename T, bool H, typename C, typename A>
+uint8_t req_compactor<T, H, C, A>::get_lg_weight() const {
+ return lg_weight_;
+}
+
+template<typename T, bool H, typename C, typename A>
+const std::vector<T, A>& req_compactor<T, H, C, A>::get_items() const {
+ return items_;
+}
+
+template<typename T, bool H, typename C, typename A>
template<bool inclusive>
uint64_t req_compactor<T, H, C, A>::compute_weight(const T& item) const {
if (!sorted_) const_cast<req_compactor*>(this)->sort(); // allow sorting as a side effect
@@ -85,7 +95,7 @@ void req_compactor<T, H, C, A>::merge_sort_in(std::vector<T, A>&& items) {
if (items_.capacity() < items_.size() + items.size()) items_.reserve(items_.size() + items.size());
auto middle = items_.end();
std::move(items.begin(), items.end(), std::back_inserter(items_));
- std::inplace_merge(items_.begin(), middle, items_.end());
+ std::inplace_merge(items_.begin(), middle, items_.end(), C());
}
template<typename T, bool H, typename C, typename A>
diff --git a/req/include/req_quantile_calculator.hpp b/req/include/req_quantile_calculator.hpp
new file mode 100755
index 0000000..ccf901c
--- /dev/null
+++ b/req/include/req_quantile_calculator.hpp
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef REQ_QUANTILE_CALCULATOR_HPP_
+#define REQ_QUANTILE_CALCULATOR_HPP_
+
+#include <functional>
+
+namespace datasketches {
+
+template<
+ typename T,
+ typename Allocator
+>
+class req_quantile_calculator {
+public:
+ req_quantile_calculator(uint64_t n, const Allocator& allocator);
+
+ void add(const std::vector<T, Allocator>& items, uint8_t lg_weight);
+
+ template<bool inclusive>
+ void convert_to_cummulative();
+
+ const T& get_quantile(double rank) const;
+
+private:
+ using Entry = std::pair<const T*, uint64_t>;
+ using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
+ using Container = std::vector<Entry, AllocEntry>;
+
+ struct compare_pairs_by_second {
+ bool operator()(const Entry& a, const Entry& b) {
+ return a.second < b.second;
+ }
+ };
+
+ uint64_t n_;
+ Container entries_;
+};
+
+} /* namespace datasketches */
+
+#include "req_quantile_calculator_impl.hpp"
+
+#endif
diff --git a/req/include/req_quantile_calculator_impl.hpp b/req/include/req_quantile_calculator_impl.hpp
new file mode 100755
index 0000000..bc143bf
--- /dev/null
+++ b/req/include/req_quantile_calculator_impl.hpp
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef REQ_QUANTILE_CALCULATOR_IMPL_HPP_
+#define REQ_QUANTILE_CALCULATOR_IMPL_HPP_
+
+namespace datasketches {
+
+template<typename T, typename A>
+req_quantile_calculator<T, A>::req_quantile_calculator(uint64_t n, const A& allocator):
+n_(n),
+entries_(allocator)
+{}
+
+template<typename T, typename A>
+void req_quantile_calculator<T, A>::add(const std::vector<T, A>& items, uint8_t lg_weight) {
+ if (entries_.capacity() < entries_.size() + items.size()) entries_.reserve(entries_.size() + items.size());
+ auto middle = entries_.end();
+ for (const auto& item: items) entries_.push_back(Entry(&item, 1 << lg_weight));
+ std::inplace_merge(entries_.begin(), middle, entries_.end(), compare_pairs_by_second());
+}
+
+template<typename T, typename A>
+template<bool inclusive>
+void req_quantile_calculator<T, A>::convert_to_cummulative() {
+ uint64_t subtotal = 0;
+ for (auto& entry: entries_) {
+ const uint64_t new_subtotal = subtotal + entry.second;
+ entry.second = inclusive ? new_subtotal : subtotal;
+ subtotal = new_subtotal;
+ }
+}
+
+template<typename T, typename A>
+const T& req_quantile_calculator<T, A>::get_quantile(double rank) const {
+ uint64_t weight = rank * n_;
+ auto it = std::lower_bound(entries_.begin(), entries_.end(), Entry(nullptr, weight), compare_pairs_by_second());
+ if (it == entries_.end()) return *(entries_[entries_.size() - 1].first);
+ return *(it->first);
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp
index d99f687..30841c3 100755
--- a/req/include/req_sketch.hpp
+++ b/req/include/req_sketch.hpp
@@ -22,6 +22,7 @@
#include "req_common.hpp"
#include "req_compactor.hpp"
+#include "req_quantile_calculator.hpp"
namespace datasketches {
@@ -78,6 +79,9 @@ public:
template<bool inclusive = false>
double get_rank(const T& item) const;
+ template<bool inclusive = false>
+ const T& get_quantile(double rank) const;
+
/**
* Prints a summary of the sketch.
* @param print_levels if true include information about levels
diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp
index 6137e99..f8f592a 100755
--- a/req/include/req_sketch_impl.hpp
+++ b/req/include/req_sketch_impl.hpp
@@ -88,6 +88,25 @@ double req_sketch<T, H, C, S, A>::get_rank(const T& item) const {
}
template<typename T, bool H, typename C, typename S, typename A>
+template<bool inclusive>
+const T& req_sketch<T, H, C, S, A>::get_quantile(double rank) const {
+ if (is_empty()) throw new std::invalid_argument("sketch is empty");
+ if ((rank < 0.0) || (rank > 1.0)) {
+ throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
+ }
+ // TODO: min and max
+ if (!compactors_[0].is_sorted()) {
+ const_cast<req_compactor<T, H, C, A>&>(compactors_[0]).sort(); // allow this side effect
+ }
+ req_quantile_calculator<T, A> quantile_calculator(n_, allocator_);
+ for (auto& compactor: compactors_) {
+ quantile_calculator.add(compactor.get_items(), compactor.get_lg_weight());
+ }
+ quantile_calculator.template convert_to_cummulative<inclusive>();
+ return quantile_calculator.get_quantile(rank);
+}
+
+template<typename T, bool H, typename C, typename S, typename A>
void req_sketch<T, H, C, S, A>::grow() {
const uint8_t lg_weight = get_num_levels();
compactors_.push_back(Compactor(lg_weight, k_, allocator_));
diff --git a/req/test/req_sketch_test.cpp b/req/test/req_sketch_test.cpp
index 5451fcc..d55907f 100755
--- a/req/test/req_sketch_test.cpp
+++ b/req/test/req_sketch_test.cpp
@@ -52,6 +52,9 @@ TEST_CASE("req sketch: single value", "[req_sketch]") {
REQUIRE(sketch.get_rank<true>(1) == 1);
REQUIRE(sketch.get_rank(1.1) == 1);
REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1);
+ REQUIRE(sketch.get_quantile(0) == 1);
+ REQUIRE(sketch.get_quantile(0.5) == 1);
+ REQUIRE(sketch.get_quantile(1) == 1);
}
TEST_CASE("req sketch: repeated values", "[req_sketch]") {
@@ -79,8 +82,34 @@ TEST_CASE("req sketch: exact mode", "[req_sketch]") {
REQUIRE_FALSE(sketch.is_estimation_mode());
REQUIRE(sketch.get_n() == 100);
REQUIRE(sketch.get_num_retained() == 100);
+
+ // like KLL
+ REQUIRE(sketch.get_rank(0) == 0);
+ REQUIRE(sketch.get_rank(1) == 0.01);
REQUIRE(sketch.get_rank(50) == 0.5);
+ REQUIRE(sketch.get_rank(98) == 0.98);
+ REQUIRE(sketch.get_rank(99) == 0.99);
+
+ // inclusive
+ REQUIRE(sketch.get_rank<true>(0) == 0.01);
+ REQUIRE(sketch.get_rank<true>(1) == 0.02);
REQUIRE(sketch.get_rank<true>(49) == 0.5);
+ REQUIRE(sketch.get_rank<true>(98) == 0.99);
+ REQUIRE(sketch.get_rank<true>(99) == 1);
+
+ // like KLL
+ REQUIRE(sketch.get_quantile(0) == 0);
+ REQUIRE(sketch.get_quantile(0.01) == 1);
+ REQUIRE(sketch.get_quantile(0.5) == 50);
+ REQUIRE(sketch.get_quantile(.99) == 99);
+ REQUIRE(sketch.get_quantile(1) == 99);
+
+ // inclusive
+ REQUIRE(sketch.get_quantile<true>(0) == 0);
+ REQUIRE(sketch.get_quantile<true>(0.01) == 0);
+ REQUIRE(sketch.get_quantile<true>(0.5) == 49);
+ REQUIRE(sketch.get_quantile<true>(0.99) == 98);
+ REQUIRE(sketch.get_quantile<true>(1) == 99);
}
TEST_CASE("req sketch: estimation mode", "[req_sketch]") {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org