You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2022/04/26 07:17:24 UTC

[datasketches-cpp] 03/03: clone and minorly adapt python kll testing to classic quantiles

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch quantiles
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 64272ddfbca72dd608b945859cab5254fe4a7c2d
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Tue Apr 26 00:17:00 2022 -0700

    clone and minorly adapt python kll testing to classic quantiles
---
 python/src/datasketches.cpp      |   2 +
 python/src/quantiles_wrapper.cpp |  20 +++----
 python/tests/quantiles_test.py   | 125 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp
index 4ef491b..d0a26b7 100644
--- a/python/src/datasketches.cpp
+++ b/python/src/datasketches.cpp
@@ -28,6 +28,7 @@ void init_cpc(py::module& m);
 void init_theta(py::module& m);
 void init_vo(py::module& m);
 void init_req(py::module& m);
+void init_quantiles(py::module& m);
 void init_vector_of_kll(py::module& m);
 
 PYBIND11_MODULE(datasketches, m) {
@@ -38,5 +39,6 @@ PYBIND11_MODULE(datasketches, m) {
   init_theta(m);
   init_vo(m);
   init_req(m);
+  init_quantiles(m);
   init_vector_of_kll(m);
 }
diff --git a/python/src/quantiles_wrapper.cpp b/python/src/quantiles_wrapper.cpp
index ca3b7f0..7633ea2 100644
--- a/python/src/quantiles_wrapper.cpp
+++ b/python/src/quantiles_wrapper.cpp
@@ -51,8 +51,8 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
 
 template<typename T>
 double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
-                           const T& item,
-                           bool inclusive) {
+                                 const T& item,
+                                 bool inclusive) {
   if (inclusive)
     return sk.template get_rank<true>(item);
   else
@@ -61,8 +61,8 @@ double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
 
 template<typename T>
 T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
-                          double rank,
-                          bool inclusive) {
+                                double rank,
+                                bool inclusive) {
   if (inclusive)
     return T(sk.template get_quantile<true>(rank));
   else
@@ -71,8 +71,8 @@ T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
 
 template<typename T>
 py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
-                                  std::vector<double>& fractions,
-                                  bool inclusive) {
+                                        std::vector<double>& fractions,
+                                        bool inclusive) {
   size_t n_quantiles = fractions.size();
   auto result = inclusive
      ? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
@@ -89,8 +89,8 @@ py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
 
 template<typename T>
 py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
-                            std::vector<T>& split_points,
-                            bool inclusive) {
+                                  std::vector<T>& split_points,
+                                  bool inclusive) {
   size_t n_points = split_points.size();
   auto result = inclusive
      ? sk.template get_PMF<true>(&split_points[0], n_points)
@@ -106,8 +106,8 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
 
 template<typename T>
 py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
-                            std::vector<T>& split_points,
-                            bool inclusive) {
+                                  std::vector<T>& split_points,
+                                  bool inclusive) {
   size_t n_points = split_points.size();
   auto result = inclusive
      ? sk.template get_CDF<true>(&split_points[0], n_points)
diff --git a/python/tests/quantiles_test.py b/python/tests/quantiles_test.py
new file mode 100644
index 0000000..1c606ab
--- /dev/null
+++ b/python/tests/quantiles_test.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
+import numpy as np
+
+class QuantilesTest(unittest.TestCase):
+    def test_quantiles_example(self):
+      k = 128
+      n = 2 ** 20
+
+      # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
+      quantiles = quantiles_floats_sketch(k)
+      quantiles.update(np.random.normal(size=n-1))
+      quantiles.update(0.0)
+
+      # 0 should be near the median
+      self.assertAlmostEqual(0.5, quantiles.get_rank(0.0), delta=0.035)
+      
+      # the median should be near 0
+      self.assertAlmostEqual(0.0, quantiles.get_quantile(0.5), delta=0.035)
+
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      self.assertLessEqual(quantiles.get_min_value(), quantiles.get_quantile(0.01))
+      self.assertLessEqual(0.0, quantiles.get_rank(quantiles.get_min_value()))
+      self.assertGreaterEqual(quantiles.get_max_value(), quantiles.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, quantiles.get_rank(quantiles.get_max_value()))
+
+      # we can also extract a list of values at a time,
+      # here the values should give us something close to [-2, -1, 0, 1, 2].
+      # then get the CDF, which will return something close to
+      # the original values used in get_quantiles()
+      # finally, can check the normalized rank error bound
+      pts = quantiles.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
+      cdf = quantiles.get_cdf(pts)  # include 1.0 at end to account for all probability mass
+      self.assertEqual(len(cdf), len(pts)+1)
+      err = quantiles.normalized_rank_error(False)
+      self.assertEqual(err, quantiles_floats_sketch.get_normalized_rank_error(k, False))
+
+      # and a few basic queries about the sketch
+      self.assertFalse(quantiles.is_empty())
+      self.assertTrue(quantiles.is_estimation_mode())
+      self.assertEqual(quantiles.get_n(), n)
+      self.assertEqual(quantiles.get_k(), k)
+      self.assertLess(quantiles.get_num_retained(), n)
+
+      # merging itself will double the number of items the sketch has seen
+      quantiles.merge(quantiles)
+      self.assertEqual(quantiles.get_n(), 2*n)
+
+      # we can then serialize and reconstruct the sketch
+      quantiles_bytes = quantiles.serialize()
+      new_quantiles = quantiles.deserialize(quantiles_bytes)
+      self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
+      self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
+      self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
+      self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
+      self.assertEqual(quantiles.get_rank(0.0), new_quantiles.get_rank(0.0))
+
+    def test_quantiles_ints_sketch(self):
+        k = 128
+        n = 10
+        quantiles = quantiles_ints_sketch(k)
+        for i in range(0, n):
+          quantiles.update(i)
+
+        self.assertEqual(quantiles.get_min_value(), 0)
+        self.assertEqual(quantiles.get_max_value(), n-1)
+        self.assertEqual(quantiles.get_n(), n)
+        self.assertFalse(quantiles.is_empty())
+        self.assertFalse(quantiles.is_estimation_mode()) # n < k
+        self.assertEqual(quantiles.get_k(), k)
+
+        pmf = quantiles.get_pmf([round(n/2)])
+        self.assertIsNotNone(pmf)
+        self.assertEqual(len(pmf), 2)
+
+        cdf = quantiles.get_cdf([round(n/2)])
+        self.assertIsNotNone(cdf)
+        self.assertEqual(len(cdf), 2)
+
+        self.assertEqual(quantiles.get_quantile(0.5), round(n/2))
+        quants = quantiles.get_quantiles([0.25, 0.5, 0.75])
+        self.assertIsNotNone(quants)
+        self.assertEqual(len(quants), 3)
+
+        self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)
+
+        # merge self
+        quantiles.merge(quantiles)
+        self.assertEqual(quantiles.get_n(), 2 * n)
+
+        sk_bytes = quantiles.serialize()
+        self.assertTrue(isinstance(quantiles_ints_sketch.deserialize(sk_bytes), quantiles_ints_sketch))
+
+    def test_quantiles_floats_sketch(self):
+      # already tested ints and it's templatized, so just make sure it instantiates properly
+      k = 256
+      quantiles = quantiles_floats_sketch(k)
+      self.assertTrue(quantiles.is_empty())
+
+    def test_quantiles_doubles_sketch(self):
+      # already tested ints and it's templatized, so just make sure it instantiates properly
+      k = 128
+      quantiles = quantiles_doubles_sketch(k)
+      self.assertTrue(quantiles.is_empty())
+
+if __name__ == '__main__':
+    unittest.main()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org