You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2022/04/26 07:17:21 UTC

[datasketches-cpp] branch quantiles updated (e2d2bec -> 64272dd)

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a change to branch quantiles
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


    from e2d2bec  clean up a few warnings
     new 8a5c65e  improve test coverage
     new 9e224d7  more test coverage
     new 64272dd  clone and minorly adapt python kll testing to classic quantiles

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 python/src/datasketches.cpp                 |   2 +
 python/src/quantiles_wrapper.cpp            |  20 ++--
 python/tests/quantiles_test.py              | 125 +++++++++++++++++++++++++
 quantiles/include/quantiles_sketch_impl.hpp |  19 ----
 quantiles/test/quantiles_sketch_test.cpp    | 139 +++++++++++++++++++++++++++-
 5 files changed, 273 insertions(+), 32 deletions(-)
 create mode 100644 python/tests/quantiles_test.py


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[datasketches-cpp] 02/03: more test coverage

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch quantiles
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 9e224d743b0557c7b65b00e682887441dc68dca9
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Mon Apr 25 23:59:42 2022 -0700

    more test coverage
---
 quantiles/test/quantiles_sketch_test.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/quantiles/test/quantiles_sketch_test.cpp b/quantiles/test/quantiles_sketch_test.cpp
index 1c8764e..619b059 100644
--- a/quantiles/test/quantiles_sketch_test.cpp
+++ b/quantiles/test/quantiles_sketch_test.cpp
@@ -738,7 +738,7 @@ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
     REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
     REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
 
-    const int n = 1000;
+    const int n = 10000;
     for (int i = 0; i < n; i++) sketch.update(i);
 
     std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
@@ -803,11 +803,11 @@ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
     REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
     REQUIRE(sketch1.get_serialized_size_bytes() == 8);
 
-    const int n = 1000;
+    const int n = 10000;
     for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
 
     REQUIRE(sketch1.get_min_value() == std::string("0"));
-    REQUIRE(sketch1.get_max_value() == std::string("999"));
+    REQUIRE(sketch1.get_max_value() == std::string("9999"));
 
     auto bytes = sketch1.serialize();
     REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[datasketches-cpp] 03/03: clone and minorly adapt python kll testing to classic quantiles

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch quantiles
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 64272ddfbca72dd608b945859cab5254fe4a7c2d
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Tue Apr 26 00:17:00 2022 -0700

    clone and minorly adapt python kll testing to classic quantiles
---
 python/src/datasketches.cpp      |   2 +
 python/src/quantiles_wrapper.cpp |  20 +++----
 python/tests/quantiles_test.py   | 125 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp
index 4ef491b..d0a26b7 100644
--- a/python/src/datasketches.cpp
+++ b/python/src/datasketches.cpp
@@ -28,6 +28,7 @@ void init_cpc(py::module& m);
 void init_theta(py::module& m);
 void init_vo(py::module& m);
 void init_req(py::module& m);
+void init_quantiles(py::module& m);
 void init_vector_of_kll(py::module& m);
 
 PYBIND11_MODULE(datasketches, m) {
@@ -38,5 +39,6 @@ PYBIND11_MODULE(datasketches, m) {
   init_theta(m);
   init_vo(m);
   init_req(m);
+  init_quantiles(m);
   init_vector_of_kll(m);
 }
diff --git a/python/src/quantiles_wrapper.cpp b/python/src/quantiles_wrapper.cpp
index ca3b7f0..7633ea2 100644
--- a/python/src/quantiles_wrapper.cpp
+++ b/python/src/quantiles_wrapper.cpp
@@ -51,8 +51,8 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
 
 template<typename T>
 double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
-                           const T& item,
-                           bool inclusive) {
+                                 const T& item,
+                                 bool inclusive) {
   if (inclusive)
     return sk.template get_rank<true>(item);
   else
@@ -61,8 +61,8 @@ double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
 
 template<typename T>
 T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
-                          double rank,
-                          bool inclusive) {
+                                double rank,
+                                bool inclusive) {
   if (inclusive)
     return T(sk.template get_quantile<true>(rank));
   else
@@ -71,8 +71,8 @@ T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
 
 template<typename T>
 py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
-                                  std::vector<double>& fractions,
-                                  bool inclusive) {
+                                        std::vector<double>& fractions,
+                                        bool inclusive) {
   size_t n_quantiles = fractions.size();
   auto result = inclusive
      ? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
@@ -89,8 +89,8 @@ py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
 
 template<typename T>
 py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
-                            std::vector<T>& split_points,
-                            bool inclusive) {
+                                  std::vector<T>& split_points,
+                                  bool inclusive) {
   size_t n_points = split_points.size();
   auto result = inclusive
      ? sk.template get_PMF<true>(&split_points[0], n_points)
@@ -106,8 +106,8 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
 
 template<typename T>
 py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
-                            std::vector<T>& split_points,
-                            bool inclusive) {
+                                  std::vector<T>& split_points,
+                                  bool inclusive) {
   size_t n_points = split_points.size();
   auto result = inclusive
      ? sk.template get_CDF<true>(&split_points[0], n_points)
diff --git a/python/tests/quantiles_test.py b/python/tests/quantiles_test.py
new file mode 100644
index 0000000..1c606ab
--- /dev/null
+++ b/python/tests/quantiles_test.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
+import numpy as np
+
+class QuantilesTest(unittest.TestCase):
+    def test_quantiles_example(self):
+      k = 128
+      n = 2 ** 20
+
+      # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
+      quantiles = quantiles_floats_sketch(k)
+      quantiles.update(np.random.normal(size=n-1))
+      quantiles.update(0.0)
+
+      # 0 should be near the median
+      self.assertAlmostEqual(0.5, quantiles.get_rank(0.0), delta=0.035)
+      
+      # the median should be near 0
+      self.assertAlmostEqual(0.0, quantiles.get_quantile(0.5), delta=0.035)
+
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      self.assertLessEqual(quantiles.get_min_value(), quantiles.get_quantile(0.01))
+      self.assertLessEqual(0.0, quantiles.get_rank(quantiles.get_min_value()))
+      self.assertGreaterEqual(quantiles.get_max_value(), quantiles.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, quantiles.get_rank(quantiles.get_max_value()))
+
+      # we can also extract a list of values at a time,
+      # here the values should give us something close to [-2, -1, 0, 1, 2].
+      # then get the CDF, which will return something close to
+      # the original values used in get_quantiles()
+      # finally, can check the normalized rank error bound
+      pts = quantiles.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
+      cdf = quantiles.get_cdf(pts)  # include 1.0 at end to account for all probability mass
+      self.assertEqual(len(cdf), len(pts)+1)
+      err = quantiles.normalized_rank_error(False)
+      self.assertEqual(err, quantiles_floats_sketch.get_normalized_rank_error(k, False))
+
+      # and a few basic queries about the sketch
+      self.assertFalse(quantiles.is_empty())
+      self.assertTrue(quantiles.is_estimation_mode())
+      self.assertEqual(quantiles.get_n(), n)
+      self.assertEqual(quantiles.get_k(), k)
+      self.assertLess(quantiles.get_num_retained(), n)
+
+      # merging itself will double the number of items the sketch has seen
+      quantiles.merge(quantiles)
+      self.assertEqual(quantiles.get_n(), 2*n)
+
+      # we can then serialize and reconstruct the sketch
+      quantiles_bytes = quantiles.serialize()
+      new_quantiles = quantiles.deserialize(quantiles_bytes)
+      self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
+      self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
+      self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
+      self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
+      self.assertEqual(quantiles.get_rank(0.0), new_quantiles.get_rank(0.0))
+
+    def test_quantiles_ints_sketch(self):
+        k = 128
+        n = 10
+        quantiles = quantiles_ints_sketch(k)
+        for i in range(0, n):
+          quantiles.update(i)
+
+        self.assertEqual(quantiles.get_min_value(), 0)
+        self.assertEqual(quantiles.get_max_value(), n-1)
+        self.assertEqual(quantiles.get_n(), n)
+        self.assertFalse(quantiles.is_empty())
+        self.assertFalse(quantiles.is_estimation_mode()) # n < k
+        self.assertEqual(quantiles.get_k(), k)
+
+        pmf = quantiles.get_pmf([round(n/2)])
+        self.assertIsNotNone(pmf)
+        self.assertEqual(len(pmf), 2)
+
+        cdf = quantiles.get_cdf([round(n/2)])
+        self.assertIsNotNone(cdf)
+        self.assertEqual(len(cdf), 2)
+
+        self.assertEqual(quantiles.get_quantile(0.5), round(n/2))
+        quants = quantiles.get_quantiles([0.25, 0.5, 0.75])
+        self.assertIsNotNone(quants)
+        self.assertEqual(len(quants), 3)
+
+        self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)
+
+        # merge self
+        quantiles.merge(quantiles)
+        self.assertEqual(quantiles.get_n(), 2 * n)
+
+        sk_bytes = quantiles.serialize()
+        self.assertTrue(isinstance(quantiles_ints_sketch.deserialize(sk_bytes), quantiles_ints_sketch))
+
+    def test_quantiles_floats_sketch(self):
+      # already tested ints and it's templatized, so just make sure it instantiates properly
+      k = 256
+      quantiles = quantiles_floats_sketch(k)
+      self.assertTrue(quantiles.is_empty())
+
+    def test_quantiles_doubles_sketch(self):
+      # already tested ints and it's templatized, so just make sure it instantiates properly
+      k = 128
+      quantiles = quantiles_doubles_sketch(k)
+      self.assertTrue(quantiles.is_empty())
+
+if __name__ == '__main__':
+    unittest.main()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[datasketches-cpp] 01/03: improve test coverage

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch quantiles
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 8a5c65e302473a2cc339249bf2fe79ee1d278b8b
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Mon Apr 25 23:55:28 2022 -0700

    improve test coverage
---
 quantiles/include/quantiles_sketch_impl.hpp |  19 ----
 quantiles/test/quantiles_sketch_test.cpp    | 133 ++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+), 19 deletions(-)

diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp
index cf620d3..2065396 100644
--- a/quantiles/include/quantiles_sketch_impl.hpp
+++ b/quantiles/include/quantiles_sketch_impl.hpp
@@ -208,25 +208,6 @@ void quantiles_sketch<T, C, A>::merge(FwdSk&& other) {
     }
     *this = sk_copy;
   }
-
-/*
-  // update min/max values
-  // can't just check is_empty() since min/max might not have been set if
-  // there were no base buffer items added via update()
-  if (min_value_ == nullptr) {
-    min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
-  } else {
-    if (C()(*other.min_value_, *min_value_))
-      *min_value_ = conditional_forward<FwdSk>(*other.min_value_);
-  }
-
-  if (max_value_ == nullptr) {
-    max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
-  } else {
-    if (C()(*max_value_, *other.max_value_))
-      *max_value_ = conditional_forward<FwdSk>(*other.max_value_);
-  }
-  */
 }
 
 template<typename T, typename C, typename A>
diff --git a/quantiles/test/quantiles_sketch_test.cpp b/quantiles/test/quantiles_sketch_test.cpp
index 8860e4a..1c8764e 100644
--- a/quantiles/test/quantiles_sketch_test.cpp
+++ b/quantiles/test/quantiles_sketch_test.cpp
@@ -599,6 +599,139 @@ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
     REQUIRE(sketch2.get_max_value() == 999999.0f);
   }
 
+  SECTION("merge: two empty") {
+    quantiles_float_sketch sk1(128, 0);
+    quantiles_float_sketch sk2(64, 0);
+    sk1.merge(sk2);
+    REQUIRE(sk1.get_n() == 0);
+    REQUIRE(sk1.get_k() == 128);
+
+    sk2.merge(const_cast<const quantiles_float_sketch&>(sk1));
+    REQUIRE(sk2.get_n() == 0);
+    REQUIRE(sk2.get_k() == 64);
+  }
+
+  SECTION("merge: exact as input") {
+    const uint16_t k = 128;
+    quantiles_float_sketch sketch1(2 * k, 0);
+    quantiles_float_sketch sketch2(k, 0);
+
+    for (int i = 0; i < k / 2; i++) {
+      sketch1.update(static_cast<float>(i));
+      sketch2.update(static_cast<float>(i));
+    }
+
+    for (int i = 0; i < 100 * k; i++) {
+      sketch1.update(static_cast<float>(i));
+    }
+    
+    sketch1.merge(sketch2);
+    REQUIRE(sketch1.get_n() == 101 * k);
+    REQUIRE(sketch1.get_k() == 2 * k); // no reason to have shrunk
+    REQUIRE(sketch1.get_min_value() == 0.0f);
+    REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
+  }
+
+  SECTION("merge: src estimation, tgt exact, tgt.k > src.k") {
+    const uint16_t k = 128;
+    quantiles_float_sketch sketch1(2 * k, 0);
+    quantiles_float_sketch sketch2(k, 0);
+
+    for (int i = 0; i < k / 2; i++) {
+      sketch1.update(static_cast<float>(i));
+      sketch2.update(static_cast<float>(i));
+    }
+    
+    for (int i = 0; i < 100 * k; i++) {
+      sketch2.update(static_cast<float>(i));
+    }
+
+    sketch1.merge(sketch2);
+    REQUIRE(sketch1.get_n() == 101 * k);
+    REQUIRE(sketch1.get_k() == k); // no reason to have shrunk
+    REQUIRE(sketch1.get_min_value() == 0.0f);
+    REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
+  }
+
+  SECTION("merge: both estimation, tgt.k < src.k") {
+    const uint16_t k = 128;
+    quantiles_float_sketch sketch1(k, 0);
+    quantiles_float_sketch sketch2(2 * k, 0);
+
+    for (int i = 0; i < 100 * k; i++) {
+      sketch1.update(static_cast<float>(i));
+      sketch2.update(static_cast<float>(-i));
+    }
+    
+    sketch1.merge(sketch2);
+    REQUIRE(sketch1.get_n() == 200 * k);
+    REQUIRE(sketch1.get_k() == k); // no reason to have shrunk
+    REQUIRE(sketch1.get_min_value() == static_cast<float>(-100 * k + 1));
+    REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
+    REQUIRE(sketch1.get_quantile(0.5) == Approx(0.0).margin(100 * k * RANK_EPS_FOR_K_128));
+  }
+
+  SECTION("merge: src estimation, tgt exact, equal k") {
+    const uint16_t k = 128;
+    quantiles_float_sketch sketch1(k, 0);
+    quantiles_float_sketch sketch2(k, 0);
+
+    for (int i = 0; i < k / 2; i++) {
+      sketch1.update(static_cast<float>(i));
+      sketch2.update(static_cast<float>(k - i - 1));
+    }
+    
+    for (int i = k; i < 100 * k; i++) {
+      sketch2.update(static_cast<float>(i));
+    }
+
+    sketch1.merge(sketch2);
+    REQUIRE(sketch1.get_n() == 100 * k);
+    REQUIRE(sketch1.get_k() == k);
+    REQUIRE(sketch1.get_min_value() == 0.0f);
+    REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
+    float n = 100 * k - 1;
+    REQUIRE(sketch1.get_quantile(0.5) == Approx(n / 2).margin(n / 2 * RANK_EPS_FOR_K_128));
+  }
+
+  SECTION("merge: both estimation, no base buffer, same k") {
+    const uint16_t k = 128;
+    quantiles_float_sketch sketch1(k, 0);
+    quantiles_float_sketch sketch2(k, 0);
+
+    uint64_t n = 2 * k;
+    for (uint64_t i = 0; i < n; i++) {
+      sketch1.update(static_cast<float>(i));
+      sketch2.update(static_cast<float>(2 * n - i - 1));
+    }
+    
+    sketch1.merge(sketch2);
+    REQUIRE(sketch1.get_n() == 2 * n);
+    REQUIRE(sketch1.get_k() == k);
+    REQUIRE(sketch1.get_min_value() == 0.0f);
+    REQUIRE(sketch1.get_max_value() == static_cast<float>(2 * n - 1));
+    REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_128));
+  }
+
+  SECTION("merge: both estimation, no base buffer, tgt.k < src.k") {
+    const uint16_t k = 128;
+    quantiles_float_sketch sketch1(k, 0);
+    quantiles_float_sketch sketch2(2 * k, 0);
+
+    uint64_t n = 4 * k;
+    for (uint64_t i = 0; i < n; i++) {
+      sketch1.update(static_cast<float>(i));
+      sketch2.update(static_cast<float>(2 * n - i - 1));
+    }
+    
+    sketch1.merge(sketch2);
+    REQUIRE(sketch1.get_n() == 2 * n);
+    REQUIRE(sketch1.get_k() == k);
+    REQUIRE(sketch1.get_min_value() == 0.0f);
+    REQUIRE(sketch1.get_max_value() == static_cast<float>(2 * n - 1));
+    REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_128));
+  }
+
   SECTION("sketch of ints") {
     quantiles_sketch<int> sketch;
     REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org