You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/12/23 00:29:46 UTC

[datasketches-cpp] branch python_wrapper_improvement updated: added iterator, rearranged and simplified existing code

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch python_wrapper_improvement
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


The following commit(s) were added to refs/heads/python_wrapper_improvement by this push:
     new 52ed141  added iterator, rearranged and simplified existing code
52ed141 is described below

commit 52ed141e5e01595ccc3453c4d793abfc54959542
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Dec 22 16:29:41 2022 -0800

    added iterator, rearranged and simplified existing code
---
 python/src/req_wrapper.cpp | 275 +++++++++++++++++++--------------------------
 python/tests/req_test.py   |   7 ++
 2 files changed, 124 insertions(+), 158 deletions(-)

diff --git a/python/src/req_wrapper.cpp b/python/src/req_wrapper.cpp
index 6480ff6..1f9b01a 100644
--- a/python/src/req_wrapper.cpp
+++ b/python/src/req_wrapper.cpp
@@ -17,97 +17,15 @@
  * under the License.
  */
 
-#include "req_sketch.hpp"
-
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/numpy.h>
-#include <sstream>
 #include <vector>
 #include <stdexcept>
 
-namespace py = pybind11;
-
-namespace datasketches {
-
-namespace python {
-
-template<typename T>
-req_sketch<T> req_sketch_deserialize(py::bytes sk_bytes) {
-  std::string sk_str = sk_bytes; // implicit cast  
-  return req_sketch<T>::deserialize(sk_str.c_str(), sk_str.length());
-}
-
-template<typename T>
-py::object req_sketch_serialize(const req_sketch<T>& sk) {
-  auto ser_result = sk.serialize();
-  return py::bytes((char*)ser_result.data(), ser_result.size());
-}
-
-// maybe possible to disambiguate the static vs method rank error calls, but
-// this is easier for now
-template<typename T>
-double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
-  return req_sketch<T>::get_normalized_rank_error(k, pmf);
-}
-
-template<typename T>
-py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
-                                  std::vector<double>& ranks,
-                                  bool inclusive) {
-  size_t n_quantiles = ranks.size();
-  auto result = sk.get_quantiles(ranks.data(), n_quantiles, inclusive);
-  // returning as std::vector<> would copy values to a list anyway
-  py::list list(n_quantiles);
-  for (size_t i = 0; i < n_quantiles; ++i) {
-      list[i] = result[i];
-  }
-  return list;
-}
-
-template<typename T>
-py::list req_sketch_get_pmf(const req_sketch<T>& sk,
-                            std::vector<T>& split_points,
-                            bool inclusive) {
-  size_t n_points = split_points.size();
-  auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
-  py::list list(n_points + 1);
-  for (size_t i = 0; i <= n_points; ++i) {
-    list[i] = result[i];
-  }
-  return list;
-}
-
-template<typename T>
-py::list req_sketch_get_cdf(const req_sketch<T>& sk,
-                            std::vector<T>& split_points,
-                            bool inclusive) {
-  size_t n_points = split_points.size();
-  auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
-  py::list list(n_points + 1);
-  for (size_t i = 0; i <= n_points; ++i) {
-    list[i] = result[i];
-  }
-  return list;
-}
-
-template<typename T>
-void req_sketch_update(req_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
-  if (items.ndim() != 1) {
-    throw std::invalid_argument("input data must have only one dimension. Found: "
-          + std::to_string(items.ndim()));
-  }
-  
-  auto data = items.template unchecked<1>();
-  for (uint32_t i = 0; i < data.size(); ++i) {
-    sk.update(data(i));
-  }
-}
-
-}
-}
+#include "req_sketch.hpp"
 
-namespace dspy = datasketches::python;
+namespace py = pybind11;
 
 template<typename T>
 void bind_req_sketch(py::module &m, const char* name) {
@@ -117,96 +35,137 @@ void bind_req_sketch(py::module &m, const char* name) {
     .def(py::init<uint16_t, bool>(), py::arg("k")=12, py::arg("is_hra")=true)
     .def(py::init<const req_sketch<T>&>())
     .def("update", (void (req_sketch<T>::*)(const T&)) &req_sketch<T>::update, py::arg("item"),
-         "Updates the sketch with the given value")
-    .def("update", &dspy::req_sketch_update<T>, py::arg("array"),
-         "Updates the sketch with the values in the given array")
+        "Updates the sketch with the given value")
+    .def(
+        "update",
+        [](req_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
+          if (items.ndim() != 1) {
+            throw std::invalid_argument("input data must have only one dimension. Found: "
+              + std::to_string(items.ndim()));
+          }
+          auto array = items.template unchecked<1>();
+          for (uint32_t i = 0; i < array.size(); ++i) sk.update(array(i));
+        },
+        py::arg("array"),
+        "Updates the sketch with the values in the given array"
+    )
     .def("merge", (void (req_sketch<T>::*)(const req_sketch<T>&)) &req_sketch<T>::merge, py::arg("sketch"),
-         "Merges the provided sketch into the this one")
+        "Merges the provided sketch into this one")
     .def("__str__", &req_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
-         "Produces a string summary of the sketch")
+        "Produces a string summary of the sketch")
     .def("to_string", &req_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
-         "Produces a string summary of the sketch")
+        "Produces a string summary of the sketch")
     .def("is_hra", &req_sketch<T>::is_HRA,
-         "Returns True if the sketch is in High Rank Accuracy mode, otherwise False")
+        "Returns True if the sketch is in High Rank Accuracy mode, otherwise False")
     .def("is_empty", &req_sketch<T>::is_empty,
-         "Returns True if the sketch is empty, otherwise False")
+        "Returns True if the sketch is empty, otherwise False")
     .def("get_k", &req_sketch<T>::get_k,
-         "Returns the configured parameter k")
+        "Returns the configured parameter k")
     .def("get_n", &req_sketch<T>::get_n,
-         "Returns the length of the input stream")
+        "Returns the length of the input stream")
     .def("get_num_retained", &req_sketch<T>::get_num_retained,
-         "Returns the number of retained items (samples) in the sketch")
+        "Returns the number of retained items (samples) in the sketch")
     .def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
-         "Returns True if the sketch is in estimation mode, otherwise False")
+        "Returns True if the sketch is in estimation mode, otherwise False")
     .def("get_min_value", &req_sketch<T>::get_min_item,
-         "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
+        "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
     .def("get_max_value", &req_sketch<T>::get_max_item,
-         "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
+        "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
     .def("get_quantile", &req_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
-         "Returns an approximation to the data value "
-         "associated with the given normalized rank in a hypothetical sorted "
-         "version of the input stream so far.\n"
-         "For req_floats_sketch: if the sketch is empty this returns nan. "
-         "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
-    .def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
-         "This returns an array that could have been generated by using get_quantile() for each "
-         "normalized rank separately.\n"
-         "If the sketch is empty this returns an empty vector.\n"
-         "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
+        "Returns an approximation to the data value "
+        "associated with the given normalized rank in a hypothetical sorted "
+        "version of the input stream so far.\n"
+        "For req_floats_sketch: if the sketch is empty this returns nan. "
+        "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
+    .def(
+        "get_quantiles",
+        [](const req_sketch<T>& sk, const std::vector<double>& ranks, bool inclusive) {
+          return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
+        },
+        py::arg("ranks"), py::arg("inclusive")=false,
+        "This returns an array that could have been generated by using get_quantile() for each "
+        "normalized rank separately.\n"
+        "If the sketch is empty this returns an empty vector.\n"
+        "Deprecated. Will be removed in the next major version. Use get_quantile() instead."
+    )
     .def("get_rank", &req_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
-         "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
-         "The resulting approximation has a probabilistic guarantee that can be obtained from the "
-         "get_normalized_rank_error(False) function.\n"
-         "With the parameter inclusive=true the weight of the given value is included into the rank."
-         "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
-         "If the sketch is empty this returns nan.")
-    .def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
-         "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
-         "given a set of split points (values).\n"
-         "The resulting approximations have a probabilistic guarantee that can be obtained from the "
-         "get_normalized_rank_error(True) function.\n"
-         "If the sketch is empty this returns an empty vector.\n"
-         "split_points is an array of m unique, monotonically increasing float values "
-         "that divide the real number line into m+1 consecutive disjoint intervals.\n"
-         "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
-         "exclusive of the right split point, with the exception that the last interval will include "
-         "the maximum value.\n"
-         "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
-         "inclusive of the right split point.\n"
-         "It is not necessary to include either the min or max values in these split points.")
-    .def("get_cdf", &dspy::req_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
-         "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
-         "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
-         "The resulting approximations have a probabilistic guarantee that can be obtained from the "
-         "get_normalized_rank_error(True) function.\n"
-         "If the sketch is empty this returns an empty vector.\n"
-         "split_points is an array of m unique, monotonically increasing float values "
-         "that divide the real number line into m+1 consecutive disjoint intervals.\n"
-         "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
-         "exclusive of the right split point, with the exception that the last interval will include "
-         "the maximum value.\n"
-         "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
-         "inclusive of the right split point.\n"
-         "It is not necessary to include either the min or max values in these split points.")
+        "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
+        "The resulting approximation has a probabilistic guarantee that can be obtained from the "
+        "get_normalized_rank_error(False) function.\n"
+        "With the parameter inclusive=true the weight of the given value is included into the rank."
+        "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
+        "If the sketch is empty this returns nan.")
+    .def(
+        "get_pmf",
+        [](const req_sketch<T>& sk, const std::vector<T>& split_points, bool inclusive) {
+          return sk.get_PMF(split_points.data(), split_points.size(), inclusive);
+        },
+        py::arg("split_points"), py::arg("inclusive")=false,
+        "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
+        "given a set of split points (values).\n"
+        "The resulting approximations have a probabilistic guarantee that can be obtained from the "
+        "get_normalized_rank_error(True) function.\n"
+        "If the sketch is empty this returns an empty vector.\n"
+        "split_points is an array of m unique, monotonically increasing float values "
+        "that divide the real number line into m+1 consecutive disjoint intervals.\n"
+        "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
+        "exclusive of the right split point, with the exception that the last interval will include "
+        "the maximum value.\n"
+        "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
+        "inclusive of the right split point.\n"
+        "It is not necessary to include either the min or max values in these split points."
+    )
+    .def(
+        "get_cdf",
+        [](const req_sketch<T>& sk, const std::vector<T>& split_points, bool inclusive) {
+          return sk.get_CDF(split_points.data(), split_points.size(), inclusive);
+        },
+        py::arg("split_points"), py::arg("inclusive")=false,
+        "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
+        "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
+        "The resulting approximations have a probabilistic guarantee that can be obtained from the "
+        "get_normalized_rank_error(True) function.\n"
+        "If the sketch is empty this returns an empty vector.\n"
+        "split_points is an array of m unique, monotonically increasing float values "
+        "that divide the real number line into m+1 consecutive disjoint intervals.\n"
+        "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
+        "exclusive of the right split point, with the exception that the last interval will include "
+        "the maximum value.\n"
+        "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
+        "inclusive of the right split point.\n"
+        "It is not necessary to include either the min or max values in these split points."
+    )
     .def("get_rank_lower_bound", &req_sketch<T>::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"),
-         "Returns an approximate lower bound on the given normalized rank.\n"
-         "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
-         "the number of standard deviations must be 1, 2, or 3.")
+        "Returns an approximate lower bound on the given normalized rank.\n"
+        "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
+        "the number of standard deviations must be 1, 2, or 3.")
     .def("get_rank_upper_bound", &req_sketch<T>::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"),
-         "Returns an approximate upper bound on the given normalized rank.\n"
-         "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
-         "the number of standard deviations must be 1, 2, or 3.")
+        "Returns an approximate upper bound on the given normalized rank.\n"
+        "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
+        "the number of standard deviations must be 1, 2, or 3.")
     .def_static("get_RSE", &req_sketch<T>::get_RSE,
-         py::arg("k"), py::arg("rank"), py::arg("is_hra"), py::arg("n"),
-         "Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). "
-         "Derived from Lemma 12 in http://arxiv.org/abs/2004.01668v2, but the constant factors have been "
-         "modified based on empirical measurements, for a given value of parameter k.\n"
-         "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high "
-         "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points "
-         "provided to the sketch.")
-    .def("serialize", &dspy::req_sketch_serialize<T>, "Serializes the sketch into a bytes object")
-    .def_static("deserialize", &dspy::req_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
-    ;
+        py::arg("k"), py::arg("rank"), py::arg("is_hra"), py::arg("n"),
+        "Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). "
+        "Derived from Lemma 12 in http://arxiv.org/abs/2004.01668v2, but the constant factors have been "
+        "modified based on empirical measurements, for a given value of parameter k.\n"
+        "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high "
+        "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points "
+        "provided to the sketch.")
+    .def(
+        "serialize",
+        [](const req_sketch<T>& sk) {
+          auto bytes = sk.serialize();
+          return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
+        },
+        "Serializes the sketch into a bytes object"
+    )
+    .def_static(
+        "deserialize",
+        [](const std::string& bytes) { return req_sketch<T>::deserialize(bytes.data(), bytes.size()); },
+        py::arg("bytes"),
+        "Deserializes the sketch from a bytes object"
+    )
+    .def("__iter__", [](const req_sketch<T>& s) { return py::make_iterator(s.begin(), s.end()); });
 }
 
 void init_req(py::module &m) {
diff --git a/python/tests/req_test.py b/python/tests/req_test.py
index 658cd57..2c35f3c 100644
--- a/python/tests/req_test.py
+++ b/python/tests/req_test.py
@@ -79,6 +79,13 @@ class reqTest(unittest.TestCase):
       self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
       self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
 
+      total_weight = 0
+      for tuple in req:
+        item = tuple[0]
+        weight = tuple[1]
+        total_weight = total_weight + weight
+      self.assertEqual(total_weight, req.get_n())
+
     def test_req_ints_sketch(self):
         k = 100
         n = 10


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org