You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/12/23 20:15:11 UTC
[datasketches-cpp] 02/02: rearranged code to simplify
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch python_wrapper_improvement
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 98a46081820f00a6ab7372ea871607ae63cc644b
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Dec 23 12:15:03 2022 -0800
rearranged code to simplify
---
python/src/fi_wrapper.cpp | 114 ++++++++++++++++++++-------------------------
python/src/hll_wrapper.cpp | 56 ++++++++++------------
2 files changed, 75 insertions(+), 95 deletions(-)
diff --git a/python/src/fi_wrapper.cpp b/python/src/fi_wrapper.cpp
index 1c217e1..bdb49a4 100644
--- a/python/src/fi_wrapper.cpp
+++ b/python/src/fi_wrapper.cpp
@@ -17,62 +17,11 @@
* under the License.
*/
-#include "frequent_items_sketch.hpp"
-
#include <pybind11/pybind11.h>
-#include <sstream>
-
-namespace py = pybind11;
-namespace datasketches {
-namespace python {
-
-template<typename T>
-frequent_items_sketch<T> fi_sketch_deserialize(py::bytes skBytes) {
- std::string skStr = skBytes; // implicit cast
- return frequent_items_sketch<T>::deserialize(skStr.c_str(), skStr.length());
-}
-
-template<typename T>
-py::object fi_sketch_serialize(const frequent_items_sketch<T>& sk) {
- auto serResult = sk.serialize();
- return py::bytes((char*)serResult.data(), serResult.size());
-}
-
-// maybe possible to disambiguate the static vs method get_epsilon calls, but
-// this is easier for now
-template<typename T>
-double fi_sketch_get_generic_epsilon(uint8_t lg_max_map_size) {
- return frequent_items_sketch<T>::get_epsilon(lg_max_map_size);
-}
-
-template<typename T>
-py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
- frequent_items_error_type err_type,
- uint64_t threshold = 0) {
- if (threshold == 0) { threshold = sk.get_maximum_error(); }
-
- py::list list;
- auto items = sk.get_frequent_items(err_type, threshold);
- for (auto iter = items.begin(); iter != items.end(); ++iter) {
- py::tuple t = py::make_tuple(iter->get_item(),
- iter->get_estimate(),
- iter->get_lower_bound(),
- iter->get_upper_bound());
- list.append(t);
- }
- return list;
-}
-
-template<typename T>
-size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
- return sk.get_serialized_size_bytes();
-}
-
-}
-}
+#include "frequent_items_sketch.hpp"
-namespace dspy = datasketches::python;
+namespace py = pybind11;
template<typename T>
void bind_fi_sketch(py::module &m, const char* name) {
@@ -86,7 +35,6 @@ void bind_fi_sketch(py::module &m, const char* name) {
"Produces a string summary of the sketch")
.def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
"Updates the sketch with the given string and, optionally, a weight")
- .def("get_frequent_items", &dspy::fi_sketch_get_frequent_items<T>, py::arg("err_type"), py::arg("threshold")=0)
.def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
"Merges the given sketch into this one")
.def("is_empty", &frequent_items_sketch<T>::is_empty,
@@ -105,15 +53,55 @@ void bind_fi_sketch(py::module &m, const char* name) {
"Returns the guaranteed upper bound weight (frequency) of the given item.")
.def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
"Returns the epsilon value used by the sketch to compute error")
- .def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"),
- "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
- .def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
- "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
- .def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
- "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
- .def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
- .def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
- ;
+ .def(
+ "get_frequent_items",
+ [](const frequent_items_sketch<T>& sk, frequent_items_error_type err_type, uint64_t threshold) {
+ if (threshold == 0) threshold = sk.get_maximum_error();
+ py::list list;
+ auto rows = sk.get_frequent_items(err_type, threshold);
+ for (auto row: rows) {
+ list.append(py::make_tuple(
+ row.get_item(),
+ row.get_estimate(),
+ row.get_lower_bound(),
+ row.get_upper_bound())
+ );
+ }
+ return list;
+ },
+ py::arg("err_type"), py::arg("threshold")=0
+ )
+ .def_static(
+ "get_epsilon_for_lg_size",
+ [](uint8_t lg_max_map_size) { return frequent_items_sketch<T>::get_epsilon(lg_max_map_size); },
+ py::arg("lg_max_map_size"),
+ "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)"
+ )
+ .def_static(
+ "get_apriori_error",
+ &frequent_items_sketch<T>::get_apriori_error,
+ py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
+ "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight."
+ )
+ .def(
+ "get_serialized_size_bytes",
+ [](const frequent_items_sketch<T>& sk) { return sk.get_serialized_size_bytes(); },
+ "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at."
+ )
+ .def(
+ "serialize",
+ [](const frequent_items_sketch<T>& sk) {
+ auto bytes = sk.serialize();
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
+ },
+ "Serializes the sketch into a bytes object"
+ )
+ .def_static(
+ "deserialize",
+ [](const std::string& bytes) { return frequent_items_sketch<T>::deserialize(bytes.data(), bytes.size()); },
+ py::arg("bytes"),
+ "Reads a bytes object and returns the corresponding frequent_strings_sketch"
+ );
}
void init_fi(py::module &m) {
diff --git a/python/src/hll_wrapper.cpp b/python/src/hll_wrapper.cpp
index 0491074..24da90a 100644
--- a/python/src/hll_wrapper.cpp
+++ b/python/src/hll_wrapper.cpp
@@ -17,34 +17,11 @@
* under the License.
*/
-#include "hll.hpp"
-
#include <pybind11/pybind11.h>
-namespace py = pybind11;
-
-namespace datasketches {
-namespace python {
-
-hll_sketch hll_sketch_deserialize(py::bytes skBytes) {
- std::string skStr = skBytes; // implicit cast
- return hll_sketch::deserialize(skStr.c_str(), skStr.length());
-}
-
-py::object hll_sketch_serialize_compact(const hll_sketch& sk) {
- auto serResult = sk.serialize_compact();
- return py::bytes((char*)serResult.data(), serResult.size());
-}
-
-py::object hll_sketch_serialize_updatable(const hll_sketch& sk) {
- auto serResult = sk.serialize_updatable();
- return py::bytes((char*)serResult.data(), serResult.size());
-}
-
-}
-}
+#include "hll.hpp"
-namespace dspy = datasketches::python;
+namespace py = pybind11;
void init_hll(py::module &m) {
using namespace datasketches;
@@ -59,12 +36,6 @@ void init_hll(py::module &m) {
.def(py::init<uint8_t>(), py::arg("lg_k"))
.def(py::init<uint8_t, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
.def(py::init<uint8_t, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
- .def_static("deserialize", &dspy::hll_sketch_deserialize,
- "Reads a bytes object and returns the corresponding hll_sketch")
- .def("serialize_compact", &dspy::hll_sketch_serialize_compact,
- "Serializes the sketch into a bytes object, compressiong the exception table if HLL_4")
- .def("serialize_updatable", &dspy::hll_sketch_serialize_updatable,
- "Serializes the sketch into a bytes object")
.def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
"Produces a string summary of the sketch")
@@ -101,7 +72,28 @@ void init_hll(py::module &m) {
.def_static("get_rel_err", &hll_sketch::get_rel_err,
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
"Retuns the a priori relative error bound for the given parameters")
- ;
+ .def(
+ "serialize_compact",
+ [](const hll_sketch& sk) {
+ auto bytes = sk.serialize_compact();
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
+ },
+ "Serializes the sketch into a bytes object, compressiong the exception table if HLL_4"
+ )
+ .def(
+ "serialize_updatable",
+ [](const hll_sketch& sk) {
+ auto bytes = sk.serialize_updatable();
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
+ },
+ "Serializes the sketch into a bytes object"
+ )
+ .def_static(
+ "deserialize",
+ [](const std::string& bytes) { return hll_sketch::deserialize(bytes.data(), bytes.size()); },
+ py::arg("bytes"),
+ "Reads a bytes object and returns the corresponding hll_sketch"
+ );
py::class_<hll_union>(m, "hll_union")
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org