You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/15 09:04:23 UTC

[incubator-datasketches-cpp] 02/02: add varopt to python, with changes to support human-readable to_string() even if type has no operator<< defined

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 4f91eba2e7b56d19a3120d5508705e09c7f764c5
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Sat Feb 15 01:04:08 2020 -0800

    add varopt to python, with changes to support human-readable to_string() even if type has no operator<< defined
---
 python/CMakeLists.txt                    |   2 +
 python/src/datasketches.cpp              |   2 +
 python/src/vo_wrapper.cpp                | 120 +++++++++++++++++++++++++++++++
 sampling/include/var_opt_sketch.hpp      |   5 ++
 sampling/include/var_opt_sketch_impl.hpp |  16 ++++-
 sampling/include/var_opt_union_impl.hpp  |   1 -
 6 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1136382..d1a3f9f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -34,6 +34,7 @@ target_link_libraries(python
     cpc
     fi
     theta
+    sampling
     pybind11::module
 )
 
@@ -55,4 +56,5 @@ target_sources(python
     src/cpc_wrapper.cpp
     src/fi_wrapper.cpp
     src/theta_wrapper.cpp
+    src/vo_wrapper.cpp
 )
diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp
index 158f97c..f8c138a 100644
--- a/python/src/datasketches.cpp
+++ b/python/src/datasketches.cpp
@@ -26,6 +26,7 @@ void init_kll(py::module& m);
 void init_fi(py::module& m);
 void init_cpc(py::module& m);
 void init_theta(py::module& m);
+void init_vo(py::module& m);
 
 PYBIND11_MODULE(datasketches, m) {
   init_hll(m);
@@ -33,4 +34,5 @@ PYBIND11_MODULE(datasketches, m) {
   init_fi(m);
   init_cpc(m);
   init_theta(m);
+  init_vo(m);
 }
diff --git a/python/src/vo_wrapper.cpp b/python/src/vo_wrapper.cpp
new file mode 100644
index 0000000..a2346e5
--- /dev/null
+++ b/python/src/vo_wrapper.cpp
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "var_opt_sketch.hpp"
+#include "var_opt_union.hpp"
+
+#include <pybind11/pybind11.h>
+#include <sstream>
+
+namespace py = pybind11;
+
+namespace datasketches {
+namespace python {
+
+template<typename T>
+py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
+  py::list list;
+  for (auto& item : sk) {
+    py::tuple t = py::make_tuple(item.first, item.second);
+    list.append(t);
+  }
+  return list;
+}
+
+template<typename T>
+std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
+  if (print_items) {
+    std::ostringstream ss;
+    sk.to_stream(ss);
+    ss << "### VarOpt Sketch Items" << std::endl;
+    int i = 0;
+    for (auto& item : sk) {
+      // item.second is always a double
+      // item.first is an arbitrary py::object, so get the value by
+      // using internal str() method then casting to C++ std::string
+      py::str item_pystr(item.first);
+      std::string item_str = py::cast<std::string>(item_pystr);
+      // item.second is guaranteed to be a double
+      ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
+    }
+    return ss.str();
+  } else {
+    return sk.to_string();
+  }
+}
+
+template<typename T>
+std::string vo_union_to_string(const var_opt_union<T>& u) {
+  // no direct access to gadget so we can't easily print the item list
+  // for arbitrary python objects
+  return u.to_string();
+}
+
+}
+}
+
+namespace dspy = datasketches::python;
+
+template<typename T>
+void bind_vo_sketch(py::module &m, const char* name) {
+  using namespace datasketches;
+
+  py::class_<var_opt_sketch<T>>(m, name)
+    .def(py::init<uint32_t>(), py::arg("k"))
+    .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items"))
+    .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items"))
+    .def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0)
+    .def_property_readonly("k", &var_opt_sketch<T>::get_k)
+    .def_property_readonly("n", &var_opt_sketch<T>::get_n)
+    .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples)
+    .def("get_samples", &dspy::vo_sketch_get_samples<T>)
+    .def("is_empty", &var_opt_sketch<T>::is_empty)
+    // As of writing, not yet clear how to serialize arbitrary python objects,
+    // especially in any sort of language-portable way
+    //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
+    //.def("serialize", &dspy::vo_sketch_serialize<T>)
+    //.def_static("deserialize", &dspy::vo_sketch_deserialize<T>)
+    ;
+}
+
+template<typename T>
+void bind_vo_union(py::module &m, const char* name) {
+  using namespace datasketches;
+
+  py::class_<var_opt_union<T>>(m, name)
+    .def(py::init<uint32_t>(), py::arg("max_k"))
+    .def("__str__", &dspy::vo_union_to_string<T>)
+    .def("to_string", &dspy::vo_union_to_string<T>)
+    .def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"))
+    .def("get_result", &var_opt_union<T>::get_result)
+    .def("reset", &var_opt_union<T>::reset)
+    // As of writing, not yet clear how to serialize arbitrary python objects,
+    // especially in any sort of language-portable way
+    //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
+    //.def("serialize", &dspy::vo_union_serialize<T>)
+    //.def_static("deserialize", &dspy::vo_union_deserialize<T>)
+    ;
+}
+
+
+void init_vo(py::module &m) {
+  bind_vo_sketch<py::object>(m, "varopt_object_sketch");
+  bind_vo_union<py::object>(m, "varopt_object_uunion");
+}
diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index 7230537..0f044bd 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -90,6 +90,11 @@ class var_opt_sketch {
     std::ostream& to_stream(std::ostream& os) const;
     std::string to_string() const;
 
+    // These will only work for T with a defined operator<<()
+    // Kept separate to allow to_string() to compile for all types T
+    std::ostream& items_to_stream(std::ostream& os) const;
+    std::string items_to_string() const;
+
     subset_summary estimate_subset_sum(std::function<bool(T)> predicate) const;
 
     class const_iterator;
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 4fea686..9673950 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -687,6 +687,13 @@ std::ostream& var_opt_sketch<T,S,A>::to_stream(std::ostream& os) const {
   os << "   weight_r     : " << total_wt_r_ << std::endl;
   os << "   Current size : " << curr_items_alloc_ << std::endl;
   os << "   Resize factor: " << (1 << rf_) << std::endl;
+  os << "### END SKETCH SUMMARY" << std::endl;
+
+  return os;
+}
+
+template<typename T, typename S, typename A>
+std::ostream& var_opt_sketch<T,S,A>::items_to_stream(std::ostream& os) const {
   os << "### Sketch Items" << std::endl;
 
   uint32_t print_length = (n_ < k_ ? n_ : k_ + 1);
@@ -697,8 +704,6 @@ std::ostream& var_opt_sketch<T,S,A>::to_stream(std::ostream& os) const {
       os << i << ": " << data_[i] << "\twt = " << weights_[i] << std::endl;
     }
   }
-  
-  os << "### END SKETCH SUMMARY" << std::endl;
 
   return os;
 }
@@ -710,6 +715,13 @@ std::string var_opt_sketch<T,S,A>::to_string() const {
   return ss.str();
 }
 
+template <typename T, typename S, typename A>
+std::string var_opt_sketch<T,S,A>::items_to_string() const {
+  std::ostringstream ss;
+  items_to_stream(ss);
+  return ss.str();
+}
+
 template<typename T, typename S, typename A>
 void var_opt_sketch<T,S,A>::update(const T& item, double weight, bool mark) {
   if (weight <= 0.0) { 
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
index dd87ca8..adb21d4 100644
--- a/sampling/include/var_opt_union_impl.hpp
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -313,7 +313,6 @@ std::string var_opt_union<T,S,A>::to_string() const {
   return ss.str();
 }
 
-
 template<typename T, typename S, typename A>
 void var_opt_union<T,S,A>::update(var_opt_sketch<T,S,A>& sk) {
   merge_into(sk);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org