You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2019/06/25 19:23:02 UTC

[incubator-datasketches-cpp] branch pybind11 updated: add kll support to python with pybind11

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch pybind11
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/pybind11 by this push:
     new adca8ba  add kll support to python with pybind11
adca8ba is described below

commit adca8babab1d5e30f4325fd086596f31fc9e1d58
Author: jmalkin <jm...@users.noreply.github.com>
AuthorDate: Tue Jun 25 12:15:40 2019 -0700

    add kll support to python with pybind11
---
 python/CMakeLists.txt       |  19 ++----
 python/src/datasketches.cpp |   4 +-
 python/src/kll_wrapper.cpp  | 162 +++++++++++++++++---------------------------
 3 files changed, 71 insertions(+), 114 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 51969ce..b37171c 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,4 @@
-#find_package(Python3 REQUIRED COMPONENTS Development)
-
+# TODO: Can we force python version >= 3.0?
 if (MSVC)
   set(PYBIND11_CPP_STANDARD /std:c++11)
 else()
@@ -16,25 +15,16 @@ target_link_libraries(python
   PRIVATE
     common
     hll
-    #kll
+    kll
     cpc
     fi
-    #${Python3_LIBRARIES}
+    #theta
     pybind11::module
 )
 
-#target_include_directories(python
-#  PRIVATE
-#    ${Python3_INCLUDE_DIRS}
-#)
-
 set_target_properties(python PROPERTIES
   PREFIX ""
   OUTPUT_NAME datasketches
-#  POSITION_INDEPENDENT_CODE ON
-#  LINKER_LANGUAGE CXX
-#  CXX_STANDARD 11
-#  CXX_STANDARD_REQUIRED YES
 )
 
 # ensure we make a .so on Mac rather than .dylib
@@ -46,7 +36,8 @@ target_sources(python
   PRIVATE
     src/datasketches.cpp
     src/hll_wrapper.cpp
-    #src/kll_wrapper.cpp
+    src/kll_wrapper.cpp
     src/cpc_wrapper.cpp
     src/fi_wrapper.cpp
+    #stc/theta_wrapper.cpp
 )
diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp
index 1e46402..5e97f57 100644
--- a/python/src/datasketches.cpp
+++ b/python/src/datasketches.cpp
@@ -22,14 +22,14 @@
 namespace py = pybind11;
 
 void init_hll(py::module& m);
-//void init_kll(py::module& m);
+void init_kll(py::module& m);
 void init_fi(py::module& m);
 void init_cpc(py::module& m);
 //void init_theta(py::module& m);
 
 PYBIND11_MODULE(datasketches, m) {
   init_hll(m);
-  //init_kll(m);
+  init_kll(m);
   init_fi(m);
   init_cpc(m);
   //init_theta(m);
diff --git a/python/src/kll_wrapper.cpp b/python/src/kll_wrapper.cpp
index 9a60185..7219f06 100644
--- a/python/src/kll_wrapper.cpp
+++ b/python/src/kll_wrapper.cpp
@@ -18,115 +18,85 @@
  */
 
 #include "kll_sketch.hpp"
-#include <boost/python.hpp>
 
-namespace bpy = boost::python;
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sstream>
+
+namespace py = pybind11;
 
 namespace datasketches {
 namespace python {
 
 template<typename T>
-kll_sketch<T>* KllSketch_deserialize(bpy::object obj) {
-  PyObject* skBytes = obj.ptr();
-  if (!PyBytes_Check(skBytes)) {
-    PyErr_SetString(PyExc_TypeError, "Attmpted to deserialize non-bytes object");
-    bpy::throw_error_already_set();
-    return nullptr;
-  }
-  
-  size_t len = PyBytes_GET_SIZE(skBytes);
-  char* sketchImg = PyBytes_AS_STRING(skBytes);
-  auto sk = kll_sketch<T>::deserialize(sketchImg, len);
-  return sk.release();
+kll_sketch<T> KllSketch_deserialize(py::bytes skBytes) {
+  std::string skStr = skBytes; // implicit cast  
+  return kll_sketch<T>::deserialize(skStr.c_str(), skStr.length());
 }
 
 template<typename T>
-bpy::object KllSketch_serialize(const kll_sketch<T>& sk) {
+py::object KllSketch_serialize(const kll_sketch<T>& sk) {
   auto serResult = sk.serialize();
-  PyObject* sketchBytes = PyBytes_FromStringAndSize((char*)serResult.first.get(), serResult.second);
-  return bpy::object{bpy::handle<>(sketchBytes)};
-}
-
-template<typename T>
-double KllSketch_sketchNormalizedRankError(const kll_sketch<T>& sk,
-                                           bool pmf) {
-  return sk.get_normalized_rank_error(pmf);
+  return py::bytes((char*)serResult.first.get(), serResult.second);
 }
 
+// maybe possible to disambiguate the static vs method rank error calls, but
+// this is easier for now
 template<typename T>
 double KllSketch_generalNormalizedRankError(uint16_t k, bool pmf) {
   return kll_sketch<T>::get_normalized_rank_error(k, pmf);
 }
 
 template<typename T>
-bpy::list KllSketch_getQuantiles(const kll_sketch<T>& sk,
-                                 bpy::list& fractions) {
-  size_t nQuantiles = len(fractions);
-  double* frac = new double[nQuantiles];
-  for (int i = 0; i < nQuantiles; ++i) {
-    frac[i] = bpy::extract<double>(fractions[i]);
-  }
-  std::unique_ptr<T[]> result = sk.get_quantiles(frac, nQuantiles);
+py::list KllSketch_getQuantiles(const kll_sketch<T>& sk,
+                                std::vector<double>& fractions) {
+  size_t nQuantiles = fractions.size();
+  std::unique_ptr<T[]> result = sk.get_quantiles(&fractions[0], nQuantiles);
 
-  PyObject* list = PyList_New(nQuantiles);
+  // returning as std::vector<> would copy values to a list anyway
+  py::list list(nQuantiles);
   for (int i = 0; i < nQuantiles; ++i) {
-    if (std::is_same<T, int>::value)        
-      PyList_SET_ITEM(list, i, PyLong_FromLong(result[i]));
-    else if (std::is_same<T, float>::value)
-      PyList_SET_ITEM(list, i, PyFloat_FromDouble(result[i]));
+      list[i] = result[i];
   }
 
-  delete [] frac;
-  return bpy::list{bpy::handle<>(list)};
+  return list;
 }
 
 template<typename T>
-bpy::list KllSketch_getPMF(const kll_sketch<T>& sk,
-                           bpy::list& split_points) {
-  size_t nPoints = len(split_points);
-  T* splitPoints = new T[nPoints];
-  for (int i = 0; i < nPoints; ++i) {
-    splitPoints[i] = bpy::extract<T>(split_points[i]);
-  }
-  std::unique_ptr<double[]> result = sk.get_PMF(splitPoints, nPoints);
+py::list KllSketch_getPMF(const kll_sketch<T>& sk,
+                          std::vector<T>& split_points) {
+  size_t nPoints = split_points.size();
+  std::unique_ptr<double[]> result = sk.get_PMF(&split_points[0], nPoints);
 
-  PyObject* pmf = PyList_New(nPoints);
+  py::list list(nPoints);
   for (int i = 0; i < nPoints; ++i) {
-    PyList_SET_ITEM(pmf, i, PyFloat_FromDouble(result[i]));
+    list[i] = result[i];
   }
 
-  delete [] splitPoints;
-  return bpy::list{bpy::handle<>(pmf)};
+  return list;
 }
 
 template<typename T>
-bpy::list KllSketch_getCDF(const kll_sketch<T>& sk,
-                           bpy::list& split_points) {
-  size_t nPoints = len(split_points);
-  T* splitPoints = new T[nPoints];
-  for (int i = 0; i < nPoints; ++i) {
-    splitPoints[i] = bpy::extract<T>(split_points[i]);
-  }
-  std::unique_ptr<double[]> result = sk.get_CDF(splitPoints, nPoints);
+py::list KllSketch_getCDF(const kll_sketch<T>& sk,
+                          std::vector<T>& split_points) {
+  size_t nPoints = split_points.size();
+  std::unique_ptr<double[]> result = sk.get_CDF(&split_points[0], nPoints);
 
-  PyObject* cdf = PyList_New(nPoints);
+  py::list list(nPoints);
   for (int i = 0; i < nPoints; ++i) {
-    PyList_SET_ITEM(cdf, i, PyFloat_FromDouble(result[i]));
+    list[i] = result[i];
   }
 
-  delete [] splitPoints;
-  return bpy::list{bpy::handle<>(cdf)};
-}
-
-template<typename T>
-uint32_t KllSketch_getSerializedSizeBytes(const kll_sketch<T>& sk) {
-  return sk.get_serialized_size_bytes();
+  return list;
 }
 
 template<typename T>
+//std::string KllSketch_toString(const kll_sketch<T>& sk, bool print_levels, bool print_items) {
 std::string KllSketch_toString(const kll_sketch<T>& sk) {
   std::ostringstream ss;
-  ss << sk;
+  // kll_sketch::toS_straem class does not currently pay attention to the flags
+  //sk.to_stream(ss, print_levels, print_items);
+  sk.to_stream(ss);
   return ss.str();
 }
 
@@ -136,42 +106,38 @@ std::string KllSketch_toString(const kll_sketch<T>& sk) {
 namespace dspy = datasketches::python;
 
 template<typename T>
-void bind_kll_sketch(const char* name)
-{
+void bind_kll_sketch(py::module &m, const char* name) {
   using namespace datasketches;
 
-  bpy::class_<kll_sketch<T>, boost::noncopyable>(name, bpy::init<uint16_t>())
-    .def(bpy::init<const kll_sketch<T>&>())
+  py::class_<kll_sketch<T>>(m, name)
+    .def(py::init<uint16_t>())
+    .def(py::init<const kll_sketch<T>&>())
     .def("update", &kll_sketch<T>::update)
     .def("merge", &kll_sketch<T>::merge)
     .def("__str__", &dspy::KllSketch_toString<T>)
-    .def("isEmpty", &kll_sketch<T>::is_empty)
-    .def("getN", &kll_sketch<T>::get_n)
-    .def("getNumRetained", &kll_sketch<T>::get_num_retained)
-    .def("isEstimationMode", &kll_sketch<T>::is_estimation_mode)
-    .def("getMinValue", &kll_sketch<T>::get_min_value)
-    .def("getMaxValue", &kll_sketch<T>::get_max_value)
-    .def("getQuantile", &kll_sketch<T>::get_quantile)
-    .def("getQuantiles", &dspy::KllSketch_getQuantiles<T>)
-    .def("getRank", &kll_sketch<T>::get_rank)
-    .def("getPMF", &dspy::KllSketch_getPMF<T>)
-    .def("getCDF", &dspy::KllSketch_getCDF<T>)
-    .def("normalizedRankError", &dspy::KllSketch_sketchNormalizedRankError<T>)
-    .def("getNormalizedRankError", &dspy::KllSketch_generalNormalizedRankError<T>)
-    .staticmethod("getNormalizedRankError")
-    .def("getSerializedSizeBytes", &dspy::KllSketch_getSerializedSizeBytes<T>)
-    .def("getSizeofItem", &kll_sketch<T>::get_sizeof_item)
-    .staticmethod("getSizeofItem")
-    .def("getMaxSerializedSizeBytes", &kll_sketch<T>::get_max_serialized_size_bytes)
-    .staticmethod("getMaxSerializedSizeBytes")
+    .def("is_empty", &kll_sketch<T>::is_empty)
+    .def("get_n", &kll_sketch<T>::get_n)
+    .def("get_num_retained", &kll_sketch<T>::get_num_retained)
+    .def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode)
+    .def("get_min_value", &kll_sketch<T>::get_min_value)
+    .def("get_max_value", &kll_sketch<T>::get_max_value)
+    .def("get_quantile", &kll_sketch<T>::get_quantile)
+    .def("get_quantiles", &dspy::KllSketch_getQuantiles<T>)
+    .def("get_rank", &kll_sketch<T>::get_rank)
+    .def("get_pmf", &dspy::KllSketch_getPMF<T>)
+    .def("get_cdf", &dspy::KllSketch_getCDF<T>)
+    .def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error)
+    .def_static("get_normalized_rank_error", &dspy::KllSketch_generalNormalizedRankError<T>)
+    // can't yet get this one to work
+    //.def("get_serialized_size_bytes", &kll_sketch<T>::get_serialized_size_bytes)
+    // this doesn't seem to be defined in the class
+    //.def_static("get_max_serialized_size_bytes", &kll_sketch<T>::get_max_serialized_size_bytes)
     .def("serialize", &dspy::KllSketch_serialize<T>)
-    .def("deserialize", &dspy::KllSketch_deserialize<T>, bpy::return_value_policy<bpy::manage_new_object>())
-    .staticmethod("deserialize")   
+    .def_static("deserialize", &dspy::KllSketch_deserialize<T>)
     ;
 }
 
-void export_kll()
-{
-  bind_kll_sketch<int>("KllIntSketch");
-  bind_kll_sketch<float>("KllFloatSketch");
-}
\ No newline at end of file
+void init_kll(py::module &m) {
+  bind_kll_sketch<int>(m, "kll_int_sketch");
+  bind_kll_sketch<float>(m, "kll_float_sketch");
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org