You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/17 07:17:30 UTC

[incubator-datasketches-cpp] branch sampling updated: add python unit tests/example, ensure properly using placement constructor when copying data items

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/sampling by this push:
     new e72ac26  add python unit tests/example, ensure properly using placement constructor when copying data items
e72ac26 is described below

commit e72ac26e15e5fea9478ff90be66ca53bd5538fcb
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Sun Feb 16 23:17:13 2020 -0800

    add python unit tests/example, ensure properly using placement constructor when copying data items
---
 python/src/vo_wrapper.cpp                | 21 +++++--
 python/tests/vo_test.py                  | 99 ++++++++++++++++++++++++++++++++
 sampling/include/var_opt_sketch_impl.hpp | 16 +++---
 sampling/include/var_opt_union_impl.hpp  |  6 +-
 4 files changed, 127 insertions(+), 15 deletions(-)

diff --git a/python/src/vo_wrapper.cpp b/python/src/vo_wrapper.cpp
index a2346e5..7047eca 100644
--- a/python/src/vo_wrapper.cpp
+++ b/python/src/vo_wrapper.cpp
@@ -21,6 +21,7 @@
 #include "var_opt_union.hpp"
 
 #include <pybind11/pybind11.h>
+#include <pybind11/functional.h>
 #include <sstream>
 
 namespace py = pybind11;
@@ -39,6 +40,17 @@ py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
 }
 
 template<typename T>
+py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch<T>& sk, const std::function<bool(T)> func) {
+  subset_summary summary = sk.estimate_subset_sum(func);
+  py::dict d;
+  d["estimate"] = summary.estimate;
+  d["lower_bound"] = summary.lower_bound;
+  d["upper_bound"] = summary.upper_bound;
+  d["total_sketch_weight"] = summary.total_sketch_weight;
+  return d;
+}
+
+template<typename T>
 std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
   if (print_items) {
     std::ostringstream ss;
@@ -78,14 +90,15 @@ void bind_vo_sketch(py::module &m, const char* name) {
 
   py::class_<var_opt_sketch<T>>(m, name)
     .def(py::init<uint32_t>(), py::arg("k"))
-    .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items"))
-    .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items"))
+    .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false)
+    .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false)
     .def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0)
     .def_property_readonly("k", &var_opt_sketch<T>::get_k)
     .def_property_readonly("n", &var_opt_sketch<T>::get_n)
     .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples)
     .def("get_samples", &dspy::vo_sketch_get_samples<T>)
     .def("is_empty", &var_opt_sketch<T>::is_empty)
+    .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>)
     // As of writing, not yet clear how to serialize arbitrary python objects,
     // especially in any sort of language-portable way
     //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
@@ -115,6 +128,6 @@ void bind_vo_union(py::module &m, const char* name) {
 
 
 void init_vo(py::module &m) {
-  bind_vo_sketch<py::object>(m, "varopt_object_sketch");
-  bind_vo_union<py::object>(m, "varopt_object_uunion");
+  bind_vo_sketch<py::object>(m, "var_opt_sketch");
+  bind_vo_union<py::object>(m, "var_opt_union");
 }
diff --git a/python/tests/vo_test.py b/python/tests/vo_test.py
new file mode 100644
index 0000000..a42ee20
--- /dev/null
+++ b/python/tests/vo_test.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+ 
+import unittest
+from datasketches import var_opt_sketch, var_opt_union
+
+class VoTest(unittest.TestCase):
+  def test_vo_example(self):
+    k = 50  # a small value so we can easily fill the sketch
+    vo = var_opt_sketch(k)
+
+    # varopt sampling reduces to standard reservoir sampling
+    # if the items are all equally weighted, although the
+    # algorithm will be significantly slower than an optimized
+    # reservoir sampler
+    n = 5 * k
+    for i in range(0, n):
+      vo.update(i)
+
+    # we can also add a heavy item, using a negative weight for
+    # easy filtering later.  keep in mind that "heavy" is a
+    # relative concept, so using a fixed multiple of n may not
+    # be considered a heavy item for larger values of n
+    vo.update(-1, 1000 * n)
+    self.assertEqual(k, vo.k)
+    self.assertEqual(k, vo.num_samples)
+    self.assertEqual(n + 1, vo.n)
+    self.assertFalse(vo.is_empty())
+
+    # we can easily get the list of items in the sample
+    items = vo.get_samples()
+    self.assertEqual(len(items), k)
+
+    # we can also apply a predicate to the sketch to get an estimate
+    # (with optimially minimal variance) of the subset sum of items
+    # matching that predicate among the entire population
+
+    # we'll use a lambda here, but any function operating on a single
+    # item which returns a boolean value should work
+    summary = vo.estimate_subset_sum(lambda x: x < 0)
+    self.assertEqual(summary['estimate'], 1000 * n)
+    self.assertEqual(summary['total_sketch_weight'], 1001 * n)
+
+    # a regular function is similarly handled
+    def geq_zero(x):
+      return x >= 0
+    summary = vo.estimate_subset_sum(geq_zero)
+    self.assertEqual(summary['estimate'], n)
+    self.assertEqual(summary['total_sketch_weight'], 1001 * n)
+
+    # next we'll create a second, smaller sketch with
+    # only heavier items relative to the previous sketch,
+    # but with the sketch in sampling mode
+    k2 = 5
+    vo2 = var_opt_sketch(k2)
+    # for weight, use the estimate of all items >=0 from before
+    wt = summary['estimate']
+    for i in range(0, k2 + 1):
+      vo2.update((2 * n) + i, wt)
+
+    # now union the sketches, demonstrating how the
+    # union's k may not be equal to that of either
+    # input value
+    union = var_opt_union(k)
+    union.update(vo)
+    union.update(vo2)
+
+    result = union.get_result()
+    self.assertEqual(n + k2 + 2, result.n)
+    self.assertFalse(result.is_empty())
+    self.assertGreater(result.k, k2)
+    self.assertLess(result.k, k)
+
+    # we can compare what information is available from both
+    # the union and a sketch.
+    print(union)
+
+    # if we want to print the list of itmes, there must be a
+    # __str__() method for each item (which need not be the same
+    # type; they're all generic python objects when used from
+    # pythoh), otherwise you may trigger an exception
+    print(result.to_string(True))
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 9673950..8f8c8a8 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -62,13 +62,13 @@ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other) :
     if (other.filled_data_) {
       // copy everything
       for (size_t i = 0; i < curr_items_alloc_; ++i)
-        data_[i] = other.data_[i];
+        A().construct(&data_[i], T(other.data_[i]));
     } else {
       // skip gap or anything unused at the end
       for (size_t i = 0; i < h_; ++i)
-        data_[i] = other.data_[i];
+        A().construct(&data_[i], T(other.data_[i]));
       for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
-        data_[i] = other.data_[i];
+        A().construct(&data_[i], T(other.data_[i]));
     }
     
     weights_ = AllocDouble().allocate(curr_items_alloc_);
@@ -101,19 +101,19 @@ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other, bool as_sketc
     if (other.filled_data_) {
       // copy everything
       for (size_t i = 0; i < curr_items_alloc_; ++i)
-        data_[i] = other.data_[i];
+        A().construct(&data_[i], T(other.data_[i]));
     } else {
       // skip gap or anything unused at the end
       for (size_t i = 0; i < h_; ++i)
-        data_[i] = other.data_[i];
+        A().construct(&data_[i], T(other.data_[i]));
       for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
-        data_[i] = other.data_[i];
+        A().construct(&data_[i], T(other.data_[i]));
     }
-    
+
     weights_ = AllocDouble().allocate(curr_items_alloc_);
     // doubles so can successfully copy regardless of the internal state
     std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
-    
+
     if (!as_sketch && other.marks_ != nullptr) {
       marks_ = AllocBool().allocate(curr_items_alloc_);
       std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
index adb21d4..a882dd9 100644
--- a/sampling/include/var_opt_union_impl.hpp
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -483,7 +483,7 @@ void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk)
   // Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
   size_t final_idx = gadget_.get_num_samples();
   for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
-    data[next_r_pos] = gadget_.data_[idx];
+    A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
     wts[next_r_pos]  = gadget_.weights_[idx];
     ++result_r;
     --next_r_pos;
@@ -494,13 +494,13 @@ void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk)
   // insert H region items
   for (size_t idx = 0; idx < gadget_.h_; ++idx) {
     if (gadget_.marks_[idx]) {
-      data[next_r_pos] = gadget_.data_[idx];
+      A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
       wts[next_r_pos] = -1.0;
       transferred_weight += gadget_.weights_[idx];
       ++result_r;
       --next_r_pos;
     } else {
-      data[result_h] = gadget_.data_[idx];
+      A().construct(&data[result_h], T(gadget_.data_[idx]));
       wts[result_h] = gadget_.weights_[idx];
       ++result_h;
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org