You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/17 07:17:30 UTC
[incubator-datasketches-cpp] branch sampling updated: add python
unit tests/example,
ensure properly using placement constructor when copying data items
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
The following commit(s) were added to refs/heads/sampling by this push:
new e72ac26 add python unit tests/example, ensure properly using placement constructor when copying data items
e72ac26 is described below
commit e72ac26e15e5fea9478ff90be66ca53bd5538fcb
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Sun Feb 16 23:17:13 2020 -0800
add python unit tests/example, ensure properly using placement constructor when copying data items
---
python/src/vo_wrapper.cpp | 21 +++++--
python/tests/vo_test.py | 99 ++++++++++++++++++++++++++++++++
sampling/include/var_opt_sketch_impl.hpp | 16 +++---
sampling/include/var_opt_union_impl.hpp | 6 +-
4 files changed, 127 insertions(+), 15 deletions(-)
diff --git a/python/src/vo_wrapper.cpp b/python/src/vo_wrapper.cpp
index a2346e5..7047eca 100644
--- a/python/src/vo_wrapper.cpp
+++ b/python/src/vo_wrapper.cpp
@@ -21,6 +21,7 @@
#include "var_opt_union.hpp"
#include <pybind11/pybind11.h>
+#include <pybind11/functional.h>
#include <sstream>
namespace py = pybind11;
@@ -39,6 +40,17 @@ py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
}
template<typename T>
+py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch<T>& sk, const std::function<bool(T)> func) {
+ subset_summary summary = sk.estimate_subset_sum(func);
+ py::dict d;
+ d["estimate"] = summary.estimate;
+ d["lower_bound"] = summary.lower_bound;
+ d["upper_bound"] = summary.upper_bound;
+ d["total_sketch_weight"] = summary.total_sketch_weight;
+ return d;
+}
+
+template<typename T>
std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
if (print_items) {
std::ostringstream ss;
@@ -78,14 +90,15 @@ void bind_vo_sketch(py::module &m, const char* name) {
py::class_<var_opt_sketch<T>>(m, name)
.def(py::init<uint32_t>(), py::arg("k"))
- .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items"))
- .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items"))
+ .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false)
+ .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false)
.def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0)
.def_property_readonly("k", &var_opt_sketch<T>::get_k)
.def_property_readonly("n", &var_opt_sketch<T>::get_n)
.def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples)
.def("get_samples", &dspy::vo_sketch_get_samples<T>)
.def("is_empty", &var_opt_sketch<T>::is_empty)
+ .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>)
// As of writing, not yet clear how to serialize arbitrary python objects,
// especially in any sort of language-portable way
//.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
@@ -115,6 +128,6 @@ void bind_vo_union(py::module &m, const char* name) {
void init_vo(py::module &m) {
- bind_vo_sketch<py::object>(m, "varopt_object_sketch");
- bind_vo_union<py::object>(m, "varopt_object_uunion");
+ bind_vo_sketch<py::object>(m, "var_opt_sketch");
+ bind_vo_union<py::object>(m, "var_opt_union");
}
diff --git a/python/tests/vo_test.py b/python/tests/vo_test.py
new file mode 100644
index 0000000..a42ee20
--- /dev/null
+++ b/python/tests/vo_test.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from datasketches import var_opt_sketch, var_opt_union
+
+class VoTest(unittest.TestCase):
+ def test_vo_example(self):
+ k = 50 # a small value so we can easily fill the sketch
+ vo = var_opt_sketch(k)
+
+ # varopt sampling reduces to standard reservoir sampling
+ # if the items are all equally weighted, although the
+ # algorithm will be significantly slower than an optimized
+ # reservoir sampler
+ n = 5 * k
+ for i in range(0, n):
+ vo.update(i)
+
+ # we can also add a heavy item, using a negative weight for
+ # easy filtering later. keep in mind that "heavy" is a
+ # relative concept, so using a fixed multiple of n may not
+ # be considered a heavy item for larger values of n
+ vo.update(-1, 1000 * n)
+ self.assertEqual(k, vo.k)
+ self.assertEqual(k, vo.num_samples)
+ self.assertEqual(n + 1, vo.n)
+ self.assertFalse(vo.is_empty())
+
+ # we can easily get the list of items in the sample
+ items = vo.get_samples()
+ self.assertEqual(len(items), k)
+
+ # we can also apply a predicate to the sketch to get an estimate
+ # (with optimially minimal variance) of the subset sum of items
+ # matching that predicate among the entire population
+
+ # we'll use a lambda here, but any function operating on a single
+ # item which returns a boolean value should work
+ summary = vo.estimate_subset_sum(lambda x: x < 0)
+ self.assertEqual(summary['estimate'], 1000 * n)
+ self.assertEqual(summary['total_sketch_weight'], 1001 * n)
+
+ # a regular function is similarly handled
+ def geq_zero(x):
+ return x >= 0
+ summary = vo.estimate_subset_sum(geq_zero)
+ self.assertEqual(summary['estimate'], n)
+ self.assertEqual(summary['total_sketch_weight'], 1001 * n)
+
+ # next we'll create a second, smaller sketch with
+ # only heavier items relative to the previous sketch,
+ # but with the sketch in sampling mode
+ k2 = 5
+ vo2 = var_opt_sketch(k2)
+ # for weight, use the estimate of all items >=0 from before
+ wt = summary['estimate']
+ for i in range(0, k2 + 1):
+ vo2.update((2 * n) + i, wt)
+
+ # now union the sketches, demonstrating how the
+ # union's k may not be equal to that of either
+ # input value
+ union = var_opt_union(k)
+ union.update(vo)
+ union.update(vo2)
+
+ result = union.get_result()
+ self.assertEqual(n + k2 + 2, result.n)
+ self.assertFalse(result.is_empty())
+ self.assertGreater(result.k, k2)
+ self.assertLess(result.k, k)
+
+ # we can compare what information is available from both
+ # the union and a sketch.
+ print(union)
+
+ # if we want to print the list of itmes, there must be a
+ # __str__() method for each item (which need not be the same
+ # type; they're all generic python objects when used from
+ # pythoh), otherwise you may trigger an exception
+ print(result.to_string(True))
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 9673950..8f8c8a8 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -62,13 +62,13 @@ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other) :
if (other.filled_data_) {
// copy everything
for (size_t i = 0; i < curr_items_alloc_; ++i)
- data_[i] = other.data_[i];
+ A().construct(&data_[i], T(other.data_[i]));
} else {
// skip gap or anything unused at the end
for (size_t i = 0; i < h_; ++i)
- data_[i] = other.data_[i];
+ A().construct(&data_[i], T(other.data_[i]));
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
- data_[i] = other.data_[i];
+ A().construct(&data_[i], T(other.data_[i]));
}
weights_ = AllocDouble().allocate(curr_items_alloc_);
@@ -101,19 +101,19 @@ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other, bool as_sketc
if (other.filled_data_) {
// copy everything
for (size_t i = 0; i < curr_items_alloc_; ++i)
- data_[i] = other.data_[i];
+ A().construct(&data_[i], T(other.data_[i]));
} else {
// skip gap or anything unused at the end
for (size_t i = 0; i < h_; ++i)
- data_[i] = other.data_[i];
+ A().construct(&data_[i], T(other.data_[i]));
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
- data_[i] = other.data_[i];
+ A().construct(&data_[i], T(other.data_[i]));
}
-
+
weights_ = AllocDouble().allocate(curr_items_alloc_);
// doubles so can successfully copy regardless of the internal state
std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
-
+
if (!as_sketch && other.marks_ != nullptr) {
marks_ = AllocBool().allocate(curr_items_alloc_);
std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
index adb21d4..a882dd9 100644
--- a/sampling/include/var_opt_union_impl.hpp
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -483,7 +483,7 @@ void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk)
// Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
size_t final_idx = gadget_.get_num_samples();
for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
- data[next_r_pos] = gadget_.data_[idx];
+ A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
wts[next_r_pos] = gadget_.weights_[idx];
++result_r;
--next_r_pos;
@@ -494,13 +494,13 @@ void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk)
// insert H region items
for (size_t idx = 0; idx < gadget_.h_; ++idx) {
if (gadget_.marks_[idx]) {
- data[next_r_pos] = gadget_.data_[idx];
+ A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
wts[next_r_pos] = -1.0;
transferred_weight += gadget_.weights_[idx];
++result_r;
--next_r_pos;
} else {
- data[result_h] = gadget_.data_[idx];
+ A().construct(&data[result_h], T(gadget_.data_[idx]));
wts[result_h] = gadget_.weights_[idx];
++result_h;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org