You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/06/08 18:57:50 UTC
[incubator-datasketches-cpp] 02/02: rename vector_of_kll_{floats,
ints}_sketches, add merge and collapse methods
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch python_vector_kll
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit 2b7325f253ff00f71419b44742161863348f2f9e
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Mon Jun 8 11:57:24 2020 -0700
rename vector_of_kll_{floats,ints}_sketches, add merge and collapse methods
---
python/src/kll_wrapper.cpp | 39 +++++++++++++++++++++++++++++++--
python/tests/kll_test.py | 54 +++++++++++++++++++---------------------------
2 files changed, 59 insertions(+), 34 deletions(-)
diff --git a/python/src/kll_wrapper.cpp b/python/src/kll_wrapper.cpp
index e691e79..d5986b6 100644
--- a/python/src/kll_wrapper.cpp
+++ b/python/src/kll_wrapper.cpp
@@ -37,6 +37,10 @@ class vector_of_kll_sketches {
static const uint32_t DEFAULT_D = 1;
explicit vector_of_kll_sketches(uint32_t k = DEFAULT_K, uint32_t d = DEFAULT_D);
+ vector_of_kll_sketches(const vector_of_kll_sketches& other);
+ vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
+ vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
+ vector_of_kll_sketches<T,C,S>& operator=(vector_of_kll_sketches&& other);
// container parameters
inline uint32_t get_k() const;
@@ -96,6 +100,37 @@ d_(d)
}
template<typename T, typename C, typename S>
+vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
+ k_(other.k_),
+ d_(other.d_),
+ sketches_(other.sketches_)
+{}
+
+template<typename T, typename C, typename S>
+vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
+ k_(other.k_),
+ d_(other.d_),
+ sketches_(std::move(other.sketches_))
+{}
+
+template<typename T, typename C, typename S>
+vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(const vector_of_kll_sketches& other) {
+ vector_of_kll_sketches<T,C,S> copy(other);
+ std::swap(k_, copy.k_);
+ std::swap(d_, copy.d_);
+ std::swap(sketches_, copy.sketches_);
+ return *this;
+}
+
+template<typename T, typename C, typename S>
+vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(vector_of_kll_sketches&& other) {
+ std::swap(k_, other.k_);
+ std::swap(d_, other.d_);
+ std::swap(sketches_, other.sketches_);
+ return *this;
+}
+
+template<typename T, typename C, typename S>
uint32_t vector_of_kll_sketches<T,C,S>::get_k() const {
return k_;
}
@@ -540,6 +575,6 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
void init_kll(py::module &m) {
bind_kll_sketch<int>(m, "kll_ints_sketch");
bind_kll_sketch<float>(m, "kll_floats_sketch");
- bind_vector_of_kll_sketches<int>(m, "kll_intarray_sketches");
- bind_vector_of_kll_sketches<float>(m, "kll_floatarray_sketches");
+ bind_vector_of_kll_sketches<int>(m, "vector_of_kll_ints_sketches");
+ bind_vector_of_kll_sketches<float>(m, "vector_of_kll_floats_sketches");
}
diff --git a/python/tests/kll_test.py b/python/tests/kll_test.py
index b26fd59..9ce0523 100644
--- a/python/tests/kll_test.py
+++ b/python/tests/kll_test.py
@@ -17,8 +17,8 @@
import unittest
from datasketches import (kll_ints_sketch, kll_floats_sketch,
- kll_intarray_sketches,
- kll_floatarray_sketches)
+ vector_of_kll_ints_sketches,
+ vector_of_kll_floats_sketches)
import numpy as np
@@ -112,7 +112,7 @@ class KllTest(unittest.TestCase):
self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
def test_kll_floats_sketch(self):
- # alraedy tested ints and it's templatized, so just make sure it instantiates properly
+ # already tested ints and it's templatized, so just make sure it instantiates properly
k = 75
kll = kll_floats_sketch(k)
self.assertTrue(kll.is_empty())
@@ -125,7 +125,7 @@ class KllSketchesTest(unittest.TestCase):
n = 2 ** 20
# create a sketch and inject ~1 million N(0,1) points
- kll = kll_floatarray_sketches(k, d)
+ kll = vector_of_kll_floats_sketches(k, d)
# Track the min/max for each sketch to test later
smin = np.zeros(d) + np.inf
smax = np.zeros(d) - np.inf
@@ -159,49 +159,39 @@ class KllSketchesTest(unittest.TestCase):
self.assertEqual(cdf.shape[0], pts.shape[0])
self.assertEqual(cdf.shape[1], pts.shape[1]+1)
- #err = kll.normalized_rank_error(False) # method not implemented
- #self.assertEqual(err, kll.get_normalized_rank_error(k, False))
-
# and a few basic queries about the sketch
self.assertFalse(np.all(kll.is_empty()))
self.assertTrue(np.all(kll.is_estimation_mode()))
self.assertTrue(np.all(kll.get_n() == n))
self.assertTrue(np.all(kll.get_num_retained() < n))
- # Merging not yet implemented
- # merging itself will double the number of items the sketch has seen
- #kll.merge(kll)
- #self.assertEqual(kll.get_n(), 2*n)
+ # we can combine sketches across all dimensions and get the reuslt
+ result = kll.collapse()
+ self.assertEqual(result.get_n(), d * n)
+
+ # merging a copy of itself will double the number of items the sketch has seen
+ kll_copy = vector_of_kll_floats_sketches(kll)
+ kll.merge(kll_copy)
+ np.testing.assert_equal(kll.get_n(), 2*n)
# we can then serialize and reconstruct the sketch
kll_bytes = kll.serialize() # serializes each sketch as a list
- # store values we are interested in
- oldkll_num_retained = kll.get_num_retained()
- oldkll_min_values = kll.get_min_values()
- oldkll_max_values = kll.get_max_values()
- oldkll_quantiles = kll.get_quantiles(0.7)
- oldkll_ranks = kll.get_ranks(0.0)
- # deserialize the sketches
+ new_kll = vector_of_kll_floats_sketches(k, d)
for s in range(len(kll_bytes)):
- kll.deserialize(kll_bytes[s], s)
- np.testing.assert_allclose(oldkll_num_retained, kll.get_num_retained())
- np.testing.assert_allclose(oldkll_min_values, kll.get_min_values())
- np.testing.assert_allclose(oldkll_max_values, kll.get_max_values())
- np.testing.assert_allclose(oldkll_quantiles, kll.get_quantiles(0.7))
- np.testing.assert_allclose(oldkll_ranks, kll.get_ranks(0.0))
+ new_kll.deserialize(kll_bytes[s], s)
+
+ # everything should be exactly equal
+ np.testing.assert_equal(kll.get_num_retained(), new_kll.get_num_retained())
+ np.testing.assert_equal;(kll.get_min_values(), new_kll.get_min_values())
+ np.testing.assert_equal(kll.get_max_values(), new_kll.get_max_values())
+ np.testing.assert_equal(kll.get_quantiles(0.7), new_kll.get_quantiles(0.7))
+ np.testing.assert_equal(kll.get_ranks(0.0), new_kll.get_ranks(0.0))
def test_kll_ints_sketches(self):
# already tested floats and it's templatized, so just make sure it instantiates properly
k = 100
d = 5
- kll = kll_intarray_sketches(k, d)
- self.assertTrue(np.all(kll.is_empty()))
-
- def test_kll_floats_sketches(self):
- # already tested in the example
- k = 75
- d = 3
- kll = kll_floatarray_sketches(k, d)
+ kll = vector_of_kll_ints_sketches(k, d)
self.assertTrue(np.all(kll.is_empty()))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org