You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2019/09/19 20:31:46 UTC
[incubator-datasketches-cpp] 02/02: add theta python tests,
clean up typos in cpc/hll, ensure default seed supplied in theta wrapper
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch py_test
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit a8d95e010b96a7a1e586ff2d115fc5c0e2b16b70
Author: jmalkin <jm...@users.noreply.github.com>
AuthorDate: Thu Sep 19 13:31:33 2019 -0700
add theta python tests, clean up typos in cpc/hll, ensure default seed supplied in theta wrapper
---
python/src/theta_wrapper.cpp | 6 ++-
python/tests/cpc_test.py | 7 +--
python/tests/hll_test.py | 7 +--
python/tests/theta_test.py | 121 +++++++++++++++++++++++++++++++++++++++++++
4 files changed, 133 insertions(+), 8 deletions(-)
diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index cf4fe5c..9ec2335 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -112,13 +112,15 @@ void init_theta(py::module &m) {
.def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"))
.def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"))
.def("compact", &update_theta_sketch::compact, py::arg("ordered")=true)
- .def_static("deserialize", &dspy::update_theta_sketch_deserialize)
+ .def_static("deserialize", &dspy::update_theta_sketch_deserialize,
+ py::arg("bytes"), py::arg("seed")=update_theta_sketch::builder::DEFAULT_SEED)
;
py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
.def(py::init<const compact_theta_sketch&>())
.def(py::init<const theta_sketch&, bool>())
- .def_static("deserialize", &dspy::compact_theta_sketch_deserialize)
+ .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
+ py::arg("bytes"), py::arg("seed")=update_theta_sketch::builder::DEFAULT_SEED)
;
py::class_<theta_union>(m, "theta_union")
diff --git a/python/tests/cpc_test.py b/python/tests/cpc_test.py
index 2504e87..9029ecf 100644
--- a/python/tests/cpc_test.py
+++ b/python/tests/cpc_test.py
@@ -23,7 +23,7 @@ class CpcTest(unittest.TestCase):
k = 12 # 2^k = 4096 rows in the table
n = 1 << 18 # ~256k unique values
- # create a couple sketchrd and inject some values
+ # create a couple sketches and inject some values
# we'll have 1/4 of the values overlap
cpc = cpc_sketch(k)
cpc2 = cpc_sketch(k)
@@ -49,8 +49,9 @@ class CpcTest(unittest.TestCase):
union.update(cpc2)
result = union.get_result()
- # since CPC is deterministic, we have checked and know the
- # exact answer is within one standard deviation of the estimate
+ # since our process here (including post-union CPC) is
+ # deterministic, we have checked and know the exact
+ # answer is within one standard deviation of the estimate
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
diff --git a/python/tests/hll_test.py b/python/tests/hll_test.py
index 420c00a..0fb727e 100644
--- a/python/tests/hll_test.py
+++ b/python/tests/hll_test.py
@@ -23,7 +23,7 @@ class HllTest(unittest.TestCase):
k = 12 # 2^k = 4096 rows in the table
n = 1 << 18 # ~256k unique values
- # create a couple sketchrd and inject some values
+ # create a couple sketches and inject some values
# we'll have 1/4 of the values overlap
hll = hll_sketch(k, tgt_hll_type.HLL_8)
hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
@@ -50,8 +50,9 @@ class HllTest(unittest.TestCase):
result = union.get_result()
self.assertEqual(result.get_estimate(), union.get_estimate())
- # since HLL is deterministic, we have checked and know the
- # exact answer is within one standard deviation of the estimate
+ # since our process here (including post-union HLL) is
+ # deterministic, we have checked and know the exact
+ # answer is within one standard deviation of the estimate
self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)
diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py
new file mode 100644
index 0000000..537e19f
--- /dev/null
+++ b/python/tests/theta_test.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+
+from datasketches import theta_sketch, update_theta_sketch
+from datasketches import compact_theta_sketch, theta_union
+from datasketches import theta_intersection, theta_a_not_b
+
+class ThetaTest(unittest.TestCase):
+ def test_theta_basic_example(self):
+ k = 12 # 2^k = 4096 rows in the table
+ n = 1 << 18 # ~256k unique values
+
+ # create a sketch and inject some values
+ sk = self.generate_theta_sketch(n, k)
+
+ # we can check that the upper and lower bounds bracket the
+ # estimate, without needing to know the exact value.
+ self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
+ self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
+
+ # because this sketch is deterministically generated, we can
+ # also compare against the exact value
+ self.assertLessEqual(sk.get_lower_bound(1), n)
+ self.assertGreaterEqual(sk.get_upper_bound(1), n)
+
+ # serialize for storage and reconstruct
+ sk_bytes = sk.serialize()
+ new_sk = update_theta_sketch.deserialize(sk_bytes)
+
+ # estimate remains unchanged
+ self.assertFalse(sk.is_empty())
+ self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
+
+ def test_theta_set_operations(self):
+ k = 12 # 2^k = 4096 rows in the table
+ n = 1 << 18 # ~256k unique values
+
+ # we'll have 1/4 of the values overlap
+ offset = int(3 * n / 4) # it's a float w/o cast
+
+ # create a couple sketches and inject some values
+ sk1 = self.generate_theta_sketch(n, k)
+ sk2 = self.generate_theta_sketch(n, k, offset)
+
+ # UNIONS
+ # create a union object
+ union = theta_union(k)
+ union.update(sk1)
+ union.update(sk2)
+
+ # getting result from union returns a compact_theta_sketch
+ # compact theta sketches can be used in additional unions
+ # or set operations but cannot accept further item updates
+ result = union.get_result()
+ self.assertTrue(isinstance(result, compact_theta_sketch))
+
+ # since our process here is deterministic, we have
+ # checked and know the exact answer is within one
+ # standard deviation of the estimate
+ self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
+ self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
+
+
+ # INTERSECTIONS
+ # create an intersection object
+ intersect = theta_intersection() # no lg_k
+ intersect.update(sk1)
+ intersect.update(sk2)
+
+ # has_result() indicates the intersection has been used,
+ # although the result may be the empty set
+ self.assertTrue(intersect.has_result())
+
+ # as with unions, the result is a compact sketch
+ result = intersect.get_result()
+ self.assertTrue(isinstance(result, compact_theta_sketch))
+
+ # we know the sets overlap by 1/4
+ self.assertLessEqual(result.get_lower_bound(1), n / 4)
+ self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
+
+
+ # A NOT B
+ # create an a_not_b object
+ anb = theta_a_not_b() # no lg_k
+ result = anb.compute(sk1, sk2)
+
+ # as with unions, the result is a compact sketch
+ self.assertTrue(isinstance(result, compact_theta_sketch))
+
+ # we know the sets overlap by 1/4, so the remainder is 3/4
+ self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
+ self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
+
+
+ def generate_theta_sketch(self, n, k, offset=0):
+ sk = update_theta_sketch(k)
+ for i in range(0, n):
+ sk.update(i + offset)
+ return sk
+
+if __name__ == '__main__':
+ unittest.main()
+
+
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org