You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2019/09/19 20:31:46 UTC

[incubator-datasketches-cpp] 02/02: add theta python tests, clean up typos in cpc/hll, ensure default seed supplied in theta wrapper

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch py_test
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit a8d95e010b96a7a1e586ff2d115fc5c0e2b16b70
Author: jmalkin <jm...@users.noreply.github.com>
AuthorDate: Thu Sep 19 13:31:33 2019 -0700

    add theta python tests, clean up typos in cpc/hll, ensure default seed supplied in theta wrapper
---
 python/src/theta_wrapper.cpp |   6 ++-
 python/tests/cpc_test.py     |   7 +--
 python/tests/hll_test.py     |   7 +--
 python/tests/theta_test.py   | 121 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 133 insertions(+), 8 deletions(-)

diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index cf4fe5c..9ec2335 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -112,13 +112,15 @@ void init_theta(py::module &m) {
     .def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"))
     .def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"))
     .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true)
-    .def_static("deserialize", &dspy::update_theta_sketch_deserialize)
+    .def_static("deserialize", &dspy::update_theta_sketch_deserialize,
+        py::arg("bytes"), py::arg("seed")=update_theta_sketch::builder::DEFAULT_SEED)
   ;
 
   py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
     .def(py::init<const compact_theta_sketch&>())
     .def(py::init<const theta_sketch&, bool>())
-    .def_static("deserialize", &dspy::compact_theta_sketch_deserialize)
+    .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
+        py::arg("bytes"), py::arg("seed")=update_theta_sketch::builder::DEFAULT_SEED)
   ;
 
   py::class_<theta_union>(m, "theta_union")
diff --git a/python/tests/cpc_test.py b/python/tests/cpc_test.py
index 2504e87..9029ecf 100644
--- a/python/tests/cpc_test.py
+++ b/python/tests/cpc_test.py
@@ -23,7 +23,7 @@ class CpcTest(unittest.TestCase):
     k = 12      # 2^k = 4096 rows in the table
     n = 1 << 18 # ~256k unique values
 
-    # create a couple sketchrd and inject some values
+    # create a couple sketches and inject some values
     # we'll have 1/4 of the values overlap
     cpc  = cpc_sketch(k)
     cpc2 = cpc_sketch(k)
@@ -49,8 +49,9 @@ class CpcTest(unittest.TestCase):
     union.update(cpc2)
     result = union.get_result()
 
-    # since CPC is deterministic, we have checked and know the
-    # exact answer is within one standard deviation of the estimate
+    # since our process here (including post-union CPC) is
+    # deterministic, we have checked and know the exact
+    # answer is within one standard deviation of the estimate
     self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
     self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
      
diff --git a/python/tests/hll_test.py b/python/tests/hll_test.py
index 420c00a..0fb727e 100644
--- a/python/tests/hll_test.py
+++ b/python/tests/hll_test.py
@@ -23,7 +23,7 @@ class HllTest(unittest.TestCase):
         k = 12      # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
 
-        # create a couple sketchrd and inject some values
+        # create a couple sketches and inject some values
         # we'll have 1/4 of the values overlap
         hll  = hll_sketch(k, tgt_hll_type.HLL_8)
         hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
@@ -50,8 +50,9 @@ class HllTest(unittest.TestCase):
         result = union.get_result()
         self.assertEqual(result.get_estimate(), union.get_estimate())
 
-        # since HLL is deterministic, we have checked and know the
-        # exact answer is within one standard deviation of the estimate
+        # since our process here (including post-union HLL) is
+        # deterministic, we have checked and know the exact
+        # answer is within one standard deviation of the estimate
         self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
         self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)
 
diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py
new file mode 100644
index 0000000..537e19f
--- /dev/null
+++ b/python/tests/theta_test.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+ 
+import unittest
+
+from datasketches import theta_sketch, update_theta_sketch
+from datasketches import compact_theta_sketch, theta_union
+from datasketches import theta_intersection, theta_a_not_b
+
+class ThetaTest(unittest.TestCase):
+    def test_theta_basic_example(self):
+        k = 12      # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+
+        # create a sketch and inject some values
+        sk = self.generate_theta_sketch(n, k)
+
+        # we can check that the upper and lower bounds bracket the
+        # estimate, without needing to know the exact value.
+        self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
+        self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
+
+        # because this sketch is deterministically generated, we can
+        # also compare against the exact value
+        self.assertLessEqual(sk.get_lower_bound(1), n)
+        self.assertGreaterEqual(sk.get_upper_bound(1), n)
+
+        # serialize for storage and reconstruct
+        sk_bytes = sk.serialize()
+        new_sk = update_theta_sketch.deserialize(sk_bytes)
+
+        # estimate remains unchanged
+        self.assertFalse(sk.is_empty())
+        self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
+
+    def test_theta_set_operations(self):
+        k = 12      # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+
+        # we'll have 1/4 of the values overlap
+        offset = int(3 * n / 4) # it's a float w/o cast
+
+        # create a couple sketches and inject some values
+        sk1 = self.generate_theta_sketch(n, k)
+        sk2 = self.generate_theta_sketch(n, k, offset)
+
+        # UNIONS
+        # create a union object
+        union = theta_union(k)
+        union.update(sk1)
+        union.update(sk2)
+
+        # getting result from union returns a compact_theta_sketch
+        # compact theta sketches can be used in additional unions
+        # or set operations but cannot accept further item updates
+        result = union.get_result()
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+
+        # since our process here is deterministic, we have
+        # checked and know the exact answer is within one
+        # standard deviation of the estimate
+        self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
+
+
+        # INTERSECTIONS
+        # create an intersection object
+        intersect = theta_intersection() # no lg_k
+        intersect.update(sk1)
+        intersect.update(sk2)
+
+        # has_result() indicates the intersection has been used,
+        # although the result may be the empty set
+        self.assertTrue(intersect.has_result())
+
+        # as with unions, the result is a compact sketch
+        result = intersect.get_result()
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+
+        # we know the sets overlap by 1/4
+        self.assertLessEqual(result.get_lower_bound(1), n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
+
+
+        # A NOT B
+        # create an a_not_b object
+        anb = theta_a_not_b() # no lg_k
+        result = anb.compute(sk1, sk2)
+
+        # as with unions, the result is a compact sketch
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+
+        # we know the sets overlap by 1/4, so the remainder is 3/4
+        self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
+
+
+    def generate_theta_sketch(self, n, k, offset=0):
+      sk = update_theta_sketch(k)
+      for i in range(0, n):
+        sk.update(i + offset)
+      return sk
+        
+if __name__ == '__main__':
+    unittest.main()
+
+  
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org