You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2019/09/19 20:31:44 UTC

[incubator-datasketches-cpp] branch py_test updated (d9499b8 -> a8d95e0)

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a change to branch py_test
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git.


    from d9499b8  python tests: add frequent items, clarify kll example
     new e073ccd  add cpc python test example
     new a8d95e0  add theta python tests, clean up typos in cpc/hll, ensure default seed supplied in theta wrapper

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 python/src/theta_wrapper.cpp |   6 ++-
 python/tests/cpc_test.py     |  64 +++++++++++++++++++++++
 python/tests/hll_test.py     |   7 +--
 python/tests/theta_test.py   | 121 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+), 5 deletions(-)
 create mode 100644 python/tests/cpc_test.py
 create mode 100644 python/tests/theta_test.py


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 02/02: add theta python tests, clean up typos in cpc/hll, ensure default seed supplied in theta wrapper

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch py_test
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit a8d95e010b96a7a1e586ff2d115fc5c0e2b16b70
Author: jmalkin <jm...@users.noreply.github.com>
AuthorDate: Thu Sep 19 13:31:33 2019 -0700

    add theta python tests, clean up typos in cpc/hll, ensure default seed supplied in theta wrapper
---
 python/src/theta_wrapper.cpp |   6 ++-
 python/tests/cpc_test.py     |   7 +--
 python/tests/hll_test.py     |   7 +--
 python/tests/theta_test.py   | 121 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 133 insertions(+), 8 deletions(-)

diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index cf4fe5c..9ec2335 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -112,13 +112,15 @@ void init_theta(py::module &m) {
     .def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"))
     .def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"))
     .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true)
-    .def_static("deserialize", &dspy::update_theta_sketch_deserialize)
+    .def_static("deserialize", &dspy::update_theta_sketch_deserialize,
+        py::arg("bytes"), py::arg("seed")=update_theta_sketch::builder::DEFAULT_SEED)
   ;
 
   py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
     .def(py::init<const compact_theta_sketch&>())
     .def(py::init<const theta_sketch&, bool>())
-    .def_static("deserialize", &dspy::compact_theta_sketch_deserialize)
+    .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
+        py::arg("bytes"), py::arg("seed")=update_theta_sketch::builder::DEFAULT_SEED)
   ;
 
   py::class_<theta_union>(m, "theta_union")
diff --git a/python/tests/cpc_test.py b/python/tests/cpc_test.py
index 2504e87..9029ecf 100644
--- a/python/tests/cpc_test.py
+++ b/python/tests/cpc_test.py
@@ -23,7 +23,7 @@ class CpcTest(unittest.TestCase):
     k = 12      # 2^k = 4096 rows in the table
     n = 1 << 18 # ~256k unique values
 
-    # create a couple sketchrd and inject some values
+    # create a couple sketches and inject some values
     # we'll have 1/4 of the values overlap
     cpc  = cpc_sketch(k)
     cpc2 = cpc_sketch(k)
@@ -49,8 +49,9 @@ class CpcTest(unittest.TestCase):
     union.update(cpc2)
     result = union.get_result()
 
-    # since CPC is deterministic, we have checked and know the
-    # exact answer is within one standard deviation of the estimate
+    # since our process here (including post-union CPC) is
+    # deterministic, we have checked and know the exact
+    # answer is within one standard deviation of the estimate
     self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
     self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
      
diff --git a/python/tests/hll_test.py b/python/tests/hll_test.py
index 420c00a..0fb727e 100644
--- a/python/tests/hll_test.py
+++ b/python/tests/hll_test.py
@@ -23,7 +23,7 @@ class HllTest(unittest.TestCase):
         k = 12      # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
 
-        # create a couple sketchrd and inject some values
+        # create a couple sketches and inject some values
         # we'll have 1/4 of the values overlap
         hll  = hll_sketch(k, tgt_hll_type.HLL_8)
         hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
@@ -50,8 +50,9 @@ class HllTest(unittest.TestCase):
         result = union.get_result()
         self.assertEqual(result.get_estimate(), union.get_estimate())
 
-        # since HLL is deterministic, we have checked and know the
-        # exact answer is within one standard deviation of the estimate
+        # since our process here (including post-union HLL) is
+        # deterministic, we have checked and know the exact
+        # answer is within one standard deviation of the estimate
         self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
         self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)
 
diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py
new file mode 100644
index 0000000..537e19f
--- /dev/null
+++ b/python/tests/theta_test.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+ 
+import unittest
+
+from datasketches import theta_sketch, update_theta_sketch
+from datasketches import compact_theta_sketch, theta_union
+from datasketches import theta_intersection, theta_a_not_b
+
+class ThetaTest(unittest.TestCase):
+    def test_theta_basic_example(self):
+        k = 12      # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+
+        # create a sketch and inject some values
+        sk = self.generate_theta_sketch(n, k)
+
+        # we can check that the upper and lower bounds bracket the
+        # estimate, without needing to know the exact value.
+        self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
+        self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
+
+        # because this sketch is deterministically generated, we can
+        # also compare against the exact value
+        self.assertLessEqual(sk.get_lower_bound(1), n)
+        self.assertGreaterEqual(sk.get_upper_bound(1), n)
+
+        # serialize for storage and reconstruct
+        sk_bytes = sk.serialize()
+        new_sk = update_theta_sketch.deserialize(sk_bytes)
+
+        # estimate remains unchanged
+        self.assertFalse(sk.is_empty())
+        self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
+
+    def test_theta_set_operations(self):
+        k = 12      # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+
+        # we'll have 1/4 of the values overlap
+        offset = int(3 * n / 4) # it's a float w/o cast
+
+        # create a couple sketches and inject some values
+        sk1 = self.generate_theta_sketch(n, k)
+        sk2 = self.generate_theta_sketch(n, k, offset)
+
+        # UNIONS
+        # create a union object
+        union = theta_union(k)
+        union.update(sk1)
+        union.update(sk2)
+
+        # getting result from union returns a compact_theta_sketch
+        # compact theta sketches can be used in additional unions
+        # or set operations but cannot accept further item updates
+        result = union.get_result()
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+
+        # since our process here is deterministic, we have
+        # checked and know the exact answer is within one
+        # standard deviation of the estimate
+        self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
+
+
+        # INTERSECTIONS
+        # create an intersection object
+        intersect = theta_intersection() # no lg_k
+        intersect.update(sk1)
+        intersect.update(sk2)
+
+        # has_result() indicates the intersection has been used,
+        # although the result may be the empty set
+        self.assertTrue(intersect.has_result())
+
+        # as with unions, the result is a compact sketch
+        result = intersect.get_result()
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+
+        # we know the sets overlap by 1/4
+        self.assertLessEqual(result.get_lower_bound(1), n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
+
+
+        # A NOT B
+        # create an a_not_b object
+        anb = theta_a_not_b() # no lg_k
+        result = anb.compute(sk1, sk2)
+
+        # as with unions, the result is a compact sketch
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+
+        # we know the sets overlap by 1/4, so the remainder is 3/4
+        self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
+
+
+    def generate_theta_sketch(self, n, k, offset=0):
+      sk = update_theta_sketch(k)
+      for i in range(0, n):
+        sk.update(i + offset)
+      return sk
+        
+if __name__ == '__main__':
+    unittest.main()
+
+  
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 01/02: add cpc python test example

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch py_test
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit e073ccd38502273a080183d08aa93bca409c30a3
Author: jmalkin <jm...@users.noreply.github.com>
AuthorDate: Wed Sep 18 22:50:03 2019 -0700

    add cpc python test example
---
 python/tests/cpc_test.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/python/tests/cpc_test.py b/python/tests/cpc_test.py
new file mode 100644
index 0000000..2504e87
--- /dev/null
+++ b/python/tests/cpc_test.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+  
+import unittest
+from datasketches import cpc_sketch, cpc_union
+
+class CpcTest(unittest.TestCase):
+  def test_cpc_example(self):
+    k = 12      # 2^k = 4096 rows in the table
+    n = 1 << 18 # ~256k unique values
+
+    # create a couple sketchrd and inject some values
+    # we'll have 1/4 of the values overlap
+    cpc  = cpc_sketch(k)
+    cpc2 = cpc_sketch(k)
+    offset = int(3 * n / 4) # it's a float w/o cast
+    # because we hash on the bits, not an abstract numeric value,
+    # cpc.update(1) and cpc.update(1.0) give different results.
+    for i in range(0, n):
+        cpc.update(i)
+        cpc2.update(i + offset)
+        
+    # although we provide get_composite_estimate() and get_estimate(),
+    # the latter will always give the best available estimate.  we
+    # recommend using get_estimate().
+    # we can check that the upper and lower bounds bracket the
+    # estimate, without needing to know the exact value.
+    self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate())
+    self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate())
+
+    # unioning uses a separate class, but we need to get_result()
+    # tp query the unioned sketches
+    union = cpc_union(k)
+    union.update(cpc)
+    union.update(cpc2)
+    result = union.get_result()
+
+    # since CPC is deterministic, we have checked and know the
+    # exact answer is within one standard deviation of the estimate
+    self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
+    self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
+     
+    # serialize for storage and reconstruct
+    sk_bytes = result.serialize()
+    new_cpc = cpc_sketch.deserialize(sk_bytes)
+    self.assertFalse(new_cpc.is_empty())
+
+if __name__ == '__main__':
+    unittest.main()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org