You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@datasketches.apache.org by al...@apache.org on 2023/02/21 20:24:03 UTC

[datasketches-cpp] branch python_cleanup created (now c403a7d)

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a change to branch python_cleanup
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


      at c403a7d  minor cleanup, no functional changes

This branch includes the following new commits:

     new c403a7d  minor cleanup, no functional changes

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org

[datasketches-cpp] 01/01: minor cleanup, no functional changes

Posted by al...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch python_cleanup
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit c403a7d8e692bb99d10348b2e81235af539c2e9e
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Feb 21 12:23:56 2023 -0800

    minor cleanup, no functional changes
---
 python/src/cpc_wrapper.cpp   |  2 +-
 python/src/hll_wrapper.cpp   | 10 +++++-----
 python/src/theta_wrapper.cpp |  8 ++++----
 python/src/tuple_wrapper.cpp |  8 ++++----
 python/tests/cpc_test.py     | 20 ++++++++++----------
 python/tests/hll_test.py     | 39 +++++++++++++++++++--------------------
 python/tests/theta_test.py   | 22 ++++++++++------------
 python/tests/tuple_test.py   | 28 +++++++++++++---------------
 8 files changed, 66 insertions(+), 71 deletions(-)

diff --git a/python/src/cpc_wrapper.cpp b/python/src/cpc_wrapper.cpp
index ddd3d99..9ce4fd7 100644
--- a/python/src/cpc_wrapper.cpp
+++ b/python/src/cpc_wrapper.cpp
@@ -43,7 +43,7 @@ void init_cpc(py::module &m) {
     .def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"),
          "Updates the sketch with the given string")
     .def("is_empty", &cpc_sketch::is_empty,
-         "Returns True if the sketch is empty, otherwise Dalse")
+         "Returns True if the sketch is empty, otherwise False")
     .def("get_estimate", &cpc_sketch::get_estimate,
          "Estimate of the distinct count of the input stream")
     .def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"),
diff --git a/python/src/hll_wrapper.cpp b/python/src/hll_wrapper.cpp
index 24da90a..52690b2 100644
--- a/python/src/hll_wrapper.cpp
+++ b/python/src/hll_wrapper.cpp
@@ -59,7 +59,7 @@ void init_hll(py::module &m) {
     .def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
          "Returns the size of the serialized sketch when compressing the exception table if HLL_4")
     .def("reset", &hll_sketch::reset,
-         "Resets the sketch to the empty state in coupon colleciton mode")
+         "Resets the sketch to the empty state in coupon collection mode")
     .def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
          "Updates the sketch with the given integral value")
     .def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
@@ -68,17 +68,17 @@ void init_hll(py::module &m) {
          "Updates the sketch with the given string value")
     .def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
          py::arg("lg_k"), py::arg("tgt_type"),
-         "Provides a likely upper bound on serialization size for the given paramters")
+         "Provides a likely upper bound on serialization size for the given parameters")
     .def_static("get_rel_err", &hll_sketch::get_rel_err,
          py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
-         "Retuns the a priori relative error bound for the given parameters")
+         "Returns the a priori relative error bound for the given parameters")
     .def(
         "serialize_compact",
         [](const hll_sketch& sk) {
           auto bytes = sk.serialize_compact();
           return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
         },
-        "Serializes the sketch into a bytes object, compressiong the exception table if HLL_4"
+        "Serializes the sketch into a bytes object, compressing the exception table if HLL_4"
     )
     .def(
         "serialize_updatable",
@@ -121,6 +121,6 @@ void init_hll(py::module &m) {
          "Updates the union with the given string value")
     .def_static("get_rel_err", &hll_union::get_rel_err,
          py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
-         "Retuns the a priori relative error bound for the given parameters")
+         "Returns the a priori relative error bound for the given parameters")
     ;
 }
diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index 60b7cbb..f242ce5 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -52,7 +52,7 @@ void init_theta(py::module &m) {
     .def("get_theta64", &theta_sketch::get_theta64,
          "Returns theta as 64-bit value")
     .def("get_num_retained", &theta_sketch::get_num_retained,
-         "Retunrs the number of items currently in the sketch")
+         "Returns the number of items currently in the sketch")
     .def("get_seed_hash", &theta_sketch::get_seed_hash,
          "Returns a hash of the seed used in the sketch")
     .def("is_ordered", &theta_sketch::is_ordered,
@@ -128,7 +128,7 @@ void init_theta(py::module &m) {
         "compute",
         &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>,
         py::arg("a"), py::arg("b"), py::arg("ordered")=true,
-        "Returns a sketch with the result of appying the A-not-B operation on the given inputs"
+        "Returns a sketch with the result of applying the A-not-B operation on the given inputs"
     )
   ;
   
@@ -153,14 +153,14 @@ void init_theta(py::module &m) {
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
         "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
         "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
-        "to be similar sith a confidence of 97.7% and returns True, otherwise False.")
+        "to be similar with a confidence of 97.7% and returns True, otherwise False.")
     .def_static(
         "dissimilarity_test",
         &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const theta_sketch&>,
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
         "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
         "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
-        "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False."
+        "to be dissimilar with a confidence of 97.7% and returns True, otherwise False."
     )
   ;     
 }
diff --git a/python/src/tuple_wrapper.cpp b/python/src/tuple_wrapper.cpp
index 88a9f71..706621c 100644
--- a/python/src/tuple_wrapper.cpp
+++ b/python/src/tuple_wrapper.cpp
@@ -86,7 +86,7 @@ void init_tuple(py::module &m) {
     .def("get_theta64", &py_tuple_sketch::get_theta64,
          "Returns theta as 64-bit value")
     .def("get_num_retained", &py_tuple_sketch::get_num_retained,
-         "Retunrs the number of items currently in the sketch")
+         "Returns the number of items currently in the sketch")
     .def("get_seed_hash", [](const py_tuple_sketch& sk) { return sk.get_seed_hash(); }, // why does regular call not work??
          "Returns a hash of the seed used in the sketch")
     .def("is_ordered", &py_tuple_sketch::is_ordered,
@@ -177,7 +177,7 @@ void init_tuple(py::module &m) {
         "compute",
         &py_tuple_a_not_b::compute<const py_tuple_sketch&, const py_tuple_sketch&>,
         py::arg("a"), py::arg("b"), py::arg("ordered")=true,
-        "Returns a sketch with the result of appying the A-not-B operation on the given inputs"
+        "Returns a sketch with the result of applying the A-not-B operation on the given inputs"
     )
   ;
 
@@ -202,14 +202,14 @@ void init_tuple(py::module &m) {
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
         "Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard "
         "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
-        "to be similar sith a confidence of 97.7% and returns True, otherwise False.")
+        "to be similar with a confidence of 97.7% and returns True, otherwise False.")
     .def_static(
         "dissimilarity_test",
         &py_tuple_jaccard_similarity::dissimilarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
         "Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard "
         "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
-        "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False."
+        "to be dissimilar with a confidence of 97.7% and returns True, otherwise False."
     )
   ;
 }
diff --git a/python/tests/cpc_test.py b/python/tests/cpc_test.py
index 9029ecf..762413d 100644
--- a/python/tests/cpc_test.py
+++ b/python/tests/cpc_test.py
@@ -14,26 +14,26 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-  
+
 import unittest
 from datasketches import cpc_sketch, cpc_union
 
 class CpcTest(unittest.TestCase):
   def test_cpc_example(self):
-    k = 12      # 2^k = 4096 rows in the table
-    n = 1 << 18 # ~256k unique values
+    lgk = 12    # 2^k = 4096 rows in the table
+    n = 1 << 18 # ~256k distinct values
 
     # create a couple sketches and inject some values
     # we'll have 1/4 of the values overlap
-    cpc  = cpc_sketch(k)
-    cpc2 = cpc_sketch(k)
+    cpc  = cpc_sketch(lgk)
+    cpc2 = cpc_sketch(lgk)
     offset = int(3 * n / 4) # it's a float w/o cast
     # because we hash on the bits, not an abstract numeric value,
     # cpc.update(1) and cpc.update(1.0) give different results.
     for i in range(0, n):
         cpc.update(i)
         cpc2.update(i + offset)
-        
+
     # although we provide get_composite_estimate() and get_estimate(),
     # the latter will always give the best available estimate.  we
     # recommend using get_estimate().
@@ -42,9 +42,9 @@ class CpcTest(unittest.TestCase):
     self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate())
     self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate())
 
-    # unioning uses a separate class, but we need to get_result()
-    # tp query the unioned sketches
-    union = cpc_union(k)
+    # union is a separate class, so we need to get_result()
+    # to query the unioned sketches
+    union = cpc_union(lgk)
     union.update(cpc)
     union.update(cpc2)
     result = union.get_result()
@@ -54,7 +54,7 @@ class CpcTest(unittest.TestCase):
     # answer is within one standard deviation of the estimate
     self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
     self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
-     
+
     # serialize for storage and reconstruct
     sk_bytes = result.serialize()
     new_cpc = cpc_sketch.deserialize(sk_bytes)
diff --git a/python/tests/hll_test.py b/python/tests/hll_test.py
index 9573626..32d762e 100644
--- a/python/tests/hll_test.py
+++ b/python/tests/hll_test.py
@@ -14,34 +14,34 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
- 
+
 import unittest
 from datasketches import hll_sketch, hll_union, tgt_hll_type
 
 class HllTest(unittest.TestCase):
     def test_hll_example(self):
-        k = 12      # 2^k = 4096 rows in the table
+        lgk = 12    # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
 
         # create a couple sketches and inject some values
         # we'll have 1/4 of the values overlap
-        hll  = hll_sketch(k, tgt_hll_type.HLL_8)
-        hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
+        hll  = hll_sketch(lgk, tgt_hll_type.HLL_8)
+        hll2 = hll_sketch(lgk, tgt_hll_type.HLL_6)
         offset = int(3 * n / 4) # it's a float w/o cast
         # because we hash on the bits, not an abstract numeric value,
         # hll.update(1) and hll.update(1.0) give different results.
         for i in range(0, n):
             hll.update(i)
             hll2.update(i + offset)
-        
+
         # we can check that the upper and lower bounds bracket the
         # estimate, without needing to know the exact value.
         self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
         self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
 
-        # unioning uses a separate class, and we can either get a result
+        # union is a separate class, and we can either get a result
         # sketch or query the union object directly
-        union = hll_union(k)
+        union = hll_union(lgk)
         union.update(hll)
         union.update(hll2)
         result = union.get_result()
@@ -59,7 +59,7 @@ class HllTest(unittest.TestCase):
         new_hll = hll_sketch.deserialize(sk_bytes)
 
         # the sketch can self-report its configuration and status
-        self.assertEqual(new_hll.lg_config_k, k)
+        self.assertEqual(new_hll.lg_config_k, lgk)
         self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
         self.assertFalse(new_hll.is_empty())
 
@@ -68,16 +68,16 @@ class HllTest(unittest.TestCase):
         self.assertTrue(new_hll.is_empty())
 
     def test_hll_sketch(self):
-        k = 8
+        lgk = 8
         n = 117
-        hll = self.generate_sketch(n, k, tgt_hll_type.HLL_6)
+        hll = self.generate_sketch(n, lgk, tgt_hll_type.HLL_6)
         hll.update('string data')
         hll.update(3.14159) # double data
 
         self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
         self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
 
-        self.assertEqual(hll.lg_config_k, k)
+        self.assertEqual(hll.lg_config_k, lgk)
         self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6)
 
         bytes_compact = hll.serialize_compact()
@@ -98,13 +98,13 @@ class HllTest(unittest.TestCase):
         self.assertTrue(hll.is_empty())
 
     def test_hll_union(self):
-        k = 7
+        lgk = 7
         n = 53
-        union = hll_union(k)
+        union = hll_union(lgk)
 
-        sk = self.generate_sketch(n, k, tgt_hll_type.HLL_4, 0)
+        sk = self.generate_sketch(n, lgk, tgt_hll_type.HLL_4, 0)
         union.update(sk)
-        sk = self.generate_sketch(3 * n, k, tgt_hll_type.HLL_4, n)
+        sk = self.generate_sketch(3 * n, lgk, tgt_hll_type.HLL_4, n)
         union.update(sk)
         union.update('string data')
         union.update(1.4142136)
@@ -112,19 +112,18 @@ class HllTest(unittest.TestCase):
         self.assertLessEqual(union.get_lower_bound(1), union.get_estimate())
         self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate())
 
-        self.assertEqual(union.lg_config_k, k)
+        self.assertEqual(union.lg_config_k, lgk)
         self.assertFalse(union.is_empty())
 
         sk = union.get_result()
         self.assertTrue(isinstance(sk, hll_sketch))
         self.assertEqual(sk.tgt_type, tgt_hll_type.HLL_4)
         
-    def generate_sketch(self, n, k, sk_type=tgt_hll_type.HLL_4, st_idx=0):
-        sk = hll_sketch(k, sk_type)
+    def generate_sketch(self, n, lgk, sk_type=tgt_hll_type.HLL_4, st_idx=0):
+        sk = hll_sketch(lgk, sk_type)
         for i in range(st_idx, st_idx + n):
             sk.update(i)
         return sk
-        
-        
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py
index 3f0f697..b3ca2da 100644
--- a/python/tests/theta_test.py
+++ b/python/tests/theta_test.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
- 
+
 import unittest
 
 from datasketches import theta_sketch, update_theta_sketch
@@ -24,11 +24,11 @@ from datasketches import theta_jaccard_similarity
 
 class ThetaTest(unittest.TestCase):
     def test_theta_basic_example(self):
-        k = 12      # 2^k = 4096 rows in the table
+        lgk = 12    # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
 
         # create a sketch and inject some values
-        sk = self.generate_theta_sketch(n, k)
+        sk = self.generate_theta_sketch(n, lgk)
 
         # we can check that the upper and lower bounds bracket the
         # estimate, without needing to know the exact value.
@@ -55,19 +55,19 @@ class ThetaTest(unittest.TestCase):
         self.assertEqual(count, new_sk.get_num_retained())
 
     def test_theta_set_operations(self):
-        k = 12      # 2^k = 4096 rows in the table
+        lgk = 12    # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
 
         # we'll have 1/4 of the values overlap
         offset = int(3 * n / 4) # it's a float w/o cast
 
         # create a couple sketches and inject some values
-        sk1 = self.generate_theta_sketch(n, k)
-        sk2 = self.generate_theta_sketch(n, k, offset)
+        sk1 = self.generate_theta_sketch(n, lgk)
+        sk2 = self.generate_theta_sketch(n, lgk, offset)
 
         # UNIONS
         # create a union object
-        union = theta_union(k)
+        union = theta_union(lgk)
         union.update(sk1)
         union.update(sk2)
 
@@ -138,13 +138,11 @@ class ThetaTest(unittest.TestCase):
         self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7))
 
 
-    def generate_theta_sketch(self, n, k, offset=0):
-      sk = update_theta_sketch(k)
+    def generate_theta_sketch(self, n, lgk, offset=0):
+      sk = update_theta_sketch(lgk)
       for i in range(0, n):
         sk.update(i + offset)
       return sk
-        
+
 if __name__ == '__main__':
     unittest.main()
-
-  
\ No newline at end of file
diff --git a/python/tests/tuple_test.py b/python/tests/tuple_test.py
index 0e94e2e..2a298ef 100644
--- a/python/tests/tuple_test.py
+++ b/python/tests/tuple_test.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
- 
+
 import unittest
 
 from datasketches import update_tuple_sketch
@@ -27,12 +27,12 @@ from datasketches import update_theta_sketch
 
 class TupleTest(unittest.TestCase):
     def test_tuple_basic_example(self):
-        k = 12      # 2^k = 4096 rows in the table
+        lgk = 12    # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
 
         # create a sketch and inject some values -- summary is 2 so we can sum them
         # and know the reuslt
-        sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=2)
+        sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=2)
 
         # we can check that the upper and lower bounds bracket the
         # estimate, without needing to know the exact value.
@@ -66,7 +66,7 @@ class TupleTest(unittest.TestCase):
 
         # we can even create a tuple sketch from an existing theta sketch
         # as long as we provide a summary to use
-        theta_sk = update_theta_sketch(k)
+        theta_sk = update_theta_sketch(lgk)
         for i in range(n, 2*n):
           theta_sk.update(i)
         cts = compact_tuple_sketch(theta_sk, 5)
@@ -77,19 +77,19 @@ class TupleTest(unittest.TestCase):
 
 
     def test_tuple_set_operations(self):
-        k = 12      # 2^k = 4096 rows in the table
+        lgk = 12    # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
 
         # we'll have 1/4 of the values overlap
         offset = int(3 * n / 4) # it's a float w/o cast
 
         # create a couple sketches and inject some values, with different summaries
-        sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=5)
-        sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=7, offset=offset)
+        sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=5)
+        sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=7, offset=offset)
 
         # UNIONS
         # create a union object
-        union = tuple_union(MaxIntPolicy(), k)
+        union = tuple_union(MaxIntPolicy(), lgk)
         union.update(sk1)
         union.update(sk2)
 
@@ -179,12 +179,12 @@ class TupleTest(unittest.TestCase):
         self.assertLess(jac[0], jac[1])
         self.assertLess(jac[1], jac[2])
 
-        # checks for sketch equivalency
+        # checks for sketch equivalence
         self.assertTrue(tuple_jaccard_similarity.exactly_equal(sk1, sk1))
         self.assertFalse(tuple_jaccard_similarity.exactly_equal(sk1, sk2))
 
         # we can apply a check for similarity or dissimilarity at a
-        # given threshhold, at 97.7% confidence.
+        # given threshold, at 97.7% confidence.
 
         # check that the Jaccard Index is at most (upper bound) 0.2.
         # exact result would be 1/7
@@ -196,13 +196,11 @@ class TupleTest(unittest.TestCase):
 
 
     # Generates a basic tuple sketch with a fixed value for each update
-    def generate_tuple_sketch(self, policy, n, k, value, offset=0):
-      sk = update_tuple_sketch(policy, k)
+    def generate_tuple_sketch(self, policy, n, lgk, value, offset=0):
+      sk = update_tuple_sketch(policy, lgk)
       for i in range(0, n):
         sk.update(i + offset, value)
       return sk
-        
+
 if __name__ == '__main__':
     unittest.main()
-
-  
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org