You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2023/01/28 00:58:25 UTC

[datasketches-cpp] 01/01: Clean up compact tuple creation, add docstrings, define default seed as a named constant

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch py_tuple
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 4afb55d65a440943c3df199176020f0e7830077d
Author: Jon Malkin <78...@users.noreply.github.com>
AuthorDate: Fri Jan 27 16:58:08 2023 -0800

    Clean up compact tuple creation, add docstrings, define default seed as a named constant
---
 python/datasketches/TupleWrapper.py | 113 +++++++++++++++++++++++++-----------
 python/datasketches/__init__.py     |  10 ++--
 python/src/tuple_wrapper.cpp        |  25 ++++----
 python/tests/tuple_test.py          |  12 ++--
 4 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/python/datasketches/TupleWrapper.py b/python/datasketches/TupleWrapper.py
index cb32802..155016f 100644
--- a/python/datasketches/TupleWrapper.py
+++ b/python/datasketches/TupleWrapper.py
@@ -21,51 +21,62 @@ from .TuplePolicy import TuplePolicy
 from _datasketches import _tuple_sketch, _compact_tuple_sketch, _update_tuple_sketch
 from _datasketches import  _tuple_union, _tuple_intersection
 from _datasketches import _tuple_a_not_b, _tuple_jaccard_similarity
-from _datasketches import PyObjectSerDe
+from _datasketches import PyObjectSerDe, theta_sketch
 
 class tuple_sketch(ABC):
+  """An abstract base class representing a Tuple Sketch."""
   _gadget: _tuple_sketch
 
   def __str__(self, print_items:bool=False):
     return self._gadget.to_string(print_items)
 
   def is_empty(self):
+    """Returns True if the sketch is empty, otherwise False."""
     return self._gadget.is_empty()
 
   def get_estimate(self):
+    """Returns an estimate of the distinct count of the input stream."""
     return self._gadget.get_estimate()
 
   def get_upper_bound(self, num_std_devs:int):
+    """Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}."""
     return self._gadget.get_upper_bound(num_std_devs)
 
   def get_lower_bound(self, num_std_devs:int):
+    """Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}."""
     return self._gadget.get_lower_bound(num_std_devs)
 
   def is_estimation_mode(self):
+    """Returns True if the sketch is in estimation mode, otherwise False."""
     return self._gadget.is_estimation_mode()
 
   def get_theta(self):
+    """Returns theta (the effective sampling rate) as a fraction from 0 to 1."""
     return self._gadget.get_theta()
 
   def get_theta64(self):
+    """Returns theta as a 64-bit integer value."""
     return self._gadget.get_theta64()
 
   def get_num_retained(self):
+    """Returns the number of items currently in the sketch."""
     return self._gadget.get_num_retained()
 
   def get_seed_hash(self):
+    """Returns a hash of the seed used in the sketch."""
     return self._gadget.get_seed_hash()
 
   def is_ordered(self):
+    """Returns True if the sketch entries are sorder, otherwise False."""
     return self._gadget.is_ordered()
 
   def __iter__(self):
     return self._gadget.__iter__()
 
-  #.def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
-
 
 class compact_tuple_sketch(tuple_sketch):
+  """An instance of a Tuple Sketch that has been compacted and can no longer accept updates."""
+
   def __init__(self, other:tuple_sketch, ordered:bool = True):
     if other == None:
       self._gadget = None
@@ -73,89 +84,123 @@ class compact_tuple_sketch(tuple_sketch):
       self._gadget = _compact_tuple_sketch(other, ordered)
 
   def serialize(self, serde:PyObjectSerDe):
+    """Serializes the sketch into a bytes object with the provided SerDe."""
     return self._gadget.serialize(serde)
 
-  # TODO: define seed from constant
-  @staticmethod
-  def deserialize(data:bytes, serde:PyObjectSerDe, seed:int=9001):
-    cpp_sk = _compact_tuple_sketch.deserialize(data, serde, seed)
-    # TODO: this seems inefficinet -- is there some sort of _wrap()
-    # approach that might work better?
-    sk = compact_tuple_sketch(None, True)
-    sk._gadget = cpp_sk
-    return sk
+  @classmethod
+  def from_theta_sketch(cls, sketch:theta_sketch, summary, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Creates a comapct Tuple Sketch from a Theta Sketch using a fixed summary value."""
+    self = cls.__new__(cls)
+    self._gadget = _compact_tuple_sketch(sketch, summary, seed)
+    return self
+
+  @classmethod
+  def deserialize(cls, data:bytes, serde:PyObjectSerDe, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Reads a bytes object and uses the provded SerDe to return the corresponding compact_tuple_sketch."""
+    self = cls.__new__(cls)
+    self._gadget = _compact_tuple_sketch.deserialize(data, serde, seed)
+    return self
 
 
 class update_tuple_sketch(tuple_sketch):
-  # TODO: define seed from constant
-  def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = 9001):
+  """An instance of a Tuple Sketch that is available for updates. Requires a Policy object to handle Summary values."""
+
+  def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._policy = policy
     self._gadget = _update_tuple_sketch(self._policy, lg_k, p, seed)
 
-  # TODO: do we need multiple update formats?
-  def update(self, datum, summary):
-    self._gadget.update(datum, summary)
+  def update(self, datum, value):
+    """Updates the sketch with the provided item and summary value."""
+    self._gadget.update(datum, value)
 
   def compact(self, ordered:bool = True) -> compact_tuple_sketch:
+    """Returns a compacted form of the sketch, optionally sorting it."""
     return self._gadget.compact(ordered)
 
+  def reset(self):
+    """Resets the sketch to the initial empty state."""
+    self._gadget.reset()
+
+
 class tuple_union:
-  # TODO: define seed from constant
-  def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = 9001):
+  """An object that can merge Tuple Sketches. Requires a Policy object to handle merging Summaries."""
+  _policy: TuplePolicy
+
+  def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._policy = policy
     self._gadget = _tuple_union(self._policy, lg_k, p, seed)
 
   def update(self, sketch:tuple_sketch):
-      self._gadget.update(sketch._gadget)
+    """Updates the union with the given sketch."""
+    self._gadget.update(sketch._gadget)
 
   def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
-    sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
-    return sk
+    """Returns the sketch corresponding to the union result, optionally sorted."""
+    return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
 
   def reset(self):
+    """Resets the union to the initial empty state."""
     self._gadget.reset()
 
 
 class tuple_intersection:
-  # TODO: define seed from constant
-  def __init__(self, policy:TuplePolicy, seed:int = 9001):
+  """An object that can intersect Tuple Sketches. Requires a Policy object to handle intersecting Summaries."""
+  _policy: TuplePolicy
+
+  def __init__(self, policy:TuplePolicy, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._policy = policy
     self._gadget = _tuple_intersection(self._policy, seed)
 
   def update(self, sketch:tuple_sketch):
+    """Intersects the provided sketch with the current intersection state."""
     self._gadget.update(sketch._gadget)
 
   def has_result(self) -> bool:
+    """Returns True if the intersection has a valid result, otherwise False."""
     return self._gadget.has_result()
 
   def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
-    sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
-    return sk
+    """Returns the sketch corresponding to the intersection result, optionally sorted."""
+    return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
 
 
 class tuple_a_not_b:
-  def __init__(self, seed:int = 9001):
+  """An object that can peform the A-not-B operation between two sketches."""
+  def __init__(self, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._gadget = _tuple_a_not_b(seed)
   
   def compute(self, a:tuple_sketch, b:tuple_sketch, ordered:bool=True) -> compact_tuple_sketch:
-    sk = compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
-    return sk
+    """Returns a sketch with the result of applying the A-not-B operation on the given inputs."""
+    return compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
 
 
 class tuple_jaccard_similarity:
-  # TODO: define seed from constant
   @staticmethod  
-  def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=9001):
+  def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches."""
     return _tuple_jaccard_similarity.jaccard(a._gadget, b._gadget, seed)
 
   @staticmethod
-  def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=9001):
+  def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Returns True if sketch_a and sketch_b are equivalent, otherwise False."""
     return _tuple_jaccard_similarity.exactly_equal(a._gadget, b._gadget, seed)
 
   @staticmethod
-  def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001):
+  def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Tests similarity of an actual sketch against an expected sketch.
+
+    Computes the lower bound of the Jaccard index J_{LB} of the actual and expected sketches.
+    If J_{LB} >= threshold, then the sketches are considered to be similar sith a confidence of
+    97.7% and returns True, otherwise False.
+    """
     return _tuple_jaccard_similarity.similarity_test(actual._gadget, expected._gadget, threshold, seed)
 
   @staticmethod
-  def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001):
+  def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Tests dissimilarity of an actual sketch against an expected sketch.
+
+    Computes the upper bound of the Jaccard index J_{UB} of the actual and expected sketches.
+    If J_{UB} <= threshold, then the sketches are considered to be dissimilar sith a confidence of
+    97.7% and returns True, otherwise False.
+    """
     return _tuple_jaccard_similarity.dissimilarity_test(actual._gadget, expected._gadget, threshold, seed)
diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py
index 71b2d2c..fb7636e 100644
--- a/python/datasketches/__init__.py
+++ b/python/datasketches/__init__.py
@@ -15,6 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""The Apache DataSketches Library for Python
+
+Provided under the Apache License, Verison 2.0
+<http://www.apache.org/licenses/LICENSE-2.0>
+"""
+
 name = 'datasketches'
 
 from _datasketches import *
@@ -27,8 +33,4 @@ from .TuplePolicy import *
 # the C++ object. Currently, the native python portion of
 # a class derived from a C++ class may be garbage collected
 # even though a pointer to the C++ portion remains valid.
-#
-# These wrappers should exactly implement the target API
-# for the pybind11 interface so they can be removed if
-# that issue is ever fixed.
 from .TupleWrapper import *
diff --git a/python/src/tuple_wrapper.cpp b/python/src/tuple_wrapper.cpp
index 027623b..88a9f71 100644
--- a/python/src/tuple_wrapper.cpp
+++ b/python/src/tuple_wrapper.cpp
@@ -47,13 +47,16 @@ void init_tuple(py::module &m) {
     .def("__call__", &tuple_policy::operator(), py::arg("summary"), py::arg("update"))
   ;
 
-  // only needed temporarily -- can remove once everything is working
+  // potentially useful for debugging but not needed as a permanent
+  // object type in the library
+  /*
   py::class_<tuple_policy_holder>(m, "TuplePolicyHolder")
     .def(py::init<std::shared_ptr<tuple_policy>>(), py::arg("policy"))
     .def("create", &tuple_policy_holder::create, "Creates a new Summary object")
     .def("update", &tuple_policy_holder::update, py::arg("summary"), py::arg("update"),
          "Updates the provided summary using the data in update")
   ;
+  */
 
   using py_tuple_sketch = tuple_sketch<py::object>;
   using py_update_tuple = update_tuple_sketch<py::object, py::object, tuple_policy_holder>;
@@ -89,6 +92,7 @@ void init_tuple(py::module &m) {
     .def("is_ordered", &py_tuple_sketch::is_ordered,
          "Returns True if the sketch entries are sorted, otherwise False")
     .def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
+    .def_property_readonly_static("DEFAULT_SEED", [](py::object /* self */) { return DEFAULT_SEED; });
   ;
 
   py::class_<py_compact_tuple, py_tuple_sketch>(m, "_compact_tuple_sketch")
@@ -123,16 +127,17 @@ void init_tuple(py::module &m) {
     )
     .def(py::init<const py_update_tuple&>())
     .def("update", static_cast<void (py_update_tuple::*)(int64_t, py::object&)>(&py_update_tuple::update),
-         py::arg("datum"), py::arg("summary"),
-         "Updates the sketch with the given integral value")
+         py::arg("datum"), py::arg("value"),
+         "Updates the sketch with the given integral item and summary value")
     .def("update", static_cast<void (py_update_tuple::*)(double, py::object&)>(&py_update_tuple::update),
-         py::arg("datum"), py::arg("summary"),
-         "Updates the sketch with the given floating point value")
+         py::arg("datum"), py::arg("value"),
+         "Updates the sketch with the given floating point item and summary value")
     .def("update", static_cast<void (py_update_tuple::*)(const std::string&, py::object&)>(&py_update_tuple::update),
-         py::arg("datum"), py::arg("summary"),
-         "Updates the sketch with the given string")
+         py::arg("datum"), py::arg("value"),
+         "Updates the sketch with the given string item and summary value")
     .def("compact", &py_update_tuple::compact, py::arg("ordered")=true,
          "Returns a compacted form of the sketch, optionally sorting it")
+    .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state")
   ;
 
   py::class_<py_tuple_union>(m, "_tuple_union")
@@ -159,7 +164,7 @@ void init_tuple(py::module &m) {
         }),
         py::arg("policy"), py::arg("seed")=DEFAULT_SEED)
     .def("update", &py_tuple_intersection::update<const py_tuple_sketch&>, py::arg("sketch"),
-         "Intersections the provided sketch with the current intersection state")
+         "Intersects the provided sketch with the current intersection state")
     .def("get_result", &py_tuple_intersection::get_result, py::arg("ordered")=true,
          "Returns the sketch corresponding to the intersection result")
     .def("has_result", &py_tuple_intersection::has_result,
@@ -195,14 +200,14 @@ void init_tuple(py::module &m) {
         "similarity_test",
         &py_tuple_jaccard_similarity::similarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
-        "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+        "Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard "
         "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
         "to be similar sith a confidence of 97.7% and returns True, otherwise False.")
     .def_static(
         "dissimilarity_test",
         &py_tuple_jaccard_similarity::dissimilarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
-        "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+        "Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard "
         "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
         "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False."
     )
diff --git a/python/tests/tuple_test.py b/python/tests/tuple_test.py
index 60732e7..0e94e2e 100644
--- a/python/tests/tuple_test.py
+++ b/python/tests/tuple_test.py
@@ -32,7 +32,7 @@ class TupleTest(unittest.TestCase):
 
         # create a sketch and inject some values -- summary is 2 so we can sum them
         # and know the reuslt
-        sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=2)
+        sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=2)
 
         # we can check that the upper and lower bounds bracket the
         # estimate, without needing to know the exact value.
@@ -84,8 +84,8 @@ class TupleTest(unittest.TestCase):
         offset = int(3 * n / 4) # it's a float w/o cast
 
         # create a couple sketches and inject some values, with different summaries
-        sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=5)
-        sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=7, offset=offset)
+        sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=5)
+        sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=7, offset=offset)
 
         # UNIONS
         # create a union object
@@ -195,11 +195,11 @@ class TupleTest(unittest.TestCase):
         self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
 
 
-    # Generates a basic tuple sketch using a fixed integer summary of 2
-    def generate_tuple_sketch(self, policy, n, k, summary, offset=0):
+    # Generates a basic tuple sketch with a fixed value for each update
+    def generate_tuple_sketch(self, policy, n, k, value, offset=0):
       sk = update_tuple_sketch(policy, k)
       for i in range(0, n):
-        sk.update(i + offset, summary)
+        sk.update(i + offset, value)
       return sk
         
 if __name__ == '__main__':


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org