You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2023/01/28 00:58:24 UTC

[datasketches-cpp] branch py_tuple created (now 4afb55d)

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a change to branch py_tuple
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


      at 4afb55d  Clean up compact tuple creation, add docstrings, define default seed as a named constant

This branch includes the following new commits:

     new 4afb55d  Clean up compact tuple creation, add docstrings, define default seed as a named constant

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[datasketches-cpp] 01/01: Clean up compact tuple creation, add docstrings, define default seed as a named constant

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch py_tuple
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 4afb55d65a440943c3df199176020f0e7830077d
Author: Jon Malkin <78...@users.noreply.github.com>
AuthorDate: Fri Jan 27 16:58:08 2023 -0800

    Clean up compact tuple creation, add docstrings, define default seed as a named constant
---
 python/datasketches/TupleWrapper.py | 113 +++++++++++++++++++++++++-----------
 python/datasketches/__init__.py     |  10 ++--
 python/src/tuple_wrapper.cpp        |  25 ++++----
 python/tests/tuple_test.py          |  12 ++--
 4 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/python/datasketches/TupleWrapper.py b/python/datasketches/TupleWrapper.py
index cb32802..155016f 100644
--- a/python/datasketches/TupleWrapper.py
+++ b/python/datasketches/TupleWrapper.py
@@ -21,51 +21,62 @@ from .TuplePolicy import TuplePolicy
 from _datasketches import _tuple_sketch, _compact_tuple_sketch, _update_tuple_sketch
 from _datasketches import  _tuple_union, _tuple_intersection
 from _datasketches import _tuple_a_not_b, _tuple_jaccard_similarity
-from _datasketches import PyObjectSerDe
+from _datasketches import PyObjectSerDe, theta_sketch
 
 class tuple_sketch(ABC):
+  """An abstract base class representing a Tuple Sketch."""
   _gadget: _tuple_sketch
 
   def __str__(self, print_items:bool=False):
     return self._gadget.to_string(print_items)
 
   def is_empty(self):
+    """Returns True if the sketch is empty, otherwise False."""
     return self._gadget.is_empty()
 
   def get_estimate(self):
+    """Returns an estimate of the distinct count of the input stream."""
     return self._gadget.get_estimate()
 
   def get_upper_bound(self, num_std_devs:int):
+    """Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}."""
     return self._gadget.get_upper_bound(num_std_devs)
 
   def get_lower_bound(self, num_std_devs:int):
+    """Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}."""
     return self._gadget.get_lower_bound(num_std_devs)
 
   def is_estimation_mode(self):
+    """Returns True if the sketch is in estimation mode, otherwise False."""
     return self._gadget.is_estimation_mode()
 
   def get_theta(self):
+    """Returns theta (the effective sampling rate) as a fraction from 0 to 1."""
     return self._gadget.get_theta()
 
   def get_theta64(self):
+    """Returns theta as a 64-bit integer value."""
     return self._gadget.get_theta64()
 
   def get_num_retained(self):
+    """Returns the number of items currently in the sketch."""
     return self._gadget.get_num_retained()
 
   def get_seed_hash(self):
+    """Returns a hash of the seed used in the sketch."""
     return self._gadget.get_seed_hash()
 
   def is_ordered(self):
+    """Returns True if the sketch entries are sorder, otherwise False."""
     return self._gadget.is_ordered()
 
   def __iter__(self):
     return self._gadget.__iter__()
 
-  #.def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
-
 
 class compact_tuple_sketch(tuple_sketch):
+  """An instance of a Tuple Sketch that has been compacted and can no longer accept updates."""
+
   def __init__(self, other:tuple_sketch, ordered:bool = True):
     if other == None:
       self._gadget = None
@@ -73,89 +84,123 @@ class compact_tuple_sketch(tuple_sketch):
       self._gadget = _compact_tuple_sketch(other, ordered)
 
   def serialize(self, serde:PyObjectSerDe):
+    """Serializes the sketch into a bytes object with the provided SerDe."""
     return self._gadget.serialize(serde)
 
-  # TODO: define seed from constant
-  @staticmethod
-  def deserialize(data:bytes, serde:PyObjectSerDe, seed:int=9001):
-    cpp_sk = _compact_tuple_sketch.deserialize(data, serde, seed)
-    # TODO: this seems inefficinet -- is there some sort of _wrap()
-    # approach that might work better?
-    sk = compact_tuple_sketch(None, True)
-    sk._gadget = cpp_sk
-    return sk
+  @classmethod
+  def from_theta_sketch(cls, sketch:theta_sketch, summary, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Creates a comapct Tuple Sketch from a Theta Sketch using a fixed summary value."""
+    self = cls.__new__(cls)
+    self._gadget = _compact_tuple_sketch(sketch, summary, seed)
+    return self
+
+  @classmethod
+  def deserialize(cls, data:bytes, serde:PyObjectSerDe, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Reads a bytes object and uses the provded SerDe to return the corresponding compact_tuple_sketch."""
+    self = cls.__new__(cls)
+    self._gadget = _compact_tuple_sketch.deserialize(data, serde, seed)
+    return self
 
 
 class update_tuple_sketch(tuple_sketch):
-  # TODO: define seed from constant
-  def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = 9001):
+  """An instance of a Tuple Sketch that is available for updates. Requires a Policy object to handle Summary values."""
+
+  def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._policy = policy
     self._gadget = _update_tuple_sketch(self._policy, lg_k, p, seed)
 
-  # TODO: do we need multiple update formats?
-  def update(self, datum, summary):
-    self._gadget.update(datum, summary)
+  def update(self, datum, value):
+    """Updates the sketch with the provided item and summary value."""
+    self._gadget.update(datum, value)
 
   def compact(self, ordered:bool = True) -> compact_tuple_sketch:
+    """Returns a compacted form of the sketch, optionally sorting it."""
     return self._gadget.compact(ordered)
 
+  def reset(self):
+    """Resets the sketch to the initial empty state."""
+    self._gadget.reset()
+
+
 class tuple_union:
-  # TODO: define seed from constant
-  def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = 9001):
+  """An object that can merge Tuple Sketches. Requires a Policy object to handle merging Summaries."""
+  _policy: TuplePolicy
+
+  def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._policy = policy
     self._gadget = _tuple_union(self._policy, lg_k, p, seed)
 
   def update(self, sketch:tuple_sketch):
-      self._gadget.update(sketch._gadget)
+    """Updates the union with the given sketch."""
+    self._gadget.update(sketch._gadget)
 
   def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
-    sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
-    return sk
+    """Returns the sketch corresponding to the union result, optionally sorted."""
+    return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
 
   def reset(self):
+    """Resets the union to the initial empty state."""
     self._gadget.reset()
 
 
 class tuple_intersection:
-  # TODO: define seed from constant
-  def __init__(self, policy:TuplePolicy, seed:int = 9001):
+  """An object that can intersect Tuple Sketches. Requires a Policy object to handle intersecting Summaries."""
+  _policy: TuplePolicy
+
+  def __init__(self, policy:TuplePolicy, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._policy = policy
     self._gadget = _tuple_intersection(self._policy, seed)
 
   def update(self, sketch:tuple_sketch):
+    """Intersects the provided sketch with the current intersection state."""
     self._gadget.update(sketch._gadget)
 
   def has_result(self) -> bool:
+    """Returns True if the intersection has a valid result, otherwise False."""
     return self._gadget.has_result()
 
   def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
-    sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
-    return sk
+    """Returns the sketch corresponding to the intersection result, optionally sorted."""
+    return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
 
 
 class tuple_a_not_b:
-  def __init__(self, seed:int = 9001):
+  """An object that can peform the A-not-B operation between two sketches."""
+  def __init__(self, seed:int = _tuple_sketch.DEFAULT_SEED):
     self._gadget = _tuple_a_not_b(seed)
   
   def compute(self, a:tuple_sketch, b:tuple_sketch, ordered:bool=True) -> compact_tuple_sketch:
-    sk = compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
-    return sk
+    """Returns a sketch with the result of applying the A-not-B operation on the given inputs."""
+    return compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
 
 
 class tuple_jaccard_similarity:
-  # TODO: define seed from constant
   @staticmethod  
-  def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=9001):
+  def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches."""
     return _tuple_jaccard_similarity.jaccard(a._gadget, b._gadget, seed)
 
   @staticmethod
-  def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=9001):
+  def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Returns True if sketch_a and sketch_b are equivalent, otherwise False."""
     return _tuple_jaccard_similarity.exactly_equal(a._gadget, b._gadget, seed)
 
   @staticmethod
-  def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001):
+  def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Tests similarity of an actual sketch against an expected sketch.
+
+    Computes the lower bound of the Jaccard index J_{LB} of the actual and expected sketches.
+    If J_{LB} >= threshold, then the sketches are considered to be similar sith a confidence of
+    97.7% and returns True, otherwise False.
+    """
     return _tuple_jaccard_similarity.similarity_test(actual._gadget, expected._gadget, threshold, seed)
 
   @staticmethod
-  def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001):
+  def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
+    """Tests dissimilarity of an actual sketch against an expected sketch.
+
+    Computes the upper bound of the Jaccard index J_{UB} of the actual and expected sketches.
+    If J_{UB} <= threshold, then the sketches are considered to be dissimilar sith a confidence of
+    97.7% and returns True, otherwise False.
+    """
     return _tuple_jaccard_similarity.dissimilarity_test(actual._gadget, expected._gadget, threshold, seed)
diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py
index 71b2d2c..fb7636e 100644
--- a/python/datasketches/__init__.py
+++ b/python/datasketches/__init__.py
@@ -15,6 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""The Apache DataSketches Library for Python
+
+Provided under the Apache License, Verison 2.0
+<http://www.apache.org/licenses/LICENSE-2.0>
+"""
+
 name = 'datasketches'
 
 from _datasketches import *
@@ -27,8 +33,4 @@ from .TuplePolicy import *
 # the C++ object. Currently, the native python portion of
 # a class derived from a C++ class may be garbage collected
 # even though a pointer to the C++ portion remains valid.
-#
-# These wrappers should exactly implement the target API
-# for the pybind11 interface so they can be removed if
-# that issue is ever fixed.
 from .TupleWrapper import *
diff --git a/python/src/tuple_wrapper.cpp b/python/src/tuple_wrapper.cpp
index 027623b..88a9f71 100644
--- a/python/src/tuple_wrapper.cpp
+++ b/python/src/tuple_wrapper.cpp
@@ -47,13 +47,16 @@ void init_tuple(py::module &m) {
     .def("__call__", &tuple_policy::operator(), py::arg("summary"), py::arg("update"))
   ;
 
-  // only needed temporarily -- can remove once everything is working
+  // potentially useful for debugging but not needed as a permanent
+  // object type in the library
+  /*
   py::class_<tuple_policy_holder>(m, "TuplePolicyHolder")
     .def(py::init<std::shared_ptr<tuple_policy>>(), py::arg("policy"))
     .def("create", &tuple_policy_holder::create, "Creates a new Summary object")
     .def("update", &tuple_policy_holder::update, py::arg("summary"), py::arg("update"),
          "Updates the provided summary using the data in update")
   ;
+  */
 
   using py_tuple_sketch = tuple_sketch<py::object>;
   using py_update_tuple = update_tuple_sketch<py::object, py::object, tuple_policy_holder>;
@@ -89,6 +92,7 @@ void init_tuple(py::module &m) {
     .def("is_ordered", &py_tuple_sketch::is_ordered,
          "Returns True if the sketch entries are sorted, otherwise False")
     .def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
+    .def_property_readonly_static("DEFAULT_SEED", [](py::object /* self */) { return DEFAULT_SEED; });
   ;
 
   py::class_<py_compact_tuple, py_tuple_sketch>(m, "_compact_tuple_sketch")
@@ -123,16 +127,17 @@ void init_tuple(py::module &m) {
     )
     .def(py::init<const py_update_tuple&>())
     .def("update", static_cast<void (py_update_tuple::*)(int64_t, py::object&)>(&py_update_tuple::update),
-         py::arg("datum"), py::arg("summary"),
-         "Updates the sketch with the given integral value")
+         py::arg("datum"), py::arg("value"),
+         "Updates the sketch with the given integral item and summary value")
     .def("update", static_cast<void (py_update_tuple::*)(double, py::object&)>(&py_update_tuple::update),
-         py::arg("datum"), py::arg("summary"),
-         "Updates the sketch with the given floating point value")
+         py::arg("datum"), py::arg("value"),
+         "Updates the sketch with the given floating point item and summary value")
     .def("update", static_cast<void (py_update_tuple::*)(const std::string&, py::object&)>(&py_update_tuple::update),
-         py::arg("datum"), py::arg("summary"),
-         "Updates the sketch with the given string")
+         py::arg("datum"), py::arg("value"),
+         "Updates the sketch with the given string item and summary value")
     .def("compact", &py_update_tuple::compact, py::arg("ordered")=true,
          "Returns a compacted form of the sketch, optionally sorting it")
+    .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state")
   ;
 
   py::class_<py_tuple_union>(m, "_tuple_union")
@@ -159,7 +164,7 @@ void init_tuple(py::module &m) {
         }),
         py::arg("policy"), py::arg("seed")=DEFAULT_SEED)
     .def("update", &py_tuple_intersection::update<const py_tuple_sketch&>, py::arg("sketch"),
-         "Intersections the provided sketch with the current intersection state")
+         "Intersects the provided sketch with the current intersection state")
     .def("get_result", &py_tuple_intersection::get_result, py::arg("ordered")=true,
          "Returns the sketch corresponding to the intersection result")
     .def("has_result", &py_tuple_intersection::has_result,
@@ -195,14 +200,14 @@ void init_tuple(py::module &m) {
         "similarity_test",
         &py_tuple_jaccard_similarity::similarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
-        "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+        "Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard "
         "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
         "to be similar sith a confidence of 97.7% and returns True, otherwise False.")
     .def_static(
         "dissimilarity_test",
         &py_tuple_jaccard_similarity::dissimilarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
         py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
-        "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+        "Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard "
         "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
         "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False."
     )
diff --git a/python/tests/tuple_test.py b/python/tests/tuple_test.py
index 60732e7..0e94e2e 100644
--- a/python/tests/tuple_test.py
+++ b/python/tests/tuple_test.py
@@ -32,7 +32,7 @@ class TupleTest(unittest.TestCase):
 
         # create a sketch and inject some values -- summary is 2 so we can sum them
         # and know the reuslt
-        sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=2)
+        sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=2)
 
         # we can check that the upper and lower bounds bracket the
         # estimate, without needing to know the exact value.
@@ -84,8 +84,8 @@ class TupleTest(unittest.TestCase):
         offset = int(3 * n / 4) # it's a float w/o cast
 
         # create a couple sketches and inject some values, with different summaries
-        sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=5)
-        sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=7, offset=offset)
+        sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=5)
+        sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=7, offset=offset)
 
         # UNIONS
         # create a union object
@@ -195,11 +195,11 @@ class TupleTest(unittest.TestCase):
         self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
 
 
-    # Generates a basic tuple sketch using a fixed integer summary of 2
-    def generate_tuple_sketch(self, policy, n, k, summary, offset=0):
+    # Generates a basic tuple sketch with a fixed value for each update
+    def generate_tuple_sketch(self, policy, n, k, value, offset=0):
       sk = update_tuple_sketch(policy, k)
       for i in range(0, n):
-        sk.update(i + offset, summary)
+        sk.update(i + offset, value)
       return sk
         
 if __name__ == '__main__':


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org