You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2023/01/28 00:58:25 UTC
[datasketches-cpp] 01/01: Clean up compact tuple creation, add docstrings, define default seed as a named constant
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch py_tuple
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 4afb55d65a440943c3df199176020f0e7830077d
Author: Jon Malkin <78...@users.noreply.github.com>
AuthorDate: Fri Jan 27 16:58:08 2023 -0800
Clean up compact tuple creation, add docstrings, define default seed as a named constant
---
python/datasketches/TupleWrapper.py | 113 +++++++++++++++++++++++++-----------
python/datasketches/__init__.py | 10 ++--
python/src/tuple_wrapper.cpp | 25 ++++----
python/tests/tuple_test.py | 12 ++--
4 files changed, 106 insertions(+), 54 deletions(-)
diff --git a/python/datasketches/TupleWrapper.py b/python/datasketches/TupleWrapper.py
index cb32802..155016f 100644
--- a/python/datasketches/TupleWrapper.py
+++ b/python/datasketches/TupleWrapper.py
@@ -21,51 +21,62 @@ from .TuplePolicy import TuplePolicy
from _datasketches import _tuple_sketch, _compact_tuple_sketch, _update_tuple_sketch
from _datasketches import _tuple_union, _tuple_intersection
from _datasketches import _tuple_a_not_b, _tuple_jaccard_similarity
-from _datasketches import PyObjectSerDe
+from _datasketches import PyObjectSerDe, theta_sketch
class tuple_sketch(ABC):
+ """An abstract base class representing a Tuple Sketch."""
_gadget: _tuple_sketch
def __str__(self, print_items:bool=False):
return self._gadget.to_string(print_items)
def is_empty(self):
+ """Returns True if the sketch is empty, otherwise False."""
return self._gadget.is_empty()
def get_estimate(self):
+ """Returns an estimate of the distinct count of the input stream."""
return self._gadget.get_estimate()
def get_upper_bound(self, num_std_devs:int):
+ """Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}."""
return self._gadget.get_upper_bound(num_std_devs)
def get_lower_bound(self, num_std_devs:int):
+ """Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}."""
return self._gadget.get_lower_bound(num_std_devs)
def is_estimation_mode(self):
+ """Returns True if the sketch is in estimation mode, otherwise False."""
return self._gadget.is_estimation_mode()
def get_theta(self):
+ """Returns theta (the effective sampling rate) as a fraction from 0 to 1."""
return self._gadget.get_theta()
def get_theta64(self):
+ """Returns theta as a 64-bit integer value."""
return self._gadget.get_theta64()
def get_num_retained(self):
+ """Returns the number of items currently in the sketch."""
return self._gadget.get_num_retained()
def get_seed_hash(self):
+ """Returns a hash of the seed used in the sketch."""
return self._gadget.get_seed_hash()
def is_ordered(self):
+ """Returns True if the sketch entries are sorder, otherwise False."""
return self._gadget.is_ordered()
def __iter__(self):
return self._gadget.__iter__()
- #.def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
-
class compact_tuple_sketch(tuple_sketch):
+ """An instance of a Tuple Sketch that has been compacted and can no longer accept updates."""
+
def __init__(self, other:tuple_sketch, ordered:bool = True):
if other == None:
self._gadget = None
@@ -73,89 +84,123 @@ class compact_tuple_sketch(tuple_sketch):
self._gadget = _compact_tuple_sketch(other, ordered)
def serialize(self, serde:PyObjectSerDe):
+ """Serializes the sketch into a bytes object with the provided SerDe."""
return self._gadget.serialize(serde)
- # TODO: define seed from constant
- @staticmethod
- def deserialize(data:bytes, serde:PyObjectSerDe, seed:int=9001):
- cpp_sk = _compact_tuple_sketch.deserialize(data, serde, seed)
- # TODO: this seems inefficinet -- is there some sort of _wrap()
- # approach that might work better?
- sk = compact_tuple_sketch(None, True)
- sk._gadget = cpp_sk
- return sk
+ @classmethod
+ def from_theta_sketch(cls, sketch:theta_sketch, summary, seed:int=_tuple_sketch.DEFAULT_SEED):
+ """Creates a comapct Tuple Sketch from a Theta Sketch using a fixed summary value."""
+ self = cls.__new__(cls)
+ self._gadget = _compact_tuple_sketch(sketch, summary, seed)
+ return self
+
+ @classmethod
+ def deserialize(cls, data:bytes, serde:PyObjectSerDe, seed:int=_tuple_sketch.DEFAULT_SEED):
+ """Reads a bytes object and uses the provded SerDe to return the corresponding compact_tuple_sketch."""
+ self = cls.__new__(cls)
+ self._gadget = _compact_tuple_sketch.deserialize(data, serde, seed)
+ return self
class update_tuple_sketch(tuple_sketch):
- # TODO: define seed from constant
- def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = 9001):
+ """An instance of a Tuple Sketch that is available for updates. Requires a Policy object to handle Summary values."""
+
+ def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
self._policy = policy
self._gadget = _update_tuple_sketch(self._policy, lg_k, p, seed)
- # TODO: do we need multiple update formats?
- def update(self, datum, summary):
- self._gadget.update(datum, summary)
+ def update(self, datum, value):
+ """Updates the sketch with the provided item and summary value."""
+ self._gadget.update(datum, value)
def compact(self, ordered:bool = True) -> compact_tuple_sketch:
+ """Returns a compacted form of the sketch, optionally sorting it."""
return self._gadget.compact(ordered)
+ def reset(self):
+ """Resets the sketch to the initial empty state."""
+ self._gadget.reset()
+
+
class tuple_union:
- # TODO: define seed from constant
- def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = 9001):
+ """An object that can merge Tuple Sketches. Requires a Policy object to handle merging Summaries."""
+ _policy: TuplePolicy
+
+ def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
self._policy = policy
self._gadget = _tuple_union(self._policy, lg_k, p, seed)
def update(self, sketch:tuple_sketch):
- self._gadget.update(sketch._gadget)
+ """Updates the union with the given sketch."""
+ self._gadget.update(sketch._gadget)
def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
- sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
- return sk
+ """Returns the sketch corresponding to the union result, optionally sorted."""
+ return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
def reset(self):
+ """Resets the union to the initial empty state."""
self._gadget.reset()
class tuple_intersection:
- # TODO: define seed from constant
- def __init__(self, policy:TuplePolicy, seed:int = 9001):
+ """An object that can intersect Tuple Sketches. Requires a Policy object to handle intersecting Summaries."""
+ _policy: TuplePolicy
+
+ def __init__(self, policy:TuplePolicy, seed:int = _tuple_sketch.DEFAULT_SEED):
self._policy = policy
self._gadget = _tuple_intersection(self._policy, seed)
def update(self, sketch:tuple_sketch):
+ """Intersects the provided sketch with the current intersection state."""
self._gadget.update(sketch._gadget)
def has_result(self) -> bool:
+ """Returns True if the intersection has a valid result, otherwise False."""
return self._gadget.has_result()
def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
- sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
- return sk
+ """Returns the sketch corresponding to the intersection result, optionally sorted."""
+ return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
class tuple_a_not_b:
- def __init__(self, seed:int = 9001):
+ """An object that can peform the A-not-B operation between two sketches."""
+ def __init__(self, seed:int = _tuple_sketch.DEFAULT_SEED):
self._gadget = _tuple_a_not_b(seed)
def compute(self, a:tuple_sketch, b:tuple_sketch, ordered:bool=True) -> compact_tuple_sketch:
- sk = compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
- return sk
+ """Returns a sketch with the result of applying the A-not-B operation on the given inputs."""
+ return compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
class tuple_jaccard_similarity:
- # TODO: define seed from constant
@staticmethod
- def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=9001):
+ def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
+ """Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches."""
return _tuple_jaccard_similarity.jaccard(a._gadget, b._gadget, seed)
@staticmethod
- def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=9001):
+ def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
+ """Returns True if sketch_a and sketch_b are equivalent, otherwise False."""
return _tuple_jaccard_similarity.exactly_equal(a._gadget, b._gadget, seed)
@staticmethod
- def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001):
+ def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
+ """Tests similarity of an actual sketch against an expected sketch.
+
+ Computes the lower bound of the Jaccard index J_{LB} of the actual and expected sketches.
+ If J_{LB} >= threshold, then the sketches are considered to be similar sith a confidence of
+ 97.7% and returns True, otherwise False.
+ """
return _tuple_jaccard_similarity.similarity_test(actual._gadget, expected._gadget, threshold, seed)
@staticmethod
- def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001):
+ def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
+ """Tests dissimilarity of an actual sketch against an expected sketch.
+
+ Computes the upper bound of the Jaccard index J_{UB} of the actual and expected sketches.
+ If J_{UB} <= threshold, then the sketches are considered to be dissimilar sith a confidence of
+ 97.7% and returns True, otherwise False.
+ """
return _tuple_jaccard_similarity.dissimilarity_test(actual._gadget, expected._gadget, threshold, seed)
diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py
index 71b2d2c..fb7636e 100644
--- a/python/datasketches/__init__.py
+++ b/python/datasketches/__init__.py
@@ -15,6 +15,12 @@
# specific language governing permissions and limitations
# under the License.
+"""The Apache DataSketches Library for Python
+
+Provided under the Apache License, Verison 2.0
+<http://www.apache.org/licenses/LICENSE-2.0>
+"""
+
name = 'datasketches'
from _datasketches import *
@@ -27,8 +33,4 @@ from .TuplePolicy import *
# the C++ object. Currently, the native python portion of
# a class derived from a C++ class may be garbage collected
# even though a pointer to the C++ portion remains valid.
-#
-# These wrappers should exactly implement the target API
-# for the pybind11 interface so they can be removed if
-# that issue is ever fixed.
from .TupleWrapper import *
diff --git a/python/src/tuple_wrapper.cpp b/python/src/tuple_wrapper.cpp
index 027623b..88a9f71 100644
--- a/python/src/tuple_wrapper.cpp
+++ b/python/src/tuple_wrapper.cpp
@@ -47,13 +47,16 @@ void init_tuple(py::module &m) {
.def("__call__", &tuple_policy::operator(), py::arg("summary"), py::arg("update"))
;
- // only needed temporarily -- can remove once everything is working
+ // potentially useful for debugging but not needed as a permanent
+ // object type in the library
+ /*
py::class_<tuple_policy_holder>(m, "TuplePolicyHolder")
.def(py::init<std::shared_ptr<tuple_policy>>(), py::arg("policy"))
.def("create", &tuple_policy_holder::create, "Creates a new Summary object")
.def("update", &tuple_policy_holder::update, py::arg("summary"), py::arg("update"),
"Updates the provided summary using the data in update")
;
+ */
using py_tuple_sketch = tuple_sketch<py::object>;
using py_update_tuple = update_tuple_sketch<py::object, py::object, tuple_policy_holder>;
@@ -89,6 +92,7 @@ void init_tuple(py::module &m) {
.def("is_ordered", &py_tuple_sketch::is_ordered,
"Returns True if the sketch entries are sorted, otherwise False")
.def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
+ .def_property_readonly_static("DEFAULT_SEED", [](py::object /* self */) { return DEFAULT_SEED; });
;
py::class_<py_compact_tuple, py_tuple_sketch>(m, "_compact_tuple_sketch")
@@ -123,16 +127,17 @@ void init_tuple(py::module &m) {
)
.def(py::init<const py_update_tuple&>())
.def("update", static_cast<void (py_update_tuple::*)(int64_t, py::object&)>(&py_update_tuple::update),
- py::arg("datum"), py::arg("summary"),
- "Updates the sketch with the given integral value")
+ py::arg("datum"), py::arg("value"),
+ "Updates the sketch with the given integral item and summary value")
.def("update", static_cast<void (py_update_tuple::*)(double, py::object&)>(&py_update_tuple::update),
- py::arg("datum"), py::arg("summary"),
- "Updates the sketch with the given floating point value")
+ py::arg("datum"), py::arg("value"),
+ "Updates the sketch with the given floating point item and summary value")
.def("update", static_cast<void (py_update_tuple::*)(const std::string&, py::object&)>(&py_update_tuple::update),
- py::arg("datum"), py::arg("summary"),
- "Updates the sketch with the given string")
+ py::arg("datum"), py::arg("value"),
+ "Updates the sketch with the given string item and summary value")
.def("compact", &py_update_tuple::compact, py::arg("ordered")=true,
"Returns a compacted form of the sketch, optionally sorting it")
+ .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state")
;
py::class_<py_tuple_union>(m, "_tuple_union")
@@ -159,7 +164,7 @@ void init_tuple(py::module &m) {
}),
py::arg("policy"), py::arg("seed")=DEFAULT_SEED)
.def("update", &py_tuple_intersection::update<const py_tuple_sketch&>, py::arg("sketch"),
- "Intersections the provided sketch with the current intersection state")
+ "Intersects the provided sketch with the current intersection state")
.def("get_result", &py_tuple_intersection::get_result, py::arg("ordered")=true,
"Returns the sketch corresponding to the intersection result")
.def("has_result", &py_tuple_intersection::has_result,
@@ -195,14 +200,14 @@ void init_tuple(py::module &m) {
"similarity_test",
&py_tuple_jaccard_similarity::similarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
- "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+ "Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard "
"index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
"to be similar sith a confidence of 97.7% and returns True, otherwise False.")
.def_static(
"dissimilarity_test",
&py_tuple_jaccard_similarity::dissimilarity_test<const py_tuple_sketch&, const py_tuple_sketch&>,
py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
- "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+ "Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard "
"index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
"to be dissimilar sith a confidence of 97.7% and returns True, otherwise False."
)
diff --git a/python/tests/tuple_test.py b/python/tests/tuple_test.py
index 60732e7..0e94e2e 100644
--- a/python/tests/tuple_test.py
+++ b/python/tests/tuple_test.py
@@ -32,7 +32,7 @@ class TupleTest(unittest.TestCase):
# create a sketch and inject some values -- summary is 2 so we can sum them
# and know the reuslt
- sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=2)
+ sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=2)
# we can check that the upper and lower bounds bracket the
# estimate, without needing to know the exact value.
@@ -84,8 +84,8 @@ class TupleTest(unittest.TestCase):
offset = int(3 * n / 4) # it's a float w/o cast
# create a couple sketches and inject some values, with different summaries
- sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=5)
- sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=7, offset=offset)
+ sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=5)
+ sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=7, offset=offset)
# UNIONS
# create a union object
@@ -195,11 +195,11 @@ class TupleTest(unittest.TestCase):
self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
- # Generates a basic tuple sketch using a fixed integer summary of 2
- def generate_tuple_sketch(self, policy, n, k, summary, offset=0):
+ # Generates a basic tuple sketch with a fixed value for each update
+ def generate_tuple_sketch(self, policy, n, k, value, offset=0):
sk = update_tuple_sketch(policy, k)
for i in range(0, n):
- sk.update(i + offset, summary)
+ sk.update(i + offset, value)
return sk
if __name__ == '__main__':
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org