You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/12 18:08:11 UTC
(datasketches-python) 01/01: Allow compression for theta sketches
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch theta_compressed
in repository https://gitbox.apache.org/repos/asf/datasketches-python.git
commit 9dc7e7c8ba153027475544f92661ef9d78f94511
Author: Jon Malkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 12 10:07:51 2024 -0800
Allow compression for theta sketches
---
src/theta_wrapper.cpp | 8 ++++----
tests/theta_test.py | 10 ++++++++++
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/src/theta_wrapper.cpp b/src/theta_wrapper.cpp
index 1e4eee5..2915e17 100644
--- a/src/theta_wrapper.cpp
+++ b/src/theta_wrapper.cpp
@@ -104,11 +104,11 @@ void init_theta(nb::module_ &m) {
.def("__copy__", [](const compact_theta_sketch& sk){ return compact_theta_sketch(sk); })
.def(
"serialize",
- [](const compact_theta_sketch& sk) {
- auto bytes = sk.serialize();
+ [](const compact_theta_sketch& sk, bool compress) {
+ auto bytes = compress ? sk.serialize_compressed() : sk.serialize();
return nb::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
- },
- "Serializes the sketch into a bytes object"
+ }, nb::arg("compress")=false,
+ "Serializes the sketch into a bytes object, optionally compressing the data"
)
.def_static(
"deserialize",
diff --git a/tests/theta_test.py b/tests/theta_test.py
index 8cdb2a7..1d2da0f 100644
--- a/tests/theta_test.py
+++ b/tests/theta_test.py
@@ -48,6 +48,16 @@ class ThetaTest(unittest.TestCase):
self.assertFalse(sk.is_empty())
self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
+ # can also serialze in a compressed format
+ sk_compresed_bytes = sk.compact().serialize(compress=True)
+ self.assertLess(len(sk_compresed_bytes), len(sk_bytes))
+ sk_from_compressed = compact_theta_sketch.deserialize(sk_compresed_bytes)
+
+ # compressed and non-compressed sketches should match
+ self.assertEqual(sk_from_compressed.get_estimate(), new_sk.get_estimate())
+ self.assertEqual(sk_from_compressed.get_upper_bound(1), new_sk.get_upper_bound(1))
+ self.assertEqual(sk_from_compressed.get_lower_bound(1), new_sk.get_lower_bound(1))
+
# check that printing works as expected
self.assertGreater(len(sk.to_string(True)), 0)
self.assertEqual(len(sk.__str__()), len(sk.to_string()))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org