You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/08/21 21:20:27 UTC
[incubator-datasketches-cpp] branch tuple_sketch updated: simpler
way to convert theta sketch to tuple sketch
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
The following commit(s) were added to refs/heads/tuple_sketch by this push:
new 385fbe7 simpler way to convert theta sketch to tuple sketch
385fbe7 is described below
commit 385fbe759119f9ec2f8a33c8f544c18fb9941392
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Aug 21 14:20:16 2020 -0700
simpler way to convert theta sketch to tuple sketch
---
tuple/CMakeLists.txt | 3 ---
tuple/include/tuple_sketch.hpp | 3 +++
tuple/include/tuple_sketch_impl.hpp | 15 +++++++++++
tuple/test/CMakeLists.txt | 1 -
tuple/test/tuple_a_not_b_test.cpp | 44 ++++++++++++++++++++++++++++++
tuple/test/tuple_intersection_test.cpp | 33 +++++++++++++++++++++++
tuple/test/tuple_union_test.cpp | 49 ++++++++++++++++++++++++++++++++++
7 files changed, 144 insertions(+), 4 deletions(-)
diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt
index 2a452f7..fce73ab 100644
--- a/tuple/CMakeLists.txt
+++ b/tuple/CMakeLists.txt
@@ -44,7 +44,6 @@ list(APPEND tuple_HEADERS "include/theta_set_difference_base.hpp;include/theta_s
list(APPEND tuple_HEADERS "include/theta_sketch_experimental.hpp;include/theta_sketch_experimental_impl.hpp")
list(APPEND tuple_HEADERS "include/theta_union_experimental.hpp;include/theta_union_experimental_impl.hpp")
list(APPEND tuple_HEADERS "include/array_of_doubles_sketch.hpp;include/array_of_doubles_sketch_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_to_tuple_sketch_adapter.hpp;include/theta_to_tuple_sketch_adapter_impl.hpp")
install(TARGETS tuple
EXPORT ${PROJECT_NAME}
@@ -77,6 +76,4 @@ target_sources(tuple
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_experimental_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_to_tuple_sketch_adapter.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_to_tuple_sketch_adapter_impl.hpp
)
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index e5e000d..16ed425 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -31,6 +31,7 @@ namespace datasketches {
template<typename S, typename A> class tuple_sketch;
template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
template<typename S, typename A> class compact_tuple_sketch;
+template<typename A> class theta_sketch_experimental;
template<
typename Summary,
@@ -371,6 +372,8 @@ public:
compact_tuple_sketch& operator=(const compact_tuple_sketch&) = default;
compact_tuple_sketch& operator=(compact_tuple_sketch&&) = default;
+ compact_tuple_sketch(const theta_sketch_experimental<AllocU64>& other, const Summary& summary, bool ordered = true);
+
virtual Allocator get_allocator() const;
virtual bool is_empty() const;
virtual bool is_ordered() const;
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index f99b647..a95f122 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -270,6 +270,21 @@ entries_(other.get_allocator())
}
template<typename S, typename A>
+compact_tuple_sketch<S, A>::compact_tuple_sketch(const theta_sketch_experimental<AllocU64>& other, const S& summary, bool ordered):
+is_empty_(other.is_empty()),
+is_ordered_(other.is_ordered() || ordered),
+seed_hash_(other.get_seed_hash()),
+theta_(other.get_theta64()),
+entries_(other.get_allocator())
+{
+ entries_.reserve(other.get_num_retained());
+ for (uint64_t hash: other) {
+ entries_.push_back(Entry(hash, summary));
+ }
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end(), comparator());
+}
+
+template<typename S, typename A>
A compact_tuple_sketch<S, A>::get_allocator() const {
return entries_.get_allocator();
}
diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index 17be15b..53a3a7e 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -46,5 +46,4 @@ target_sources(tuple_test
theta_sketch_experimental_test.cpp
theta_union_experimental_test.cpp
array_of_doubles_sketch_test.cpp
- mixed_union_test.cpp
)
diff --git a/tuple/test/tuple_a_not_b_test.cpp b/tuple/test/tuple_a_not_b_test.cpp
index 7e44d2e..1c56102 100644
--- a/tuple/test/tuple_a_not_b_test.cpp
+++ b/tuple/test/tuple_a_not_b_test.cpp
@@ -21,6 +21,7 @@
#include <catch.hpp>
#include <tuple_a_not_b.hpp>
+#include <theta_sketch_experimental.hpp>
namespace datasketches {
@@ -101,6 +102,49 @@ TEST_CASE("tuple a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
REQUIRE(result.get_estimate() == 500.0);
}
+// needed until promotion of experimental to replace existing theta sketch
+using update_theta_sketch = update_theta_sketch_experimental<>;
+
+TEST_CASE("mixed a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
+ auto a = update_tuple_sketch<float>::builder().build();
+ int value = 0;
+ for (int i = 0; i < 1000; i++) a.update(value++, 1);
+
+ auto b = update_theta_sketch::builder().build();
+ value = 500;
+ for (int i = 0; i < 1000; i++) b.update(value++);
+
+ tuple_a_not_b<float> a_not_b;
+
+ // unordered inputs, ordered result
+ auto result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false));
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE_FALSE(result.is_estimation_mode());
+ REQUIRE(result.is_ordered());
+ REQUIRE(result.get_estimate() == 500.0);
+
+ // unordered inputs, unordered result
+ result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false), false);
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE_FALSE(result.is_estimation_mode());
+ REQUIRE_FALSE(result.is_ordered());
+ REQUIRE(result.get_estimate() == 500.0);
+
+ // ordered inputs
+ result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b.compact(), 1));
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE_FALSE(result.is_estimation_mode());
+ REQUIRE(result.is_ordered());
+ REQUIRE(result.get_estimate() == 500.0);
+
+ // A is ordered, so the result is ordered regardless
+ result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b, 1, false), false);
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE_FALSE(result.is_estimation_mode());
+ REQUIRE(result.is_ordered());
+ REQUIRE(result.get_estimate() == 500.0);
+}
+
TEST_CASE("tuple a-not-b: exact mode disjoint", "[tuple_a_not_b]") {
auto a = update_tuple_sketch<float>::builder().build();
int value = 0;
diff --git a/tuple/test/tuple_intersection_test.cpp b/tuple/test/tuple_intersection_test.cpp
index 00591e3..9796cb3 100644
--- a/tuple/test/tuple_intersection_test.cpp
+++ b/tuple/test/tuple_intersection_test.cpp
@@ -21,6 +21,7 @@
#include <catch.hpp>
#include <tuple_intersection.hpp>
+#include <theta_sketch_experimental.hpp>
namespace datasketches {
@@ -135,6 +136,38 @@ TEST_CASE("tuple intersection: exact mode disjoint", "[tuple_intersection]") {
}
}
+// needed until promotion of experimental to replace existing theta sketch
+using update_theta_sketch = update_theta_sketch_experimental<>;
+
+TEST_CASE("mixed intersection: exact mode half overlap", "[tuple_intersection]") {
+ auto sketch1 = update_tuple_sketch<float>::builder().build();
+ int value = 0;
+ for (int i = 0; i < 1000; i++) sketch1.update(value++, 1);
+
+ auto sketch2 = update_theta_sketch::builder().build();
+ value = 500;
+ for (int i = 0; i < 1000; i++) sketch2.update(value++);
+
+ { // unordered
+ tuple_intersection_float intersection;
+ intersection.update(sketch1);
+ intersection.update(compact_tuple_sketch<float>(sketch2, 1, false));
+ auto result = intersection.get_result();
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE_FALSE(result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == 500.0);
+ }
+ { // ordered
+ tuple_intersection_float intersection;
+ intersection.update(sketch1.compact());
+ intersection.update(compact_tuple_sketch<float>(sketch2.compact(), 1));
+ auto result = intersection.get_result();
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE_FALSE(result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == 500.0);
+ }
+}
+
TEST_CASE("tuple intersection: estimation mode half overlap", "[tuple_intersection]") {
auto sketch1 = update_tuple_sketch<float>::builder().build();
int value = 0;
diff --git a/tuple/test/tuple_union_test.cpp b/tuple/test/tuple_union_test.cpp
index 00127ea..281b37c 100644
--- a/tuple/test/tuple_union_test.cpp
+++ b/tuple/test/tuple_union_test.cpp
@@ -21,6 +21,7 @@
#include <catch.hpp>
#include <tuple_union.hpp>
+#include <theta_sketch_experimental.hpp>
namespace datasketches {
@@ -36,6 +37,22 @@ TEST_CASE("tuple_union float: empty", "[tuple union]") {
REQUIRE(result.get_estimate() == 0);
}
+// needed until promotion of experimental to replace existing theta sketch
+using update_theta_sketch = update_theta_sketch_experimental<>;
+
+TEST_CASE("tupe_union float: empty theta sketch", "[tuple union]") {
+ auto update_sketch = update_theta_sketch::builder().build();
+
+ auto u = tuple_union<float>::builder().build();
+ u.update(compact_tuple_sketch<float>(update_sketch, 0));
+ auto result = u.get_result();
+// std::cout << result.to_string(true);
+ REQUIRE(result.is_empty());
+ REQUIRE(result.get_num_retained() == 0);
+ REQUIRE(!result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == 0);
+}
+
TEST_CASE("tuple_union float: non-empty no retained entries", "[tuple union]") {
auto update_sketch = update_tuple_sketch<float>::builder().set_p(0.001).build();
// std::cout << update_sketch.to_string();
@@ -135,4 +152,36 @@ TEST_CASE("tuple_union float: seed mismatch", "[tuple union]") {
REQUIRE_THROWS_AS(u.update(update_sketch), std::invalid_argument);
}
+TEST_CASE("tuple_union float: full overlap with theta sketch", "[tuple union]") {
+ auto u = tuple_union<float>::builder().build();
+
+ // tuple update
+ auto update_tuple = update_tuple_sketch<float>::builder().build();
+ for (unsigned i = 0; i < 10; ++i) update_tuple.update(i, 1);
+ u.update(update_tuple);
+
+ // tuple compact
+ auto compact_tuple = update_tuple.compact();
+ u.update(compact_tuple);
+
+ // theta update
+ auto update_theta = update_theta_sketch::builder().build();
+ for (unsigned i = 0; i < 10; ++i) update_theta.update(i);
+ u.update(compact_tuple_sketch<float>(update_theta, 1));
+
+ // theta compact
+ auto compact_theta = update_theta.compact();
+ u.update(compact_tuple_sketch<float>(compact_theta, 1));
+
+ auto result = u.get_result();
+// std::cout << result.to_string(true);
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE(result.get_num_retained() == 10);
+ REQUIRE(!result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == 10);
+ for (const auto& entry: result) {
+ REQUIRE(entry.second == 4);
+ }
+}
+
} /* namespace datasketches */
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org