You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/08/21 21:20:27 UTC

[incubator-datasketches-cpp] branch tuple_sketch updated: simpler way to convert theta sketch to tuple sketch

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/tuple_sketch by this push:
     new 385fbe7  simpler way to convert theta sketch to tuple sketch
385fbe7 is described below

commit 385fbe759119f9ec2f8a33c8f544c18fb9941392
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Aug 21 14:20:16 2020 -0700

    simpler way to convert theta sketch to tuple sketch
---
 tuple/CMakeLists.txt                   |  3 ---
 tuple/include/tuple_sketch.hpp         |  3 +++
 tuple/include/tuple_sketch_impl.hpp    | 15 +++++++++++
 tuple/test/CMakeLists.txt              |  1 -
 tuple/test/tuple_a_not_b_test.cpp      | 44 ++++++++++++++++++++++++++++++
 tuple/test/tuple_intersection_test.cpp | 33 +++++++++++++++++++++++
 tuple/test/tuple_union_test.cpp        | 49 ++++++++++++++++++++++++++++++++++
 7 files changed, 144 insertions(+), 4 deletions(-)

diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt
index 2a452f7..fce73ab 100644
--- a/tuple/CMakeLists.txt
+++ b/tuple/CMakeLists.txt
@@ -44,7 +44,6 @@ list(APPEND tuple_HEADERS "include/theta_set_difference_base.hpp;include/theta_s
 list(APPEND tuple_HEADERS "include/theta_sketch_experimental.hpp;include/theta_sketch_experimental_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_union_experimental.hpp;include/theta_union_experimental_impl.hpp")
 list(APPEND tuple_HEADERS "include/array_of_doubles_sketch.hpp;include/array_of_doubles_sketch_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_to_tuple_sketch_adapter.hpp;include/theta_to_tuple_sketch_adapter_impl.hpp")
 
 install(TARGETS tuple
   EXPORT ${PROJECT_NAME}
@@ -77,6 +76,4 @@ target_sources(tuple
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_experimental_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch_impl.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_to_tuple_sketch_adapter.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_to_tuple_sketch_adapter_impl.hpp
 )
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index e5e000d..16ed425 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -31,6 +31,7 @@ namespace datasketches {
 template<typename S, typename A> class tuple_sketch;
 template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
 template<typename S, typename A> class compact_tuple_sketch;
+template<typename A> class theta_sketch_experimental;
 
 template<
   typename Summary,
@@ -371,6 +372,8 @@ public:
   compact_tuple_sketch& operator=(const compact_tuple_sketch&) = default;
   compact_tuple_sketch& operator=(compact_tuple_sketch&&) = default;
 
+  compact_tuple_sketch(const theta_sketch_experimental<AllocU64>& other, const Summary& summary, bool ordered = true);
+
   virtual Allocator get_allocator() const;
   virtual bool is_empty() const;
   virtual bool is_ordered() const;
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index f99b647..a95f122 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -270,6 +270,21 @@ entries_(other.get_allocator())
 }
 
 template<typename S, typename A>
+compact_tuple_sketch<S, A>::compact_tuple_sketch(const theta_sketch_experimental<AllocU64>& other, const S& summary, bool ordered):
+is_empty_(other.is_empty()),
+is_ordered_(other.is_ordered() || ordered),
+seed_hash_(other.get_seed_hash()),
+theta_(other.get_theta64()),
+entries_(other.get_allocator())
+{
+  entries_.reserve(other.get_num_retained());
+  for (uint64_t hash: other) {
+    entries_.push_back(Entry(hash, summary));
+  }
+  if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end(), comparator());
+}
+
+template<typename S, typename A>
 A compact_tuple_sketch<S, A>::get_allocator() const {
   return entries_.get_allocator();
 }
diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index 17be15b..53a3a7e 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -46,5 +46,4 @@ target_sources(tuple_test
     theta_sketch_experimental_test.cpp
     theta_union_experimental_test.cpp
     array_of_doubles_sketch_test.cpp
-    mixed_union_test.cpp
 )
diff --git a/tuple/test/tuple_a_not_b_test.cpp b/tuple/test/tuple_a_not_b_test.cpp
index 7e44d2e..1c56102 100644
--- a/tuple/test/tuple_a_not_b_test.cpp
+++ b/tuple/test/tuple_a_not_b_test.cpp
@@ -21,6 +21,7 @@
 
 #include <catch.hpp>
 #include <tuple_a_not_b.hpp>
+#include <theta_sketch_experimental.hpp>
 
 namespace datasketches {
 
@@ -101,6 +102,49 @@ TEST_CASE("tuple a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
   REQUIRE(result.get_estimate() == 500.0);
 }
 
+// needed until promotion of experimental to replace existing theta sketch
+using update_theta_sketch = update_theta_sketch_experimental<>;
+
+TEST_CASE("mixed a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
+  auto a = update_tuple_sketch<float>::builder().build();
+  int value = 0;
+  for (int i = 0; i < 1000; i++) a.update(value++, 1);
+
+  auto b = update_theta_sketch::builder().build();
+  value = 500;
+  for (int i = 0; i < 1000; i++) b.update(value++);
+
+  tuple_a_not_b<float> a_not_b;
+
+  // unordered inputs, ordered result
+  auto result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false));
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE_FALSE(result.is_estimation_mode());
+  REQUIRE(result.is_ordered());
+  REQUIRE(result.get_estimate() == 500.0);
+
+  // unordered inputs, unordered result
+  result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false), false);
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE_FALSE(result.is_estimation_mode());
+  REQUIRE_FALSE(result.is_ordered());
+  REQUIRE(result.get_estimate() == 500.0);
+
+  // ordered inputs
+  result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b.compact(), 1));
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE_FALSE(result.is_estimation_mode());
+  REQUIRE(result.is_ordered());
+  REQUIRE(result.get_estimate() == 500.0);
+
+  // A is ordered, so the result is ordered regardless
+  result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b, 1, false), false);
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE_FALSE(result.is_estimation_mode());
+  REQUIRE(result.is_ordered());
+  REQUIRE(result.get_estimate() == 500.0);
+}
+
 TEST_CASE("tuple a-not-b: exact mode disjoint", "[tuple_a_not_b]") {
   auto a = update_tuple_sketch<float>::builder().build();
   int value = 0;
diff --git a/tuple/test/tuple_intersection_test.cpp b/tuple/test/tuple_intersection_test.cpp
index 00591e3..9796cb3 100644
--- a/tuple/test/tuple_intersection_test.cpp
+++ b/tuple/test/tuple_intersection_test.cpp
@@ -21,6 +21,7 @@
 
 #include <catch.hpp>
 #include <tuple_intersection.hpp>
+#include <theta_sketch_experimental.hpp>
 
 namespace datasketches {
 
@@ -135,6 +136,38 @@ TEST_CASE("tuple intersection: exact mode disjoint", "[tuple_intersection]") {
   }
 }
 
+// needed until promotion of experimental to replace existing theta sketch
+using update_theta_sketch = update_theta_sketch_experimental<>;
+
+TEST_CASE("mixed intersection: exact mode half overlap", "[tuple_intersection]") {
+  auto sketch1 = update_tuple_sketch<float>::builder().build();
+  int value = 0;
+  for (int i = 0; i < 1000; i++) sketch1.update(value++, 1);
+
+  auto sketch2 = update_theta_sketch::builder().build();
+  value = 500;
+  for (int i = 0; i < 1000; i++) sketch2.update(value++);
+
+  { // unordered
+    tuple_intersection_float intersection;
+    intersection.update(sketch1);
+    intersection.update(compact_tuple_sketch<float>(sketch2, 1, false));
+    auto result = intersection.get_result();
+    REQUIRE_FALSE(result.is_empty());
+    REQUIRE_FALSE(result.is_estimation_mode());
+    REQUIRE(result.get_estimate() == 500.0);
+  }
+  { // ordered
+    tuple_intersection_float intersection;
+    intersection.update(sketch1.compact());
+    intersection.update(compact_tuple_sketch<float>(sketch2.compact(), 1));
+    auto result = intersection.get_result();
+    REQUIRE_FALSE(result.is_empty());
+    REQUIRE_FALSE(result.is_estimation_mode());
+    REQUIRE(result.get_estimate() == 500.0);
+  }
+}
+
 TEST_CASE("tuple intersection: estimation mode half overlap", "[tuple_intersection]") {
   auto sketch1 = update_tuple_sketch<float>::builder().build();
   int value = 0;
diff --git a/tuple/test/tuple_union_test.cpp b/tuple/test/tuple_union_test.cpp
index 00127ea..281b37c 100644
--- a/tuple/test/tuple_union_test.cpp
+++ b/tuple/test/tuple_union_test.cpp
@@ -21,6 +21,7 @@
 
 #include <catch.hpp>
 #include <tuple_union.hpp>
+#include <theta_sketch_experimental.hpp>
 
 namespace datasketches {
 
@@ -36,6 +37,22 @@ TEST_CASE("tuple_union float: empty", "[tuple union]") {
   REQUIRE(result.get_estimate() == 0);
 }
 
+// needed until promotion of experimental to replace existing theta sketch
+using update_theta_sketch = update_theta_sketch_experimental<>;
+
+TEST_CASE("tupe_union float: empty theta sketch", "[tuple union]") {
+  auto update_sketch = update_theta_sketch::builder().build();
+
+  auto u = tuple_union<float>::builder().build();
+  u.update(compact_tuple_sketch<float>(update_sketch, 0));
+  auto result = u.get_result();
+//  std::cout << result.to_string(true);
+  REQUIRE(result.is_empty());
+  REQUIRE(result.get_num_retained() == 0);
+  REQUIRE(!result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == 0);
+}
+
 TEST_CASE("tuple_union float: non-empty no retained entries", "[tuple union]") {
   auto update_sketch = update_tuple_sketch<float>::builder().set_p(0.001).build();
 //  std::cout << update_sketch.to_string();
@@ -135,4 +152,36 @@ TEST_CASE("tuple_union float: seed mismatch", "[tuple union]") {
   REQUIRE_THROWS_AS(u.update(update_sketch), std::invalid_argument);
 }
 
+TEST_CASE("tuple_union float: full overlap with theta sketch", "[tuple union]") {
+  auto u = tuple_union<float>::builder().build();
+
+  // tuple update
+  auto update_tuple = update_tuple_sketch<float>::builder().build();
+  for (unsigned i = 0; i < 10; ++i) update_tuple.update(i, 1);
+  u.update(update_tuple);
+
+  // tuple compact
+  auto compact_tuple = update_tuple.compact();
+  u.update(compact_tuple);
+
+  // theta update
+  auto update_theta = update_theta_sketch::builder().build();
+  for (unsigned i = 0; i < 10; ++i) update_theta.update(i);
+  u.update(compact_tuple_sketch<float>(update_theta, 1));
+
+  // theta compact
+  auto compact_theta = update_theta.compact();
+  u.update(compact_tuple_sketch<float>(compact_theta, 1));
+
+  auto result = u.get_result();
+//  std::cout << result.to_string(true);
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE(result.get_num_retained() == 10);
+  REQUIRE(!result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == 10);
+  for (const auto& entry: result) {
+    REQUIRE(entry.second == 4);
+  }
+}
+
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org