You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2021/05/19 21:33:15 UTC

[datasketches-cpp] branch jaccard_seed created (now 1d3b7e9)

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a change to branch jaccard_seed
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git.


      at 1d3b7e9  added seed

This branch includes the following new commits:

     new 1d3b7e9  added seed

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[datasketches-cpp] 01/01: added seed

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch jaccard_seed
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 1d3b7e96d8a745e79cc4aba3da0485ab92a390fd
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Wed May 19 14:32:59 2021 -0700

    added seed
---
 theta/include/theta_jaccard_similarity_base.hpp | 26 +++++++-----
 theta/test/theta_jaccard_similarity_test.cpp    | 56 +++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/theta/include/theta_jaccard_similarity_base.hpp b/theta/include/theta_jaccard_similarity_base.hpp
index f6248e6..5e8dcf5 100644
--- a/theta/include/theta_jaccard_similarity_base.hpp
+++ b/theta/include/theta_jaccard_similarity_base.hpp
@@ -46,20 +46,21 @@ public:
    *
    * @param sketch_a given sketch A
    * @param sketch_b given sketch B
+   * @param seed for the hash function that was used to create the sketch
    * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
    * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
    */
   template<typename SketchA, typename SketchB>
-  static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
+  static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
     if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
     if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
     if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
 
-    auto union_ab = compute_union(sketch_a, sketch_b);
+    auto union_ab = compute_union(sketch_a, sketch_b, seed);
     if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
 
     // intersection
-    Intersection i;
+    Intersection i(seed);
     i.update(sketch_a);
     i.update(sketch_b);
     i.update(union_ab); // ensures that intersection is a subset of the union
@@ -76,15 +77,16 @@ public:
    * Returns true if the two given sketches are equivalent.
    * @param sketch_a the given sketch A
    * @param sketch_b the given sketch B
+   * @param seed for the hash function that was used to create the sketch
    * @return true if the two given sketches are exactly equal
    */
   template<typename SketchA, typename SketchB>
-  static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
+  static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
     if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
     if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
     if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
 
-    auto union_ab = compute_union(sketch_a, sketch_b);
+    auto union_ab = compute_union(sketch_a, sketch_b, seed);
     if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
     return false;
   }
@@ -99,12 +101,13 @@ public:
    * @param actual the sketch to be tested
    * @param expected the reference sketch that is considered to be correct
    * @param threshold a real value between zero and one
+   * @param seed for the hash function that was used to create the sketch
    * @return true if the similarity of the two sketches is greater than the given threshold
    * with at least 97.7% confidence
    */
   template<typename SketchA, typename SketchB>
-  static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
-    auto jc = jaccard(actual, expected);
+  static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
+    auto jc = jaccard(actual, expected, seed);
     return jc[0] >= threshold;
   }
 
@@ -118,23 +121,24 @@ public:
    * @param actual the sketch to be tested
    * @param expected the reference sketch that is considered to be correct
    * @param threshold a real value between zero and one
+   * @param seed for the hash function that was used to create the sketch
    * @return true if the dissimilarity of the two sketches is greater than the given threshold
    * with at least 97.7% confidence
    */
   template<typename SketchA, typename SketchB>
-  static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
-    auto jc = jaccard(actual, expected);
+  static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
+    auto jc = jaccard(actual, expected, seed);
     return jc[2] <= threshold;
   }
 
 private:
 
   template<typename SketchA, typename SketchB>
-  static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
+  static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
     const auto count_a = sketch_a.get_num_retained();
     const auto count_b = sketch_b.get_num_retained();
     const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
-    auto u = typename Union::builder().set_lg_k(lg_k).build();
+    auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
     u.update(sketch_a);
     u.update(sketch_b);
     return u.get_result(false);
diff --git a/theta/test/theta_jaccard_similarity_test.cpp b/theta/test/theta_jaccard_similarity_test.cpp
index d40a0ce..83e7c6d 100644
--- a/theta/test/theta_jaccard_similarity_test.cpp
+++ b/theta/test/theta_jaccard_similarity_test.cpp
@@ -100,6 +100,28 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
   REQUIRE(jc[2] == Approx(0.33).margin(0.01));
 }
 
+TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
+  const uint64_t seed = 123;
+  auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
+  auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
+  for (int i = 0; i < 10000; ++i) {
+    sk_a.update(i);
+    sk_b.update(i + 5000);
+  }
+
+  // update sketches
+  auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
+  REQUIRE(jc[0] == Approx(0.33).margin(0.01));
+  REQUIRE(jc[1] == Approx(0.33).margin(0.01));
+  REQUIRE(jc[2] == Approx(0.33).margin(0.01));
+
+  // compact sketches
+  jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
+  REQUIRE(jc[0] == Approx(0.33).margin(0.01));
+  REQUIRE(jc[1] == Approx(0.33).margin(0.01));
+  REQUIRE(jc[2] == Approx(0.33).margin(0.01));
+}
+
 /**
  * The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
  * underlying sketch is about +/- 1.56%.
@@ -120,6 +142,23 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
   REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
 }
 
+TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
+  const int8_t min_lg_k = 12;
+  const int u1 = 1 << 20;
+  const int u2 = static_cast<int>(u1 * 0.95);
+  const double threshold = 0.943;
+  const uint64_t seed = 1234;
+
+  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+  for (int i = 0; i < u1; ++i) expected.update(i);
+
+  auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+  for (int i = 0; i < u2; ++i) actual.update(i);
+
+  REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
+  REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
+}
+
 /**
  * The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
  * as the ratio of intersection to the union becomes a small number.
@@ -140,4 +179,21 @@ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
   REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
 }
 
+TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
+  const int8_t min_lg_k = 12;
+  const int u1 = 1 << 20;
+  const int u2 = static_cast<int>(u1 * 0.05);
+  const double threshold = 0.061;
+  const uint64_t seed = 1234;
+
+  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+  for (int i = 0; i < u1; ++i) expected.update(i);
+
+  auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+  for (int i = 0; i < u2; ++i) actual.update(i);
+
+  REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
+  REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
+}
+
 } /* namespace datasketches */

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org