You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/14 07:34:49 UTC

[incubator-datasketches-cpp] branch sampling updated: [WIP, almost done] improve serialization coverage, update tests to allow for equivalence when random selection invovled in get_result()

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/sampling by this push:
     new 4b96ede  [WIP, almost done] improve serialization coverage, update tests to allow for equivalence when random selection invovled in get_result()
4b96ede is described below

commit 4b96ede7f25572593d170f6353009699e8a23167
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Thu Feb 13 23:34:34 2020 -0800

    [WIP, almost done] improve serialization coverage, update tests to allow for equivalence when random selection invovled in get_result()
---
 sampling/test/var_opt_union_test.cpp | 261 ++++++++++++++++-------------------
 1 file changed, 122 insertions(+), 139 deletions(-)

diff --git a/sampling/test/var_opt_union_test.cpp b/sampling/test/var_opt_union_test.cpp
index f0a519f..45f92ab 100644
--- a/sampling/test/var_opt_union_test.cpp
+++ b/sampling/test/var_opt_union_test.cpp
@@ -66,8 +66,10 @@ class var_opt_union_test: public CppUnit::TestFixture {
     return sk;
   }
 
+  // if exact_compare = false, checks for equivalence -- specific R region values may differ but
+  // R region weights must match
   template<typename T, typename S, typename A>
-  void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2) {
+  void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2, bool exact_compare = true) {
     CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different values of k",
       sk1.get_k(), sk2.get_k());
     CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different values of n",
@@ -82,8 +84,10 @@ class var_opt_union_test: public CppUnit::TestFixture {
     while ((it1 != sk1.end()) && (it2 != sk2.end())) {
       const std::pair<const T&, const double> p1 = *it1;
       const std::pair<const T&, const double> p2 = *it2;
-      CPPUNIT_ASSERT_EQUAL_MESSAGE("data values differ at sample " + std::to_string(i),
-        p1.first, p2.first); 
+      if (exact_compare) {
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("data values differ at sample " + std::to_string(i),
+          p1.first, p2.first); 
+      }
       CPPUNIT_ASSERT_EQUAL_MESSAGE("weight values differ at sample " + std::to_string(i),
         p1.second, p2.second);
       ++i;
@@ -95,6 +99,39 @@ class var_opt_union_test: public CppUnit::TestFixture {
       (it1 == sk1.end()) && (it2 == sk2.end()));
   }
 
+  // compare serialization and deserializationi results, crossing methods to ensure that
+  // the resulting binary images are compatible.
+  // if exact_compare = false, checks for equivalence -- specific R region values may differ but
+  // R region weights must match
+  template<typename T, typename S, typename A>
+  void compare_serialization_deserialization(var_opt_union<T,S,A>& vo_union, bool exact_compare = true) {
+    std::vector<uint8_t> bytes = vo_union.serialize();
+
+    var_opt_union<T> u_from_bytes = var_opt_union<T>::deserialize(bytes.data(), bytes.size());
+    var_opt_sketch<T> sk1 = vo_union.get_result();
+    var_opt_sketch<T> sk2 = u_from_bytes.get_result();
+    check_if_equal(sk1, sk2, exact_compare);
+
+    std::string str(bytes.begin(), bytes.end());
+    std::stringstream ss;
+    ss.str(str);
+
+    var_opt_union<T> u_from_stream = var_opt_union<T>::deserialize(ss);
+    sk2 = u_from_stream.get_result();
+    check_if_equal(sk1, sk2, exact_compare);
+
+    ss.seekg(0); // didn't put anything so only reset read position
+    vo_union.serialize(ss);
+    u_from_stream = var_opt_union<T>::deserialize(ss);
+    sk2 = u_from_stream.get_result();
+    check_if_equal(sk1, sk2, exact_compare);
+
+    std::string str_from_stream = ss.str();
+    var_opt_union<T> u_from_str = var_opt_union<T>::deserialize(str_from_stream.c_str(), str_from_stream.size());
+    sk2 = u_from_str.get_result();
+    check_if_equal(sk1, sk2, exact_compare);
+  }
+
   void bad_prelongs() {
     var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
     var_opt_union<int> u(32);
@@ -196,7 +233,7 @@ class var_opt_union_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_EQUAL(k, result.get_k());
   }
 
-void heavy_sampling_sketch() {
+  void heavy_sampling_sketch() {
     uint64_t n1 = 20;
     uint32_t k1 = 10;
     uint64_t n2 = 6;
@@ -224,142 +261,88 @@ void heavy_sampling_sketch() {
     result = u.get_result();
     CPPUNIT_ASSERT_EQUAL((uint64_t) 0, result.get_n());
     CPPUNIT_ASSERT_EQUAL(k1, result.get_k()); // union reset so empty result reflects max_k
-}
-
-void identical_sampling_sketches() {
-  uint32_t k = 20;
-  uint64_t n = 50;
-  var_opt_sketch<int> sk = create_unweighted_sketch(k, n);
-
-  var_opt_union<int> u(k);
-  u.update(sk);
-  u.update(sk);
-
-  var_opt_sketch<int> result = u.get_result();
-  double expected_wt = 2.0 * n;
-  subset_summary ss = result.estimate_subset_sum([](int x){return true;});
-  CPPUNIT_ASSERT_EQUAL(2 * n, result.get_n());
-  CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.total_sketch_weight, EPS);
-
-  // add another sketch, such that sketch_tau < outer_tau
-  sk = create_unweighted_sketch(k, k + 1); // tau = (k + 1) / k
-  u.update(sk);
-  result = u.get_result();
-  expected_wt = (2.0 * n) + k + 1;
-  ss = result.estimate_subset_sum([](int x){return true;});
-  CPPUNIT_ASSERT_EQUAL((2 * n) + k + 1, result.get_n());
-  CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.total_sketch_weight, EPS);
-}
-
-void small_sampling_sketch() {
-  uint32_t k_small = 16;
-  uint32_t k_max = 128;
-  uint64_t n1 = 32;
-  uint64_t n2 = 64;
-
-  var_opt_sketch<float> sk(k_small);
-  for (int i = 0; i < n1; ++i) { sk.update(i); }
-  sk.update(-1, n1 * n1); // add a heavy item
-
-  var_opt_union<float> u(k_max);
-  u.update(sk);
-
-  // another one, but different n to get a different per-item weight
-  var_opt_sketch<float> sk2(k_small);
-  for (int i = 0; i < n2; ++i) { sk2.update(i); }
-  u.update(sk2);
-
-  // should trigger migrate_marked_items_by_decreasing_k()
-  var_opt_sketch<float> result = u.get_result();
-  CPPUNIT_ASSERT_EQUAL(n1 + n2 + 1, result.get_n());
+  }
+
+  void identical_sampling_sketches() {
+    uint32_t k = 20;
+    uint64_t n = 50;
+    var_opt_sketch<int> sk = create_unweighted_sketch(k, n);
+
+    var_opt_union<int> u(k);
+    u.update(sk);
+    u.update(sk);
+
+    var_opt_sketch<int> result = u.get_result();
+    double expected_wt = 2.0 * n;
+    subset_summary ss = result.estimate_subset_sum([](int x){return true;});
+    CPPUNIT_ASSERT_EQUAL(2 * n, result.get_n());
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.total_sketch_weight, EPS);
+
+    // add another sketch, such that sketch_tau < outer_tau
+    sk = create_unweighted_sketch(k, k + 1); // tau = (k + 1) / k
+    u.update(sk);
+    result = u.get_result();
+    expected_wt = (2.0 * n) + k + 1;
+    ss = result.estimate_subset_sum([](int x){return true;});
+    CPPUNIT_ASSERT_EQUAL((2 * n) + k + 1, result.get_n());
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.total_sketch_weight, EPS);
+  }
+
+  void small_sampling_sketch() {
+    uint32_t k_small = 16;
+    uint32_t k_max = 128;
+    uint64_t n1 = 32;
+    uint64_t n2 = 64;
+
+    var_opt_sketch<float> sk(k_small);
+    for (int i = 0; i < n1; ++i) { sk.update(i); }
+    sk.update(-1, n1 * n1); // add a heavy item
+
+    var_opt_union<float> u(k_max);
+    u.update(sk);
+
+    // another one, but different n to get a different per-item weight
+    var_opt_sketch<float> sk2(k_small);
+    for (int i = 0; i < n2; ++i) { sk2.update(i); }
+    u.update(sk2);
+
+    // should trigger migrate_marked_items_by_decreasing_k()
+    var_opt_sketch<float> result = u.get_result();
+    CPPUNIT_ASSERT_EQUAL(n1 + n2 + 1, result.get_n());
   
-  double expected_wt = 1.0 * (n1 + n2); // n1 + n2 light items, ignore the heavy one
-  subset_summary ss = result.estimate_subset_sum([](float x){return x >= 0;});
-  CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.estimate, EPS);
-  CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt + (n1 * n1), ss.total_sketch_weight, EPS);
-  CPPUNIT_ASSERT_LESS(k_max, result.get_k());
-}
-
-void serialize_empty() {
-  var_opt_union<std::string> u(100);
-
-  std::vector<uint8_t> bytes = u.serialize();
-
-  var_opt_union<std::string> u_from_bytes = var_opt_union<std::string>::deserialize(bytes.data(), bytes.size());
-  var_opt_sketch<std::string> sk1 = u.get_result();
-  var_opt_sketch<std::string> sk2 = u_from_bytes.get_result();
-  check_if_equal(sk1, sk2);
-
-  std::string str(bytes.begin(), bytes.end());
-  std::stringstream ss;
-  ss.str(str);
-
-  var_opt_union<std::string> u_from_stream = var_opt_union<std::string>::deserialize(ss);
-  sk2 = u_from_stream.get_result();
-  check_if_equal(sk1, sk2);
-
-  ss.seekg(0); // didn't put anything so only reset read position
-  u.serialize(ss);
-  u_from_stream = var_opt_union<std::string>::deserialize(ss);
-  sk2 = u_from_stream.get_result();
-  check_if_equal(sk1, sk2); 
-}
-
-void serialize_exact() {
-  uint32_t k = 100;
-  var_opt_union<int> u(k);
-  var_opt_sketch<int> sk = create_unweighted_sketch(k, k / 2);
-  u.update(sk);
-
-  std::vector<uint8_t> bytes = u.serialize();
-
-  var_opt_union<int> u_from_bytes = var_opt_union<int>::deserialize(bytes.data(), bytes.size());
-  var_opt_sketch<int> sk1 = u.get_result();
-  var_opt_sketch<int> sk2 = u_from_bytes.get_result();
-  check_if_equal(sk1, sk2);
-
-  std::string str(bytes.begin(), bytes.end());
-  std::stringstream ss;
-  ss.str(str);
-
-  var_opt_union<int> u_from_stream = var_opt_union<int>::deserialize(ss);
-  sk2 = u_from_stream.get_result();
-  check_if_equal(sk1, sk2);
-
-  ss.seekg(0); // didn't put anything so only reset read position
-  u.serialize(ss);
-  u_from_stream = var_opt_union<int>::deserialize(ss);
-  sk2 = u_from_stream.get_result();
-  check_if_equal(sk1, sk2); 
-}
-
-void serialize_sampling() {
-  uint32_t k = 100;
-  var_opt_union<int> u(k);
-  var_opt_sketch<int> sk = create_unweighted_sketch(k, 2 * k);
-  u.update(sk);
-
-  std::vector<uint8_t> bytes = u.serialize();
-
-  var_opt_union<int> u_from_bytes = var_opt_union<int>::deserialize(bytes.data(), bytes.size());
-  var_opt_sketch<int> sk1 = u.get_result();
-  var_opt_sketch<int> sk2 = u_from_bytes.get_result();
-  check_if_equal(sk1, sk2);
-
-  std::string str(bytes.begin(), bytes.end());
-  std::stringstream ss;
-  ss.str(str);
-
-  var_opt_union<int> u_from_stream = var_opt_union<int>::deserialize(ss);
-  sk2 = u_from_stream.get_result();
-  check_if_equal(sk1, sk2);
-
-  ss.seekg(0); // didn't put anything so only reset read position
-  u.serialize(ss);
-  u_from_stream = var_opt_union<int>::deserialize(ss);
-  sk2 = u_from_stream.get_result();
-  check_if_equal(sk1, sk2); 
-}
+    double expected_wt = 1.0 * (n1 + n2); // n1 + n2 light items, ignore the heavy one
+    subset_summary ss = result.estimate_subset_sum([](float x){return x >= 0;});
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.estimate, EPS);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt + (n1 * n1), ss.total_sketch_weight, EPS);
+    CPPUNIT_ASSERT_LESS(k_max, result.get_k());
+
+    // check tha tmark information is preserved as expected
+    compare_serialization_deserialization(u, false);
+  }
+
+  void serialize_empty() {
+    var_opt_union<std::string> u(100);
+
+    compare_serialization_deserialization(u);
+  }
+
+  void serialize_exact() {
+    uint32_t k = 100;
+    var_opt_union<int> u(k);
+    var_opt_sketch<int> sk = create_unweighted_sketch(k, k / 2);
+    u.update(sk);
+
+    compare_serialization_deserialization(u);
+  }
+
+  void serialize_sampling() {
+    uint32_t k = 100;
+    var_opt_union<int> u(k);
+    var_opt_sketch<int> sk = create_unweighted_sketch(k, 2 * k);
+    u.update(sk);
+
+    compare_serialization_deserialization(u);
+  }
 
 /**********************************************************/
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org