You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/08 08:43:17 UTC

[incubator-datasketches-cpp] branch sampling updated (c071c30 -> d73bca4)

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a change to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git.


    from c071c30  remove reference to debug header
     new dfdd040  [WIP] adding var_opt_sketch tests, fixing issues along the way
     new d73bca4  [WIP] finish porting relevant java unit tests, fixing bugs along the way

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 sampling/include/var_opt_sketch.hpp      |  16 +-
 sampling/include/var_opt_sketch_impl.hpp | 199 ++++++-----
 sampling/include/var_opt_union_impl.hpp  |   1 +
 sampling/test/var_opt_sketch_test.cpp    | 558 ++++++++++++++++++++++++++++++-
 4 files changed, 660 insertions(+), 114 deletions(-)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 02/02: [WIP] finish porting relevant java unit tests, fixing bugs along the way

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit d73bca4848008f2a9374e4c2eb57d894ad847b61
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Sat Feb 8 00:42:52 2020 -0800

    [WIP] finish porting relevant java unit tests, fixing bugs along the way
---
 sampling/include/var_opt_sketch.hpp      |   8 +-
 sampling/include/var_opt_sketch_impl.hpp |  46 ++---
 sampling/test/var_opt_sketch_test.cpp    | 304 ++++++++++++++++++++++++++++++-
 3 files changed, 321 insertions(+), 37 deletions(-)

diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index da4f339..9a35ee8 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -38,10 +38,10 @@ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template
  * A struct to hold the result of subset sum queries
  */
 struct subset_summary {
-  const double lower_bound;
-  const double estimate;
-  const double upper_bound;
-  const double total_sketch_weight;
+  double lower_bound;
+  double estimate;
+  double upper_bound;
+  double total_sketch_weight;
 };
 
 enum resize_factor { X1 = 0, X2, X4, X8 };
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 9810242..2e83671 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -582,13 +582,6 @@ bool var_opt_sketch<T,S,A>::is_empty() const {
 
 template<typename T, typename S, typename A>
 void var_opt_sketch<T,S,A>::reset() {
-  n_ = 0;
-  h_ = 0;
-  m_ = 0;
-  r_ = 0;
-  num_marks_in_h_ = 0;
-  total_wt_r_ = 0.0;
-
   uint32_t prev_alloc = curr_items_alloc_;
 
   uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
@@ -598,31 +591,38 @@ void var_opt_sketch<T,S,A>::reset() {
     ++curr_items_alloc_;
   }
   
+  if (filled_data_) {
+    // destroy everything
+    for (size_t i = 0; i < curr_items_alloc_; ++i) 
+      A().destroy(data_ + i);      
+  } else {
+    // skip gap or anything unused at the end
+    for (size_t i = 0; i < h_; ++i)
+      A().destroy(data_+ i);
+    
+    for (size_t i = h_ + 1; i < curr_items_alloc_; ++i)
+      A().destroy(data_ + i);
+  }
+
   if (curr_items_alloc_ < prev_alloc) {
     bool is_gadget = (marks_ != nullptr);
   
-    if (filled_data_) {
-      // destroy everything
-      for (size_t i = 0; i < curr_items_alloc_; ++i) 
-        A().destroy(data_ + i);      
-    } else {
-      // skip gap or anything unused at the end
-      for (size_t i = 0; i < h_; ++i)
-        A().destroy(data_+ i);
-    
-      for (size_t i = h_ + 1; i < curr_items_alloc_; ++i)
-        A().destroy(data_ + i);
-    }
     A().deallocate(data_, curr_items_alloc_);
     AllocDouble().deallocate(weights_, curr_items_alloc_);
   
     if (marks_ != nullptr)
-    AllocBool().deallocate(marks_, curr_items_alloc_);
+      AllocBool().deallocate(marks_, curr_items_alloc_);
 
     allocate_data_arrays(curr_items_alloc_, is_gadget);
-  } else {
-    filled_data_ = false;
   }
+  
+  n_ = 0;
+  h_ = 0;
+  m_ = 0;
+  r_ = 0;
+  num_marks_in_h_ = 0;
+  total_wt_r_ = 0.0;
+  filled_data_ = false;
 }
 
 template<typename T, typename S, typename A>
@@ -898,7 +898,7 @@ void var_opt_sketch<T,S,A>::grow_data_arrays() {
     double* tmp_weights = AllocDouble().allocate(curr_items_alloc_);
 
     for (int i = 0; i < prev_size; ++i) {
-      tmp_data[i] = std::move(data_[i]);
+      A().construct(&tmp_data[i], std::move(data_[i]));
       A().destroy(data_ + i);
       tmp_weights[i] = std::move(weights_[i]); // primitive double, but for consistency
     }
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index d78d3b6..78bcb27 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -40,7 +40,7 @@ namespace datasketches {
 
 class var_opt_sketch_test: public CppUnit::TestFixture {
 
-  static constexpr double EPS = 1e-10;
+  static constexpr double EPS = 1e-13;
 
   CPPUNIT_TEST_SUITE(var_opt_sketch_test);
   CPPUNIT_TEST(invalid_k);
@@ -53,15 +53,16 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
   CPPUNIT_TEST(invalid_weight);
   CPPUNIT_TEST(corrupt_serialized_weight);
   CPPUNIT_TEST(cumulative_weight);
-  // CPPUNIT_TEST(under_full_sketch_serialization);
-  // CPPUNIT_TEST(end_of_warmup_sketch_serialization);
-  // CPPUNIT_TEST(full_sketch_serialization);
-  // CPPUNIT_TEST(pseudo_light_update);
-  // CPPUNIT_TEST(pseudo_heavy_update);
+  CPPUNIT_TEST(under_full_sketch_serialization);
+  CPPUNIT_TEST(end_of_warmup_sketch_serialization);
+  CPPUNIT_TEST(full_sketch_serialization);
+  CPPUNIT_TEST(string_serialization);
+  CPPUNIT_TEST(pseudo_light_update);
+  CPPUNIT_TEST(pseudo_heavy_update);
   // CPPUNIT_TEST(decrease_k_with_under_full_sketch);
   // CPPUNIT_TEST(decrease_k_with_full_sketch);
-  // CPPUNIT_TEST(reset);
-  // CPPUNIT_TEST(estimate_subset_sum);
+  CPPUNIT_TEST(reset);
+  CPPUNIT_TEST(estimate_subset_sum);
   // CPPUNIT_TEST(binary_compatibility);
   
   // CPPUNIT_TEST(empty);
@@ -76,6 +77,35 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     return sk;
   }
 
+  template<typename T, typename S, typename A>
+  void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2) {
+    CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different values of k",
+      sk1.get_k(), sk2.get_k());
+    CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different values of n",
+      sk1.get_n(), sk2.get_n());
+    CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different sample counts",
+      sk1.get_num_samples(), sk2.get_num_samples());
+      
+    auto it1 = sk1.begin();
+    auto it2 = sk2.begin();
+    size_t i = 0;
+
+    while ((it1 != sk1.end()) && (it2 != sk2.end())) {
+      const std::pair<const T&, const double> p1 = *it1;
+      const std::pair<const T&, const double> p2 = *it2;
+      CPPUNIT_ASSERT_EQUAL_MESSAGE("data values differ at sample " + std::to_string(i),
+        p1.first, p2.first); 
+      CPPUNIT_ASSERT_EQUAL_MESSAGE("weight values differ at sample " + std::to_string(i),
+        p1.second, p2.second);
+      ++i;
+      ++it1;
+      ++it2;
+    }
+
+    CPPUNIT_ASSERT_MESSAGE("iterators did not end at the same time",
+      (it1 == sk1.end()) && (it2 == sk2.end()));
+  }
+
   void invalid_k() {
     std::cerr << "start invalid_k()" << std::endl;
     {
@@ -287,14 +317,13 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     for (size_t i = 0; i < n; ++i) {
       // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
       // which covers about 10 orders of magnitude
-
       double w = std::exp(5 * N(rand));
       input_sum += w;
       sk.update(i, w);
     }
 
     double output_sum = 0.0;
-    for (auto& it : sk) { // pair<int, weight>
+    for (auto& it : sk) { // std::pair<int, weight>
       output_sum += it.second;
     }
     
@@ -304,6 +333,261 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     std::cerr << "end cumulative_weight()" << std::endl;
   }
 
+  void under_full_sketch_serialization() {
+    std::cerr << "start under_full_sketch_serialization()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(100, 10); // need n < k
+
+    auto bytes = sk.serialize();
+    var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
+    check_if_equal(sk, sk_from_bytes);
+
+    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+    sk.serialize(ss);
+    var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
+    check_if_equal(sk, sk_from_stream);
+    }
+    std::cerr << "end under_full_sketch_serialization()" << std::endl;
+  }
+
+  void end_of_warmup_sketch_serialization() {
+    std::cerr << "start end_of_warmup_sketch_serialization()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(2843, 2843); // need n == k
+    auto bytes = sk.serialize();
+
+    // ensure still only 3 preamble longs
+    CPPUNIT_ASSERT_EQUAL_MESSAGE("Wrong number of preamble longs at end of warmup mode",
+      3, bytes.data()[0] & 0x3f); // PREAMBLE_LONGS_WARMUP
+
+    var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
+    check_if_equal(sk, sk_from_bytes);
+
+    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+    sk.serialize(ss);
+    var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
+    check_if_equal(sk, sk_from_stream);
+    }
+    std::cerr << "end end_of_warmup_sketch_serialization()" << std::endl;
+  }
+
+  void full_sketch_serialization() {
+    std::cerr << "start full_sketch_serialization()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(32, 32);
+    sk.update(100, 100.0);
+    sk.update(101, 101.0);
+
+    // first 2 entries should be heavy and in heap order (smallest at root)
+    auto it = sk.begin();
+    const std::pair<const int, const double> p1 = *it;
+    ++it;
+    const std::pair<const int, const double> p2 = *it;
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(100.0, p1.second, EPS);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(101.0, p2.second, EPS);
+    CPPUNIT_ASSERT_EQUAL(100, p1.first);
+    CPPUNIT_ASSERT_EQUAL(101, p2.first);
+
+    // check for 4 preamble longs
+    auto bytes = sk.serialize();
+    CPPUNIT_ASSERT_EQUAL_MESSAGE("Wrong number of preamble longs at end of warmup mode",
+      4, bytes.data()[0] & 0x3f); // PREAMBLE_LONGS_WARMUP
+
+    var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
+    check_if_equal(sk, sk_from_bytes);
+
+    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+    sk.serialize(ss);
+    var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
+    check_if_equal(sk, sk_from_stream);
+    }
+    std::cerr << "end full_sketch_serialization()" << std::endl;
+  }
+
+  void string_serialization() {
+    std::cerr << "start string_serialization()" << std::endl;
+    {
+    var_opt_sketch<std::string> sk(5);
+    sk.update("a", 1.0);
+    sk.update("bc", 1.0);
+    sk.update("def", 1.0);
+    sk.update("ghij", 1.0);
+    sk.update("klmno", 1.0);
+    sk.update("heavy item", 100.0);
+
+    auto bytes = sk.serialize();
+    var_opt_sketch<std::string> sk_from_bytes = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
+    check_if_equal(sk, sk_from_bytes);
+
+    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+    sk.serialize(ss);
+    var_opt_sketch<std::string> sk_from_stream = var_opt_sketch<std::string>::deserialize(ss);
+    check_if_equal(sk, sk_from_stream);
+
+    }
+    std::cerr << "end string_serialization()" << std::endl;
+  }
+
+  void pseudo_light_update() {
+    std::cerr << "start pseudo_light_update()" << std::endl;
+    {
+    uint32_t k = 1024;
+    var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
+    sk.update(0, 1.0); // k+2nd update
+
+    // check the first weight, assuming all k items are unweighted
+    // (and consequently in R).
+    // Expected: (k + 2) / |R| = (k + 2) / k
+    auto it = sk.begin();
+    double wt = (*it).second;
+    CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_light_update()",
+      ((1.0 * (k + 2)) / k), wt, EPS);
+    }
+    std::cerr << "end pseudo_light_update()" << std::endl;
+  }
+
+  void pseudo_heavy_update() {
+    std::cerr << "start pseudo_heavy_update()" << std::endl;
+    {
+    uint32_t k = 1024;
+    double wt_scale = 10.0 * k;
+    var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
+
+    // Next k-1 updates should be update_pseudo_heavy_general()
+    // Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
+    // added k-1 heavy items, leaving only 1 item left in R
+    for (int i = 1; i <= k; ++i) {
+      sk.update(-i, k + (i * wt_scale));
+    }
+
+    auto it = sk.begin();
+
+    // Expected: lightest "heavy" item (first one out): k + 2*wt_scale
+    double wt = (*it).second;
+    CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_heavy_update()",
+      1.0 * (k + (2 * wt_scale)), wt, EPS);
+
+    // we don't know which R item is left, but there should be only one, at the end
+    // of the sample set.
+    // Expected: k+1 + (min "heavy" item) / |R| = ((k+1) + (k*wt_scale)) / 1 = wt_scale + 2k + 1
+    while (it != sk.end()) {
+      wt = (*it).second;
+      ++it;
+    }
+    CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_light_update()",
+      1.0 + wt_scale + (2 * k), wt, EPS);
+    }
+    std::cerr << "end pseudo_heavy_update()" << std::endl;
+  }
+
+  void reset() {
+    std::cerr << "start reset()" << std::endl;
+    {
+    uint32_t k = 1024;
+    uint64_t n1 = 20;
+    uint64_t n2 = 2 * k;
+    var_opt_sketch<std::string> sk(k);
+
+    // reset from sampling mode
+    for (int i = 0; i < n2; ++i) {
+      sk.update(std::to_string(i), 100.0 + i);
+    }
+    CPPUNIT_ASSERT_EQUAL(n2, sk.get_n());
+    CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+
+    sk.reset();
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
+    CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+
+    // reset from exact mode
+    for (int i = 0; i < n1; ++i)
+      sk.update(std::to_string(i));
+    CPPUNIT_ASSERT_EQUAL(n1, sk.get_n());
+    CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+
+    sk.reset();
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
+    CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+    }
+    std::cerr << "end reset()" << std::endl;
+  }
+
+  void estimate_subset_sum() {
+    std::cerr << "start estimate_subset_sum()" << std::endl;
+    {
+    uint32_t k = 10;
+    var_opt_sketch<int> sk(k);
+
+    // empty sketch -- all zeros
+    subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.estimate, 0.0);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.total_sketch_weight, 0.0);
+
+    // add items, keeping in exact mode
+    double total_weight = 0.0;
+    for (int i = 1; i <= (k - 1); ++i) {
+      sk.update(i, 1.0 * i);
+      total_weight += 1.0 * i;
+    }
+
+    summary = sk.estimate_subset_sum([](int){ return true; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.estimate, 0.0);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.lower_bound, 0.0);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.upper_bound, 0.0);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, 0.0);
+
+    // add a few more items, pushing to sampling mode
+    for (int i = k; i <= (k + 1); ++i) {
+      sk.update(i, 1.0 * i);
+      total_weight += 1.0 * i;
+    }
+
+    // predicate always true so estimate == upper bound
+    summary = sk.estimate_subset_sum([](int){ return true; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.estimate, EPS);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.upper_bound, EPS);
+    CPPUNIT_ASSERT_LESS(total_weight, summary.lower_bound); // lower_bound < total_weight
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, EPS);
+
+    // predicate always false so estimate == lower bound == 0.0
+    summary = sk.estimate_subset_sum([](int){ return false; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.estimate, EPS);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.lower_bound, EPS);
+    CPPUNIT_ASSERT_GREATER(0.0, summary.upper_bound); // upper_bound > 0.0
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, EPS);
+
+    // finally, a non-degenerate predicate
+    // insert negative items with identical weights, filter for negative weights only
+    for (int i = 1; i <= (k + 1); ++i) {
+      sk.update(-i, 1.0 * i);
+      total_weight += 1.0 * i;
+    }
+
+    summary= sk.estimate_subset_sum([](int x) { return x < 0; });
+    CPPUNIT_ASSERT_GREATEREQUAL(summary.lower_bound, summary.estimate); // estimate >= lower_bound)
+    CPPUNIT_ASSERT_LESSEQUAL(summary.upper_bound, summary.estimate); // estimate <= upper_bound)
+
+    // allow pretty generous bounds when testing
+    CPPUNIT_ASSERT(summary.lower_bound < (total_weight / 1.4));
+    CPPUNIT_ASSERT(summary.upper_bound > (total_weight / 2.6));
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, EPS);
+
+    // and another data type, keeping it in exact mode for simplicity
+    var_opt_sketch<bool> sk2(k);
+    total_weight = 0.0;
+    for (int i = 1; i <= (k - 1); ++i) {
+      sk2.update((i % 2) == 0, 1.0 * i);
+      total_weight += i;
+    }
+
+    summary = sk2.estimate_subset_sum([](bool b){ return !b; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(summary.lower_bound, summary.estimate, 0.0);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(summary.upper_bound, summary.estimate, 0.0);
+    CPPUNIT_ASSERT_LESS(total_weight, summary.estimate); // exact mode, so know it must be strictly less
+    }
+    std::cerr << "end estimate_subset_sum()" << std::endl;
+  }
+
   /**********************************************************************/
 
   void vo_union() {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 01/02: [WIP] adding var_opt_sketch tests, fixing issues along the way

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit dfdd0406aba810de70601e541d3378c47217c039
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Wed Feb 5 12:47:15 2020 -0800

    [WIP] adding var_opt_sketch tests, fixing issues along the way
---
 sampling/include/var_opt_sketch.hpp      |   8 +-
 sampling/include/var_opt_sketch_impl.hpp | 153 ++++++++---------
 sampling/include/var_opt_union_impl.hpp  |   1 +
 sampling/test/var_opt_sketch_test.cpp    | 274 ++++++++++++++++++++++++++++++-
 4 files changed, 349 insertions(+), 87 deletions(-)

diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index bf22a42..da4f339 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -44,14 +44,16 @@ struct subset_summary {
   const double total_sketch_weight;
 };
 
+enum resize_factor { X1 = 0, X2, X4, X8 };
+
 template <typename T, typename S, typename A> class var_opt_union; // forward declaration
 
 template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
 class var_opt_sketch {
 
   public:
-    enum resize_factor { X1 = 0, X2, X4, X8 };
     static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
+    static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
 
     var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
     var_opt_sketch(const var_opt_sketch& other);
@@ -188,7 +190,9 @@ class var_opt_sketch {
     // validation
     static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
     static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
-    
+    // next method sets current_items_alloc_
+    void validate_and_set_current_size(int preamble_longs);
+
     // things to move to common utils and share among sketches
     static uint32_t get_adjusted_size(int max_size, int resize_target);
     static uint32_t starting_sub_multiple(int lg_target, int lg_rf, int lg_min);
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 26d4af3..9810242 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -25,7 +25,7 @@
 #include "bounds_binomial_proportions.hpp"
 
 #include <memory>
-#include <iostream>
+#include <sstream>
 #include <cmath>
 #include <random>
 #include <algorithm>
@@ -140,8 +140,8 @@ var_opt_sketch<T,S,A>::var_opt_sketch(var_opt_sketch&& other) noexcept :
 template<typename T, typename S, typename A>
 var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
   k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
-  if (k < 1) {
-    throw std::invalid_argument("k must be at least 1");
+  if (k == 0 || k_ > MAX_K) {
+    throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
   }
 
   uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
@@ -169,18 +169,20 @@ var_opt_sketch<T,S,A>::~var_opt_sketch() {
         A().destroy(data_+ i);
       }
     
-      for (size_t i = h_ + 1; i < curr_items_alloc_; ++i) {
+      for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
         A().destroy(data_ + i);
       }
     }
     A().deallocate(data_, curr_items_alloc_);
   }
 
-  if (weights_ != nullptr)
+  if (weights_ != nullptr) {
     AllocDouble().deallocate(weights_, curr_items_alloc_);
+  }
   
-  if (marks_ != nullptr)
+  if (marks_ != nullptr) {
     AllocBool().deallocate(marks_, curr_items_alloc_);
+  }
 }
 
 template<typename T, typename S, typename A>
@@ -229,52 +231,27 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
   is.read((char*)&h_, sizeof(uint32_t));
   is.read((char*)&r_, sizeof(uint32_t));
 
-  // initial data validation
-  // read fourth prelong, if needed, and determine vector sizes
-  if (n_ <= k_) {
-    if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
-      throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (n_ != h_) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
-       "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
-    }
-    if (r_ > 0) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
-       "Found r = " + std::to_string(r_));
-    }
-
-    uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
-    uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
-    int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
-    curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
-    if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
-      ++curr_items_alloc_;
-    }
-  } else { // n_ > k_
-    if (preamble_longs != PREAMBLE_LONGS_FULL) { 
-      throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (h_ + r_ != k_) {
-      throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
-       "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
-    }
+  validate_and_set_current_size(preamble_longs);
 
+  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
+  if (preamble_longs == PREAMBLE_LONGS_FULL) { 
     is.read((char*)&total_wt_r_, sizeof(total_wt_r_));
     if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
       throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
        "Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
     }
-
-    curr_items_alloc_ = k_ + 1;
   }
 
   allocate_data_arrays(curr_items_alloc_, is_gadget);
 
   // read the first h_ weights
   is.read((char*)weights_, h_ * sizeof(double));
+  for (size_t i = 0; i < h_; ++i) {
+    if (!(weights_[i] > 0.0)) {
+      throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(weights_[i]));
+    }
+  }
+
   std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
 
   // read the first h_ marks as packed bytes iff we have a gadget
@@ -306,52 +283,26 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
   ptr += copy_from_mem(ptr, &h_, sizeof(h_));
   ptr += copy_from_mem(ptr, &r_, sizeof(r_));
 
-  // initial data validation
-  // read fourth prelong, if needed, and determine vector sizes
-  if (n_ <= k_) {
-    if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
-      throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (n_ != h_) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
-       "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
-    }
-    if (r_ > 0) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
-       "Found r = " + std::to_string(r_));
-    }
-
-    uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
-    uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
-    int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
-    curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
-    if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
-      ++curr_items_alloc_;
-    }
-  } else { // n_ > k_
-    if (preamble_longs != PREAMBLE_LONGS_FULL) { 
-      throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (h_ + r_ != k_) {
-      throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
-       "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
-    }
-
+  validate_and_set_current_size(preamble_longs);
+  
+  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
+  if (preamble_longs == PREAMBLE_LONGS_FULL) {
     ptr += copy_from_mem(ptr, &total_wt_r_, sizeof(total_wt_r_));
     if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
       throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
        "Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
     }
-
-    curr_items_alloc_ = k_ + 1;
   }
 
   allocate_data_arrays(curr_items_alloc_, is_gadget);
 
   // read the first h_ weights, fill in rest of array with -1.0
   ptr += copy_from_mem(ptr, weights_, h_ * sizeof(double));
+  for (size_t i = 0; i < h_; ++i) {
+    if (!(weights_[i] > 0.0)) {
+      throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(weights_[i]));
+    }
+  }
   std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
 
   // read the first h_ marks as packed bytes iff we have a gadget
@@ -668,7 +619,7 @@ void var_opt_sketch<T,S,A>::reset() {
     if (marks_ != nullptr)
     AllocBool().deallocate(marks_, curr_items_alloc_);
 
-    allocate_data_ararys(curr_items_alloc_, is_gadget);
+    allocate_data_arrays(curr_items_alloc_, is_gadget);
   } else {
     filled_data_ = false;
   }
@@ -736,7 +687,7 @@ std::string var_opt_sketch<T,S,A>::to_string() const {
 template<typename T, typename S, typename A>
 void var_opt_sketch<T,S,A>::update(const T& item, double weight, bool mark) {
   if (weight <= 0.0) { 
-    throw std::invalid_argument("Items weights must be strictly positive. Found: "
+    throw std::invalid_argument("Item weights must be strictly positive. Found: "
                                 + std::to_string(weight));
   }
   ++n_;
@@ -1268,17 +1219,17 @@ void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t
   
   if (is_empty) {
     if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
-      throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+      throw std::invalid_argument("Possible corruption: Preamble longs must be "
         + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
         + std::to_string(preamble_longs));
     }
   } else {
     if (preamble_longs != PREAMBLE_LONGS_WARMUP
         && preamble_longs != PREAMBLE_LONGS_FULL) {
-      throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+      throw std::invalid_argument("Possible corruption: Preamble longs must be "
         + std::to_string(PREAMBLE_LONGS_WARMUP) + " or "
         + std::to_string(PREAMBLE_LONGS_FULL)
-        + "for a non-empty sketch. Found: " + std::to_string(preamble_longs));
+        + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
     }
   }
 }
@@ -1299,6 +1250,48 @@ void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t famil
 }
 
 template<typename T, typename S, typename A>
+void var_opt_sketch<T, S, A>::validate_and_set_current_size(int preamble_longs) {
+  if (k_ == 0 || k_ > MAX_K) {
+    throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
+  }
+
+  if (n_ <= k_) {
+    if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
+      throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
+       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+    }
+    if (n_ != h_) {
+      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
+       "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
+    }
+    if (r_ > 0) {
+      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
+       "Found r = " + std::to_string(r_));
+    }
+
+    uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
+    uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
+    int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
+    curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
+    if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
+      ++curr_items_alloc_;
+    }
+  } else { // n_ > k_
+    if (preamble_longs != PREAMBLE_LONGS_FULL) { 
+      throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
+       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+    }
+    if (h_ + r_ != k_) {
+      throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
+       "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
+    }
+
+    curr_items_alloc_ = k_ + 1;
+  }
+}
+
+
+template<typename T, typename S, typename A>
 subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(std::function<bool(T)> predicate) const {
   if (n_ == 0) {
     return {0.0, 0.0, 0.0, 0.0};
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
index 9fe2323..77845a0 100644
--- a/sampling/include/var_opt_union_impl.hpp
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -23,6 +23,7 @@
 #include "var_opt_union.hpp"
 
 #include <cmath>
+#include <sstream>
 
 namespace datasketches {
 
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index 203cfc9..d78d3b6 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -17,15 +17,18 @@
  * under the License.
  */
 
-#include <cppunit/TestFixture.h>
-#include <cppunit/extensions/HelperMacros.h>
-
 #include <var_opt_sketch.hpp>
 #include <var_opt_union.hpp>
 
+#include <cppunit/TestFixture.h>
+#include <cppunit/extensions/HelperMacros.h>
+
+#include <vector>
 #include <string>
 #include <sstream>
 #include <iostream>
+#include <cmath>
+#include <random>
 
 #ifdef TEST_BINARY_INPUT_PATH
 static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -37,11 +40,272 @@ namespace datasketches {
 
 class var_opt_sketch_test: public CppUnit::TestFixture {
 
+  static constexpr double EPS = 1e-10;
+
   CPPUNIT_TEST_SUITE(var_opt_sketch_test);
-  CPPUNIT_TEST(empty);
-  CPPUNIT_TEST(vo_union);
+  CPPUNIT_TEST(invalid_k);
+  CPPUNIT_TEST(bad_ser_ver);
+  CPPUNIT_TEST(bad_family);
+  CPPUNIT_TEST(bad_prelongs);
+  CPPUNIT_TEST(malformed_preamble);
+  CPPUNIT_TEST(empty_sketch);
+  CPPUNIT_TEST(non_empty_degenerate_sketch);
+  CPPUNIT_TEST(invalid_weight);
+  CPPUNIT_TEST(corrupt_serialized_weight);
+  CPPUNIT_TEST(cumulative_weight);
+  // CPPUNIT_TEST(under_full_sketch_serialization);
+  // CPPUNIT_TEST(end_of_warmup_sketch_serialization);
+  // CPPUNIT_TEST(full_sketch_serialization);
+  // CPPUNIT_TEST(pseudo_light_update);
+  // CPPUNIT_TEST(pseudo_heavy_update);
+  // CPPUNIT_TEST(decrease_k_with_under_full_sketch);
+  // CPPUNIT_TEST(decrease_k_with_full_sketch);
+  // CPPUNIT_TEST(reset);
+  // CPPUNIT_TEST(estimate_subset_sum);
+  // CPPUNIT_TEST(binary_compatibility);
+  
+  // CPPUNIT_TEST(empty);
+  // CPPUNIT_TEST(vo_union);
   CPPUNIT_TEST_SUITE_END();
 
+  var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
+    var_opt_sketch<int> sk(k);
+    for (uint64_t i = 0; i < n; ++i) {
+      sk.update(i, 1.0);
+    }
+    return sk;
+  }
+
+  void invalid_k() {
+    std::cerr << "start invalid_k()" << std::endl;
+    {
+    CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k = 0",
+      var_opt_sketch<int> sk(0),
+      std::invalid_argument);
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k < 0 (aka >= 2^31)",
+      var_opt_sketch<int> sk(1<<31),
+      std::invalid_argument);
+    }
+    std::cerr << "end invalid_k()" << std::endl;
+  }
+
+  void bad_ser_ver() {
+    std::cerr << "start bad_ser_ver()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
+    std::vector<uint8_t> bytes = sk.serialize();
+    bytes[1] = 0; // corrupt the serialization version byte
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad serialization version",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // create a stringstream to check the same
+    std::stringstream ss;
+    std::string str(bytes.begin(), bytes.end());
+    ss.str(str);
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad serialization version",
+      var_opt_sketch<int>::deserialize(ss),
+      std::invalid_argument);
+    }
+    std::cerr << "end bad_ser_ver()" << std::endl;
+  }
+
+  void bad_family() {
+    std::cerr << "start bad_family()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
+    std::vector<uint8_t> bytes = sk.serialize();
+    bytes[2] = 0; // corrupt the family byte
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad family id",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    std::stringstream ss;
+    std::string str(bytes.begin(), bytes.end());
+    ss.str(str);
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad family id",
+      var_opt_sketch<int>::deserialize(ss),
+      std::invalid_argument);
+    }
+    std::cerr << "end bad_family()" << std::endl;
+  }
+
+  void bad_prelongs() {
+    std::cerr << "start bad_prelongs()" << std::endl;
+    {
+    // The nubmer of preamble longs shares bits with resize_factor, but the latter
+    // has no invalid values as it gets 2 bites for 4 enum values.
+    var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
+    std::vector<uint8_t> bytes = sk.serialize();
+
+    bytes[0] = 0; // corrupt the preamble longs byte to be too small
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    bytes[0] = 2; // corrupt the preamble longs byte to 2
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    bytes[0] = 5; // corrupt the preamble longs byte to be too large
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+    }
+    std::cerr << "end bad_prelongs()" << std::endl;
+  }
+
+  void malformed_preamble() {
+    std::cerr << "start malformed_preamble()" << std::endl;
+    {
+    uint32_t k = 50;
+    var_opt_sketch<int> sk = create_unweighted_sketch(k, k);
+    const std::vector<uint8_t> src_bytes = sk.serialize();
+
+    // we'll re-use the same bytes several times so we'll use copies
+    std::vector<uint8_t> bytes(src_bytes);
+
+    // no items in R, but preamble longs indicates full
+    bytes[0] = 4; // PREAMBLE_LONGS_FULL
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch sampling mode with no R items",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // k = 0
+    bytes = src_bytes; 
+    *reinterpret_cast<int32_t*>(&bytes[4]) = 0;
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch sampling mode with k = 0",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // negative H region count in Java (signed ints)
+    bytes = src_bytes; 
+    *reinterpret_cast<int32_t*>(&bytes[16]) = -1;
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch invalid H count",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // negative R region count in Java (signed ints)
+    bytes = src_bytes; 
+    *reinterpret_cast<int32_t*>(&bytes[20]) = -128;
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch invalid R count",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+    }
+    std::cerr << "end malformed_preamble()" << std::endl;
+  }
+
+  void empty_sketch() {
+    std::cerr << "start empty_sketch()" << std::endl;
+    {
+    var_opt_sketch<std::string> sk(5);
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
+    CPPUNIT_ASSERT_EQUAL((uint32_t) 0, sk.get_num_samples());
+
+    std::vector<uint8_t> bytes = sk.serialize();
+    CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1 << 3), bytes.size()); // num bytes in PREAMBLE_LONGS_EMPTY
+
+    var_opt_sketch<std::string> loaded_sk = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 0, loaded_sk.get_n());
+    CPPUNIT_ASSERT_EQUAL((uint32_t) 0, loaded_sk.get_num_samples());
+    }
+    std::cerr << "end empty_sketch()" << std::endl;
+  }
+
+  void non_empty_degenerate_sketch() {
+    std::cerr << "start non_empty_degenerate_sketch()" << std::endl;
+    {
+    // Make an empty serialized sketch, then extend it to a
+    // PREAMBLE_LONGS_WARMUP-sized byte array, with no items.
+    // Then clear the empty flag so it will try to load the rest.
+    var_opt_sketch<std::string> sk(12, resize_factor::X2);
+    std::vector<uint8_t> bytes = sk.serialize();
+    while (bytes.size() < 24) { // PREAMBLE_LONGS_WARMUP * 8
+      bytes.push_back((uint8_t) 0);
+    }
+  
+    // ensure non-empty -- H and R region sizes already set to 0
+    bytes[3] = 0; // set flags bit to not-empty (other bits should already be 0)
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch non-empty sketch with no items",
+      var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+    }
+    std::cerr << "end non_empty_degenerate_sketch()" << std::endl;
+  }
+
+  void invalid_weight() {
+    std::cerr << "start invalid_weights()" << std::endl;
+    {
+    var_opt_sketch<std::string> sk(100, resize_factor::X2);
+    CPPUNIT_ASSERT_THROW_MESSAGE("update() accepted a negative weight",
+      sk.update("invalid_weight", -1.0),
+      std::invalid_argument);
+    }
+    std::cerr << "end invalid_weights()" << std::endl;
+  }
+
+  void corrupt_serialized_weight() {
+    std::cerr << "start corrupt_serialized_weight()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(100, 20);
+    auto bytes = sk.to_string();
+    
+    // weights are in the first double after the preamble
+    size_t preamble_bytes = (bytes[0] & 0x3f) << 3;
+    *reinterpret_cast<double*>(&bytes[preamble_bytes]) = -1.5;
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch negative item weight",
+      var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+    for (auto& b : bytes) { ss >> b; }
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch negative item weight",
+      var_opt_sketch<std::string>::deserialize(ss),
+      std::invalid_argument);
+    }
+    std::cerr << "end corrupt_serialized_weight()" << std::endl;
+  }
+
+  void cumulative_weight() {
+    std::cerr << "start cumulative_weight()" << std::endl;
+    {
+    uint32_t k = 256;
+    uint64_t n = 10 * k;
+    var_opt_sketch<int> sk(k);
+
+    std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
+    std::mt19937_64 rand(rd());
+    std::normal_distribution<double> N(0.0, 1.0);
+
+    double input_sum = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+      // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
+      // which covers about 10 orders of magnitude
+
+      double w = std::exp(5 * N(rand));
+      input_sum += w;
+      sk.update(i, w);
+    }
+
+    double output_sum = 0.0;
+    for (auto& it : sk) { // pair<int, weight>
+      output_sum += it.second;
+    }
+    
+    double weight_ratio = output_sum / input_sum;
+    CPPUNIT_ASSERT(std::abs(weight_ratio - 1.0) < EPS);
+    }
+    std::cerr << "end cumulative_weight()" << std::endl;
+  }
+
+  /**********************************************************************/
+
   void vo_union() {
     int k = 10;
     var_opt_sketch<int> sk(k), sk2(k+3);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org