You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/08 08:43:18 UTC

[incubator-datasketches-cpp] 01/02: [WIP] adding var_opt_sketch tests, fixing issues along the way

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit dfdd0406aba810de70601e541d3378c47217c039
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Wed Feb 5 12:47:15 2020 -0800

    [WIP] adding var_opt_sketch tests, fixing issues along the way
---
 sampling/include/var_opt_sketch.hpp      |   8 +-
 sampling/include/var_opt_sketch_impl.hpp | 153 ++++++++---------
 sampling/include/var_opt_union_impl.hpp  |   1 +
 sampling/test/var_opt_sketch_test.cpp    | 274 ++++++++++++++++++++++++++++++-
 4 files changed, 349 insertions(+), 87 deletions(-)

diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index bf22a42..da4f339 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -44,14 +44,16 @@ struct subset_summary {
   const double total_sketch_weight;
 };
 
+enum resize_factor { X1 = 0, X2, X4, X8 };
+
 template <typename T, typename S, typename A> class var_opt_union; // forward declaration
 
 template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
 class var_opt_sketch {
 
   public:
-    enum resize_factor { X1 = 0, X2, X4, X8 };
     static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
+    static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
 
     var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
     var_opt_sketch(const var_opt_sketch& other);
@@ -188,7 +190,9 @@ class var_opt_sketch {
     // validation
     static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
     static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
-    
+    // next method sets current_items_alloc_
+    void validate_and_set_current_size(int preamble_longs);
+
     // things to move to common utils and share among sketches
     static uint32_t get_adjusted_size(int max_size, int resize_target);
     static uint32_t starting_sub_multiple(int lg_target, int lg_rf, int lg_min);
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 26d4af3..9810242 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -25,7 +25,7 @@
 #include "bounds_binomial_proportions.hpp"
 
 #include <memory>
-#include <iostream>
+#include <sstream>
 #include <cmath>
 #include <random>
 #include <algorithm>
@@ -140,8 +140,8 @@ var_opt_sketch<T,S,A>::var_opt_sketch(var_opt_sketch&& other) noexcept :
 template<typename T, typename S, typename A>
 var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
   k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
-  if (k < 1) {
-    throw std::invalid_argument("k must be at least 1");
+  if (k == 0 || k_ > MAX_K) {
+    throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
   }
 
   uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
@@ -169,18 +169,20 @@ var_opt_sketch<T,S,A>::~var_opt_sketch() {
         A().destroy(data_+ i);
       }
     
-      for (size_t i = h_ + 1; i < curr_items_alloc_; ++i) {
+      for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
         A().destroy(data_ + i);
       }
     }
     A().deallocate(data_, curr_items_alloc_);
   }
 
-  if (weights_ != nullptr)
+  if (weights_ != nullptr) {
     AllocDouble().deallocate(weights_, curr_items_alloc_);
+  }
   
-  if (marks_ != nullptr)
+  if (marks_ != nullptr) {
     AllocBool().deallocate(marks_, curr_items_alloc_);
+  }
 }
 
 template<typename T, typename S, typename A>
@@ -229,52 +231,27 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
   is.read((char*)&h_, sizeof(uint32_t));
   is.read((char*)&r_, sizeof(uint32_t));
 
-  // initial data validation
-  // read fourth prelong, if needed, and determine vector sizes
-  if (n_ <= k_) {
-    if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
-      throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (n_ != h_) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
-       "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
-    }
-    if (r_ > 0) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
-       "Found r = " + std::to_string(r_));
-    }
-
-    uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
-    uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
-    int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
-    curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
-    if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
-      ++curr_items_alloc_;
-    }
-  } else { // n_ > k_
-    if (preamble_longs != PREAMBLE_LONGS_FULL) { 
-      throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (h_ + r_ != k_) {
-      throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
-       "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
-    }
+  validate_and_set_current_size(preamble_longs);
 
+  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
+  if (preamble_longs == PREAMBLE_LONGS_FULL) { 
     is.read((char*)&total_wt_r_, sizeof(total_wt_r_));
     if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
       throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
        "Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
     }
-
-    curr_items_alloc_ = k_ + 1;
   }
 
   allocate_data_arrays(curr_items_alloc_, is_gadget);
 
   // read the first h_ weights
   is.read((char*)weights_, h_ * sizeof(double));
+  for (size_t i = 0; i < h_; ++i) {
+    if (!(weights_[i] > 0.0)) {
+      throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(weights_[i]));
+    }
+  }
+
   std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
 
   // read the first h_ marks as packed bytes iff we have a gadget
@@ -306,52 +283,26 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
   ptr += copy_from_mem(ptr, &h_, sizeof(h_));
   ptr += copy_from_mem(ptr, &r_, sizeof(r_));
 
-  // initial data validation
-  // read fourth prelong, if needed, and determine vector sizes
-  if (n_ <= k_) {
-    if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
-      throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (n_ != h_) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
-       "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
-    }
-    if (r_ > 0) {
-      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
-       "Found r = " + std::to_string(r_));
-    }
-
-    uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
-    uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
-    int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
-    curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
-    if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
-      ++curr_items_alloc_;
-    }
-  } else { // n_ > k_
-    if (preamble_longs != PREAMBLE_LONGS_FULL) { 
-      throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
-       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
-    }
-    if (h_ + r_ != k_) {
-      throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
-       "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
-    }
-
+  validate_and_set_current_size(preamble_longs);
+  
+  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
+  if (preamble_longs == PREAMBLE_LONGS_FULL) {
     ptr += copy_from_mem(ptr, &total_wt_r_, sizeof(total_wt_r_));
     if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
       throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
        "Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
     }
-
-    curr_items_alloc_ = k_ + 1;
   }
 
   allocate_data_arrays(curr_items_alloc_, is_gadget);
 
   // read the first h_ weights, fill in rest of array with -1.0
   ptr += copy_from_mem(ptr, weights_, h_ * sizeof(double));
+  for (size_t i = 0; i < h_; ++i) {
+    if (!(weights_[i] > 0.0)) {
+      throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(weights_[i]));
+    }
+  }
   std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
 
   // read the first h_ marks as packed bytes iff we have a gadget
@@ -668,7 +619,7 @@ void var_opt_sketch<T,S,A>::reset() {
     if (marks_ != nullptr)
     AllocBool().deallocate(marks_, curr_items_alloc_);
 
-    allocate_data_ararys(curr_items_alloc_, is_gadget);
+    allocate_data_arrays(curr_items_alloc_, is_gadget);
   } else {
     filled_data_ = false;
   }
@@ -736,7 +687,7 @@ std::string var_opt_sketch<T,S,A>::to_string() const {
 template<typename T, typename S, typename A>
 void var_opt_sketch<T,S,A>::update(const T& item, double weight, bool mark) {
   if (weight <= 0.0) { 
-    throw std::invalid_argument("Items weights must be strictly positive. Found: "
+    throw std::invalid_argument("Item weights must be strictly positive. Found: "
                                 + std::to_string(weight));
   }
   ++n_;
@@ -1268,17 +1219,17 @@ void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t
   
   if (is_empty) {
     if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
-      throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+      throw std::invalid_argument("Possible corruption: Preamble longs must be "
         + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
         + std::to_string(preamble_longs));
     }
   } else {
     if (preamble_longs != PREAMBLE_LONGS_WARMUP
         && preamble_longs != PREAMBLE_LONGS_FULL) {
-      throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+      throw std::invalid_argument("Possible corruption: Preamble longs must be "
         + std::to_string(PREAMBLE_LONGS_WARMUP) + " or "
         + std::to_string(PREAMBLE_LONGS_FULL)
-        + "for a non-empty sketch. Found: " + std::to_string(preamble_longs));
+        + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
     }
   }
 }
@@ -1299,6 +1250,48 @@ void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t famil
 }
 
 template<typename T, typename S, typename A>
+void var_opt_sketch<T, S, A>::validate_and_set_current_size(int preamble_longs) {
+  if (k_ == 0 || k_ > MAX_K) {
+    throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
+  }
+
+  if (n_ <= k_) {
+    if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
+      throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
+       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+    }
+    if (n_ != h_) {
+      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
+       "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
+    }
+    if (r_ > 0) {
+      throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
+       "Found r = " + std::to_string(r_));
+    }
+
+    uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
+    uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
+    int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
+    curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
+    if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
+      ++curr_items_alloc_;
+    }
+  } else { // n_ > k_
+    if (preamble_longs != PREAMBLE_LONGS_FULL) { 
+      throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
+       "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+    }
+    if (h_ + r_ != k_) {
+      throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
+       "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
+    }
+
+    curr_items_alloc_ = k_ + 1;
+  }
+}
+
+
+template<typename T, typename S, typename A>
 subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(std::function<bool(T)> predicate) const {
   if (n_ == 0) {
     return {0.0, 0.0, 0.0, 0.0};
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
index 9fe2323..77845a0 100644
--- a/sampling/include/var_opt_union_impl.hpp
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -23,6 +23,7 @@
 #include "var_opt_union.hpp"
 
 #include <cmath>
+#include <sstream>
 
 namespace datasketches {
 
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index 203cfc9..d78d3b6 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -17,15 +17,18 @@
  * under the License.
  */
 
-#include <cppunit/TestFixture.h>
-#include <cppunit/extensions/HelperMacros.h>
-
 #include <var_opt_sketch.hpp>
 #include <var_opt_union.hpp>
 
+#include <cppunit/TestFixture.h>
+#include <cppunit/extensions/HelperMacros.h>
+
+#include <vector>
 #include <string>
 #include <sstream>
 #include <iostream>
+#include <cmath>
+#include <random>
 
 #ifdef TEST_BINARY_INPUT_PATH
 static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -37,11 +40,272 @@ namespace datasketches {
 
 class var_opt_sketch_test: public CppUnit::TestFixture {
 
+  static constexpr double EPS = 1e-10;
+
   CPPUNIT_TEST_SUITE(var_opt_sketch_test);
-  CPPUNIT_TEST(empty);
-  CPPUNIT_TEST(vo_union);
+  CPPUNIT_TEST(invalid_k);
+  CPPUNIT_TEST(bad_ser_ver);
+  CPPUNIT_TEST(bad_family);
+  CPPUNIT_TEST(bad_prelongs);
+  CPPUNIT_TEST(malformed_preamble);
+  CPPUNIT_TEST(empty_sketch);
+  CPPUNIT_TEST(non_empty_degenerate_sketch);
+  CPPUNIT_TEST(invalid_weight);
+  CPPUNIT_TEST(corrupt_serialized_weight);
+  CPPUNIT_TEST(cumulative_weight);
+  // CPPUNIT_TEST(under_full_sketch_serialization);
+  // CPPUNIT_TEST(end_of_warmup_sketch_serialization);
+  // CPPUNIT_TEST(full_sketch_serialization);
+  // CPPUNIT_TEST(pseudo_light_update);
+  // CPPUNIT_TEST(pseudo_heavy_update);
+  // CPPUNIT_TEST(decrease_k_with_under_full_sketch);
+  // CPPUNIT_TEST(decrease_k_with_full_sketch);
+  // CPPUNIT_TEST(reset);
+  // CPPUNIT_TEST(estimate_subset_sum);
+  // CPPUNIT_TEST(binary_compatibility);
+  
+  // CPPUNIT_TEST(empty);
+  // CPPUNIT_TEST(vo_union);
   CPPUNIT_TEST_SUITE_END();
 
+  var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
+    var_opt_sketch<int> sk(k);
+    for (uint64_t i = 0; i < n; ++i) {
+      sk.update(i, 1.0);
+    }
+    return sk;
+  }
+
+  void invalid_k() {
+    std::cerr << "start invalid_k()" << std::endl;
+    {
+    CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k = 0",
+      var_opt_sketch<int> sk(0),
+      std::invalid_argument);
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k < 0 (aka >= 2^31)",
+      var_opt_sketch<int> sk(1<<31),
+      std::invalid_argument);
+    }
+    std::cerr << "end invalid_k()" << std::endl;
+  }
+
+  void bad_ser_ver() {
+    std::cerr << "start bad_ser_ver()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
+    std::vector<uint8_t> bytes = sk.serialize();
+    bytes[1] = 0; // corrupt the serialization version byte
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad serialization version",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // create a stringstream to check the same
+    std::stringstream ss;
+    std::string str(bytes.begin(), bytes.end());
+    ss.str(str);
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad serialization version",
+      var_opt_sketch<int>::deserialize(ss),
+      std::invalid_argument);
+    }
+    std::cerr << "end bad_ser_ver()" << std::endl;
+  }
+
+  void bad_family() {
+    std::cerr << "start bad_family()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
+    std::vector<uint8_t> bytes = sk.serialize();
+    bytes[2] = 0; // corrupt the family byte
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad family id",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    std::stringstream ss;
+    std::string str(bytes.begin(), bytes.end());
+    ss.str(str);
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad family id",
+      var_opt_sketch<int>::deserialize(ss),
+      std::invalid_argument);
+    }
+    std::cerr << "end bad_family()" << std::endl;
+  }
+
+  void bad_prelongs() {
+    std::cerr << "start bad_prelongs()" << std::endl;
+    {
+    // The nubmer of preamble longs shares bits with resize_factor, but the latter
+    // has no invalid values as it gets 2 bites for 4 enum values.
+    var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
+    std::vector<uint8_t> bytes = sk.serialize();
+
+    bytes[0] = 0; // corrupt the preamble longs byte to be too small
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    bytes[0] = 2; // corrupt the preamble longs byte to 2
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    bytes[0] = 5; // corrupt the preamble longs byte to be too large
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+    }
+    std::cerr << "end bad_prelongs()" << std::endl;
+  }
+
+  void malformed_preamble() {
+    std::cerr << "start malformed_preamble()" << std::endl;
+    {
+    uint32_t k = 50;
+    var_opt_sketch<int> sk = create_unweighted_sketch(k, k);
+    const std::vector<uint8_t> src_bytes = sk.serialize();
+
+    // we'll re-use the same bytes several times so we'll use copies
+    std::vector<uint8_t> bytes(src_bytes);
+
+    // no items in R, but preamble longs indicates full
+    bytes[0] = 4; // PREAMBLE_LONGS_FULL
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch sampling mode with no R items",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // k = 0
+    bytes = src_bytes; 
+    *reinterpret_cast<int32_t*>(&bytes[4]) = 0;
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch sampling mode with k = 0",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // negative H region count in Java (signed ints)
+    bytes = src_bytes; 
+    *reinterpret_cast<int32_t*>(&bytes[16]) = -1;
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch invalid H count",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    // negative R region count in Java (signed ints)
+    bytes = src_bytes; 
+    *reinterpret_cast<int32_t*>(&bytes[20]) = -128;
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch invalid R count",
+      var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+    }
+    std::cerr << "end malformed_preamble()" << std::endl;
+  }
+
+  void empty_sketch() {
+    std::cerr << "start empty_sketch()" << std::endl;
+    {
+    var_opt_sketch<std::string> sk(5);
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
+    CPPUNIT_ASSERT_EQUAL((uint32_t) 0, sk.get_num_samples());
+
+    std::vector<uint8_t> bytes = sk.serialize();
+    CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1 << 3), bytes.size()); // num bytes in PREAMBLE_LONGS_EMPTY
+
+    var_opt_sketch<std::string> loaded_sk = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 0, loaded_sk.get_n());
+    CPPUNIT_ASSERT_EQUAL((uint32_t) 0, loaded_sk.get_num_samples());
+    }
+    std::cerr << "end empty_sketch()" << std::endl;
+  }
+
+  void non_empty_degenerate_sketch() {
+    std::cerr << "start non_empty_degenerate_sketch()" << std::endl;
+    {
+    // Make an empty serialized sketch, then extend it to a
+    // PREAMBLE_LONGS_WARMUP-sized byte array, with no items.
+    // Then clear the empty flag so it will try to load the rest.
+    var_opt_sketch<std::string> sk(12, resize_factor::X2);
+    std::vector<uint8_t> bytes = sk.serialize();
+    while (bytes.size() < 24) { // PREAMBLE_LONGS_WARMUP * 8
+      bytes.push_back((uint8_t) 0);
+    }
+  
+    // ensure non-empty -- H and R region sizes already set to 0
+    bytes[3] = 0; // set flags bit to not-empty (other bits should already be 0)
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch non-empty sketch with no items",
+      var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+    }
+    std::cerr << "end non_empty_degenerate_sketch()" << std::endl;
+  }
+
+  void invalid_weight() {
+    std::cerr << "start invalid_weights()" << std::endl;
+    {
+    var_opt_sketch<std::string> sk(100, resize_factor::X2);
+    CPPUNIT_ASSERT_THROW_MESSAGE("update() accepted a negative weight",
+      sk.update("invalid_weight", -1.0),
+      std::invalid_argument);
+    }
+    std::cerr << "end invalid_weights()" << std::endl;
+  }
+
+  void corrupt_serialized_weight() {
+    std::cerr << "start corrupt_serialized_weight()" << std::endl;
+    {
+    var_opt_sketch<int> sk = create_unweighted_sketch(100, 20);
+    auto bytes = sk.to_string();
+    
+    // weights are in the first double after the preamble
+    size_t preamble_bytes = (bytes[0] & 0x3f) << 3;
+    *reinterpret_cast<double*>(&bytes[preamble_bytes]) = -1.5;
+
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch negative item weight",
+      var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()),
+      std::invalid_argument);
+
+    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+    for (auto& b : bytes) { ss >> b; }
+    CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch negative item weight",
+      var_opt_sketch<std::string>::deserialize(ss),
+      std::invalid_argument);
+    }
+    std::cerr << "end corrupt_serialized_weight()" << std::endl;
+  }
+
+  void cumulative_weight() {
+    std::cerr << "start cumulative_weight()" << std::endl;
+    {
+    uint32_t k = 256;
+    uint64_t n = 10 * k;
+    var_opt_sketch<int> sk(k);
+
+    std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
+    std::mt19937_64 rand(rd());
+    std::normal_distribution<double> N(0.0, 1.0);
+
+    double input_sum = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+      // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
+      // which covers about 10 orders of magnitude
+
+      double w = std::exp(5 * N(rand));
+      input_sum += w;
+      sk.update(i, w);
+    }
+
+    double output_sum = 0.0;
+    for (auto& it : sk) { // pair<int, weight>
+      output_sum += it.second;
+    }
+    
+    double weight_ratio = output_sum / input_sum;
+    CPPUNIT_ASSERT(std::abs(weight_ratio - 1.0) < EPS);
+    }
+    std::cerr << "end cumulative_weight()" << std::endl;
+  }
+
+  /**********************************************************************/
+
   void vo_union() {
     int k = 10;
     var_opt_sketch<int> sk(k), sk2(k+3);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org