You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/08 08:43:18 UTC
[incubator-datasketches-cpp] 01/02: [WIP] adding var_opt_sketch
tests, fixing issues along the way
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit dfdd0406aba810de70601e541d3378c47217c039
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Wed Feb 5 12:47:15 2020 -0800
[WIP] adding var_opt_sketch tests, fixing issues along the way
---
sampling/include/var_opt_sketch.hpp | 8 +-
sampling/include/var_opt_sketch_impl.hpp | 153 ++++++++---------
sampling/include/var_opt_union_impl.hpp | 1 +
sampling/test/var_opt_sketch_test.cpp | 274 ++++++++++++++++++++++++++++++-
4 files changed, 349 insertions(+), 87 deletions(-)
diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index bf22a42..da4f339 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -44,14 +44,16 @@ struct subset_summary {
const double total_sketch_weight;
};
+enum resize_factor { X1 = 0, X2, X4, X8 };
+
template <typename T, typename S, typename A> class var_opt_union; // forward declaration
template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
class var_opt_sketch {
public:
- enum resize_factor { X1 = 0, X2, X4, X8 };
static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
+ static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
var_opt_sketch(const var_opt_sketch& other);
@@ -188,7 +190,9 @@ class var_opt_sketch {
// validation
static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
-
+ // next method sets current_items_alloc_
+ void validate_and_set_current_size(int preamble_longs);
+
// things to move to common utils and share among sketches
static uint32_t get_adjusted_size(int max_size, int resize_target);
static uint32_t starting_sub_multiple(int lg_target, int lg_rf, int lg_min);
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 26d4af3..9810242 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -25,7 +25,7 @@
#include "bounds_binomial_proportions.hpp"
#include <memory>
-#include <iostream>
+#include <sstream>
#include <cmath>
#include <random>
#include <algorithm>
@@ -140,8 +140,8 @@ var_opt_sketch<T,S,A>::var_opt_sketch(var_opt_sketch&& other) noexcept :
template<typename T, typename S, typename A>
var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
- if (k < 1) {
- throw std::invalid_argument("k must be at least 1");
+ if (k == 0 || k_ > MAX_K) {
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
}
uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
@@ -169,18 +169,20 @@ var_opt_sketch<T,S,A>::~var_opt_sketch() {
A().destroy(data_+ i);
}
- for (size_t i = h_ + 1; i < curr_items_alloc_; ++i) {
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
A().destroy(data_ + i);
}
}
A().deallocate(data_, curr_items_alloc_);
}
- if (weights_ != nullptr)
+ if (weights_ != nullptr) {
AllocDouble().deallocate(weights_, curr_items_alloc_);
+ }
- if (marks_ != nullptr)
+ if (marks_ != nullptr) {
AllocBool().deallocate(marks_, curr_items_alloc_);
+ }
}
template<typename T, typename S, typename A>
@@ -229,52 +231,27 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
is.read((char*)&h_, sizeof(uint32_t));
is.read((char*)&r_, sizeof(uint32_t));
- // initial data validation
- // read fourth prelong, if needed, and determine vector sizes
- if (n_ <= k_) {
- if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
- throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
- "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
- }
- if (n_ != h_) {
- throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
- "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
- }
- if (r_ > 0) {
- throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
- "Found r = " + std::to_string(r_));
- }
-
- uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
- uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
- int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
- curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
- if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
- ++curr_items_alloc_;
- }
- } else { // n_ > k_
- if (preamble_longs != PREAMBLE_LONGS_FULL) {
- throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
- "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
- }
- if (h_ + r_ != k_) {
- throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
- "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
- }
+ validate_and_set_current_size(preamble_longs);
+ // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
+ if (preamble_longs == PREAMBLE_LONGS_FULL) {
is.read((char*)&total_wt_r_, sizeof(total_wt_r_));
if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
"Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
}
-
- curr_items_alloc_ = k_ + 1;
}
allocate_data_arrays(curr_items_alloc_, is_gadget);
// read the first h_ weights
is.read((char*)weights_, h_ * sizeof(double));
+ for (size_t i = 0; i < h_; ++i) {
+ if (!(weights_[i] > 0.0)) {
+ throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(weights_[i]));
+ }
+ }
+
std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
// read the first h_ marks as packed bytes iff we have a gadget
@@ -306,52 +283,26 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
ptr += copy_from_mem(ptr, &h_, sizeof(h_));
ptr += copy_from_mem(ptr, &r_, sizeof(r_));
- // initial data validation
- // read fourth prelong, if needed, and determine vector sizes
- if (n_ <= k_) {
- if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
- throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
- "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
- }
- if (n_ != h_) {
- throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
- "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
- }
- if (r_ > 0) {
- throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
- "Found r = " + std::to_string(r_));
- }
-
- uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
- uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
- int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
- curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
- if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
- ++curr_items_alloc_;
- }
- } else { // n_ > k_
- if (preamble_longs != PREAMBLE_LONGS_FULL) {
- throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
- "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
- }
- if (h_ + r_ != k_) {
- throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
- "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
- }
-
+ validate_and_set_current_size(preamble_longs);
+
+ // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
+ if (preamble_longs == PREAMBLE_LONGS_FULL) {
ptr += copy_from_mem(ptr, &total_wt_r_, sizeof(total_wt_r_));
if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
"Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
}
-
- curr_items_alloc_ = k_ + 1;
}
allocate_data_arrays(curr_items_alloc_, is_gadget);
// read the first h_ weights, fill in rest of array with -1.0
ptr += copy_from_mem(ptr, weights_, h_ * sizeof(double));
+ for (size_t i = 0; i < h_; ++i) {
+ if (!(weights_[i] > 0.0)) {
+ throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(weights_[i]));
+ }
+ }
std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
// read the first h_ marks as packed bytes iff we have a gadget
@@ -668,7 +619,7 @@ void var_opt_sketch<T,S,A>::reset() {
if (marks_ != nullptr)
AllocBool().deallocate(marks_, curr_items_alloc_);
- allocate_data_ararys(curr_items_alloc_, is_gadget);
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
} else {
filled_data_ = false;
}
@@ -736,7 +687,7 @@ std::string var_opt_sketch<T,S,A>::to_string() const {
template<typename T, typename S, typename A>
void var_opt_sketch<T,S,A>::update(const T& item, double weight, bool mark) {
if (weight <= 0.0) {
- throw std::invalid_argument("Items weights must be strictly positive. Found: "
+ throw std::invalid_argument("Item weights must be strictly positive. Found: "
+ std::to_string(weight));
}
++n_;
@@ -1268,17 +1219,17 @@ void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t
if (is_empty) {
if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
- throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
+ std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
+ std::to_string(preamble_longs));
}
} else {
if (preamble_longs != PREAMBLE_LONGS_WARMUP
&& preamble_longs != PREAMBLE_LONGS_FULL) {
- throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
+ std::to_string(PREAMBLE_LONGS_WARMUP) + " or "
+ std::to_string(PREAMBLE_LONGS_FULL)
- + "for a non-empty sketch. Found: " + std::to_string(preamble_longs));
+ + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
}
}
}
@@ -1299,6 +1250,48 @@ void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t famil
}
template<typename T, typename S, typename A>
+void var_opt_sketch<T, S, A>::validate_and_set_current_size(int preamble_longs) {
+ if (k_ == 0 || k_ > MAX_K) {
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
+ }
+
+ if (n_ <= k_) {
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
+ throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
+ "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+ }
+ if (n_ != h_) {
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
+ "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
+ }
+ if (r_ > 0) {
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
+ "Found r = " + std::to_string(r_));
+ }
+
+ uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
+ uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
+ int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
+ ++curr_items_alloc_;
+ }
+ } else { // n_ > k_
+ if (preamble_longs != PREAMBLE_LONGS_FULL) {
+ throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
+ "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+ }
+ if (h_ + r_ != k_) {
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
+ "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
+ }
+
+ curr_items_alloc_ = k_ + 1;
+ }
+}
+
+
+template<typename T, typename S, typename A>
subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(std::function<bool(T)> predicate) const {
if (n_ == 0) {
return {0.0, 0.0, 0.0, 0.0};
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
index 9fe2323..77845a0 100644
--- a/sampling/include/var_opt_union_impl.hpp
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -23,6 +23,7 @@
#include "var_opt_union.hpp"
#include <cmath>
+#include <sstream>
namespace datasketches {
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index 203cfc9..d78d3b6 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -17,15 +17,18 @@
* under the License.
*/
-#include <cppunit/TestFixture.h>
-#include <cppunit/extensions/HelperMacros.h>
-
#include <var_opt_sketch.hpp>
#include <var_opt_union.hpp>
+#include <cppunit/TestFixture.h>
+#include <cppunit/extensions/HelperMacros.h>
+
+#include <vector>
#include <string>
#include <sstream>
#include <iostream>
+#include <cmath>
+#include <random>
#ifdef TEST_BINARY_INPUT_PATH
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -37,11 +40,272 @@ namespace datasketches {
class var_opt_sketch_test: public CppUnit::TestFixture {
+ static constexpr double EPS = 1e-10;
+
CPPUNIT_TEST_SUITE(var_opt_sketch_test);
- CPPUNIT_TEST(empty);
- CPPUNIT_TEST(vo_union);
+ CPPUNIT_TEST(invalid_k);
+ CPPUNIT_TEST(bad_ser_ver);
+ CPPUNIT_TEST(bad_family);
+ CPPUNIT_TEST(bad_prelongs);
+ CPPUNIT_TEST(malformed_preamble);
+ CPPUNIT_TEST(empty_sketch);
+ CPPUNIT_TEST(non_empty_degenerate_sketch);
+ CPPUNIT_TEST(invalid_weight);
+ CPPUNIT_TEST(corrupt_serialized_weight);
+ CPPUNIT_TEST(cumulative_weight);
+ // CPPUNIT_TEST(under_full_sketch_serialization);
+ // CPPUNIT_TEST(end_of_warmup_sketch_serialization);
+ // CPPUNIT_TEST(full_sketch_serialization);
+ // CPPUNIT_TEST(pseudo_light_update);
+ // CPPUNIT_TEST(pseudo_heavy_update);
+ // CPPUNIT_TEST(decrease_k_with_under_full_sketch);
+ // CPPUNIT_TEST(decrease_k_with_full_sketch);
+ // CPPUNIT_TEST(reset);
+ // CPPUNIT_TEST(estimate_subset_sum);
+ // CPPUNIT_TEST(binary_compatibility);
+
+ // CPPUNIT_TEST(empty);
+ // CPPUNIT_TEST(vo_union);
CPPUNIT_TEST_SUITE_END();
+ var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
+ var_opt_sketch<int> sk(k);
+ for (uint64_t i = 0; i < n; ++i) {
+ sk.update(i, 1.0);
+ }
+ return sk;
+ }
+
+ void invalid_k() {
+ std::cerr << "start invalid_k()" << std::endl;
+ {
+ CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k = 0",
+ var_opt_sketch<int> sk(0),
+ std::invalid_argument);
+
+ CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k < 0 (aka >= 2^31)",
+ var_opt_sketch<int> sk(1<<31),
+ std::invalid_argument);
+ }
+ std::cerr << "end invalid_k()" << std::endl;
+ }
+
+ void bad_ser_ver() {
+ std::cerr << "start bad_ser_ver()" << std::endl;
+ {
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
+ std::vector<uint8_t> bytes = sk.serialize();
+ bytes[1] = 0; // corrupt the serialization version byte
+
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad serialization version",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ // create a stringstream to check the same
+ std::stringstream ss;
+ std::string str(bytes.begin(), bytes.end());
+ ss.str(str);
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad serialization version",
+ var_opt_sketch<int>::deserialize(ss),
+ std::invalid_argument);
+ }
+ std::cerr << "end bad_ser_ver()" << std::endl;
+ }
+
+ void bad_family() {
+ std::cerr << "start bad_family()" << std::endl;
+ {
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
+ std::vector<uint8_t> bytes = sk.serialize();
+ bytes[2] = 0; // corrupt the family byte
+
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad family id",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ std::stringstream ss;
+ std::string str(bytes.begin(), bytes.end());
+ ss.str(str);
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad family id",
+ var_opt_sketch<int>::deserialize(ss),
+ std::invalid_argument);
+ }
+ std::cerr << "end bad_family()" << std::endl;
+ }
+
+ void bad_prelongs() {
+ std::cerr << "start bad_prelongs()" << std::endl;
+ {
+ // The nubmer of preamble longs shares bits with resize_factor, but the latter
+ // has no invalid values as it gets 2 bites for 4 enum values.
+ var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
+ std::vector<uint8_t> bytes = sk.serialize();
+
+ bytes[0] = 0; // corrupt the preamble longs byte to be too small
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ bytes[0] = 2; // corrupt the preamble longs byte to 2
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ bytes[0] = 5; // corrupt the preamble longs byte to be too large
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+ }
+ std::cerr << "end bad_prelongs()" << std::endl;
+ }
+
+ void malformed_preamble() {
+ std::cerr << "start malformed_preamble()" << std::endl;
+ {
+ uint32_t k = 50;
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k);
+ const std::vector<uint8_t> src_bytes = sk.serialize();
+
+ // we'll re-use the same bytes several times so we'll use copies
+ std::vector<uint8_t> bytes(src_bytes);
+
+ // no items in R, but preamble longs indicates full
+ bytes[0] = 4; // PREAMBLE_LONGS_FULL
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch sampling mode with no R items",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ // k = 0
+ bytes = src_bytes;
+ *reinterpret_cast<int32_t*>(&bytes[4]) = 0;
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch sampling mode with k = 0",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ // negative H region count in Java (signed ints)
+ bytes = src_bytes;
+ *reinterpret_cast<int32_t*>(&bytes[16]) = -1;
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch invalid H count",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ // negative R region count in Java (signed ints)
+ bytes = src_bytes;
+ *reinterpret_cast<int32_t*>(&bytes[20]) = -128;
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch invalid R count",
+ var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+ }
+ std::cerr << "end malformed_preamble()" << std::endl;
+ }
+
+ void empty_sketch() {
+ std::cerr << "start empty_sketch()" << std::endl;
+ {
+ var_opt_sketch<std::string> sk(5);
+ CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
+ CPPUNIT_ASSERT_EQUAL((uint32_t) 0, sk.get_num_samples());
+
+ std::vector<uint8_t> bytes = sk.serialize();
+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1 << 3), bytes.size()); // num bytes in PREAMBLE_LONGS_EMPTY
+
+ var_opt_sketch<std::string> loaded_sk = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
+ CPPUNIT_ASSERT_EQUAL((uint64_t) 0, loaded_sk.get_n());
+ CPPUNIT_ASSERT_EQUAL((uint32_t) 0, loaded_sk.get_num_samples());
+ }
+ std::cerr << "end empty_sketch()" << std::endl;
+ }
+
+ void non_empty_degenerate_sketch() {
+ std::cerr << "start non_empty_degenerate_sketch()" << std::endl;
+ {
+ // Make an empty serialized sketch, then extend it to a
+ // PREAMBLE_LONGS_WARMUP-sized byte array, with no items.
+ // Then clear the empty flag so it will try to load the rest.
+ var_opt_sketch<std::string> sk(12, resize_factor::X2);
+ std::vector<uint8_t> bytes = sk.serialize();
+ while (bytes.size() < 24) { // PREAMBLE_LONGS_WARMUP * 8
+ bytes.push_back((uint8_t) 0);
+ }
+
+ // ensure non-empty -- H and R region sizes already set to 0
+ bytes[3] = 0; // set flags bit to not-empty (other bits should already be 0)
+
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch non-empty sketch with no items",
+ var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+ }
+ std::cerr << "end non_empty_degenerate_sketch()" << std::endl;
+ }
+
+ void invalid_weight() {
+ std::cerr << "start invalid_weights()" << std::endl;
+ {
+ var_opt_sketch<std::string> sk(100, resize_factor::X2);
+ CPPUNIT_ASSERT_THROW_MESSAGE("update() accepted a negative weight",
+ sk.update("invalid_weight", -1.0),
+ std::invalid_argument);
+ }
+ std::cerr << "end invalid_weights()" << std::endl;
+ }
+
+ void corrupt_serialized_weight() {
+ std::cerr << "start corrupt_serialized_weight()" << std::endl;
+ {
+ var_opt_sketch<int> sk = create_unweighted_sketch(100, 20);
+ auto bytes = sk.to_string();
+
+ // weights are in the first double after the preamble
+ size_t preamble_bytes = (bytes[0] & 0x3f) << 3;
+ *reinterpret_cast<double*>(&bytes[preamble_bytes]) = -1.5;
+
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch negative item weight",
+ var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()),
+ std::invalid_argument);
+
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ for (auto& b : bytes) { ss >> b; }
+ CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch negative item weight",
+ var_opt_sketch<std::string>::deserialize(ss),
+ std::invalid_argument);
+ }
+ std::cerr << "end corrupt_serialized_weight()" << std::endl;
+ }
+
+ void cumulative_weight() {
+ std::cerr << "start cumulative_weight()" << std::endl;
+ {
+ uint32_t k = 256;
+ uint64_t n = 10 * k;
+ var_opt_sketch<int> sk(k);
+
+ std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
+ std::mt19937_64 rand(rd());
+ std::normal_distribution<double> N(0.0, 1.0);
+
+ double input_sum = 0.0;
+ for (size_t i = 0; i < n; ++i) {
+ // generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
+ // which covers about 10 orders of magnitude
+
+ double w = std::exp(5 * N(rand));
+ input_sum += w;
+ sk.update(i, w);
+ }
+
+ double output_sum = 0.0;
+ for (auto& it : sk) { // pair<int, weight>
+ output_sum += it.second;
+ }
+
+ double weight_ratio = output_sum / input_sum;
+ CPPUNIT_ASSERT(std::abs(weight_ratio - 1.0) < EPS);
+ }
+ std::cerr << "end cumulative_weight()" << std::endl;
+ }
+
+ /**********************************************************************/
+
void vo_union() {
int k = 10;
var_opt_sketch<int> sk(k), sk2(k+3);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org