You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/08 08:43:19 UTC
[incubator-datasketches-cpp] 02/02: [WIP] finish porting relevant
java unit tests, fixing bugs along the way
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit d73bca4848008f2a9374e4c2eb57d894ad847b61
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Sat Feb 8 00:42:52 2020 -0800
[WIP] finish porting relevant java unit tests, fixing bugs along the way
---
sampling/include/var_opt_sketch.hpp | 8 +-
sampling/include/var_opt_sketch_impl.hpp | 46 ++---
sampling/test/var_opt_sketch_test.cpp | 304 ++++++++++++++++++++++++++++++-
3 files changed, 321 insertions(+), 37 deletions(-)
diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index da4f339..9a35ee8 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -38,10 +38,10 @@ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template
* A struct to hold the result of subset sum queries
*/
struct subset_summary {
- const double lower_bound;
- const double estimate;
- const double upper_bound;
- const double total_sketch_weight;
+ double lower_bound;
+ double estimate;
+ double upper_bound;
+ double total_sketch_weight;
};
enum resize_factor { X1 = 0, X2, X4, X8 };
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 9810242..2e83671 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -582,13 +582,6 @@ bool var_opt_sketch<T,S,A>::is_empty() const {
template<typename T, typename S, typename A>
void var_opt_sketch<T,S,A>::reset() {
- n_ = 0;
- h_ = 0;
- m_ = 0;
- r_ = 0;
- num_marks_in_h_ = 0;
- total_wt_r_ = 0.0;
-
uint32_t prev_alloc = curr_items_alloc_;
uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
@@ -598,31 +591,38 @@ void var_opt_sketch<T,S,A>::reset() {
++curr_items_alloc_;
}
+ if (filled_data_) {
+ // destroy everything
+ for (size_t i = 0; i < curr_items_alloc_; ++i)
+ A().destroy(data_ + i);
+ } else {
+ // skip gap or anything unused at the end
+ for (size_t i = 0; i < h_; ++i)
+ A().destroy(data_+ i);
+
+ for (size_t i = h_ + 1; i < curr_items_alloc_; ++i)
+ A().destroy(data_ + i);
+ }
+
if (curr_items_alloc_ < prev_alloc) {
bool is_gadget = (marks_ != nullptr);
- if (filled_data_) {
- // destroy everything
- for (size_t i = 0; i < curr_items_alloc_; ++i)
- A().destroy(data_ + i);
- } else {
- // skip gap or anything unused at the end
- for (size_t i = 0; i < h_; ++i)
- A().destroy(data_+ i);
-
- for (size_t i = h_ + 1; i < curr_items_alloc_; ++i)
- A().destroy(data_ + i);
- }
A().deallocate(data_, curr_items_alloc_);
AllocDouble().deallocate(weights_, curr_items_alloc_);
if (marks_ != nullptr)
- AllocBool().deallocate(marks_, curr_items_alloc_);
+ AllocBool().deallocate(marks_, curr_items_alloc_);
allocate_data_arrays(curr_items_alloc_, is_gadget);
- } else {
- filled_data_ = false;
}
+
+ n_ = 0;
+ h_ = 0;
+ m_ = 0;
+ r_ = 0;
+ num_marks_in_h_ = 0;
+ total_wt_r_ = 0.0;
+ filled_data_ = false;
}
template<typename T, typename S, typename A>
@@ -898,7 +898,7 @@ void var_opt_sketch<T,S,A>::grow_data_arrays() {
double* tmp_weights = AllocDouble().allocate(curr_items_alloc_);
for (int i = 0; i < prev_size; ++i) {
- tmp_data[i] = std::move(data_[i]);
+ A().construct(&tmp_data[i], std::move(data_[i]));
A().destroy(data_ + i);
tmp_weights[i] = std::move(weights_[i]); // primitive double, but for consistency
}
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index d78d3b6..78bcb27 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -40,7 +40,7 @@ namespace datasketches {
class var_opt_sketch_test: public CppUnit::TestFixture {
- static constexpr double EPS = 1e-10;
+ static constexpr double EPS = 1e-13;
CPPUNIT_TEST_SUITE(var_opt_sketch_test);
CPPUNIT_TEST(invalid_k);
@@ -53,15 +53,16 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
CPPUNIT_TEST(invalid_weight);
CPPUNIT_TEST(corrupt_serialized_weight);
CPPUNIT_TEST(cumulative_weight);
- // CPPUNIT_TEST(under_full_sketch_serialization);
- // CPPUNIT_TEST(end_of_warmup_sketch_serialization);
- // CPPUNIT_TEST(full_sketch_serialization);
- // CPPUNIT_TEST(pseudo_light_update);
- // CPPUNIT_TEST(pseudo_heavy_update);
+ CPPUNIT_TEST(under_full_sketch_serialization);
+ CPPUNIT_TEST(end_of_warmup_sketch_serialization);
+ CPPUNIT_TEST(full_sketch_serialization);
+ CPPUNIT_TEST(string_serialization);
+ CPPUNIT_TEST(pseudo_light_update);
+ CPPUNIT_TEST(pseudo_heavy_update);
// CPPUNIT_TEST(decrease_k_with_under_full_sketch);
// CPPUNIT_TEST(decrease_k_with_full_sketch);
- // CPPUNIT_TEST(reset);
- // CPPUNIT_TEST(estimate_subset_sum);
+ CPPUNIT_TEST(reset);
+ CPPUNIT_TEST(estimate_subset_sum);
// CPPUNIT_TEST(binary_compatibility);
// CPPUNIT_TEST(empty);
@@ -76,6 +77,35 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
return sk;
}
+ template<typename T, typename S, typename A>
+ void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2) {
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different values of k",
+ sk1.get_k(), sk2.get_k());
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different values of n",
+ sk1.get_n(), sk2.get_n());
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("sketches have different sample counts",
+ sk1.get_num_samples(), sk2.get_num_samples());
+
+ auto it1 = sk1.begin();
+ auto it2 = sk2.begin();
+ size_t i = 0;
+
+ while ((it1 != sk1.end()) && (it2 != sk2.end())) {
+ const std::pair<const T&, const double> p1 = *it1;
+ const std::pair<const T&, const double> p2 = *it2;
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("data values differ at sample " + std::to_string(i),
+ p1.first, p2.first);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("weight values differ at sample " + std::to_string(i),
+ p1.second, p2.second);
+ ++i;
+ ++it1;
+ ++it2;
+ }
+
+ CPPUNIT_ASSERT_MESSAGE("iterators did not end at the same time",
+ (it1 == sk1.end()) && (it2 == sk2.end()));
+ }
+
void invalid_k() {
std::cerr << "start invalid_k()" << std::endl;
{
@@ -287,14 +317,13 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
for (size_t i = 0; i < n; ++i) {
// generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
// which covers about 10 orders of magnitude
-
double w = std::exp(5 * N(rand));
input_sum += w;
sk.update(i, w);
}
double output_sum = 0.0;
- for (auto& it : sk) { // pair<int, weight>
+ for (auto& it : sk) { // std::pair<int, weight>
output_sum += it.second;
}
@@ -304,6 +333,261 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
std::cerr << "end cumulative_weight()" << std::endl;
}
+ void under_full_sketch_serialization() {
+ std::cerr << "start under_full_sketch_serialization()" << std::endl;
+ {
+ var_opt_sketch<int> sk = create_unweighted_sketch(100, 10); // need n < k
+
+ auto bytes = sk.serialize();
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
+ check_if_equal(sk, sk_from_bytes);
+
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ sk.serialize(ss);
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
+ check_if_equal(sk, sk_from_stream);
+ }
+ std::cerr << "end under_full_sketch_serialization()" << std::endl;
+ }
+
+ void end_of_warmup_sketch_serialization() {
+ std::cerr << "start end_of_warmup_sketch_serialization()" << std::endl;
+ {
+ var_opt_sketch<int> sk = create_unweighted_sketch(2843, 2843); // need n == k
+ auto bytes = sk.serialize();
+
+ // ensure still only 3 preamble longs
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Wrong number of preamble longs at end of warmup mode",
+ 3, bytes.data()[0] & 0x3f); // PREAMBLE_LONGS_WARMUP
+
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
+ check_if_equal(sk, sk_from_bytes);
+
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ sk.serialize(ss);
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
+ check_if_equal(sk, sk_from_stream);
+ }
+ std::cerr << "end end_of_warmup_sketch_serialization()" << std::endl;
+ }
+
+ void full_sketch_serialization() {
+ std::cerr << "start full_sketch_serialization()" << std::endl;
+ {
+ var_opt_sketch<int> sk = create_unweighted_sketch(32, 32);
+ sk.update(100, 100.0);
+ sk.update(101, 101.0);
+
+ // first 2 entries should be heavy and in heap order (smallest at root)
+ auto it = sk.begin();
+ const std::pair<const int, const double> p1 = *it;
+ ++it;
+ const std::pair<const int, const double> p2 = *it;
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(100.0, p1.second, EPS);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(101.0, p2.second, EPS);
+ CPPUNIT_ASSERT_EQUAL(100, p1.first);
+ CPPUNIT_ASSERT_EQUAL(101, p2.first);
+
+ // check for 4 preamble longs
+ auto bytes = sk.serialize();
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Wrong number of preamble longs at end of warmup mode",
+ 4, bytes.data()[0] & 0x3f); // PREAMBLE_LONGS_WARMUP
+
+ var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
+ check_if_equal(sk, sk_from_bytes);
+
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ sk.serialize(ss);
+ var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
+ check_if_equal(sk, sk_from_stream);
+ }
+ std::cerr << "end full_sketch_serialization()" << std::endl;
+ }
+
+ void string_serialization() {
+ std::cerr << "start string_serialization()" << std::endl;
+ {
+ var_opt_sketch<std::string> sk(5);
+ sk.update("a", 1.0);
+ sk.update("bc", 1.0);
+ sk.update("def", 1.0);
+ sk.update("ghij", 1.0);
+ sk.update("klmno", 1.0);
+ sk.update("heavy item", 100.0);
+
+ auto bytes = sk.serialize();
+ var_opt_sketch<std::string> sk_from_bytes = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
+ check_if_equal(sk, sk_from_bytes);
+
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ sk.serialize(ss);
+ var_opt_sketch<std::string> sk_from_stream = var_opt_sketch<std::string>::deserialize(ss);
+ check_if_equal(sk, sk_from_stream);
+
+ }
+ std::cerr << "end string_serialization()" << std::endl;
+ }
+
+ void pseudo_light_update() {
+ std::cerr << "start pseudo_light_update()" << std::endl;
+ {
+ uint32_t k = 1024;
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
+ sk.update(0, 1.0); // k+2nd update
+
+ // check the first weight, assuming all k items are unweighted
+ // (and consequently in R).
+ // Expected: (k + 2) / |R| = (k + 2) / k
+ auto it = sk.begin();
+ double wt = (*it).second;
+ CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_light_update()",
+ ((1.0 * (k + 2)) / k), wt, EPS);
+ }
+ std::cerr << "end pseudo_light_update()" << std::endl;
+ }
+
+ void pseudo_heavy_update() {
+ std::cerr << "start pseudo_heavy_update()" << std::endl;
+ {
+ uint32_t k = 1024;
+ double wt_scale = 10.0 * k;
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
+
+ // Next k-1 updates should be update_pseudo_heavy_general()
+ // Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
+ // added k-1 heavy items, leaving only 1 item left in R
+ for (int i = 1; i <= k; ++i) {
+ sk.update(-i, k + (i * wt_scale));
+ }
+
+ auto it = sk.begin();
+
+ // Expected: lightest "heavy" item (first one out): k + 2*wt_scale
+ double wt = (*it).second;
+ CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_heavy_update()",
+ 1.0 * (k + (2 * wt_scale)), wt, EPS);
+
+ // we don't know which R item is left, but there should be only one, at the end
+ // of the sample set.
+ // Expected: k+1 + (min "heavy" item) / |R| = ((k+1) + (k*wt_scale)) / 1 = wt_scale + 2k + 1
+ while (it != sk.end()) {
+ wt = (*it).second;
+ ++it;
+ }
+ CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_light_update()",
+ 1.0 + wt_scale + (2 * k), wt, EPS);
+ }
+ std::cerr << "end pseudo_heavy_update()" << std::endl;
+ }
+
+ void reset() {
+ std::cerr << "start reset()" << std::endl;
+ {
+ uint32_t k = 1024;
+ uint64_t n1 = 20;
+ uint64_t n2 = 2 * k;
+ var_opt_sketch<std::string> sk(k);
+
+ // reset from sampling mode
+ for (int i = 0; i < n2; ++i) {
+ sk.update(std::to_string(i), 100.0 + i);
+ }
+ CPPUNIT_ASSERT_EQUAL(n2, sk.get_n());
+ CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+
+ sk.reset();
+ CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
+ CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+
+ // reset from exact mode
+ for (int i = 0; i < n1; ++i)
+ sk.update(std::to_string(i));
+ CPPUNIT_ASSERT_EQUAL(n1, sk.get_n());
+ CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+
+ sk.reset();
+ CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
+ CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
+ }
+ std::cerr << "end reset()" << std::endl;
+ }
+
+ void estimate_subset_sum() {
+ std::cerr << "start estimate_subset_sum()" << std::endl;
+ {
+ uint32_t k = 10;
+ var_opt_sketch<int> sk(k);
+
+ // empty sketch -- all zeros
+ subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.estimate, 0.0);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.total_sketch_weight, 0.0);
+
+ // add items, keeping in exact mode
+ double total_weight = 0.0;
+ for (int i = 1; i <= (k - 1); ++i) {
+ sk.update(i, 1.0 * i);
+ total_weight += 1.0 * i;
+ }
+
+ summary = sk.estimate_subset_sum([](int){ return true; });
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.estimate, 0.0);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.lower_bound, 0.0);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.upper_bound, 0.0);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, 0.0);
+
+ // add a few more items, pushing to sampling mode
+ for (int i = k; i <= (k + 1); ++i) {
+ sk.update(i, 1.0 * i);
+ total_weight += 1.0 * i;
+ }
+
+ // predicate always true so estimate == upper bound
+ summary = sk.estimate_subset_sum([](int){ return true; });
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.estimate, EPS);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.upper_bound, EPS);
+ CPPUNIT_ASSERT_LESS(total_weight, summary.lower_bound); // lower_bound < total_weight
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, EPS);
+
+ // predicate always false so estimate == lower bound == 0.0
+ summary = sk.estimate_subset_sum([](int){ return false; });
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.estimate, EPS);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, summary.lower_bound, EPS);
+ CPPUNIT_ASSERT_GREATER(0.0, summary.upper_bound); // upper_bound > 0.0
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, EPS);
+
+ // finally, a non-degenerate predicate
+ // insert negative items with identical weights, filter for negative weights only
+ for (int i = 1; i <= (k + 1); ++i) {
+ sk.update(-i, 1.0 * i);
+ total_weight += 1.0 * i;
+ }
+
+ summary= sk.estimate_subset_sum([](int x) { return x < 0; });
+ CPPUNIT_ASSERT_GREATEREQUAL(summary.lower_bound, summary.estimate); // estimate >= lower_bound)
+ CPPUNIT_ASSERT_LESSEQUAL(summary.upper_bound, summary.estimate); // estimate <= upper_bound)
+
+ // allow pretty generous bounds when testing
+ CPPUNIT_ASSERT(summary.lower_bound < (total_weight / 1.4));
+ CPPUNIT_ASSERT(summary.upper_bound > (total_weight / 2.6));
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(total_weight, summary.total_sketch_weight, EPS);
+
+ // and another data type, keeping it in exact mode for simplicity
+ var_opt_sketch<bool> sk2(k);
+ total_weight = 0.0;
+ for (int i = 1; i <= (k - 1); ++i) {
+ sk2.update((i % 2) == 0, 1.0 * i);
+ total_weight += i;
+ }
+
+ summary = sk2.estimate_subset_sum([](bool b){ return !b; });
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(summary.lower_bound, summary.estimate, 0.0);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(summary.upper_bound, summary.estimate, 0.0);
+ CPPUNIT_ASSERT_LESS(total_weight, summary.estimate); // exact mode, so know it must be strictly less
+ }
+ std::cerr << "end estimate_subset_sum()" << std::endl;
+ }
+
/**********************************************************************/
void vo_union() {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org