You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/01/27 22:37:00 UTC
[incubator-datasketches-cpp] 02/02: [WIP] var opt union feature
complete, very minimal testing
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit 1ca5bad43704d0acbf70f21801e9ec86d756fe71
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Mon Jan 27 14:35:46 2020 -0800
[WIP] var opt union feature complete, very minimal testing
---
common/include/bounds_binomial_proportions.hpp | 291 +++++++++++++++++
sampling/include/var_opt_sketch.hpp | 104 ++++--
sampling/include/var_opt_sketch_impl.hpp | 420 +++++++++++++++++++++----
sampling/include/var_opt_union.hpp | 156 +++++++++
sampling/include/var_opt_union_impl.hpp | 366 +++++++++++++++++++++
sampling/test/counting_allocator.hpp | 109 -------
sampling/test/var_opt_sketch_test.cpp | 115 ++-----
7 files changed, 1286 insertions(+), 275 deletions(-)
diff --git a/common/include/bounds_binomial_proportions.hpp b/common/include/bounds_binomial_proportions.hpp
new file mode 100644
index 0000000..8b00cbd
--- /dev/null
+++ b/common/include/bounds_binomial_proportions.hpp
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
+#define _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
+
+#include <cmath>
+#include <stdexcept>
+
+namespace datasketches {
+
+/**
+ * Confidence intervals for binomial proportions.
+ *
+ * <p>This class computes an approximation to the Clopper-Pearson confidence interval
+ * for a binomial proportion. Exact Clopper-Pearson intervals are strictly
+ * conservative, but these approximations are not.</p>
+ *
+ * <p>The main inputs are numbers <i>n</i> and <i>k</i>, which are not the same as other things
+ * that are called <i>n</i> and <i>k</i> in our sketching library. There is also a third
+ * parameter, numStdDev, that specifies the desired confidence level.</p>
+ * <ul>
+ * <li><i>n</i> is the number of independent randomized trials. It is given and therefore known.
+ * </li>
+ * <li><i>p</i> is the probability of a trial being a success. It is unknown.</li>
+ * <li><i>k</i> is the number of trials (out of <i>n</i>) that turn out to be successes. It is
+ * a random variable governed by a binomial distribution. After any given
+ * batch of <i>n</i> independent trials, the random variable <i>k</i> has a specific
+ * value which is observed and is therefore known.</li>
+ * <li><i>pHat</i> = <i>k</i> / <i>n</i> is an unbiased estimate of the unknown success
+ * probability <i>p</i>.</li>
+ * </ul>
+ *
+ * <p>Alternatively, consider a coin with unknown heads probability <i>p</i>. Where
+ * <i>n</i> is the number of independent flips of that coin, and <i>k</i> is the number
+ * of times that the coin comes up heads during a given batch of <i>n</i> flips.
+ * This class computes a frequentist confidence interval [lowerBoundOnP, upperBoundOnP] for the
+ * unknown <i>p</i>.</p>
+ *
+ * <p>Conceptually, the desired confidence level is specified by a tail probability delta.</p>
+ *
+ * <p>Ideally, over a large ensemble of independent batches of trials,
+ * the fraction of batches in which the true <i>p</i> lies below lowerBoundOnP would be at most
+ * delta, and the fraction of batches in which the true <i>p</i> lies above upperBoundOnP
+ * would also be at most delta.
+ *
+ * <p>Setting aside the philosophical difficulties attaching to that statement, it isn't quite
+ * true because we are approximating the Clopper-Pearson interval.</p>
+ *
+ * <p>Finally, we point out that in this class's interface, the confidence parameter delta is
+ * not specified directly, but rather through a "number of standard deviations" numStdDev.
+ * The library effectively converts that to a delta via delta = normalCDF (-1.0 * numStdDev).</p>
+ *
+ * <p>It is perhaps worth emphasizing that the library is NOT merely adding and subtracting
+ * numStdDev standard deviations to the estimate. It is doing something better, that to some
+ * extent accounts for the fact that the binomial distribution has a non-gaussian shape.</p>
+ *
+ * <p>In particular, it is using an approximation to the inverse of the incomplete beta function
+ * that appears as formula 26.5.22 on page 945 of the "Handbook of Mathematical Functions"
+ * by Abramowitz and Stegun.</p>
+ *
+ * @author Kevin Lang
+ * @author Jon Malkin
+ */
+class bounds_binomial_proportions { // confidence intervals for binomial proportions
+
+public:
+ /**
+ * Computes lower bound of approximate Clopper-Pearson confidence interval for a binomial
+ * proportion.
+ *
+ * <p>Implementation Notes:<br>
+ * The approximateLowerBoundOnP is defined with respect to the right tail of the binomial
+ * distribution.</p>
+ * <ul>
+ * <li>We want to solve for the <i>p</i> for which sum<sub><i>j,k,n</i></sub>bino(<i>j;n,p</i>)
+ * = delta.</li>
+ * <li>We now restate that in terms of the left tail.</li>
+ * <li>We want to solve for the p for which sum<sub><i>j,0,(k-1)</i></sub>bino(<i>j;n,p</i>)
+ * = 1 - delta.</li>
+ * <li>Define <i>x</i> = 1-<i>p</i>.</li>
+ * <li>We want to solve for the <i>x</i> for which I<sub><i>x(n-k+1,k)</i></sub> = 1 - delta.</li>
+ * <li>We specify 1-delta via numStdDevs through the right tail of the standard normal
+ * distribution.</li>
+ * <li>Smaller values of numStdDevs correspond to bigger values of 1-delta and hence to smaller
+ * values of delta. In fact, usefully small values of delta correspond to negative values of
+ * numStdDevs.</li>
+ * <li>return <i>p</i> = 1-<i>x</i>.</li>
+ * </ul>
+ *
+ * @param n is the number of trials. Must be non-negative.
+ * @param k is the number of successes. Must be non-negative, and cannot exceed n.
+ * @param num_std_devs the number of standard deviations defining the confidence interval
+ * @return the lower bound of the approximate Clopper-Pearson confidence interval for the
+ * unknown success probability.
+ */
+ static double approximate_lower_bound_on_p(long n, long k, double num_std_devs) {
+ check_inputs(n, k);
+ if (n == 0) { return 0.0; } // the coin was never flipped, so we know nothing
+ else if (k == 0) { return 0.0; }
+ else if (k == 1) { return (exact_lower_bound_on_p_k_eq_1(n, delta_of_num_stdevs(num_std_devs))); }
+ else if (k == n) { return (exact_lower_bound_on_p_k_eq_n(n, delta_of_num_stdevs(num_std_devs))); }
+ else {
+ double x = abramowitz_stegun_formula_26p5p22((n - k) + 1, k, (-1.0 * num_std_devs));
+ return (1.0 - x); // which is p
+ }
+ }
+
+ /**
+ * Computes upper bound of approximate Clopper-Pearson confidence interval for a binomial
+ * proportion.
+ *
+ * <p>Implementation Notes:<br>
+ * The approximateUpperBoundOnP is defined with respect to the left tail of the binomial
+ * distribution.</p>
+ * <ul>
+ * <li>We want to solve for the <i>p</i> for which sum<sub><i>j,0,k</i></sub>bino(<i>j;n,p</i>)
+ * = delta.</li>
+ * <li>Define <i>x</i> = 1-<i>p</i>.</li>
+ * <li>We want to solve for the <i>x</i> for which I<sub><i>x(n-k,k+1)</i></sub> = delta.</li>
+ * <li>We specify delta via numStdDevs through the right tail of the standard normal
+ * distribution.</li>
+ * <li>Bigger values of numStdDevs correspond to smaller values of delta.</li>
+ * <li>return <i>p</i> = 1-<i>x</i>.</li>
+ * </ul>
+ * @param n is the number of trials. Must be non-negative.
+ * @param k is the number of successes. Must be non-negative, and cannot exceed <i>n</i>.
+ * @param num_std_devs the number of standard deviations defining the confidence interval
+ * @return the upper bound of the approximate Clopper-Pearson confidence interval for the
+ * unknown success probability.
+ */
+ static double approximate_upper_bound_on_p(long n, long k, double num_std_devs) {
+ check_inputs(n, k);
+ if (n == 0) { return 1.0; } // the coin was never flipped, so we know nothing
+ else if (k == n) { return 1.0; }
+ else if (k == (n - 1)) {
+ return (exactU_upper_bound_on_p_k_eq_minusone(n, delta_of_num_stdevs(num_std_devs)));
+ }
+ else if (k == 0) {
+ return (exact_upper_bound_on_p_k_eq_zero(n, delta_of_num_stdevs(num_std_devs)));
+ }
+ else {
+ double x = abramowitz_stegun_formula_26p5p22(n - k, k + 1, num_std_devs);
+ return (1.0 - x); // which is p
+ }
+ }
+
+ /**
+ * Computes an estimate of an unknown binomial proportion.
+ * @param n is the number of trials. Must be non-negative.
+ * @param k is the number of successes. Must be non-negative, and cannot exceed n.
+ * @return the estimate of the unknown binomial proportion.
+ */
+ static double estimate_unknown_p(long n, long k) {
+ check_inputs(n, k);
+ if (n == 0) { return 0.5; } // the coin was never flipped, so we know nothing
+ else { return ((double) k / (double) n); }
+ }
+
+ /**
+ * Computes an approximation to the erf() function.
+ * @param x is the input to the erf function
+ * @return returns erf(x), accurate to roughly 7 decimal digits.
+ */
+ static double erf(double x) {
+ if (x < 0.0) { return (-1.0 * (erf_of_nonneg(-1.0 * x))); }
+ else { return (erf_of_nonneg(x)); }
+ }
+
+ /**
+ * Computes an approximation to normal_cdf(x).
+ * @param x is the input to the normal_cdf function
+ * @return returns the approximation to normalCDF(x).
+ */
+ static double normal_cdf(double x) {
+ return (0.5 * (1.0 + (erf(x / (sqrt(2.0))))));
+ }
+
+private:
+ static void check_inputs(long n, long k) {
+ if (n < 0) { throw std::invalid_argument("N must be non-negative"); }
+ if (k < 0) { throw std::invalid_argument("K must be non-negative"); }
+ if (k > n) { throw std::invalid_argument("K cannot exceed N"); }
+ }
+
+ //@formatter:off
+ // Abramowitz and Stegun formula 7.1.28, p. 88; Claims accuracy of about 7 decimal digits */
+ static double erf_of_nonneg(double x) {
+ // The constants that appear below, formatted for easy checking against the book.
+ // a1 = 0.07052 30784
+ // a3 = 0.00927 05272
+ // a5 = 0.00027 65672
+ // a2 = 0.04228 20123
+ // a4 = 0.00015 20143
+ // a6 = 0.00004 30638
+ static const double a1 = 0.0705230784;
+ static const double a3 = 0.0092705272;
+ static const double a5 = 0.0002765672;
+ static const double a2 = 0.0422820123;
+ static const double a4 = 0.0001520143;
+ static const double a6 = 0.0000430638;
+ const double x2 = x * x; // x squared, x cubed, etc.
+ const double x3 = x2 * x;
+ const double x4 = x2 * x2;
+ const double x5 = x2 * x3;
+ const double x6 = x3 * x3;
+ const double sum = ( 1.0
+ + (a1 * x)
+ + (a2 * x2)
+ + (a3 * x3)
+ + (a4 * x4)
+ + (a5 * x5)
+ + (a6 * x6) );
+ const double sum2 = sum * sum; // raise the sum to the 16th power
+ const double sum4 = sum2 * sum2;
+ const double sum8 = sum4 * sum4;
+ const double sum16 = sum8 * sum8;
+ return (1.0 - (1.0 / sum16));
+ }
+
+ static double delta_of_num_stdevs(double kappa) {
+ return (normal_cdf(-1.0 * kappa));
+ }
+
+ //@formatter:on
+ // Formula 26.5.22 on page 945 of Abramowitz & Stegun, which is an approximation
+ // of the inverse of the incomplete beta function I_x(a,b) = delta
+ // viewed as a scalar function of x.
+ // In other words, we specify delta, and it gives us x (with a and b held constant).
+ // However, delta is specified in an indirect way through yp which
+ // is the number of stdDevs that leaves delta probability in the right
+ // tail of a standard gaussian distribution.
+
+ // We point out that the variable names correspond to those in the book,
+ // and it is worth keeping it that way so that it will always be easy to verify
+ // that the formula was typed in correctly.
+
+ static double abramowitz_stegun_formula_26p5p22(double a, double b,
+ double yp) {
+ const double b2m1 = (2.0 * b) - 1.0;
+ const double a2m1 = (2.0 * a) - 1.0;
+ const double lambda = ((yp * yp) - 3.0) / 6.0;
+ const double htmp = (1.0 / a2m1) + (1.0 / b2m1);
+ const double h = 2.0 / htmp;
+ const double term1 = (yp * (sqrt(h + lambda))) / h;
+ const double term2 = (1.0 / b2m1) - (1.0 / a2m1);
+ const double term3 = (lambda + (5.0 / 6.0)) - (2.0 / (3.0 * h));
+ const double w = term1 - (term2 * term3);
+ const double xp = a / (a + (b * (exp(2.0 * w))));
+ return xp;
+ }
+
+ // Formulas for some special cases.
+
+ static double exact_upper_bound_on_p_k_eq_zero(double n, double delta) {
+ return (1.0 - pow(delta, (1.0 / n)));
+ }
+
+ static double exact_lower_bound_on_p_k_eq_n(double n, double delta) {
+ return (pow(delta, (1.0 / n)));
+ }
+
+ static double exact_lower_bound_on_p_k_eq_1(double n, double delta) {
+ return (1.0 - pow((1.0 - delta), (1.0 / n)));
+ }
+
+ static double exactU_upper_bound_on_p_k_eq_minusone(double n, double delta) {
+ return (pow((1.0 - delta), (1.0 / n)));
+ }
+
+};
+
+}
+
+#endif // _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index 83f89c7..bf22a42 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -22,19 +22,30 @@
#include "serde.hpp"
-#include <vector>
#include <iterator>
+#include <vector>
namespace datasketches {
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
-int num_allocs = 0;
-
/**
* author Kevin Lang
* author Jon Malkin
*/
+
+/**
+ * A struct to hold the result of subset sum queries
+ */
+struct subset_summary {
+ const double lower_bound;
+ const double estimate;
+ const double upper_bound;
+ const double total_sketch_weight;
+};
+
+template <typename T, typename S, typename A> class var_opt_union; // forward declaration
+
template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
class var_opt_sketch {
@@ -43,10 +54,15 @@ class var_opt_sketch {
static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
- static var_opt_sketch<T,S,A> deserialize(std::istream& is);
- static var_opt_sketch<T,S,A> deserialize(const void* bytes, size_t size);
+ var_opt_sketch(const var_opt_sketch& other);
+ var_opt_sketch(var_opt_sketch&& other) noexcept;
+ static var_opt_sketch deserialize(std::istream& is);
+ static var_opt_sketch deserialize(const void* bytes, size_t size);
- virtual ~var_opt_sketch();
+ ~var_opt_sketch();
+
+ var_opt_sketch& operator=(const var_opt_sketch& other);
+ var_opt_sketch& operator=(var_opt_sketch&& other);
void update(const T& item, double weight=1.0);
//void update(T&& item, double weight=1.0);
@@ -72,7 +88,11 @@ class var_opt_sketch {
std::ostream& to_stream(std::ostream& os) const;
std::string to_string() const;
- //estimate_subset_sum()
+ subset_summary estimate_subset_sum(std::function<bool(T)> predicate) const;
+
+ class const_iterator;
+ const_iterator begin() const;
+ const_iterator end() const;
private:
typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
@@ -88,6 +108,9 @@ class var_opt_sketch {
static const uint8_t EMPTY_FLAG_MASK = 4;
static const uint8_t GADGET_FLAG_MASK = 128;
+ // Number of standard deviations to use for subset sum error bounds
+ constexpr static const double DEFAULT_KAPPA = 2.0;
+
// TODO: should probably rearrange a bit to minimize gaps once aligned
uint32_t k_; // max size of sketch, in items
@@ -98,7 +121,7 @@ class var_opt_sketch {
uint64_t n_; // total number of items processed by sketch
double total_wt_r_; // total weight of items in reservoir-like area
- const resize_factor rf_; // resize factor
+ resize_factor rf_; // resize factor
uint32_t curr_items_alloc_; // currently allocated array size
bool filled_data_; // true if we've explciitly set all entries in data_
@@ -128,6 +151,10 @@ class var_opt_sketch {
var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, uint8_t preamble_longs, std::istream& is);
var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, uint8_t preamble_longs, const void* bytes, size_t size);
+ friend class var_opt_union<T,S,A>;
+ var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n);
+ var_opt_sketch(T* data, double* weights, size_t len, uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r);
+
// internal-use-only updates
void update(const T& item, double weight, bool mark);
void update_warmup_phase(const T& item, double weight, bool mark);
@@ -139,9 +166,9 @@ class var_opt_sketch {
double peek_min() const;
bool is_marked(int idx) const;
- int pick_random_slot_in_r() const;
- int choose_delete_slot(double wt_cand, int num_cand) const;
- int choose_weighted_delete_slot(double wt_cand, int num_cand) const;
+ uint32_t pick_random_slot_in_r() const;
+ uint32_t choose_delete_slot(double wt_cand, int num_cand) const;
+ uint32_t choose_weighted_delete_slot(double wt_cand, int num_cand) const;
void transition_from_warmup();
void convert_to_heap();
@@ -151,6 +178,7 @@ class var_opt_sketch {
void pop_min_to_m_region();
void grow_candidate_set(double wt_cands, int num_cands);
void decrease_k_by_1();
+ void strip_marks();
void force_set_k(int k); // used to resolve union gadget into sketch
void downsample_candidate_set(double wt_cands, int num_cands);
void swap_values(int src, int dst);
@@ -162,20 +190,19 @@ class var_opt_sketch {
static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
// things to move to common utils and share among sketches
- static int get_adjusted_size(int max_size, int resize_target);
- static int starting_sub_multiple(int lg_target, int lg_rf, int lg_min);
+ static uint32_t get_adjusted_size(int max_size, int resize_target);
+ static uint32_t starting_sub_multiple(int lg_target, int lg_rf, int lg_min);
+ static double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate);
+ static double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate);
static bool is_power_of_2(uint32_t v);
static uint32_t to_log_2(uint32_t v);
static uint32_t count_trailing_zeros(uint32_t v);
static uint32_t ceiling_power_of_2(uint32_t n);
- static int next_int(int max_value);
+ static uint32_t next_int(uint32_t max_value);
static double next_double_exclude_zero();
-
- class const_iterator;
- const_iterator begin() const;
- const_iterator end() const;
};
+/*
template<typename T, typename S, typename A>
class var_opt_sketch<T, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
public:
@@ -189,12 +216,49 @@ public:
private:
const T* items;
const double* weights;
+ const bool* marks;
const uint32_t h_count;
const uint32_t r_count;
+ const double total_wt_r;
const double r_item_wt;
+ double cum_weight; // used for weight correction in R
+ uint32_t final_idx;
uint32_t index;
- const_iterator(const T* items, const double* weights, const uint32_t h_count, const uint32_t r_count,
- const double total_wt_r, bool use_end=false);
+ const bool get_mark() const;
+ const_iterator(const T* items, const double* weights, const bool* marks_,
+ const uint32_t h_count, const uint32_t r_count, const double total_wt_r, bool use_end=false);
+};
+*/
+
+template<typename T, typename S, typename A>
+class var_opt_sketch<T, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
+public:
+ const_iterator(const const_iterator& other);
+ const_iterator& operator++();
+ const_iterator& operator++(int);
+ bool operator==(const const_iterator& other) const;
+ bool operator!=(const const_iterator& other) const;
+ const std::pair<const T&, const double> operator*() const;
+
+private:
+ friend class var_opt_sketch<T,S,A>;
+ friend class var_opt_union<T,S,A>;
+
+ // default iterator over full sketch
+ const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end);
+
+ // iterates over only one of the H or R region, optionally applying weight correction
+ // to R region (can correct for numerical precision issues)
+ const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region, bool weight_corr);
+
+ const bool get_mark() const;
+
+ const var_opt_sketch<T,S,A>* sk_;
+ double cum_r_weight_; // used for weight correction
+ double r_item_wt_;
+ size_t idx_;
+ size_t final_idx_;
+ bool weight_correction_;
};
} // namespace datasketches
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 9fac697..26d4af3 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -22,6 +22,7 @@
#include "var_opt_sketch.hpp"
#include "serde.hpp"
+#include "bounds_binomial_proportions.hpp"
#include <memory>
#include <iostream>
@@ -42,6 +43,101 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf) :
var_opt_sketch<T,S,A>(k, rf, false) {}
template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other) :
+ k_(other.k_),
+ h_(other.h_),
+ m_(other.m_),
+ r_(other.r_),
+ n_(other.n_),
+ total_wt_r_(other.total_wt_r_),
+ rf_(other.rf_),
+ curr_items_alloc_(other.curr_items_alloc_),
+ filled_data_(other.filled_data_),
+ data_(nullptr),
+ weights_(nullptr),
+ num_marks_in_h_(other.num_marks_in_h_),
+ marks_(nullptr)
+ {
+ data_ = A().allocate(curr_items_alloc_);
+ std::copy(&other.data_[0], &other.data_[curr_items_alloc_], data_);
+
+ weights_ = AllocDouble().allocate(curr_items_alloc_);
+ std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
+
+ if (other.marks_ != nullptr) {
+ marks_ = AllocBool().allocate(curr_items_alloc_);
+ std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
+ }
+ }
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n) :
+ k_(other.k_),
+ h_(other.h_),
+ m_(other.m_),
+ r_(other.r_),
+ n_(adjusted_n),
+ total_wt_r_(other.total_wt_r_),
+ rf_(other.rf_),
+ curr_items_alloc_(other.curr_items_alloc_),
+ filled_data_(other.filled_data_),
+ data_(nullptr),
+ weights_(nullptr),
+ num_marks_in_h_(other.num_marks_in_h_),
+ marks_(nullptr)
+ {
+ data_ = A().allocate(curr_items_alloc_);
+ std::copy(&other.data_[0], &other.data_[curr_items_alloc_], data_);
+
+ weights_ = AllocDouble().allocate(curr_items_alloc_);
+ std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
+
+ if (!as_sketch && other.marks_ != nullptr) {
+ marks_ = AllocBool().allocate(curr_items_alloc_);
+ std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
+ }
+ }
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(T* data, double* weights, size_t len,
+ uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r) :
+ k_(k),
+ h_(h_count),
+ m_(0),
+ r_(r_count),
+ n_(n),
+ total_wt_r_(total_wt_r),
+ rf_(DEFAULT_RESIZE_FACTOR),
+ curr_items_alloc_(len),
+ filled_data_(n > k),
+ data_(data),
+ weights_(weights),
+ num_marks_in_h_(0),
+ marks_(nullptr)
+ {}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(var_opt_sketch&& other) noexcept :
+ k_(other.k_),
+ h_(other.h_),
+ m_(other.m_),
+ r_(other.r_),
+ n_(other.n_),
+ total_wt_r_(other.total_wt_r_),
+ rf_(other.rf_),
+ curr_items_alloc_(other.curr_items_alloc_),
+ filled_data_(other.filled_data_),
+ data_(other.data_),
+ weights_(other.weights_),
+ num_marks_in_h_(other.num_marks_in_h_),
+ marks_(other.marks_)
+ {
+ other.data_ = nullptr;
+ other.weights_ = nullptr;
+ other.marks_ = nullptr;
+ }
+
+template<typename T, typename S, typename A>
var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
if (k < 1) {
@@ -61,27 +157,67 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
template<typename T, typename S, typename A>
var_opt_sketch<T,S,A>::~var_opt_sketch() {
- if (filled_data_) {
- // destroy everything
- for (size_t i = 0; i < curr_items_alloc_; ++i) {
- A().destroy(data_ + i);
- }
- } else {
- // skip gap or anything unused at the end
- for (size_t i = 0; i < h_; ++i) {
- A().destroy(data_+ i);
- }
+ if (data_ != nullptr) {
+ if (filled_data_) {
+ // destroy everything
+ for (size_t i = 0; i < curr_items_alloc_; ++i) {
+ A().destroy(data_ + i);
+ }
+ } else {
+ // skip gap or anything unused at the end
+ for (size_t i = 0; i < h_; ++i) {
+ A().destroy(data_+ i);
+ }
- for (size_t i = h_ + 1; i < curr_items_alloc_; ++i) {
- A().destroy(data_ + i);
+ for (size_t i = h_ + 1; i < curr_items_alloc_; ++i) {
+ A().destroy(data_ + i);
+ }
}
+ A().deallocate(data_, curr_items_alloc_);
}
- A().deallocate(data_, curr_items_alloc_);
- AllocDouble().deallocate(weights_, curr_items_alloc_);
+
+ if (weights_ != nullptr)
+ AllocDouble().deallocate(weights_, curr_items_alloc_);
- if (marks_ != nullptr) {
+ if (marks_ != nullptr)
AllocBool().deallocate(marks_, curr_items_alloc_);
- }
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(const var_opt_sketch& other) {
+ var_opt_sketch<T,S,A> sk_copy(other);
+ std::swap(k_, sk_copy.k_);
+ std::swap(h_, sk_copy.h_);
+ std::swap(m_, sk_copy.m_);
+ std::swap(r_, sk_copy.r_);
+ std::swap(n_, sk_copy.n_);
+ std::swap(total_wt_r_, sk_copy.total_wt_r_);
+ std::swap(rf_, sk_copy.rf_);
+ std::swap(curr_items_alloc_, sk_copy.curr_items_alloc_);
+ std::swap(filled_data_, sk_copy.filled_data_);
+ std::swap(data_, sk_copy.data_);
+ std::swap(weights_, sk_copy.weights_);
+ std::swap(num_marks_in_h_, sk_copy.num_marks_in_h_);
+ std::swap(marks_, sk_copy.marks_);
+ return *this;
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other) {
+ std::swap(k_, other.k_);
+ std::swap(h_, other.h_);
+ std::swap(m_, other.m_);
+ std::swap(r_, other.r_);
+ std::swap(n_, other.n_);
+ std::swap(total_wt_r_, other.total_wt_r_);
+ std::swap(rf_, other.rf_);
+ std::swap(curr_items_alloc_, other.curr_items_alloc_);
+ std::swap(filled_data_, other.filled_data_);
+ std::swap(data_, other.data_);
+ std::swap(weights_, other.weights_);
+ std::swap(num_marks_in_h_, other.num_marks_in_h_);
+ std::swap(marks_, other.marks_);
+ return *this;
}
template<typename T, typename S, typename A>
@@ -129,7 +265,7 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
is.read((char*)&total_wt_r_, sizeof(total_wt_r_));
if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
- "Found r = " + std::to_string(r_) + ", R regionw weight = " + std::to_string(total_wt_r_));
+ "Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
}
curr_items_alloc_ = k_ + 1;
@@ -206,7 +342,7 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
ptr += copy_from_mem(ptr, &total_wt_r_, sizeof(total_wt_r_));
if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
- "Found r = " + std::to_string(r_) + ", R regionw weight = " + std::to_string(total_wt_r_));
+ "Found r = " + std::to_string(r_) + ", R region weight = " + std::to_string(total_wt_r_));
}
curr_items_alloc_ = k_ + 1;
@@ -279,7 +415,6 @@ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadg
template<typename T, typename S, typename A>
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
- std::cerr << "fixed-size version!";
if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
num_bytes += h_ * sizeof(double); // weights
@@ -294,7 +429,6 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
template<typename T, typename S, typename A>
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
- std::cerr << "other version!";
if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
num_bytes += h_ * sizeof(double); // weights
@@ -414,9 +548,6 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
// write the first h_ weights
os.write((char*)weights_, h_ * sizeof(double));
- //for (int i = 0; i < h_; ++i) {
- // os.write((char*)&weights_[i], sizeof(double));
- //}
// write the first h_ marks as packed bytes iff we have a gadget
if (marks_ != nullptr) {
@@ -648,7 +779,6 @@ void var_opt_sketch<T,S,A>::update_warmup_phase(const T& item, double weight, bo
}
// store items as they come in until full
- std::cerr << "h_=" << h_ << ":\t" << item << "\t(alloc: " << curr_items_alloc_ << ")\n";
A().construct(&data_[h_], T(item));
weights_[h_] = weight;
if (marks_ != nullptr) {
@@ -724,9 +854,72 @@ void var_opt_sketch<T,S,A>::update_heavy_r_eq1(const T& item, double weight, boo
grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
}
+/**
+ * Decreases sketch's value of k by 1, updating stored values as needed.
+ *
+ * <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
+ * the unioning algorithm to force "marked" items out of H and into the reservoir region.</p>
+ */
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::decrease_k_by_1() {
+ if (k_ <= 1) {
+ throw std::logic_error("Cannot decrease k below 1 in union");
+ }
+
+ if ((h_ == 0) && (r_ == 0)) {
+ // exact mode, but no data yet; this reduction is somewhat gratuitous
+ --k_;
+ } else if ((h_ > 0) && (r_ == 0)) {
+ // exact mode, but we have some data
+ --k_;
+ if (h_ > k_) {
+ transition_from_warmup();
+ }
+ } else if ((h_ > 0) && (r_ > 0)) {
+ // reservoir mode, but we have some exact samples.
+ // Our strategy will be to pull an item out of H (which we are allowed to do since it's
+ // still just data), reduce k, and then re-insert the item
+
+ // first, slide the R zone to the left by 1, temporarily filling the gap
+ uint32_t old_gap_idx = h_;
+ uint32_t old_final_r_idx = (h_ + 1 + r_) - 1;
+
+ assert(old_final_r_idx == k_);
+ swap_values(old_final_r_idx, old_gap_idx);
+
+ // now we pull an item out of H; any item is ok, but if we grab the rightmost and then
+ // reduce h_, the heap invariant will be preserved (and the gap will be restored), plus
+ // the push() of the item that will probably happen later will be cheap.
+
+ uint32_t pulled_idx = h_ - 1;
+ T pulled_item = data_[pulled_idx];
+ double pulled_weight = weights_[pulled_idx];
+ bool pulled_mark = marks_[pulled_idx];
+
+ if (pulled_mark) { --num_marks_in_h_; }
+ weights_[pulled_idx] = -1.0; // to make bugs easier to spot
+
+ --h_;
+ --k_;
+ --n_; // will be re-incremented with the update
+
+ update(pulled_item, pulled_weight, pulled_mark);
+ } else if ((h_ == 0) && (r_ > 0)) {
+ // pure reservoir mode, so can simply eject a randomly chosen sample from the reservoir
+ assert(r_ >= 2);
+
+ uint32_t r_idx_to_delete = 1 + next_int(r_); // 1 for the gap
+ uint32_t rightmost_r_idx = (1 + r_) - 1;
+ swap_values(r_idx_to_delete, rightmost_r_idx);
+ weights_[rightmost_r_idx] = -1.0;
+
+ --k_;
+ --r_;
+ }
+}
+
template<typename T, typename S, typename A>
void var_opt_sketch<T,S,A>::allocate_data_arrays(uint32_t tgt_size, bool use_marks) {
- std::cerr << "allocate_data_arrays(" << tgt_size << ", " << use_marks << ")\n";
filled_data_ = false;
data_ = A().allocate(tgt_size);
@@ -813,12 +1006,11 @@ void var_opt_sketch<T,S,A>::convert_to_heap() {
restore_towards_leaves(j);
}
- // TODO: Remove this block
// validates heap, used for initial debugging
- for (int j = h_ - 1; j >= 1; --j) {
- int p = ((j + 1) / 2) - 1;
- assert(weights_[p] <= weights_[j]);
- }
+ //for (int j = h_ - 1; j >= 1; --j) {
+ // int p = ((j + 1) / 2) - 1;
+ // assert(weights_[p] <= weights_[j]);
+ //}
}
template<typename T, typename S, typename A>
@@ -984,7 +1176,7 @@ void var_opt_sketch<T,S,A>::downsample_candidate_set(double wt_cands, int num_ca
}
template<typename T, typename S, typename A>
-int var_opt_sketch<T,S,A>::choose_delete_slot(double wt_cands, int num_cands) const {
+uint32_t var_opt_sketch<T,S,A>::choose_delete_slot(double wt_cands, int num_cands) const {
assert(r_ > 0);
if (m_ == 0) {
@@ -1012,7 +1204,7 @@ int var_opt_sketch<T,S,A>::choose_delete_slot(double wt_cands, int num_cands) co
}
template<typename T, typename S, typename A>
-int var_opt_sketch<T,S,A>::choose_weighted_delete_slot(double wt_cands, int num_cands) const {
+uint32_t var_opt_sketch<T,S,A>::choose_weighted_delete_slot(double wt_cands, int num_cands) const {
assert(m_ >= 1);
int offset = h_;
@@ -1036,7 +1228,7 @@ int var_opt_sketch<T,S,A>::choose_weighted_delete_slot(double wt_cands, int num_
}
template<typename T, typename S, typename A>
-int var_opt_sketch<T,S,A>::pick_random_slot_in_r() const {
+uint32_t var_opt_sketch<T,S,A>::pick_random_slot_in_r() const {
assert(r_ > 0);
int offset = h_ + m_;
if (r_ == 1) {
@@ -1053,7 +1245,7 @@ double var_opt_sketch<T,S,A>::peek_min() const {
}
template<typename T, typename S, typename A>
-bool var_opt_sketch<T,S,A>::is_marked(int idx) const {
+inline bool var_opt_sketch<T,S,A>::is_marked(int idx) const {
return marks_ == nullptr ? false : marks_[idx];
}
@@ -1062,6 +1254,13 @@ double var_opt_sketch<T,S,A>::get_tau() const {
return r_ == 0 ? std::nan("1") : (total_wt_r_ / r_);
}
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::strip_marks() {
+ assert(marks_ != nullptr);
+ num_marks_in_h_ = 0;
+ AllocBool().deallocate(marks_, curr_items_alloc_);
+ marks_ = nullptr;
+}
template<typename T, typename S, typename A>
void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
@@ -1099,44 +1298,127 @@ void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t famil
+ std::to_string(FAMILY) + ". Found: " + std::to_string(family_id));
}
+template<typename T, typename S, typename A>
+subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(std::function<bool(T)> predicate) const {
+ if (n_ == 0) {
+ return {0.0, 0.0, 0.0, 0.0};
+ }
+
+ double total_wt_h = 0.0;
+ double h_true_wt = 0.0;
+ size_t idx = 0;
+ for (; idx < h_; ++idx) {
+ double wt = weights_[idx];
+ total_wt_h += wt;
+ if (predicate(data_[idx])) {
+ h_true_wt += wt;
+ }
+ }
+
+ // if only heavy items, we have an exact answre
+ if (r_ == 0) {
+ return {h_true_wt, h_true_wt, h_true_wt, h_true_wt};
+ }
+
+ uint64_t num_samples = n_ - h_;
+ assert(num_samples > 0);
+ double effective_sampling_rate = r_ / static_cast<double>(num_samples);
+ assert(effective_sampling_rate >= 0.0);
+ assert(effective_sampling_rate <= 1.0);
+
+ size_t r_true_count = 0;
+ ++idx; // skip the gap
+ for (; idx < (k_ + 1); ++idx) {
+ if (predicate(data_[idx])) {
+ ++r_true_count;
+ }
+ }
+
+ double lb_true_fraction = pseudo_hypergeometric_lb_on_p(r_, r_true_count, effective_sampling_rate);
+ double estimated_true_fraction = (1.0 * r_true_count) / r_;
+ double ub_true_fraction = pseudo_hypergeometric_ub_on_p(r_, r_true_count, effective_sampling_rate);
+
+ return { h_true_wt + (total_wt_r_ * lb_true_fraction),
+ h_true_wt + (total_wt_r_ * estimated_true_fraction),
+ h_true_wt + (total_wt_r_ * ub_true_fraction),
+ total_wt_h + total_wt_r_
+ };
+}
template<typename T, typename S, typename A>
typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::begin() const {
- return var_opt_sketch<T, S, A>::const_iterator(data_, weights_, h_, r_, total_wt_r_, false);
+ return var_opt_sketch<T, S, A>::const_iterator(*this, false);
}
template<typename T, typename S, typename A>
typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::end() const {
- return var_opt_sketch<T, S, A>::const_iterator(data_, weights_, h_, r_, total_wt_r_, true);
+ return var_opt_sketch<T, S, A>::const_iterator(*this, true);
}
// -------- var_opt_sketch::const_iterator implementation ---------
template<typename T, typename S, typename A>
-var_opt_sketch<T,S,A>::const_iterator::const_iterator(const T* items, const double* weights,
- const uint32_t h_count, const uint32_t r_count,
- const double total_wt_r, bool use_end) :
-items(items), weights(weights), h_count(h_count), r_count(r_count), r_item_wt(r_count > 0 ? total_wt_r/r_count : -1.0) {
+var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end) :
+ sk_(&sk),
+ cum_r_weight_(0.0),
+ r_item_wt_(sk.get_tau()),
+ weight_correction_(false)
+{
// index logic easier to read if not inline
- if (use_end) {
- index = (r_count > 0 ? h_count + r_count + 1 : h_count);
+ final_idx_ = (sk.r_ > 0 ? sk.h_ + sk.r_ + 1 : sk.h_);
+ if (is_end) {
+ idx_ = final_idx_;
+ sk_ = nullptr;
} else {
- index = (h_count == 0 && r_count > 0 ? 1 : 0); // skip the gap if at the start
+ idx_ = (sk.h_ == 0 && sk.r_ > 0 ? 1 : 0); // skip if gap is at start
}
+
+ // should only apply if sketch is empty
+ if (idx_ == final_idx_) { sk_ = nullptr; }
}
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region, bool weight_corr) :
+ sk_(&sk),
+ cum_r_weight_(0.0),
+ r_item_wt_(sk.get_tau()),
+ weight_correction_(weight_corr)
+{
+ if (use_r_region) {
+ idx_ = sk.h_ + 1 + (is_end ? sk.r_ : 0);
+ final_idx_ = sk.h_ + 1 + sk.r_;
+ } else { // H region
+ // gap at start only if h_ == 0, so index always starts at 0
+ idx_ = (is_end ? sk.h_ : 0);
+ final_idx_ = sk.h_;
+ }
+
+ // unlike in full iterator case, may happen even if sketch is not empty
+ if (idx_ == final_idx_) { sk_ = nullptr; }
+}
+
+
template<typename T, typename S, typename A>
-var_opt_sketch<T, S, A>::const_iterator::const_iterator(const const_iterator& other):
-items(other.items), weights(other.weights), h_count(other.h_count), r_count(other.r_count), r_item_wt(other.r_item_wt), index(other.index)
+var_opt_sketch<T, S, A>::const_iterator::const_iterator(const const_iterator& other) :
+ sk_(other.sk_),
+ cum_r_weight_(other.cum_r_weight_),
+ r_item_wt_(other.r_item_wt_),
+ idx_(other.idx_),
+ final_idx_(other.final_idx_),
+ weight_correction_(other.weight_correction_)
{}
template<typename T, typename S, typename A>
typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++() {
- ++index;
- // check for the gap
- if (index == h_count && r_count > 0) {
- ++index;
+ ++idx_;
+
+ if (idx_ == final_idx_) {
+ sk_ = nullptr;
+ return *this;
+ } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
+ ++idx_;
}
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
return *this;
}
@@ -1149,8 +1431,9 @@ typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const
template<typename T, typename S, typename A>
bool var_opt_sketch<T, S, A>::const_iterator::operator==(const const_iterator& other) const {
- if (items != other.items || h_count != other.h_count || r_count != other.r_count) return false;
- return index == other.index;
+ if (sk_ != other.sk_) return false;
+ if (sk_ == nullptr) return true; // end (and we know other.sk_ is also null)
+ return idx_ == other.idx_;
}
template<typename T, typename S, typename A>
@@ -1160,7 +1443,20 @@ bool var_opt_sketch<T, S, A>::const_iterator::operator!=(const const_iterator& o
template<typename T, typename S, typename A>
const std::pair<const T&, const double> var_opt_sketch<T, S, A>::const_iterator::operator*() const {
- return std::pair<const T&, const double>(items[index], weights[index]);
+ double wt;
+ if (idx_ < sk_->h_) {
+ wt = sk_->weights_[idx_];
+ } else if (weight_correction_ && idx_ == final_idx_ - 1) {
+ wt = sk_->total_wt_r_ - cum_r_weight_;
+ } else {
+ wt = r_item_wt_;
+ }
+ return std::pair<const T&, const double>(sk_->data_[idx_], wt);
+}
+
+template<typename T, typename S, typename A>
+const bool var_opt_sketch<T, S, A>::const_iterator::get_mark() const {
+ return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
}
@@ -1177,7 +1473,7 @@ namespace random_utils {
* If so, returns max sampling size, otherwise passes through target size.
*/
template<typename T, typename S, typename A>
-int var_opt_sketch<T,S,A>::get_adjusted_size(int max_size, int resize_target) {
+uint32_t var_opt_sketch<T,S,A>::get_adjusted_size(int max_size, int resize_target) {
if (max_size - (resize_target << 1) < 0L) {
return max_size;
}
@@ -1185,13 +1481,25 @@ int var_opt_sketch<T,S,A>::get_adjusted_size(int max_size, int resize_target) {
}
template<typename T, typename S, typename A>
-int var_opt_sketch<T,S,A>::starting_sub_multiple(int lg_target, int lg_rf, int lg_min) {
+uint32_t var_opt_sketch<T,S,A>::starting_sub_multiple(int lg_target, int lg_rf, int lg_min) {
return (lg_target <= lg_min)
? lg_min : (lg_rf == 0) ? lg_target
: (lg_target - lg_min) % lg_rf + lg_min;
}
template<typename T, typename S, typename A>
+double var_opt_sketch<T,S,A>::pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate) {
+ double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
+ return bounds_binomial_proportions::approximate_upper_bound_on_p(n, k, adjusted_kappa);
+}
+
+template<typename T, typename S, typename A>
+double var_opt_sketch<T,S,A>::pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate) {
+ double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
+ return bounds_binomial_proportions::approximate_lower_bound_on_p(n, k, adjusted_kappa);
+}
+
+template<typename T, typename S, typename A>
bool var_opt_sketch<T,S,A>::is_power_of_2(uint32_t v) {
return v && !(v & (v - 1));
}
@@ -1207,8 +1515,8 @@ uint32_t var_opt_sketch<T,S,A>::to_log_2(uint32_t v) {
// Returns an integer in the range [0, max_value) -- excludes max_value
template<typename T, typename S, typename A>
-int var_opt_sketch<T,S,A>::next_int(int max_value) {
- std::uniform_int_distribution<> dist(0, max_value - 1);
+uint32_t var_opt_sketch<T,S,A>::next_int(uint32_t max_value) {
+ std::uniform_int_distribution<uint32_t> dist(0, max_value - 1);
return dist(random_utils::rand);
}
diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp
new file mode 100644
index 0000000..a1a24ff
--- /dev/null
+++ b/sampling/include/var_opt_union.hpp
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef _VAR_OPT_UNION_HPP_
+#define _VAR_OPT_UNION_HPP_
+
+#include "var_opt_sketch.hpp"
+#include "serde.hpp"
+
+#include <vector>
+
+namespace datasketches {
+
+template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
+
+/**
+ * author Kevin Lang
+ * author Jon Malkin
+ */
+
+template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
+class var_opt_union {
+
+public:
+ explicit var_opt_union(uint32_t max_k);
+ var_opt_union(const var_opt_union& other);
+ var_opt_union(var_opt_union&& other) noexcept;
+ //static var_opt_union deserialize(std::istream& is);
+ //static var_opt_union deserialize(const void* bytes, size_t size);
+
+ ~var_opt_union();
+
+ var_opt_union& operator=(const var_opt_union& other);
+ var_opt_union& operator=(var_opt_union&& other);
+
+ void update(var_opt_sketch<T,S,A>& sk);
+ //void update(var_opt_sketch<T,S,A>>& sk);
+
+ void reset();
+
+ /**
+ * Gets the varopt sketch resulting from the union of any input sketches.
+ *
+ * @return A varopt sketch
+ */
+ var_opt_sketch<T,S,A> get_result() const;
+
+ //std::vector<uint8_t, AllocU8<A>> serialize(unsigned header_size_bytes = 0) const;
+ //void serialize(std::ostream& os) const;
+
+ std::ostream& to_stream(std::ostream& os) const;
+ std::string to_string() const;
+
+
+private:
+ typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T,S,A>> AllocSketch;
+
+ uint64_t n_; // cumulative over all input sketches
+
+ // outer tau is the largest tau of any input sketch
+ double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
+
+ // total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
+ uint64_t outer_tau_denom_;
+
+ const uint32_t max_k_;
+
+ var_opt_sketch<T,S,A> gadget_;
+
+ /*
+ IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
+ but in fact is NOT because it doesn't satisfy the mathematical definition
+ of a varopt sketch of the concatenated input streams. Therefore it could be different
+ from a true varopt sketch with that value of K, in which case it could easily provide
+ worse estimation accuracy for subset-sum queries.
+
+ This should not surprise you; the approximation guarantees of varopt sketches
+ do not apply to things that merely resemble varopt sketches.
+
+ However, even though the gadget is not a varopt sketch, the result
+ of the unioning process IS a varopt sketch. It is constructed by a
+ somewhat complicated "resolution" process which determines the largest K
+ that a valid varopt sketch could have given the available information,
+ then constructs a varopt sketch of that size and returns it.
+
+ However, the gadget itself is not touched during the resolution process,
+ and additional sketches could subsequently be merged into the union,
+ at which point a varopt result could again be requested.
+ */
+
+ /*
+ Explanation of "marked items" in the union's gadget:
+
+ The boolean value "true" in an pair indicates that the item
+ came from an input sketch's R zone, so it is already the result of sampling.
+
+ Therefore it must not wind up in the H zone of the final result, because
+ that would imply that the item is "exact".
+
+ However, it is okay for a marked item to hang out in the gadget's H zone for a while.
+
+ And once the item has moved to the gadget's R zone, the mark is never checked again,
+ so no effort is made to ensure that its value is preserved or even makes sense.
+ */
+
+ /*
+ Note: if the computer could perform exact real-valued arithmetic, the union could finalize
+ its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
+ arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
+ case.]
+
+ In fact, we won't even look at tau while while reducing k. Instead the logic will be based
+ on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
+ to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
+ be fairly harmless since we will have achieved our goal of getting the marked items out of H.
+
+ Also, you might be wondering why we are bothering to maintain the numerator and denominator
+ separately instead of just having a single variable outer_tau. This allows us (in certain
+ cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
+ it then adding it back up. That would be a source of numerical inaccuracy. And even
+ more importantly, this design choice allows us to exactly re-construct the input sketch
+ when there is only one of them.
+ */
+ void merge_into(const var_opt_sketch<T,S,A>& sk);
+
+ double get_outer_tau() const;
+
+ var_opt_sketch<T,S,A> simple_gadget_coercer() const;
+
+ bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
+ bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const;
+ void mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const;
+ void migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& sk) const;
+};
+
+}
+
+#include "var_opt_union_impl.hpp"
+
+#endif // _VAR_OPT_UNION_HPP_
\ No newline at end of file
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
new file mode 100644
index 0000000..9fe2323
--- /dev/null
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef _VAR_OPT_UNION_IMPL_HPP_
+#define _VAR_OPT_UNION_IMPL_HPP_
+
+#include "var_opt_union.hpp"
+
+#include <cmath>
+
+namespace datasketches {
+
+template<typename T, typename S, typename A>
+var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
+ n_(0),
+ outer_tau_numer_(0),
+ outer_tau_denom_(0.0),
+ max_k_(max_k),
+ gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
+{}
+
+template<typename T, typename S, typename A>
+var_opt_union<T,S,A>::var_opt_union(const var_opt_union& other) :
+ n_(other.n_),
+ outer_tau_numer_(other.outer_tau_numer_),
+ outer_tau_denom_(other.outer_tau_denom_),
+ max_k_(other.max_k_),
+ gadget_(other.gadget)
+{}
+
+template<typename T, typename S, typename A>
+var_opt_union<T,S,A>::var_opt_union(var_opt_union&& other) noexcept :
+ n_(other.n_),
+ outer_tau_numer_(other.outer_tau_numer_),
+ outer_tau_denom_(other.outer_tau_denom_),
+ max_k_(other.max_k_)
+{
+ gadget_ = std::move(other.gadget_);
+}
+
+template<typename T, typename S, typename A>
+var_opt_union<T,S,A>::~var_opt_union() {}
+
+template<typename T, typename S, typename A>
+var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(const var_opt_union& other) {
+ var_opt_union<T,S,A> union_copy(other);
+ std::swap(n_, union_copy.n_);
+ std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
+ std::swap(outer_tau_numer_, union_copy.outer_tau_denom_);
+ std::swap(max_k_, union_copy.max_k_);
+ std::swap(gadget_, union_copy.gadget_);
+ return *this;
+}
+
+template<typename T, typename S, typename A>
+var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
+ std::swap(n_, other.n_);
+ std::swap(outer_tau_numer_, other.outer_tau_numer_);
+ std::swap(outer_tau_numer_, other.outer_tau_denom_);
+ std::swap(max_k_, other.max_k_);
+ std::swap(gadget_, other.gadget_);
+ return *this;
+}
+
+template<typename T, typename S, typename A>
+void var_opt_union<T,S,A>::reset() {
+ if (gadget_ != nullptr)
+ gadget_.reset();
+}
+
+template<typename T, typename S, typename A>
+std::ostream& var_opt_union<T,S,A>::to_stream(std::ostream& os) const {
+ os << "### VarOpt Union SUMMARY: " << std::endl;
+ os << " . n : " << n_ << std::endl;
+ os << " Max k : " << max_k_ << std::endl;
+ os << " Gadget Summary: " << std::endl;
+ gadget_.to_stream(os);
+ os << "### END VarOpt Union SUMMARY: " << std::endl;
+
+ return os;
+}
+
+template<typename T, typename S, typename A>
+std::string var_opt_union<T,S,A>::to_string() const {
+ std::ostringstream ss;
+ to_stream(ss);
+ return ss.str();
+}
+
+template<typename T, typename S, typename A>
+void var_opt_union<T,S,A>::update(var_opt_sketch<T,S,A>& sk) {
+ merge_into(sk);
+}
+
+template<typename T, typename S, typename A>
+double var_opt_union<T,S,A>::get_outer_tau() const {
+ if (outer_tau_denom_ == 0) {
+ return 0.0;
+ } else {
+ return outer_tau_numer_ / outer_tau_denom_;
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_union<T,S,A>::merge_into(const var_opt_sketch<T,S,A>& sketch) {
+ if (sketch.n_ == 0) {
+ return;
+ }
+
+ n_ += sketch.n_;
+
+ // H region iterator
+ typename var_opt_sketch<T,S,A>::const_iterator h_itr(sketch, false, false, false);
+ typename var_opt_sketch<T,S,A>::const_iterator h_end(sketch, true, false, false);
+ while (h_itr != h_end) {
+ std::pair<const T&, const double> sample = *h_itr;
+ gadget_.update(sample.first, sample.second, false);
+ ++h_itr;
+ }
+
+ // Weight-correcitng R region iterator
+ typename var_opt_sketch<T,S,A>::const_iterator r_itr(sketch, false, true, true);
+ typename var_opt_sketch<T,S,A>::const_iterator r_end(sketch, true, true, true);
+ while (r_itr != r_end) {
+ std::pair<const T&, const double> sample = *r_itr;
+ gadget_.update(sample.first, sample.second, true);
+ ++r_itr;
+ }
+
+ // resolve tau
+ if (sketch.r_ > 0) {
+ double sketch_tau = sketch.get_tau();
+ double outer_tau = get_outer_tau();
+
+ if (outer_tau_denom_ == 0) {
+ // detect first estimation mode sketch and grab its tau
+ outer_tau_numer_ = sketch.total_wt_r_;
+ outer_tau_denom_ = sketch.r_;
+ } else if (sketch_tau > outer_tau) {
+ // switch to a bigger value of outer_tau
+ outer_tau_numer_ = sketch.total_wt_r_;
+ outer_tau_denom_ = sketch.r_;
+ } else if (sketch_tau == outer_tau) {
+ // Ok if previous equality test isn't quite perfect. Mistakes in either direction should
+ // be fairly benign.
+ // Without conceptually changing outer_tau, update number and denominator. In particular,
+ // add the total weight of the incoming reservoir to the running total.
+ outer_tau_numer_ += sketch.total_wt_r_;
+ outer_tau_denom_ += sketch.r_;
+ }
+
+ // do nothing if sketch's tau is smaller than outer_tau
+ }
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A> var_opt_union<T,S,A>::get_result() const {
+ // If no marked items in H, gadget is already valid mathematically. We can return what is
+ // basically just a copy of the gadget.
+ if (gadget_.num_marks_in_h_ == 0) {
+ return simple_gadget_coercer();
+ } else {
+ // Copy of gadget. This may produce needless copying in the
+ // pseudo-exact case below, but should simplify the code without
+ // needing to make the gadget a pointer
+ var_opt_sketch<T,S,A> gcopy(gadget_, false, n_);
+
+ // At this point, we know that marked items are present in H. So:
+ // 1. Result will necessarily be in estimation mode
+ // 2. Marked items currently in H need to be absorbed into reservoir (R)
+ bool is_pseudo_exact = detect_and_handle_subcase_of_pseudo_exact(gcopy);
+ if (!is_pseudo_exact) {
+ // continue with main logic
+ migrate_marked_items_by_decreasing_k(gcopy);
+ }
+ // sub-case was already detected and handled, so return the result
+ return gcopy;
+ }
+}
+
+/**
+ * When there are no marked items in H, the gadget is mathematically equivalent to a valid
+ * varopt sketch. This method simply returns a copy (without perserving marks).
+ *
+ * @return A shallow copy of the gadget as valid varopt sketch
+ */
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A> var_opt_union<T,S,A>::simple_gadget_coercer() const {
+ assert(gadget_.num_marks_in_h_ == 0);
+ return var_opt_sketch<T,S,A>(gadget_, true, n_);
+}
+
+// this is a condition checked in detect_and_handle_subcase_of_pseudo_exact()
+template<typename T, typename S, typename A>
+bool var_opt_union<T,S,A>::there_exist_unmarked_h_items_lighter_than_target(double threshold) const {
+ for (int i = 0; i < gadget_.h_; ++i) {
+ if ((gadget_.weights_[i] < threshold) && !gadget_.marks_[i]) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template<typename T, typename S, typename A>
+bool var_opt_union<T,S,A>::detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const {
+ // gadget is seemingly exact
+ bool condition1 = gadget_.r_ == 0;
+
+ // but there are marked items in H, so only _pseudo_ exact
+ bool condition2 = gadget_.num_marks_in_h_ > 0;
+
+ // if gadget is pseudo-exact and the number of marks equals outer_tau_denom, then we can deduce
+ // from the bookkeeping logic of mergeInto() that all estimation mode input sketches must
+ // have had the same tau, so we can throw all of the marked items into a common reservoir.
+ bool condition3 = gadget_.num_marks_in_h_ == outer_tau_denom_;
+
+ if (!(condition1 && condition2 && condition3)) {
+ return false;
+ } else {
+
+ // explicitly enforce rule that items in H should not be lighter than the sketch's tau
+ bool anti_condition4 = there_exist_unmarked_h_items_lighter_than_target(gadget_.get_tau());
+ if (anti_condition4) {
+ return false;
+ } else {
+ // conditions 1 through 4 hold
+ mark_moving_gadget_coercer(sk);
+ return true;
+ }
+ }
+}
+
+/**
+ * This coercer directly transfers marked items from the gadget's H into the result's R.
+ * Deciding whether that is a valid thing to do is the responsibility of the caller. Currently,
+ * this is only used for a subcase of pseudo-exact, but later it might be used by other
+ * subcases as well.
+ *
+ * @param sk Copy of the gadget, modified with marked items moved to the reservoir
+ */
+template<typename T, typename S, typename A>
+void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const {
+ uint32_t result_k = gadget_.h_ + gadget_.r_;
+
+ uint32_t result_h = 0;
+ uint32_t result_r = 0;
+ size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
+
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
+ double* wts = AllocDouble().allocate(result_k + 1);
+ T* data = A().allocate(result_k + 1);
+
+ // insert R region items, ignoring weights
+ // Currently (May 2017) this next block is unreachable; this coercer is used only in the
+ // pseudo-exact case in which case there are no items natively in R, only marked items in H
+ // that will be moved into R as part of the coercion process.
+ // Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
+ size_t final_idx = gadget_.get_num_samples();
+ for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
+ data[next_r_pos] = gadget_.data_[idx];
+ wts[next_r_pos] = gadget_.weights_[idx];
+ ++result_r;
+ --next_r_pos;
+ }
+
+ double transferred_weight = 0;
+
+ // insert H region items
+ for (size_t idx = 0; idx < gadget_.h_; ++idx) {
+ if (gadget_.marks_[idx]) {
+ data[next_r_pos] = gadget_.data_[idx];
+ wts[next_r_pos] = -1.0;
+ transferred_weight += gadget_.weights_[idx];
+ ++result_r;
+ --next_r_pos;
+ } else {
+ data[result_h] = gadget_.data_[idx];
+ wts[result_h] = gadget_.weights_[idx];
+ ++result_h;
+ }
+ }
+
+ assert((result_h + result_r) == result_k);
+ assert(fabs(transferred_weight - outer_tau_numer_) < 1e-10);
+
+ double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
+ uint64_t result_n = n_;
+
+ // explicitly set weight value for the gap
+ //data[result_h] = nullptr; invalid not a pointer
+ wts[result_h] = -1.0;
+
+ // clean up arrays in input sketch, replace with new values
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
+ AllocBool().deallocate(sk.marks_, sk.curr_items_alloc_);
+ AllocDouble().deallocate(sk.weights_, sk.curr_items_alloc_);
+ for (size_t i = 0; i < result_k; ++i) { A().destroy(sk.data_ + i); } // assumes everything in H region, no gap
+ A().deallocate(sk.data_, sk.curr_items_alloc_);
+
+ sk.data_ = data;
+ sk.weights_ = wts;
+ sk.marks_ = nullptr;
+ sk.num_marks_in_h_ = 0;
+ sk.curr_items_alloc_ = result_k + 1;
+ sk.k_ = result_k;
+ sk.n_ = result_n;
+ sk.h_ = result_h;
+ sk.r_ = result_r;
+ sk.total_wt_r_ = result_r_weight;
+}
+
+// this is basically a continuation of get_result(), but modifying the input gadget copy
+template<typename T, typename S, typename A>
+void var_opt_union<T,S,A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& gcopy) const {
+ uint32_t r_count = gcopy.r_;
+ uint32_t h_count = gcopy.h_;
+ uint32_t k = gcopy.k_;
+
+ assert(gcopy.num_marks_in_h_ > 0); // ensured by caller
+ // either full (of samples), or in pseudo-exact mode, or both
+ assert((r_count == 0) || (k == (h_count + r_count)));
+
+ // if non-full and pseudo-exact, change k so that gcopy is full
+ if ((r_count == 0) && (h_count < k)) {
+ gcopy.k_ = h_count; // may leve extra space allocated but that's ok
+ }
+
+ // Now k equals the number of samples, so reducing k will increase tau.
+ // Also, we know that there are at least 2 samples because 0 or 1 would have been handled
+ // by the earlier logic in get_result()
+ assert(gcopy.k_ >= 2);
+ gcopy.decrease_k_by_1();
+
+ // gcopy is now in estimation mode, just like the final result must be (due to marked items)
+ assert(gcopy.r_ > 0);
+ assert(gcopy.get_tau() > 0.0);
+
+ // keep reducing k until all marked items have been absorbed into the reservoir
+ while (gcopy.num_marks_in_h_ > 0) {
+ assert(gcopy.k_ >= 2); // because h_ and r_ are both at least 1
+ gcopy.decrease_k_by_1();
+ }
+
+ gcopy.strip_marks();
+}
+
+
+} // namespace datasketches
+
+#endif // _VAR_OPT_UNION_IMPL_HPP_
diff --git a/sampling/test/counting_allocator.hpp b/sampling/test/counting_allocator.hpp
deleted file mode 100644
index e4c546f..0000000
--- a/sampling/test/counting_allocator.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef COUNTING_ALLOCATOR_H
-#define COUNTING_ALLOCATOR_H
-
-#include <new>
-#include <utility>
-
-namespace datasketches {
-
-// this relies on global variables to track allocated memory
-extern long long int total_allocated_memory;
-extern long long int total_objects_constructed;
-
-template <class T> class counting_allocator {
-public:
- typedef T value_type;
- typedef value_type* pointer;
- typedef const value_type* const_pointer;
- typedef value_type& reference;
- typedef const value_type& const_reference;
- typedef std::size_t size_type;
- typedef std::ptrdiff_t difference_type;
-
- template <class U>
- struct rebind { typedef counting_allocator<U> other; };
-
- counting_allocator() {}
- counting_allocator(const counting_allocator&) {}
- template <class U>
- counting_allocator(const counting_allocator<U>&) {}
- ~counting_allocator() {}
-
- pointer address(reference x) const { return &x; }
- const_pointer address(const_reference x) const {
- return &x;
- }
-
- pointer allocate(size_type n, const_pointer = 0) {
- void* p = malloc(n * sizeof(T));
- if (!p) throw std::bad_alloc();
- total_allocated_memory += n * sizeof(T);
- return static_cast<pointer>(p);
- }
-
- void deallocate(pointer p, size_type n) {
- if (p) free(p);
- total_allocated_memory -= n * sizeof(T);
- }
-
- size_type max_size() const {
- return static_cast<size_type>(-1) / sizeof(T);
- }
-
- template<typename... Args>
- void construct(pointer p, Args&&... args) {
- new(p) value_type(std::forward<Args>(args)...);
- ++total_objects_constructed;
- }
- void destroy(pointer p) {
- p->~value_type();
- --total_objects_constructed;
- }
-
-private:
- void operator=(const counting_allocator&);
-};
-
-template<> class counting_allocator<void> {
-public:
- typedef void value_type;
- typedef void* pointer;
- typedef const void* const_pointer;
-
- template <class U>
- struct rebind { typedef counting_allocator<U> other; };
-};
-
-
-template <class T>
-inline bool operator==(const counting_allocator<T>&, const counting_allocator<T>&) {
- return true;
-}
-
-template <class T>
-inline bool operator!=(const counting_allocator<T>&, const counting_allocator<T>&) {
- return false;
-}
-
-}
-
-#endif
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index ba75ba3..7251eb7 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -23,6 +23,7 @@
#include "counting_allocator.hpp"
#include <var_opt_sketch.hpp>
+#include <var_opt_union.hpp>
#include <string>
#include <sstream>
@@ -36,16 +37,35 @@ static std::string testBinaryInputPath = "test/";
namespace datasketches {
-long long int total_allocated_memory;
-long long int total_objects_constructed;
-
class var_opt_sketch_test: public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE(var_opt_sketch_test);
CPPUNIT_TEST(empty);
- CPPUNIT_TEST(mem_test);
+ CPPUNIT_TEST(vo_union);
CPPUNIT_TEST_SUITE_END();
+ void vo_union() {
+ int k = 10;
+ var_opt_sketch<int> sk(k), sk2(k+3);
+
+ for (int i = 0; i < 10*k; ++i) {
+ sk.update(i);
+ sk2.update(i);
+ }
+ sk.update(-1, 10000.0);
+ sk2.update(-2, 4000.0);
+ std::cerr << sk.to_string() << std::endl;
+
+ var_opt_union<int> vou(k+3);
+ std::cerr << vou.to_string() << std::endl;
+ vou.update(sk);
+ vou.update(sk2);
+ std::cerr << vou.to_string() << std::endl;
+
+ var_opt_sketch<int> r = vou.get_result();
+ std::cerr << "-----------------------" << std::endl << r.to_string() << std::endl;
+ }
+
void empty() {
int k = 10;
@@ -64,7 +84,7 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
var_opt_sketch<int> sk2 = var_opt_sketch<int>::deserialize(ss);
std::cout << sk2.to_string() << std::endl;;
}
- std::cerr << "num allocs: " << num_allocs << "\n";
+
{
var_opt_sketch<std::string> sk(k);
std::cout << "Expected size: " << sk.get_serialized_size_bytes() << std::endl;
@@ -112,91 +132,6 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
}
}
- void print_mem_stats() {
- std::cout << "construct(): " << total_objects_constructed << std::endl;
- std::cout << "memory used: " << total_allocated_memory << std::endl;
- }
-
- void mem_test() {
- int k = 10;
-
- total_allocated_memory = 0;
- total_objects_constructed = 0;
-
- typedef var_opt_sketch<std::string, serde<std::string>, counting_allocator<std::string>> var_opt_sketch_a;
- var_opt_sketch_a* s = new (counting_allocator<var_opt_sketch_a>().allocate(1)) var_opt_sketch_a(k);
-
- std::string x[26];
- x[0] = std::string("a");
- x[1] = std::string("b");
- x[2] = std::string("c");
- x[3] = std::string("d");
- x[4] = std::string("e");
- x[5] = std::string("f");
- x[6] = std::string("g");
- x[7] = std::string("h");
- x[8] = std::string("i");
- x[9] = std::string("j");
- x[10] = std::string("k");
- x[11] = std::string("l");
- x[12] = std::string("m");
- x[13] = std::string("n");
- x[14] = std::string("o");
- x[15] = std::string("p");
- x[16] = std::string("q");
- x[17] = std::string("r");
- x[18] = std::string("s");
- x[19] = std::string("t");
- x[20] = std::string("u");
- x[21] = std::string("v");
- x[22] = std::string("w");
- x[23] = std::string("x");
- x[24] = std::string("y");
- x[25] = std::string("z");
-
- for (int i=0; i <5; ++i)
- s->update(x[i]);
- print_mem_stats();
-
- for (int i=5; i <11; ++i)
- s->update(x[i]);
- print_mem_stats();
-
- s->update(x[11], 10000);
- print_mem_stats();
-
- for (int i=12; i <26; ++i)
- s->update(x[i]);
- print_mem_stats();
-
- std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
- s->serialize(ss);
- std::cout << s->to_string() << std::endl;
- print_mem_stats();
-
- counting_allocator<var_opt_sketch_a>().destroy(s);
- counting_allocator<var_opt_sketch_a>().deallocate(s, 1);
- print_mem_stats();
-
- {
- auto sk = var_opt_sketch_a::deserialize(ss);
- print_mem_stats();
-
- sk.update("qrs");
- print_mem_stats();
-
- sk.update("zyx");
- print_mem_stats();
-
- std::cout << sk.to_string() << std::endl;
-
- auto vec = sk.serialize();
- var_opt_sketch_a sk2 = var_opt_sketch_a::deserialize(vec.data(), vec.size());
- std::cout << sk2.to_string() << std::endl;
- }
- print_mem_stats();
- }
-
};
CPPUNIT_TEST_SUITE_REGISTRATION(var_opt_sketch_test);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org