You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/01/18 07:33:50 UTC
[incubator-datasketches-cpp] 01/01: [WIP,
incomplete with a bunch of debug code] checkpointing varopt with
basic sketch framework mostly coded. Lacks querying, unit tests,
and unioning so lots left to do
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit e788651786616f26742b102e3064094a8615ea16
Author: Jon Malkin <jm...@apache.org>
AuthorDate: Fri Jan 17 23:33:30 2020 -0800
[WIP, incomplete with a bunch of debug code] checkpointing varopt with basic sketch framework mostly coded. Lacks querying, unit tests, and unioning so lots left to do
---
CMakeLists.txt | 3 +-
NOTICE | 4 +
sampling/CMakeLists.txt | 50 ++
sampling/include/var_opt_sketch.hpp | 204 +++++
sampling/include/var_opt_sketch_impl.hpp | 1252 ++++++++++++++++++++++++++++++
sampling/test/CMakeLists.txt | 46 ++
sampling/test/counting_allocator.hpp | 109 +++
sampling/test/var_opt_sketch_test.cpp | 204 +++++
8 files changed, 1871 insertions(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 820bb55..7b40cdf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,12 +80,13 @@ add_subdirectory(cpc)
add_subdirectory(kll)
add_subdirectory(fi)
add_subdirectory(theta)
+add_subdirectory(sampling)
if (WITH_PYTHON)
add_subdirectory(python)
endif()
-target_link_libraries(datasketches PUBLIC hll cpc kll fi theta)
+target_link_libraries(datasketches PUBLIC hll cpc kll fi theta sampling)
set_target_properties(datasketches PROPERTIES
LINKER_LANGUAGE CXX
diff --git a/NOTICE b/NOTICE
index 34204eb..9c06f60 100644
--- a/NOTICE
+++ b/NOTICE
@@ -16,6 +16,10 @@ This product contains code written by Austin Appleby and placed in the public do
The original: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
* common/include/MurmurHash3.h
+This product contains code published by Sean Eron Anderson and placed in the public domain.
+The original: https://graphics.stanford.edu/~seander/bithacks.html
+ * common/include/CommonUtil.h
+
Apache 2.0 License
==================
diff --git a/sampling/CMakeLists.txt b/sampling/CMakeLists.txt
new file mode 100644
index 0000000..0ca0743
--- /dev/null
+++ b/sampling/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_library(sampling INTERFACE)
+
+add_library(${PROJECT_NAME}::SAMPLING ALIAS sampling)
+
+if (BUILD_TESTS)
+ add_subdirectory(test)
+endif()
+
+target_include_directories(sampling
+ INTERFACE
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
+ PRIVATE
+ ${COMMON_INCLUDE_DIR}
+)
+
+target_link_libraries(sampling INTERFACE common)
+target_compile_features(sampling INTERFACE cxx_std_11)
+
+set(sampling_HEADERS "include/var_opt_sketch.hpp;include/var_opt_sketch_impl.hpp")
+
+install(TARGETS sampling
+ EXPORT ${PROJECT_NAME}
+)
+
+install(FILES ${sampling_HEADERS}
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
+
+target_sources(sampling
+ INTERFACE
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch_impl.hpp
+)
diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
new file mode 100644
index 0000000..83f89c7
--- /dev/null
+++ b/sampling/include/var_opt_sketch.hpp
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef _VAR_OPT_SKETCH_HPP_
+#define _VAR_OPT_SKETCH_HPP_
+
+#include "serde.hpp"
+
+#include <vector>
+#include <iterator>
+
+namespace datasketches {
+
+template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
+
+int num_allocs = 0;
+
+/**
+ * author Kevin Lang
+ * author Jon Malkin
+ */
+template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
+class var_opt_sketch {
+
+ public:
+ enum resize_factor { X1 = 0, X2, X4, X8 };
+ static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
+
+ var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
+ static var_opt_sketch<T,S,A> deserialize(std::istream& is);
+ static var_opt_sketch<T,S,A> deserialize(const void* bytes, size_t size);
+
+ virtual ~var_opt_sketch();
+
+ void update(const T& item, double weight=1.0);
+ //void update(T&& item, double weight=1.0);
+
+ uint32_t get_k() const;
+ uint64_t get_n() const;
+ uint32_t get_num_samples() const;
+
+ bool is_empty() const;
+ void reset();
+
+ // version for fixed-size arithmetic types (integer, floating point)
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
+ size_t get_serialized_size_bytes() const;
+
+ // version for all other types
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
+ size_t get_serialized_size_bytes() const;
+
+ std::vector<uint8_t, AllocU8<A>> serialize(unsigned header_size_bytes = 0) const;
+ void serialize(std::ostream& os) const;
+
+ std::ostream& to_stream(std::ostream& os) const;
+ std::string to_string() const;
+
+ //estimate_subset_sum()
+
+ private:
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
+
+ static const uint32_t MIN_LG_ARR_ITEMS = 4;
+
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
+ static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
+ static const uint8_t PREAMBLE_LONGS_FULL = 4;
+ static const uint8_t SER_VER = 2;
+ static const uint8_t FAMILY = 12;
+ static const uint8_t EMPTY_FLAG_MASK = 4;
+ static const uint8_t GADGET_FLAG_MASK = 128;
+
+ // TODO: should probably rearrange a bit to minimize gaps once aligned
+ uint32_t k_; // max size of sketch, in items
+
+ uint32_t h_; // number of items in heap
+ uint32_t m_; // number of items in middle region
+ uint32_t r_; // number of items in reservoir-like region
+
+ uint64_t n_; // total number of items processed by sketch
+ double total_wt_r_; // total weight of items in reservoir-like area
+
+ const resize_factor rf_; // resize factor
+
+ uint32_t curr_items_alloc_; // currently allocated array size
+ bool filled_data_; // true if we've explciitly set all entries in data_
+
+ T* data_; // stored sampled items
+ double* weights_; // weights for sampled items
+
+ // The next two fields are hidden from the user because they are part of the state of the
+ // unioning algorithm, NOT part of a varopt sketch, or even of a varopt "gadget" (our name for
+ // the potentially invalid sketch that is maintained by the unioning algorithm). It would make
+ // more sense logically for these fields to be declared in the unioning object (whose entire
+ // purpose is storing the state of the unioning algorithm) but for reasons of programming
+ // convenience we are currently declaring them here. However, that could change in the future.
+
+ // Following int is:
+ // 1. Zero (for a varopt sketch)
+ // 2. Count of marked items in H region, if part of a unioning algo's gadget
+ uint32_t num_marks_in_h_;
+
+ // The following array is absent in a varopt sketch, and notionally present in a gadget
+ // (although it really belongs in the unioning object). If the array were to be made explicit,
+ // some additional coding would need to be done to ensure that all of the necessary data motion
+ // occurs and is properly tracked.
+ bool* marks_;
+
+ var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget);
+ var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, uint8_t preamble_longs, std::istream& is);
+ var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, uint8_t preamble_longs, const void* bytes, size_t size);
+
+ // internal-use-only updates
+ void update(const T& item, double weight, bool mark);
+ void update_warmup_phase(const T& item, double weight, bool mark);
+ void update_light(const T& item, double weight, bool mark);
+ void update_heavy_r_eq1(const T& item, double weight, bool mark);
+ void update_heavy_general(const T& item, double weight, bool mark);
+
+ double get_tau() const;
+ double peek_min() const;
+ bool is_marked(int idx) const;
+
+ int pick_random_slot_in_r() const;
+ int choose_delete_slot(double wt_cand, int num_cand) const;
+ int choose_weighted_delete_slot(double wt_cand, int num_cand) const;
+
+ void transition_from_warmup();
+ void convert_to_heap();
+ void restore_towards_leaves(int slot_in);
+ void restore_towards_root(int slot_in);
+ void push(const T& item, double wt, bool mark);
+ void pop_min_to_m_region();
+ void grow_candidate_set(double wt_cands, int num_cands);
+ void decrease_k_by_1();
+ void force_set_k(int k); // used to resolve union gadget into sketch
+ void downsample_candidate_set(double wt_cands, int num_cands);
+ void swap_values(int src, int dst);
+ void grow_data_arrays();
+ void allocate_data_arrays(uint32_t tgt_size, bool use_marks);
+
+ // validation
+ static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
+ static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
+
+ // things to move to common utils and share among sketches
+ static int get_adjusted_size(int max_size, int resize_target);
+ static int starting_sub_multiple(int lg_target, int lg_rf, int lg_min);
+ static bool is_power_of_2(uint32_t v);
+ static uint32_t to_log_2(uint32_t v);
+ static uint32_t count_trailing_zeros(uint32_t v);
+ static uint32_t ceiling_power_of_2(uint32_t n);
+ static int next_int(int max_value);
+ static double next_double_exclude_zero();
+
+ class const_iterator;
+ const_iterator begin() const;
+ const_iterator end() const;
+};
+
+template<typename T, typename S, typename A>
+class var_opt_sketch<T, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
+public:
+ friend class var_opt_sketch<T, S, A>;
+ const_iterator(const const_iterator& other);
+ const_iterator& operator++();
+ const_iterator& operator++(int);
+ bool operator==(const const_iterator& other) const;
+ bool operator!=(const const_iterator& other) const;
+ const std::pair<const T&, const double> operator*() const;
+private:
+ const T* items;
+ const double* weights;
+ const uint32_t h_count;
+ const uint32_t r_count;
+ const double r_item_wt;
+ uint32_t index;
+ const_iterator(const T* items, const double* weights, const uint32_t h_count, const uint32_t r_count,
+ const double total_wt_r, bool use_end=false);
+};
+
+} // namespace datasketches
+
+#include "var_opt_sketch_impl.hpp"
+
+#endif // _VAR_OPT_SKETCH_HPP_
\ No newline at end of file
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
new file mode 100644
index 0000000..9fac697
--- /dev/null
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -0,0 +1,1252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef _VAR_OPT_SKETCH_IMPL_HPP_
+#define _VAR_OPT_SKETCH_IMPL_HPP_
+
+#include "var_opt_sketch.hpp"
+#include "serde.hpp"
+
+#include <memory>
+#include <iostream>
+#include <cmath>
+#include <random>
+#include <algorithm>
+
+namespace datasketches {
+
+/**
+ * Implementation code for the VarOpt sketch.
+ *
+ * author Kevin Lang
+ * author Jon Malkin
+ */
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf) :
+ var_opt_sketch<T,S,A>(k, rf, false) {}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
+ k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
+ if (k < 1) {
+ throw std::invalid_argument("k must be at least 1");
+ }
+
+ uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
+ int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
+ ++curr_items_alloc_;
+ }
+
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
+ num_marks_in_h_ = 0;
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::~var_opt_sketch() {
+ if (filled_data_) {
+ // destroy everything
+ for (size_t i = 0; i < curr_items_alloc_; ++i) {
+ A().destroy(data_ + i);
+ }
+ } else {
+ // skip gap or anything unused at the end
+ for (size_t i = 0; i < h_; ++i) {
+ A().destroy(data_+ i);
+ }
+
+ for (size_t i = h_ + 1; i < curr_items_alloc_; ++i) {
+ A().destroy(data_ + i);
+ }
+ }
+ A().deallocate(data_, curr_items_alloc_);
+ AllocDouble().deallocate(weights_, curr_items_alloc_);
+
+ if (marks_ != nullptr) {
+ AllocBool().deallocate(marks_, curr_items_alloc_);
+ }
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, uint8_t preamble_longs, std::istream& is) :
+ k_(k), m_(0), rf_(rf) {
+
+ // second and third prelongs
+ is.read((char*)&n_, sizeof(uint64_t));
+ is.read((char*)&h_, sizeof(uint32_t));
+ is.read((char*)&r_, sizeof(uint32_t));
+
+ // initial data validation
+ // read fourth prelong, if needed, and determine vector sizes
+ if (n_ <= k_) {
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
+ throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
+ "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+ }
+ if (n_ != h_) {
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
+ "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
+ }
+ if (r_ > 0) {
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
+ "Found r = " + std::to_string(r_));
+ }
+
+ uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
+ uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
+ int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
+ ++curr_items_alloc_;
+ }
+ } else { // n_ > k_
+ if (preamble_longs != PREAMBLE_LONGS_FULL) {
+ throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
+ "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+ }
+ if (h_ + r_ != k_) {
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
+ "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
+ }
+
+ is.read((char*)&total_wt_r_, sizeof(total_wt_r_));
+ if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
+ "Found r = " + std::to_string(r_) + ", R regionw weight = " + std::to_string(total_wt_r_));
+ }
+
+ curr_items_alloc_ = k_ + 1;
+ }
+
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
+
+ // read the first h_ weights
+ is.read((char*)weights_, h_ * sizeof(double));
+ std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
+
+ // read the first h_ marks as packed bytes iff we have a gadget
+ if (is_gadget) {
+ uint32_t num_bytes = (h_ >> 3) + ((h_ & 0x7) > 0 ? 1 : 0);
+
+ uint8_t val = 0;
+ for (int i = 0; i < num_bytes; ++i) {
+ if ((i & 0x7) == 0x0) { // should trigger on first iteration
+ is.read((char*)&val, sizeof(val));
+ }
+ marks_[i] = ((val >> (i & 0x7)) & 0x1) == 1;
+ }
+ }
+
+ // read the sample items, skipping the gap. Either h_ or r_ may be 0
+ S().deserialize(is, data_, h_); // aka &data_[0]
+ S().deserialize(is, &data_[h_ + 1], r_);
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, uint8_t preamble_longs,
+ const void* bytes, size_t size) : k_(k), m_(0), rf_(rf) {
+ // private constructor so we assume not called if sketch is empty
+ const char* ptr = static_cast<const char*>(bytes) + sizeof(uint64_t);
+
+ // second and third prelongs
+ ptr += copy_from_mem(ptr, &n_, sizeof(n_));
+ ptr += copy_from_mem(ptr, &h_, sizeof(h_));
+ ptr += copy_from_mem(ptr, &r_, sizeof(r_));
+
+ // initial data validation
+ // read fourth prelong, if needed, and determine vector sizes
+ if (n_ <= k_) {
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
+ throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
+ "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+ }
+ if (n_ != h_) {
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
+ "Found n = " + std::to_string(n_) + ", h = " + std::to_string(h_));
+ }
+ if (r_ > 0) {
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
+ "Found r = " + std::to_string(r_));
+ }
+
+ uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
+ uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h_));
+ int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, min_lg_size);
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
+ ++curr_items_alloc_;
+ }
+ } else { // n_ > k_
+ if (preamble_longs != PREAMBLE_LONGS_FULL) {
+ throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
+ "Found n = " + std::to_string(n_) + ", k = " + std::to_string(k_));
+ }
+ if (h_ + r_ != k_) {
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
+ "Found h = " + std::to_string(h_) + ", r = " + std::to_string(r_) + ", n = " + std::to_string(n_));
+ }
+
+ ptr += copy_from_mem(ptr, &total_wt_r_, sizeof(total_wt_r_));
+ if (isnan(total_wt_r_) || r_ == 0 || total_wt_r_ <= 0.0) {
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
+ "Found r = " + std::to_string(r_) + ", R regionw weight = " + std::to_string(total_wt_r_));
+ }
+
+ curr_items_alloc_ = k_ + 1;
+ }
+
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
+
+ // read the first h_ weights, fill in rest of array with -1.0
+ ptr += copy_from_mem(ptr, weights_, h_ * sizeof(double));
+ std::fill(&weights_[h_], &weights_[curr_items_alloc_], -1.0);
+
+ // read the first h_ marks as packed bytes iff we have a gadget
+ if (is_gadget) {
+ uint32_t num_bytes = (h_ >> 3) + ((h_ & 0x7) > 0 ? 1 : 0);
+
+ uint8_t val = 0;
+ for (int i = 0; i < num_bytes; ++i) {
+ if ((i & 0x7) == 0x0) { // should trigger on first iteration
+ ptr += copy_from_mem(ptr, &val, sizeof(val));
+ }
+ marks_[i] = ((val >> (i & 0x7)) & 0x1) == 1;
+ }
+ }
+
+ // read the sample items, skipping the gap. Either h_ or r_ may be 0
+ ptr += S().deserialize(ptr, data_, h_); // ala data_[0]
+ ptr += S().deserialize(ptr, &data_[h_ + 1], r_);
+}
+
+/*
+ * An empty sketch requires 8 bytes.
+ *
+ * <pre>
+ * Long || Start Byte Adr:
+ * Adr:
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
+ * </pre>
+ *
+ * A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
+ * at least k items the sketch uses 32 bytes of preamble.
+ *
+ * The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
+ * unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
+ * limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
+ * ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
+ * use slightly fewer bits.
+ *
+ * Following the header are weights for the heavy items, then marks in the event this is a gadget.
+ * The serialized items come last.
+ *
+ * <pre>
+ * Long || Start Byte Adr:
+ * Adr:
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
+ *
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
+ * 1 ||---------------------------Items Seen Count (N)--------------------------------|
+ *
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
+ * 2 ||-------------Item Count in H---------------|-------Item Count in R-------------|
+ *
+ * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
+ * 3 ||-------------------------------Total Weight in R-------------------------------|
+ * </pre>
+ */
+
+// implementation for fixed-size arithmetic types (integral and floating point)
+template<typename T, typename S, typename A>
+template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
+size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
+ std::cerr << "fixed-size version!";
+ if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
+ size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
+ num_bytes += h_ * sizeof(double); // weights
+ if (marks_ != nullptr) { // marks
+ num_bytes += (h_ / 8) + (h_ % 8 > 0);
+ }
+ num_bytes += (h_ + r_) * sizeof(TT); // the actual items
+ return num_bytes;
+}
+
+// implementation for all other types
+template<typename T, typename S, typename A>
+template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
+size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
+ std::cerr << "other version!";
+ if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
+ size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
+ num_bytes += h_ * sizeof(double); // weights
+ if (marks_ != nullptr) { // marks
+ num_bytes += (h_ / 8) + (h_ % 8 > 0);
+ }
+ // must iterate over the items
+ for (auto& it: *this)
+ num_bytes += S().size_of_item(it.first);
+ return num_bytes;
+}
+
+template<typename T, typename S, typename A>
+std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes) const {
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
+ std::vector<uint8_t, AllocU8<A>> bytes(size);
+ uint8_t* ptr = bytes.data() + header_size_bytes;
+
+ bool empty = is_empty();
+ uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
+ : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
+ uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
+ uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
+
+ if (empty) {
+ flags |= EMPTY_FLAG_MASK;
+ }
+
+ // first prelong
+ uint8_t ser_ver(SER_VER);
+ uint8_t family(FAMILY);
+ ptr += copy_to_mem(&first_byte, ptr, sizeof(uint8_t));
+ ptr += copy_to_mem(&ser_ver, ptr, sizeof(uint8_t));
+ ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
+ ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
+ ptr += copy_to_mem(&k_, ptr, sizeof(uint32_t));
+
+ if (!empty) {
+ // second and third prelongs
+ ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
+ ptr += copy_to_mem(&h_, ptr, sizeof(uint32_t));
+ ptr += copy_to_mem(&r_, ptr, sizeof(uint32_t));
+
+ // fourth prelong, if needed
+ if (r_ > 0) {
+ ptr += copy_to_mem(&total_wt_r_, ptr, sizeof(double));
+ }
+
+ // first h_ weights
+ ptr += copy_to_mem(weights_, ptr, h_ * sizeof(double));
+
+ // first h_ marks as packed bytes iff we have a gadget
+ if (marks_ != nullptr) {
+ uint8_t val = 0;
+ for (int i = 0; i < h_; ++i) {
+ if (marks_[i]) {
+ val |= 0x1 << (i & 0x7);
+ }
+
+ if ((i & 0x7) == 0x7) {
+ ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
+ val = 0;
+ }
+ }
+
+ // write out any remaining values
+ if ((h_ & 0x7) > 0) {
+ ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
+ }
+ }
+
+ // write the sample items, skipping the gap. Either h_ or r_ may be 0
+ ptr += S().serialize(ptr, data_, h_);
+ ptr += S().serialize(ptr, &data_[h_ + 1], r_);
+ }
+
+ size_t bytes_written = ptr - bytes.data();
+ if (bytes_written != size) {
+ throw std::logic_error("serialized size mismatch: " + std::to_string(bytes_written) + " != " + std::to_string(size));
+ }
+
+ return bytes;
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
+ bool empty = (h_ == 0) && (r_ == 0);
+
+ uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
+ : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
+ uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
+ uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
+
+ if (empty) {
+ flags |= EMPTY_FLAG_MASK;
+ }
+
+ // first prelong
+ uint8_t ser_ver(SER_VER);
+ uint8_t family(FAMILY);
+ os.write((char*)&first_byte, sizeof(uint8_t));
+ os.write((char*)&ser_ver, sizeof(uint8_t));
+ os.write((char*)&family, sizeof(uint8_t));
+ os.write((char*)&flags, sizeof(uint8_t));
+ os.write((char*)&k_, sizeof(uint32_t));
+
+ if (!empty) {
+ // second and third prelongs
+ os.write((char*)&n_, sizeof(uint64_t));
+ os.write((char*)&h_, sizeof(uint32_t));
+ os.write((char*)&r_, sizeof(uint32_t));
+
+ // fourth prelong, if needed
+ if (r_ > 0) {
+ os.write((char*)&total_wt_r_, sizeof(double));
+ }
+
+ // write the first h_ weights
+ os.write((char*)weights_, h_ * sizeof(double));
+ //for (int i = 0; i < h_; ++i) {
+ // os.write((char*)&weights_[i], sizeof(double));
+ //}
+
+ // write the first h_ marks as packed bytes iff we have a gadget
+ if (marks_ != nullptr) {
+ uint8_t val = 0;
+ for (int i = 0; i < h_; ++i) {
+ if (marks_[i]) {
+ val |= 0x1 << (i & 0x7);
+ }
+
+ if ((i & 0x7) == 0x7) {
+ os.write((char*)&val, sizeof(uint8_t));
+ val = 0;
+ }
+ }
+
+ // write out any remaining values
+ if ((h_ & 0x7) > 0) {
+ os.write((char*)&val, sizeof(uint8_t));
+ }
+ }
+
+ // write the sample items, skipping the gap. Either h_ or r_ may be 0
+ S().serialize(os, data_, h_);
+ S().serialize(os, &data_[h_ + 1], r_);
+ }
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size) {
+ const char* ptr = static_cast<const char*>(bytes);
+ uint8_t first_byte;
+ ptr += copy_from_mem(ptr, &first_byte, sizeof(first_byte));
+ uint8_t preamble_longs = first_byte & 0x3f;
+ resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
+ uint8_t serial_version;
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
+ uint8_t family_id;
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
+ uint8_t flags;
+ ptr += copy_from_mem(ptr, &flags, sizeof(flags));
+ uint32_t k;
+ ptr += copy_from_mem(ptr, &k, sizeof(k));
+
+ check_preamble_longs(preamble_longs, flags);
+ check_family_and_serialization_version(family_id, serial_version);
+
+ bool is_empty = flags & EMPTY_FLAG_MASK;
+ bool is_gadget = flags & GADGET_FLAG_MASK;
+
+ return is_empty ? var_opt_sketch<T,S,A>(k, rf, is_gadget) : var_opt_sketch<T,S,A>(k, rf, is_gadget, preamble_longs, bytes, size);
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is) {
+ uint8_t first_byte;
+ is.read((char*)&first_byte, sizeof(first_byte));
+ uint8_t preamble_longs = first_byte & 0x3f;
+ resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
+ uint8_t serial_version;
+ is.read((char*)&serial_version, sizeof(serial_version));
+ uint8_t family_id;
+ is.read((char*)&family_id, sizeof(family_id));
+ uint8_t flags;
+ is.read((char*)&flags, sizeof(flags));
+ uint32_t k;
+ is.read((char*)&k, sizeof(k));
+
+ check_preamble_longs(preamble_longs, flags);
+ check_family_and_serialization_version(family_id, serial_version);
+
+ bool is_empty = flags & EMPTY_FLAG_MASK;
+ bool is_gadget = flags & GADGET_FLAG_MASK;
+
+ return is_empty ? var_opt_sketch<T,S,A>(k, rf, is_gadget) : var_opt_sketch<T,S,A>(k, rf, is_gadget, preamble_longs, is);
+}
+
+template<typename T, typename S, typename A>
+bool var_opt_sketch<T,S,A>::is_empty() const {
+ return (h_ == 0 && r_ == 0);
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::reset() {
+ n_ = 0;
+ h_ = 0;
+ m_ = 0;
+ r_ = 0;
+ num_marks_in_h_ = 0;
+ total_wt_r_ = 0.0;
+
+ uint32_t prev_alloc = curr_items_alloc_;
+
+ uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
+ int initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
+ ++curr_items_alloc_;
+ }
+
+ if (curr_items_alloc_ < prev_alloc) {
+ bool is_gadget = (marks_ != nullptr);
+
+ if (filled_data_) {
+ // destroy everything
+ for (size_t i = 0; i < curr_items_alloc_; ++i)
+ A().destroy(data_ + i);
+ } else {
+ // skip gap or anything unused at the end
+ for (size_t i = 0; i < h_; ++i)
+ A().destroy(data_+ i);
+
+ for (size_t i = h_ + 1; i < curr_items_alloc_; ++i)
+ A().destroy(data_ + i);
+ }
+ A().deallocate(data_, curr_items_alloc_);
+ AllocDouble().deallocate(weights_, curr_items_alloc_);
+
+ if (marks_ != nullptr)
+ AllocBool().deallocate(marks_, curr_items_alloc_);
+
+ allocate_data_ararys(curr_items_alloc_, is_gadget);
+ } else {
+ filled_data_ = false;
+ }
+}
+
+template<typename T, typename S, typename A>
+uint64_t var_opt_sketch<T,S,A>::get_n() const {
+ return n_;
+}
+
+template<typename T, typename S, typename A>
+uint32_t var_opt_sketch<T,S,A>::get_k() const {
+ return k_;
+}
+
+template<typename T, typename S, typename A>
+uint32_t var_opt_sketch<T,S,A>::get_num_samples() const {
+ uint32_t num_in_sketch = h_ + r_;
+ return (num_in_sketch < k_ ? num_in_sketch : k_);
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::update(const T& item, double weight) {
+ update(item, weight, false);
+}
+
+/*
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::update(T&& item, double weight) {
+}
+*/
+
+template<typename T, typename S, typename A>
+std::ostream& var_opt_sketch<T,S,A>::to_stream(std::ostream& os) const {
+ os << "### VarOpt SUMMARY: " << std::endl;
+ os << " k : " << k_ << std::endl;
+ os << " h : " << h_ << std::endl;
+ os << " r : " << r_ << std::endl;
+ os << " weight_r : " << total_wt_r_ << std::endl;
+ os << " Current size : " << curr_items_alloc_ << std::endl;
+ os << " Resize factor: " << (1 << rf_) << std::endl;
+ os << "### Sketch Items" << std::endl;
+
+ uint32_t print_length = (n_ < k_ ? n_ : k_ + 1);
+ for (int i = 0; i < print_length; ++i) {
+ if (i == h_) {
+ os << i << ": GAP" << std::endl;
+ } else {
+ os << i << ": " << data_[i] << "\twt = " << weights_[i] << std::endl;
+ }
+ }
+
+ os << "### END SKETCH SUMMARY" << std::endl;
+
+ return os;
+}
+
+template <typename T, typename S, typename A>
+std::string var_opt_sketch<T,S,A>::to_string() const {
+ std::ostringstream ss;
+ to_stream(ss);
+ return ss.str();
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::update(const T& item, double weight, bool mark) {
+ if (weight <= 0.0) {
+ throw std::invalid_argument("Items weights must be strictly positive. Found: "
+ + std::to_string(weight));
+ }
+ ++n_;
+
+ if (r_ == 0) {
+ // exact mode
+ update_warmup_phase(item, weight, mark);
+ } else {
+ // sketch is in estimation mode so we can make the following check
+ assert(h_ == 0 || (peek_min() >= get_tau()));
+
+ // what tau would be if deletion candidates turn out to be R plus the new item
+ // note: (r_ + 1) - 1 is intentional
+ double hypothetical_tau = (weight + total_wt_r_) / ((r_ + 1) - 1);
+
+ // is new item's turn to be considered for reservoir?
+ double condition1 = (h_ == 0) || (weight <= peek_min());
+
+ // is new item light enough for reservoir?
+ double condition2 = weight < hypothetical_tau;
+
+ if (condition1 && condition2) {
+ update_light(item, weight, mark);
+ } else if (r_ == 1) {
+ update_heavy_r_eq1(item, weight, mark);
+ } else {
+ update_heavy_general(item, weight, mark);
+ }
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::update_warmup_phase(const T& item, double weight, bool mark) {
+ assert(r_ == 0);
+ assert(m_ == 0);
+ assert(h_ <= k_);
+
+ if (h_ >= curr_items_alloc_) {
+ grow_data_arrays();
+ }
+
+ // store items as they come in until full
+ std::cerr << "h_=" << h_ << ":\t" << item << "\t(alloc: " << curr_items_alloc_ << ")\n";
+ A().construct(&data_[h_], T(item));
+ weights_[h_] = weight;
+ if (marks_ != nullptr) {
+ marks_[h_] = mark;
+ }
+ ++h_;
+ num_marks_in_h_ += mark ? 1 : 0;
+
+ // check if need to heapify
+ if (h_ > k_) {
+ transition_from_warmup();
+ }
+}
+
+/* In the "light" case the new item has weight <= old_tau, so
+ would appear to the right of the R items in a hypothetical reverse-sorted
+ list. It is easy to prove that it is light enough to be part of this
+ round's downsampling */
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::update_light(const T& item, double weight, bool mark) {
+ assert(r_ >= 1);
+ assert((r_ + h_) == k_);
+
+ int m_slot = h_; // index of the gap, which becomes the M region
+ if (filled_data_) {
+ data_[m_slot] = item;
+ } else {
+ A().construct(&data_[m_slot], T(item));
+ filled_data_ = true;
+ }
+ weights_[m_slot] = weight;
+ if (marks_ != nullptr) { marks_[m_slot] = mark; }
+ ++m_;
+
+ grow_candidate_set(total_wt_r_ + weight, r_ + 1);
+}
+
+/* In the "heavy" case the new item has weight > old_tau, so would
+ appear to the left of items in R in a hypothetical reverse-sorted list and
+ might or might not be light enough be part of this round's downsampling.
+ [After first splitting off the R=1 case] we greatly simplify the code by
+ putting the new item into the H heap whether it needs to be there or not.
+ In other words, it might go into the heap and then come right back out,
+ but that should be okay because pseudo_heavy items cannot predominate
+ in long streams unless (max wt) / (min wt) > o(exp(N)) */
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::update_heavy_general(const T& item, double weight, bool mark) {
+ assert(m_ == 0);
+ assert(r_ >= 2);
+ assert((r_ + h_) == k_);
+
+ // put into H, although may come back out momentarily
+ push(item, weight, mark);
+
+ grow_candidate_set(total_wt_r_, r_);
+}
+
+/* The analysis of this case is similar to that of the general heavy case.
+ The one small technical difference is that since R < 2, we must grab an M item
+ to have a valid starting point for continue_by_growing_candidate_set () */
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::update_heavy_r_eq1(const T& item, double weight, bool mark) {
+ assert(m_ == 0);
+ assert(r_ == 1);
+ assert((r_ + h_) == k_);
+
+ push(item, weight, mark); // new item into H
+ pop_min_to_m_region(); // pop lightest back into M
+
+ // Any set of two items is downsample-able to one item,
+ // so the two lightest items are a valid starting point for the following
+ int m_slot = k_ - 1; // array is k+1, 1 in R, so slot before is M
+ grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::allocate_data_arrays(uint32_t tgt_size, bool use_marks) {
+ std::cerr << "allocate_data_arrays(" << tgt_size << ", " << use_marks << ")\n";
+ filled_data_ = false;
+
+ data_ = A().allocate(tgt_size);
+ weights_ = AllocDouble().allocate(tgt_size);
+
+ if (use_marks) {
+ marks_ = AllocBool().allocate(tgt_size);
+ } else {
+ marks_ = nullptr;
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::grow_data_arrays() {
+ uint32_t prev_size = curr_items_alloc_;
+ curr_items_alloc_ = get_adjusted_size(k_, curr_items_alloc_ << rf_);
+ if (curr_items_alloc_ == k_) {
+ ++curr_items_alloc_;
+ }
+
+ if (prev_size < curr_items_alloc_) {
+ filled_data_ = false;
+
+ T* tmp_data = A().allocate(curr_items_alloc_);
+ double* tmp_weights = AllocDouble().allocate(curr_items_alloc_);
+
+ for (int i = 0; i < prev_size; ++i) {
+ tmp_data[i] = std::move(data_[i]);
+ A().destroy(data_ + i);
+ tmp_weights[i] = std::move(weights_[i]); // primitive double, but for consistency
+ }
+
+ A().deallocate(data_, prev_size);
+ AllocDouble().deallocate(weights_, prev_size);
+
+ data_ = std::move(tmp_data);
+ weights_ = std::move(tmp_weights);
+
+ if (marks_ != nullptr) {
+ bool* tmp_marks = AllocBool().allocate(curr_items_alloc_);
+ for (int i = 0; i < prev_size; ++i) {
+ tmp_marks[i] = std::move(marks_ + i); // primitive bool, again for consisntency
+ }
+ AllocBool().deallocate(marks_, prev_size);
+ marks_ = std::move(tmp_marks);
+ }
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::transition_from_warmup() {
+ // Move the 2 lightest items from H to M
+ // But the lighter really belongs in R, so update counts to reflect that
+ convert_to_heap();
+ pop_min_to_m_region();
+ pop_min_to_m_region();
+ --m_;
+ ++r_;
+
+ assert(h_ == (k_ - 1));
+ assert(m_ == 1);
+ assert(r_ == 1);
+
+ // Update total weight in R and then, having grabbed the value, overwrite
+ // in weight_ array to help make bugs more obvious
+ total_wt_r_ = weights_[k_]; // only one item, known location
+ weights_[k_] = -1.0;
+
+ // The two lightest items are ncessarily downsample-able to one item,
+ // and are therefore a valid initial candidate set
+ grow_candidate_set(weights_[k_ - 1] + total_wt_r_, 2);
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::convert_to_heap() {
+ if (h_ < 2) {
+ return; // nothing to do
+ }
+
+ int last_slot = h_ - 1;
+ int last_non_leaf = ((last_slot + 1) / 2) - 1;
+
+ for (int j = last_non_leaf; j >= 0; --j) {
+ restore_towards_leaves(j);
+ }
+
+ // TODO: Remove this block
+ // validates heap, used for initial debugging
+ for (int j = h_ - 1; j >= 1; --j) {
+ int p = ((j + 1) / 2) - 1;
+ assert(weights_[p] <= weights_[j]);
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::restore_towards_leaves(int slot_in) {
+ assert(h_ > 0);
+ const int last_slot = h_ - 1;
+ assert(slot_in <= last_slot);
+
+ int slot = slot_in;
+ int child = (2 * slot_in) + 1; // might be invalid, need to check
+
+ while (child <= last_slot) {
+ int child2 = child + 1; // might also be invalid
+ if ((child2 <= last_slot) && (weights_[child2] < weights_[child])) {
+ // siwtch to otehr vhild if it's both valid and smaller
+ child = child2;
+ }
+
+ if (weights_[slot] <= weights_[child]) {
+ // invariant holds so we're done
+ break;
+ }
+
+ // swap and continue
+ swap_values(slot, child);
+
+ slot = child;
+ child = (2 * slot) + 1; // might be invalid, checked on next loop
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::restore_towards_root(int slot_in) {
+ int slot = slot_in;
+ int p = (((slot + 1) / 2) - 1); // valid if slot >= 1
+ while ((slot > 0) && (weights_[slot] < weights_[p])) {
+ swap_values(slot, p);
+ slot = p;
+ p = (((slot + 1) / 2) - 1); // valid if slot >= 1
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::push(const T& item, double wt, bool mark) {
+ if (filled_data_) {
+ data_[h_] = item;
+ } else {
+ A().construct(&data_[h_], T(item));
+ filled_data_ = true;
+ }
+ weights_[h_] = wt;
+ if (marks_ != nullptr) {
+ marks_[h_] = mark;
+ num_marks_in_h_ += (mark ? 1 : 0);
+ }
+ ++h_;
+
+ restore_towards_root(h_ - 1); // need use old h_, but want accurate h_
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::pop_min_to_m_region() {
+ assert(h_ > 0);
+ assert(h_ + m_ + r_ == k_ + 1);
+
+ if (h_ == 1) {
+ // just update bookkeeping
+ ++m_;
+ --h_;
+ } else {
+ // main case
+ int tgt = h_ - 1; // last slot, will swap with root
+ swap_values(0, tgt);
+ ++m_;
+ --h_;
+
+ restore_towards_leaves(0);
+ }
+
+ if (is_marked(h_)) {
+ --num_marks_in_h_;
+ }
+}
+
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::swap_values(int src, int dst) {
+ T item = std::move(data_[src]);
+ data_[src] = std::move(data_[dst]);
+ data_[dst] = std::move(item);
+
+ double wt = weights_[src];
+ weights_[src] = weights_[dst];
+ weights_[dst] = wt;
+
+ if (marks_ != nullptr) {
+ bool mark = marks_[src];
+ marks_[src] = marks_[dst];
+ marks_[dst] = mark;
+ }
+}
+
+/* When entering here we should be in a well-characterized state where the
+ new item has been placed in either h or m and we have a valid but not necessarily
+ maximal sampling plan figured out. The array is completely full at this point.
+ Everyone in h and m has an explicit weight. The candidates are right-justified
+ and are either just the r set or the r set + exactly one m item. The number
+ of cands is at least 2. We will now grow the candidate set as much as possible
+ by pulling sufficiently light items from h to m.
+*/
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::grow_candidate_set(double wt_cands, int num_cands) {
+ assert(h_ + m_ + r_ == k_ + 1);
+ assert(num_cands >= 2);
+ assert(num_cands == m_ + r_);
+ assert(m_ == 0 || m_ == 1);
+
+ while (h_ > 0) {
+ double next_wt = peek_min();
+ double next_tot_wt = wt_cands + next_wt;
+
+ // test for strict lightness of next prospect (denominator multiplied through)
+ // ideally: (next_wt * (next_num_cands-1) < next_tot_wt)
+ // but can use num_cands directly
+ if ((next_wt * num_cands) < next_tot_wt) {
+ wt_cands = next_tot_wt;
+ ++num_cands;
+ pop_min_to_m_region(); // adjusts h_ and m_
+ } else {
+ break;
+ }
+ }
+
+ downsample_candidate_set(wt_cands, num_cands);
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::downsample_candidate_set(double wt_cands, int num_cands) {
+ assert(num_cands >= 2);
+ assert(h_ + num_cands == k_ + 1);
+
+ // need this before overwriting anything
+ int delete_slot = choose_delete_slot(wt_cands, num_cands);
+ int leftmost_cand_slot = h_;
+ assert(delete_slot >= leftmost_cand_slot);
+ assert(delete_slot <= k_);
+
+ // Overwrite weights for items from M moving into R,
+ // to make bugs more obvious. Also needed so anyone reading the
+ // weight knows if it's invalid without checking h_ and m_
+ int stop_idx = leftmost_cand_slot + m_;
+ for (int j = leftmost_cand_slot; j < stop_idx; ++j) {
+ weights_[j] = -1.0;
+ }
+
+ // The next two lines work even when delete_slot == leftmost_cand_slot
+ data_[delete_slot] = std::move(data_[leftmost_cand_slot]);
+ // cannot set data_[leftmost_cand_slot] to null since not uisng T*
+
+ m_ = 0;
+ r_ = num_cands - 1;
+ total_wt_r_ = wt_cands;
+}
+
+template<typename T, typename S, typename A>
+int var_opt_sketch<T,S,A>::choose_delete_slot(double wt_cands, int num_cands) const {
+ assert(r_ > 0);
+
+ if (m_ == 0) {
+ // this happens if we insert a really have item
+ return pick_random_slot_in_r();
+ } else if (m_ == 1) {
+ // check if we keep th item in M or pick oen from R
+ // p(keep) = (num_cand - 1) * wt_M / wt_cand
+ double wt_m_cand = weights_[h_]; // slot of item in M is h_
+ if ((wt_cands * next_double_exclude_zero()) < ((num_cands - 1) * wt_m_cand)) {
+ return pick_random_slot_in_r(); // keep item in M
+ } else {
+ return h_; // indext of item in M
+ }
+ } else {
+ // general case
+ int delete_slot = choose_weighted_delete_slot(wt_cands, num_cands);
+ int first_r_slot = h_ + m_;
+ if (delete_slot == first_r_slot) {
+ return pick_random_slot_in_r();
+ } else {
+ return delete_slot;
+ }
+ }
+}
+
+template<typename T, typename S, typename A>
+int var_opt_sketch<T,S,A>::choose_weighted_delete_slot(double wt_cands, int num_cands) const {
+ assert(m_ >= 1);
+
+ int offset = h_;
+ int final_m = (offset + m_) - 1;
+ int num_to_keep = num_cands - 1;
+
+ double left_subtotal = 0.0;
+ double right_subtotal = -1.0 * wt_cands * next_double_exclude_zero();
+
+ for (int i = offset; i <= final_m; ++i) {
+ left_subtotal += num_to_keep * weights_[i];
+ right_subtotal += wt_cands;
+
+ if (left_subtotal < right_subtotal) {
+ return i;
+ }
+ }
+
+ // this slot tells caller that we need to delete out of R
+ return final_m + 1;
+}
+
+template<typename T, typename S, typename A>
+int var_opt_sketch<T,S,A>::pick_random_slot_in_r() const {
+ assert(r_ > 0);
+ int offset = h_ + m_;
+ if (r_ == 1) {
+ return offset;
+ } else {
+ return offset + next_int(r_);
+ }
+}
+
+template<typename T, typename S, typename A>
+double var_opt_sketch<T,S,A>::peek_min() const {
+ assert(h_ > 0);
+ return weights_[0];
+}
+
+template<typename T, typename S, typename A>
+bool var_opt_sketch<T,S,A>::is_marked(int idx) const {
+ return marks_ == nullptr ? false : marks_[idx];
+}
+
+template<typename T, typename S, typename A>
+double var_opt_sketch<T,S,A>::get_tau() const {
+ return r_ == 0 ? std::nan("1") : (total_wt_r_ / r_);
+}
+
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
+ bool is_empty(flags & EMPTY_FLAG_MASK);
+
+ if (is_empty) {
+ if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
+ throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+ + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
+ + std::to_string(preamble_longs));
+ }
+ } else {
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP
+ && preamble_longs != PREAMBLE_LONGS_FULL) {
+ throw new std::invalid_argument("Possible corruption: Preamble longs must be "
+ + std::to_string(PREAMBLE_LONGS_WARMUP) + " or "
+ + std::to_string(PREAMBLE_LONGS_FULL)
+ + "for a non-empty sketch. Found: " + std::to_string(preamble_longs));
+ }
+ }
+}
+
+template<typename T, typename S, typename A>
+void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
+ if (family_id == FAMILY) {
+ if (ser_ver != SER_VER) {
+ throw std::invalid_argument("Possible corruption: VarOpt serialization version must be "
+ + std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
+ }
+ return;
+ }
+ // TODO: extend to handle reservoir sampling
+
+ throw std::invalid_argument("Possible corruption: VarOpt family id must be "
+ + std::to_string(FAMILY) + ". Found: " + std::to_string(family_id));
+}
+
+
+template<typename T, typename S, typename A>
+typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::begin() const {
+ return var_opt_sketch<T, S, A>::const_iterator(data_, weights_, h_, r_, total_wt_r_, false);
+}
+
+template<typename T, typename S, typename A>
+typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::end() const {
+ return var_opt_sketch<T, S, A>::const_iterator(data_, weights_, h_, r_, total_wt_r_, true);
+}
+
+// -------- var_opt_sketch::const_iterator implementation ---------
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T,S,A>::const_iterator::const_iterator(const T* items, const double* weights,
+ const uint32_t h_count, const uint32_t r_count,
+ const double total_wt_r, bool use_end) :
+items(items), weights(weights), h_count(h_count), r_count(r_count), r_item_wt(r_count > 0 ? total_wt_r/r_count : -1.0) {
+ // index logic easier to read if not inline
+ if (use_end) {
+ index = (r_count > 0 ? h_count + r_count + 1 : h_count);
+ } else {
+ index = (h_count == 0 && r_count > 0 ? 1 : 0); // skip the gap if at the start
+ }
+}
+
+template<typename T, typename S, typename A>
+var_opt_sketch<T, S, A>::const_iterator::const_iterator(const const_iterator& other):
+items(other.items), weights(other.weights), h_count(other.h_count), r_count(other.r_count), r_item_wt(other.r_item_wt), index(other.index)
+{}
+
+template<typename T, typename S, typename A>
+typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++() {
+ ++index;
+ // check for the gap
+ if (index == h_count && r_count > 0) {
+ ++index;
+ }
+ return *this;
+}
+
+template<typename T, typename S, typename A>
+typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++(int) {
+ const_iterator tmp(*this);
+ operator++();
+ return tmp;
+}
+
+template<typename T, typename S, typename A>
+bool var_opt_sketch<T, S, A>::const_iterator::operator==(const const_iterator& other) const {
+ if (items != other.items || h_count != other.h_count || r_count != other.r_count) return false;
+ return index == other.index;
+}
+
+template<typename T, typename S, typename A>
+bool var_opt_sketch<T, S, A>::const_iterator::operator!=(const const_iterator& other) const {
+ return !operator==(other);
+}
+
+template<typename T, typename S, typename A>
+const std::pair<const T&, const double> var_opt_sketch<T, S, A>::const_iterator::operator*() const {
+ return std::pair<const T&, const double>(items[index], weights[index]);
+}
+
+
+// ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
+
+namespace random_utils {
+ static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
+ static std::mt19937_64 rand(rd());
+ static std::uniform_real_distribution<> next_double(0.0, 1.0);
+}
+
+/**
+ * Checks if target sampling allocation is more than 50% of max sampling size.
+ * If so, returns max sampling size, otherwise passes through target size.
+ */
+template<typename T, typename S, typename A>
+int var_opt_sketch<T,S,A>::get_adjusted_size(int max_size, int resize_target) {
+ if (max_size - (resize_target << 1) < 0L) {
+ return max_size;
+ }
+ return resize_target;
+}
+
+template<typename T, typename S, typename A>
+int var_opt_sketch<T,S,A>::starting_sub_multiple(int lg_target, int lg_rf, int lg_min) {
+ return (lg_target <= lg_min)
+ ? lg_min : (lg_rf == 0) ? lg_target
+ : (lg_target - lg_min) % lg_rf + lg_min;
+}
+
+template<typename T, typename S, typename A>
+bool var_opt_sketch<T,S,A>::is_power_of_2(uint32_t v) {
+ return v && !(v & (v - 1));
+}
+
+template<typename T, typename S, typename A>
+uint32_t var_opt_sketch<T,S,A>::to_log_2(uint32_t v) {
+ if (is_power_of_2(v)) {
+ return count_trailing_zeros(v);
+ } else {
+ throw std::invalid_argument("Attempt to compute integer log2 of non-positive or non-power of 2");
+ }
+}
+
+// Returns an integer in the range [0, max_value) -- excludes max_value
+template<typename T, typename S, typename A>
+int var_opt_sketch<T,S,A>::next_int(int max_value) {
+ std::uniform_int_distribution<> dist(0, max_value - 1);
+ return dist(random_utils::rand);
+}
+
+template<typename T, typename S, typename A>
+double var_opt_sketch<T,S,A>::next_double_exclude_zero() {
+ double r = random_utils::next_double(random_utils::rand);
+ while (r == 0.0) {
+ r = random_utils::next_double(random_utils::rand);
+ }
+ return r;
+}
+
+template<typename T, typename S, typename A>
+uint32_t var_opt_sketch<T,S,A>::count_trailing_zeros(uint32_t v) {
+ static const uint8_t ctz_byte_count[256] = {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1, [...]
+
+ uint32_t tmp =v;
+ for (int j = 0; j < 4; ++j) {
+ const int byte = (tmp & 0xFFUL);
+ if (byte != 0) return (j << 3) + ctz_byte_count[byte];
+ tmp >>= 8;
+ }
+ return 64;
+}
+
+template<typename T, typename S, typename A>
+uint32_t var_opt_sketch<T,S,A>::ceiling_power_of_2(uint32_t n) {
+ --n;
+ n |= n >> 1;
+ n |= n >> 2;
+ n |= n >> 4;
+ n |= n >> 8;
+ n |= n >> 16;
+ return ++n;
+}
+
+}
+
+// namespace datasketches
+
+#endif // _VAR_OPT_SKETCH_IMPL_HPP_
diff --git a/sampling/test/CMakeLists.txt b/sampling/test/CMakeLists.txt
new file mode 100644
index 0000000..e291b25
--- /dev/null
+++ b/sampling/test/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_executable(sampling_test)
+
+target_link_libraries(sampling_test sampling common_test)
+
+set_target_properties(sampling_test PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+)
+
+file(TO_NATIVE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/" SAMPLING_TEST_BINARY_PATH)
+target_compile_definitions(sampling_test
+ PRIVATE
+ TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
+)
+
+add_test(
+ NAME sampling_test
+ COMMAND sampling_test
+)
+
+target_sources(sampling_test
+ PRIVATE
+ var_opt_sketch_test.cpp
+)
+
+target_include_directories(sampling_test
+ PRIVATE
+ ../../common/test
+)
diff --git a/sampling/test/counting_allocator.hpp b/sampling/test/counting_allocator.hpp
new file mode 100644
index 0000000..e4c546f
--- /dev/null
+++ b/sampling/test/counting_allocator.hpp
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef COUNTING_ALLOCATOR_H
+#define COUNTING_ALLOCATOR_H
+
+#include <new>
+#include <utility>
+
+namespace datasketches {
+
+// this relies on global variables to track allocated memory
+extern long long int total_allocated_memory;
+extern long long int total_objects_constructed;
+
+template <class T> class counting_allocator {
+public:
+ typedef T value_type;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ template <class U>
+ struct rebind { typedef counting_allocator<U> other; };
+
+ counting_allocator() {}
+ counting_allocator(const counting_allocator&) {}
+ template <class U>
+ counting_allocator(const counting_allocator<U>&) {}
+ ~counting_allocator() {}
+
+ pointer address(reference x) const { return &x; }
+ const_pointer address(const_reference x) const {
+ return &x;
+ }
+
+ pointer allocate(size_type n, const_pointer = 0) {
+ void* p = malloc(n * sizeof(T));
+ if (!p) throw std::bad_alloc();
+ total_allocated_memory += n * sizeof(T);
+ return static_cast<pointer>(p);
+ }
+
+ void deallocate(pointer p, size_type n) {
+ if (p) free(p);
+ total_allocated_memory -= n * sizeof(T);
+ }
+
+ size_type max_size() const {
+ return static_cast<size_type>(-1) / sizeof(T);
+ }
+
+ template<typename... Args>
+ void construct(pointer p, Args&&... args) {
+ new(p) value_type(std::forward<Args>(args)...);
+ ++total_objects_constructed;
+ }
+ void destroy(pointer p) {
+ p->~value_type();
+ --total_objects_constructed;
+ }
+
+private:
+ void operator=(const counting_allocator&);
+};
+
+template<> class counting_allocator<void> {
+public:
+ typedef void value_type;
+ typedef void* pointer;
+ typedef const void* const_pointer;
+
+ template <class U>
+ struct rebind { typedef counting_allocator<U> other; };
+};
+
+
+template <class T>
+inline bool operator==(const counting_allocator<T>&, const counting_allocator<T>&) {
+ return true;
+}
+
+template <class T>
+inline bool operator!=(const counting_allocator<T>&, const counting_allocator<T>&) {
+ return false;
+}
+
+}
+
+#endif
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
new file mode 100644
index 0000000..ba75ba3
--- /dev/null
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cppunit/TestFixture.h>
+#include <cppunit/extensions/HelperMacros.h>
+
+#include "counting_allocator.hpp"
+
+#include <var_opt_sketch.hpp>
+
+#include <string>
+#include <sstream>
+#include <iostream>
+
+#ifdef TEST_BINARY_INPUT_PATH
+static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
+#else
+static std::string testBinaryInputPath = "test/";
+#endif
+
+namespace datasketches {
+
+long long int total_allocated_memory;
+long long int total_objects_constructed;
+
+class var_opt_sketch_test: public CppUnit::TestFixture {
+
+ CPPUNIT_TEST_SUITE(var_opt_sketch_test);
+ CPPUNIT_TEST(empty);
+ CPPUNIT_TEST(mem_test);
+ CPPUNIT_TEST_SUITE_END();
+
+ void empty() {
+ int k = 10;
+
+ {
+ var_opt_sketch<int> sketch(k);
+
+ for (int i = 0; i < 2*k; ++i)
+ sketch.update(i);
+ sketch.update(1000, 100000.0);
+
+ std::cout << sketch.to_string();
+
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ sketch.serialize(ss);
+ std::cout << "sketch.serialize() done\n";
+ var_opt_sketch<int> sk2 = var_opt_sketch<int>::deserialize(ss);
+ std::cout << sk2.to_string() << std::endl;;
+ }
+ std::cerr << "num allocs: " << num_allocs << "\n";
+ {
+ var_opt_sketch<std::string> sk(k);
+ std::cout << "Expected size: " << sk.get_serialized_size_bytes() << std::endl;
+ std::string x[26];
+ x[0] = std::string("a");
+ x[1] = std::string("b");
+ x[2] = std::string("c");
+ x[3] = std::string("d");
+ x[4] = std::string("e");
+ x[5] = std::string("f");
+ x[6] = std::string("g");
+ x[7] = std::string("h");
+ x[8] = std::string("i");
+ x[9] = std::string("j");
+ x[10] = std::string("k");
+ x[11] = std::string("l");
+ x[12] = std::string("m");
+ x[13] = std::string("n");
+ x[14] = std::string("o");
+ x[15] = std::string("p");
+ x[16] = std::string("q");
+ x[17] = std::string("r");
+ x[18] = std::string("s");
+ x[19] = std::string("t");
+ x[20] = std::string("u");
+ x[21] = std::string("v");
+ x[22] = std::string("w");
+ x[23] = std::string("x");
+ x[24] = std::string("y");
+ x[25] = std::string("z");
+
+ for (int i=0; i <11; ++i)
+ sk.update(x[i]);
+ sk.update(x[11], 10000);
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ sk.serialize(ss);
+ std::cout << "ss size: " << ss.str().length() << std::endl;
+ auto vec = sk.serialize();
+ std::cout << "Vector size: " << vec.size() << std::endl;
+
+ var_opt_sketch<std::string> sk2 = var_opt_sketch<std::string>::deserialize(ss);
+ std::cout << sk2.to_string() << std::endl;
+ const std::string str("much longer string with luck won't fit nicely in existing structure location");
+ sk2.update(str, 1000000);
+ }
+ }
+
+ void print_mem_stats() {
+ std::cout << "construct(): " << total_objects_constructed << std::endl;
+ std::cout << "memory used: " << total_allocated_memory << std::endl;
+ }
+
+ void mem_test() {
+ int k = 10;
+
+ total_allocated_memory = 0;
+ total_objects_constructed = 0;
+
+ typedef var_opt_sketch<std::string, serde<std::string>, counting_allocator<std::string>> var_opt_sketch_a;
+ var_opt_sketch_a* s = new (counting_allocator<var_opt_sketch_a>().allocate(1)) var_opt_sketch_a(k);
+
+ std::string x[26];
+ x[0] = std::string("a");
+ x[1] = std::string("b");
+ x[2] = std::string("c");
+ x[3] = std::string("d");
+ x[4] = std::string("e");
+ x[5] = std::string("f");
+ x[6] = std::string("g");
+ x[7] = std::string("h");
+ x[8] = std::string("i");
+ x[9] = std::string("j");
+ x[10] = std::string("k");
+ x[11] = std::string("l");
+ x[12] = std::string("m");
+ x[13] = std::string("n");
+ x[14] = std::string("o");
+ x[15] = std::string("p");
+ x[16] = std::string("q");
+ x[17] = std::string("r");
+ x[18] = std::string("s");
+ x[19] = std::string("t");
+ x[20] = std::string("u");
+ x[21] = std::string("v");
+ x[22] = std::string("w");
+ x[23] = std::string("x");
+ x[24] = std::string("y");
+ x[25] = std::string("z");
+
+ for (int i=0; i <5; ++i)
+ s->update(x[i]);
+ print_mem_stats();
+
+ for (int i=5; i <11; ++i)
+ s->update(x[i]);
+ print_mem_stats();
+
+ s->update(x[11], 10000);
+ print_mem_stats();
+
+ for (int i=12; i <26; ++i)
+ s->update(x[i]);
+ print_mem_stats();
+
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+ s->serialize(ss);
+ std::cout << s->to_string() << std::endl;
+ print_mem_stats();
+
+ counting_allocator<var_opt_sketch_a>().destroy(s);
+ counting_allocator<var_opt_sketch_a>().deallocate(s, 1);
+ print_mem_stats();
+
+ {
+ auto sk = var_opt_sketch_a::deserialize(ss);
+ print_mem_stats();
+
+ sk.update("qrs");
+ print_mem_stats();
+
+ sk.update("zyx");
+ print_mem_stats();
+
+ std::cout << sk.to_string() << std::endl;
+
+ auto vec = sk.serialize();
+ var_opt_sketch_a sk2 = var_opt_sketch_a::deserialize(vec.data(), vec.size());
+ std::cout << sk2.to_string() << std::endl;
+ }
+ print_mem_stats();
+ }
+
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(var_opt_sketch_test);
+
+} /* namespace datasketches */
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org