You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/06/03 00:51:57 UTC

[datasketches-cpp] 01/01: converting constructor

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch kll_converting_constructor
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit a92a26f7f78453afec6f90b02566dc014b45e3f7
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Jun 2 17:51:26 2022 -0700

    converting constructor
---
 kll/include/kll_sketch.hpp      | 40 ++++++++++++++++-----
 kll/include/kll_sketch_impl.hpp | 73 +++++++++++++++++++++++++++++++++++---
 kll/test/kll_sketch_test.cpp    | 77 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 172 insertions(+), 18 deletions(-)

diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index fcf8ebb..8616623 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -182,6 +182,14 @@ class kll_sketch {
     kll_sketch& operator=(const kll_sketch& other);
     kll_sketch& operator=(kll_sketch&& other);
 
+    /*
+     * Type converting constructor.
+     * @param other sketch of a different type
+     * @param allocator instance of an Allocator
+     */
+    template<typename TT, typename CC, typename AA>
+    explicit kll_sketch(const kll_sketch<TT, CC, AA> & other, const A& allocator = A());
+
     /**
      * Updates this sketch with the given data item.
      * @param value an item from a stream of items
@@ -208,6 +216,13 @@ class kll_sketch {
      */
     uint16_t get_k() const;
 
+    /**
+     * Returns min_k, which is an internal parameter for error estimation
+     * after merging sketches with different k.
+     * @return parameter min_k
+     */
+    uint16_t get_min_k() const;
+
     /**
      * Returns the length of the input stream.
      * @return stream length
@@ -220,6 +235,13 @@ class kll_sketch {
      */
     uint32_t get_num_retained() const;
 
+    /**
+     * Returns the current capacity in items allocated by the sketch.
+     * For internal use.
+     * @return the capacity
+     */
+    uint32_t get_capacity() const;
+
     /**
      * Returns true if this sketch is in estimation mode.
      * @return estimation mode flag
@@ -390,7 +412,7 @@ class kll_sketch {
     /**
      * Computes size needed to serialize the current state of the sketch.
      * This version is for fixed-size arithmetic types (integral and floating point).
-     * @param instance of a SerDe
+     * @param serde instance of a SerDe
      * @return size in bytes needed to serialize this sketch
      */
     template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
@@ -399,7 +421,7 @@ class kll_sketch {
     /**
      * Computes size needed to serialize the current state of the sketch.
      * This version is for all other types and can be expensive since every item needs to be looked at.
-     * @param instance of a SerDe
+     * @param serde instance of a SerDe
      * @return size in bytes needed to serialize this sketch
      */
     template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
@@ -459,7 +481,7 @@ class kll_sketch {
     /**
      * This method deserializes a sketch from a given stream.
      * @param is input stream
-     * @param instance of an Allocator
+     * @param allocator instance of an Allocator
      * @return an instance of a sketch
      *
      * Deprecated, to be removed in the next major version
@@ -469,8 +491,8 @@ class kll_sketch {
     /**
      * This method deserializes a sketch from a given stream.
      * @param is input stream
-     * @param instance of a SerDe
-     * @param instance of an Allocator
+     * @param serge instance of a SerDe
+     * @param allocator instance of an Allocator
      * @return an instance of a sketch
      */
     template<typename SerDe = S>
@@ -480,7 +502,7 @@ class kll_sketch {
      * This method deserializes a sketch from a given array of bytes.
      * @param bytes pointer to the array of bytes
      * @param size the size of the array
-     * @param instance of an Allocator
+     * @param allocator instance of an Allocator
      * @return an instance of a sketch
      *
      * Deprecated, to be removed in the next major version
@@ -491,8 +513,8 @@ class kll_sketch {
      * This method deserializes a sketch from a given array of bytes.
      * @param bytes pointer to the array of bytes
      * @param size the size of the array
-     * @param instance of a SerDe
-     * @param instance of an Allocator
+     * @param serde instance of a SerDe
+     * @param allocator instance of an Allocator
      * @return an instance of a sketch
      */
     template<typename SerDe = S>
@@ -606,6 +628,8 @@ class kll_sketch {
     static void check_serial_version(uint8_t serial_version);
     static void check_family_id(uint8_t family_id);
 
+    void check_sorting() const;
+
     // implementations for floating point types
     template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
     static const TT& get_invalid_value() {
diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index 1302ac9..211a926 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp
@@ -26,6 +26,7 @@
 #include <stdexcept>
 
 #include "conditional_forward.hpp"
+#include "count_zeros.hpp"
 #include "memory_operations.hpp"
 #include "kll_helper.hpp"
 
@@ -69,7 +70,7 @@ max_value_(nullptr),
 is_level_zero_sorted_(other.is_level_zero_sorted_)
 {
   items_ = allocator_.allocate(items_size_);
-  std::copy(other.items_ + levels_[0], other.items_ + levels_[num_levels_], items_ + levels_[0]);
+  for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
   if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
   if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
 }
@@ -147,6 +148,50 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
   }
 }
 
+template<typename T, typename C, typename S, typename A>
+template<typename TT, typename CC, typename AA>
+kll_sketch<T, C, S, A>::kll_sketch(const kll_sketch<TT, CC, AA>& other, const A& allocator):
+allocator_(allocator),
+k_(other.get_k()),
+m_(DEFAULT_M),
+min_k_(other.get_min_k()),
+n_(other.get_n()),
+num_levels_(1),
+levels_(2, 0, allocator),
+items_(nullptr),
+items_size_(other.get_capacity()),
+min_value_(nullptr),
+max_value_(nullptr),
+is_level_zero_sorted_(false)
+{
+  static_assert(
+    std::is_constructible<T, TT>::value,
+    "Type converting constructor requires new type to be constructible from existing type"
+  );
+  items_ = allocator_.allocate(items_size_);
+  levels_[0] = items_size_ - other.get_num_retained();
+  levels_[1] = items_size_;
+
+  if (!other.is_empty()) {
+    min_value_ = new (allocator_.allocate(1)) T(other.get_min_value());
+    max_value_ = new (allocator_.allocate(1)) T(other.get_max_value());
+    size_t index = levels_[0];
+    for (auto pair: other) {
+      new (&items_[index]) T(pair.first);
+      const uint8_t level = count_trailing_zeros_in_u64(pair.second);
+      if (level == num_levels_) {
+        ++num_levels_;
+        levels_.resize(num_levels_ + 1);
+        levels_[level] = index;
+        levels_[num_levels_] = items_size_;
+      }
+      ++index;
+    }
+  }
+  check_sorting();
+  assert_correct_total_weight();
+}
+
 template<typename T, typename C, typename S, typename A>
 template<typename FwdT>
 void kll_sketch<T, C, S, A>::update(FwdT&& value) {
@@ -210,6 +255,11 @@ uint16_t kll_sketch<T, C, S, A>::get_k() const {
   return k_;
 }
 
+template<typename T, typename C, typename S, typename A>
+uint16_t kll_sketch<T, C, S, A>::get_min_k() const {
+  return min_k_;
+}
+
 template<typename T, typename C, typename S, typename A>
 uint64_t kll_sketch<T, C, S, A>::get_n() const {
   return n_;
@@ -220,6 +270,11 @@ uint32_t kll_sketch<T, C, S, A>::get_num_retained() const {
   return levels_[num_levels_] - levels_[0];
 }
 
+template<typename T, typename C, typename S, typename A>
+uint32_t kll_sketch<T, C, S, A>::get_capacity() const {
+  return items_size_;
+}
+
 template<typename T, typename C, typename S, typename A>
 bool kll_sketch<T, C, S, A>::is_estimation_mode() const {
   return num_levels_ > 1;
@@ -780,17 +835,27 @@ void kll_sketch<T, C, S, A>::sort_level_zero() {
   }
 }
 
+template<typename T, typename C, typename S, typename A>
+void kll_sketch<T, C, S, A>::check_sorting() const {
+  // not checking level 0
+  for (uint8_t level = 1; level < num_levels_; ++level) {
+    const auto from = items_ + levels_[level];
+    const auto to = items_ + levels_[level + 1];
+    if (!std::is_sorted(from, to, C())) {
+      throw std::logic_error("levels must be sorted");
+    }
+  }
+}
+
 template<typename T, typename C, typename S, typename A>
 template<bool inclusive>
 quantile_sketch_sorted_view<T, C, A> kll_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
   const_cast<kll_sketch*>(this)->sort_level_zero(); // allow this side effect
   quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
-  uint8_t level = 0;
-  while (level < num_levels_) {
+  for (uint8_t level = 0; level < num_levels_; ++level) {
     const auto from = items_ + levels_[level];
     const auto to = items_ + levels_[level + 1]; // exclusive
     view.add(from, to, 1 << level);
-    ++level;
   }
   if (cumulative) view.template convert_to_cummulative<inclusive>();
   return view;
diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp
index a8bd040..acf69df 100644
--- a/kll/test/kll_sketch_test.cpp
+++ b/kll/test/kll_sketch_test.cpp
@@ -39,9 +39,9 @@ static std::string testBinaryInputPath = "test/";
 #endif
 
 // typical usage would be just kll_sketch<float> or kll_sketch<std::string>, but here we use test_allocator
-typedef kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>> kll_float_sketch;
+using kll_float_sketch = kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>>;
 // let std::string use the default allocator for simplicity, otherwise we need to define "less" and "serde"
-typedef kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>> kll_string_sketch;
+using kll_string_sketch = kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>>;
 
 TEST_CASE("kll sketch", "[kll_sketch]") {
 
@@ -75,7 +75,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
       (void) it; // to suppress "unused" warning
       FAIL("should be no iterations over an empty sketch");
     }
-}
+  }
 
   SECTION("get bad quantile") {
     kll_float_sketch sketch(200, 0);
@@ -835,10 +835,75 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
       REQUIRE((*it).second == 3);
     }
   }
-  // cleanup
-  if (test_allocator_total_bytes != 0) {
-    REQUIRE(test_allocator_total_bytes == 0);
+
+  SECTION("type conversion: empty") {
+    kll_sketch<double> kll_double;
+    kll_sketch<float> kll_float(kll_double);
+    REQUIRE(kll_float.is_empty());
+    REQUIRE(kll_float.get_k() == kll_double.get_k());
+    REQUIRE(kll_float.get_n() == 0);
+    REQUIRE(kll_float.get_num_retained() == 0);
+  }
+
+  SECTION("type conversion: over k") {
+    kll_sketch<double> kll_double;
+    for (int i = 0; i < 1000; ++i) kll_double.update(static_cast<double>(i));
+    kll_sketch<float> kll_float(kll_double);
+    REQUIRE(!kll_float.is_empty());
+    REQUIRE(kll_float.get_k() == kll_double.get_k());
+    REQUIRE(kll_float.get_n() == kll_double.get_n());
+    REQUIRE(kll_float.get_num_retained() == kll_double.get_num_retained());
+
+    auto sv_float = kll_float.get_sorted_view(false);
+    auto sv_double = kll_double.get_sorted_view(false);
+    auto sv_float_it = sv_float.begin();
+    auto sv_double_it = sv_double.begin();
+    while (sv_float_it != sv_float.end()) {
+      REQUIRE(sv_double_it != sv_double.end());
+      auto float_pair = *sv_float_it;
+      auto double_pair = *sv_double_it;
+      REQUIRE(float_pair.first == Approx(double_pair.first).margin(0.01));
+      REQUIRE(float_pair.second == double_pair.second);
+      ++sv_float_it;
+      ++sv_double_it;
+    }
+    REQUIRE(sv_double_it == sv_double.end());
+  }
+
+  class A {
+    int val;
+  public:
+    A(int val): val(val) {}
+    int get_val() const { return val; }
+  };
+
+  struct less_A {
+    bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); }
+  };
+
+  class B {
+    int val;
+  public:
+    explicit B(const A& a): val(a.get_val()) {}
+    int get_val() const { return val; }
+  };
+
+  struct less_B {
+    bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); }
+  };
+
+  SECTION("type conversion: custom types") {
+    kll_sketch<A, less_A> sa;
+    sa.update(1);
+    sa.update(2);
+    sa.update(3);
+
+    kll_sketch<B, less_B> sb(sa);
+    REQUIRE(sb.get_n() == 3);
   }
+
+  // cleanup
+  REQUIRE(test_allocator_total_bytes == 0);
 }
 
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org