You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/06/23 23:16:09 UTC

[incubator-datasketches-cpp] branch tuple_sketch updated: more methods and tests

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/tuple_sketch by this push:
     new 4424074  more methods and tests
4424074 is described below

commit 4424074c4877eab20eceb767de1cac58b6e03f04
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Jun 23 16:15:59 2020 -0700

    more methods and tests
---
 tuple/include/theta_sketch_experimental.hpp      |   4 +-
 tuple/include/theta_sketch_experimental_impl.hpp |   5 +
 tuple/include/theta_union_experimental.hpp       |  17 ++-
 tuple/include/theta_update_sketch_base.hpp       |  20 +++-
 tuple/include/theta_update_sketch_base_impl.hpp  |   7 +-
 tuple/include/tuple_sketch.hpp                   | 136 +++++++++++------------
 tuple/include/tuple_sketch_impl.hpp              |  66 +++++++++++
 tuple/test/tuple_sketch_allocation_test.cpp      |   3 +
 tuple/test/tuple_sketch_test.cpp                 |  39 +++++++
 9 files changed, 219 insertions(+), 78 deletions(-)

diff --git a/tuple/include/theta_sketch_experimental.hpp b/tuple/include/theta_sketch_experimental.hpp
index 9c8bd3d..a920929 100644
--- a/tuple/include/theta_sketch_experimental.hpp
+++ b/tuple/include/theta_sketch_experimental.hpp
@@ -50,6 +50,8 @@ public:
   void update(uint64_t key);
   void update(const void* key, size_t length);
 
+  void trim();
+
   string<A> to_string(bool detail = false) const;
 
   vector_bytes serialize(unsigned header_size_bytes = 0) const;
@@ -62,7 +64,7 @@ public:
 
 private:
   enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
-  typedef theta_update_sketch_base<uint64_t, trivial_extract_key<uint64_t>, A> theta_table;
+  using theta_table = theta_update_sketch_base<uint64_t, trivial_extract_key<uint64_t>, A>;
   theta_table table_;
 
   theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
diff --git a/tuple/include/theta_sketch_experimental_impl.hpp b/tuple/include/theta_sketch_experimental_impl.hpp
index 347022f..81db200 100644
--- a/tuple/include/theta_sketch_experimental_impl.hpp
+++ b/tuple/include/theta_sketch_experimental_impl.hpp
@@ -44,6 +44,11 @@ void theta_sketch_experimental<A>::update(const void* key, size_t length) {
 }
 
 template<typename A>
+void theta_sketch_experimental<A>::trim() {
+  table_.trim();
+}
+
+template<typename A>
 string<A> theta_sketch_experimental<A>::to_string(bool detail) const {
   std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
   auto type = typeid(*this).name();
diff --git a/tuple/include/theta_union_experimental.hpp b/tuple/include/theta_union_experimental.hpp
index 585c5fe..87b6a64 100644
--- a/tuple/include/theta_union_experimental.hpp
+++ b/tuple/include/theta_union_experimental.hpp
@@ -29,6 +29,17 @@ namespace datasketches {
 
 // experimental theta union derived from the same base as tuple union
 
+// utility function to hide unused compiler warning
+// usually has no additional cost
+template<typename T> void unused(T&&...) {}
+
+struct pass_through_policy {
+  uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
+    unused(incoming_entry);
+    return internal_entry;
+  }
+};
+
 template<typename Allocator = std::allocator<uint64_t>>
 class theta_union_experimental {
 public:
@@ -38,12 +49,6 @@ public:
   using CompactSketch = compact_theta_sketch_experimental<Allocator>;
   using resize_factor = theta_constants::resize_factor;
 
-  struct pass_through_policy {
-    uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
-      return internal_entry;
-    }
-  };
-
   using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
 
   // No constructor here. Use builder instead.
diff --git a/tuple/include/theta_update_sketch_base.hpp b/tuple/include/theta_update_sketch_base.hpp
index e90091f..0046adc 100644
--- a/tuple/include/theta_update_sketch_base.hpp
+++ b/tuple/include/theta_update_sketch_base.hpp
@@ -45,7 +45,7 @@ struct theta_update_sketch_base {
   // TODO: copy and move
   ~theta_update_sketch_base();
 
-  typedef Entry* iterator;
+  using iterator = Entry*;
 
   std::pair<iterator, bool> find(uint64_t key) const;
 
@@ -77,6 +77,7 @@ struct theta_update_sketch_base {
 
   void resize();
   void rebuild();
+  void trim();
 
   static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
   static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
@@ -220,6 +221,23 @@ private:
   uint32_t index_;
 };
 
+// double value canonicalization for compatibility with Java
+static inline int64_t canonical_double(double value) {
+  union {
+    int64_t long_value;
+    double double_value;
+  } long_double_union;
+
+  if (value == 0.0) {
+    long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
+  } else if (std::isnan(value)) {
+    long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
+  } else {
+    long_double_union.double_value = value;
+  }
+  return long_double_union.long_value;
+}
+
 } /* namespace datasketches */
 
 #include "theta_update_sketch_base_impl.hpp"
diff --git a/tuple/include/theta_update_sketch_base_impl.hpp b/tuple/include/theta_update_sketch_base_impl.hpp
index ff4492d..3ba239c 100644
--- a/tuple/include/theta_update_sketch_base_impl.hpp
+++ b/tuple/include/theta_update_sketch_base_impl.hpp
@@ -140,7 +140,6 @@ void theta_update_sketch_base<EN, EK, A>::resize() {
   const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
   lg_cur_size_ += factor;
   const size_t new_size = 1 << lg_cur_size_;
-//  std::cout << "resizing from " << old_size << " to " << new_size << std::endl;
   EN* old_entries = entries_;
   entries_ = A().allocate(new_size);
   for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
@@ -157,7 +156,6 @@ void theta_update_sketch_base<EN, EK, A>::resize() {
 
 template<typename EN, typename EK, typename A>
 void theta_update_sketch_base<EN, EK, A>::rebuild() {
-//  std::cout << "rebuilding" << std::endl;
   const size_t size = 1 << lg_cur_size_;
   const uint32_t pivot = (1 << lg_nom_size_) + size - num_entries_;
   std::nth_element(&entries_[0], &entries_[pivot], &entries_[size], comparator());
@@ -176,6 +174,11 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
   A().deallocate(old_entries, size);
 }
 
+template<typename EN, typename EK, typename A>
+void theta_update_sketch_base<EN, EK, A>::trim() {
+  if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
+}
+
 // builder
 
 template<bool dummy>
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 55dda48..b520e22 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -232,8 +232,8 @@ public:
    * Update this sketch with a given string.
    * @param value string to update the sketch with
    */
-//  template<typename FwdUpdate>
-//  void update(const std::string& key, FwdUpdate&& value);
+  template<typename FwdUpdate>
+  void update(const std::string& key, FwdUpdate&& value);
 
   /**
    * Update this sketch with a given unsigned 64-bit integer.
@@ -246,72 +246,72 @@ public:
    * Update this sketch with a given signed 64-bit integer.
    * @param value int64_t to update the sketch with
    */
-//  template<typename FwdUpdate>
-//  void update(int64_t key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given unsigned 32-bit integer.
-//   * For compatibility with Java implementation.
-//   * @param value uint32_t to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(uint32_t key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given signed 32-bit integer.
-//   * For compatibility with Java implementation.
-//   * @param value int32_t to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(int32_t key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given unsigned 16-bit integer.
-//   * For compatibility with Java implementation.
-//   * @param value uint16_t to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(uint16_t key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given signed 16-bit integer.
-//   * For compatibility with Java implementation.
-//   * @param value int16_t to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(int16_t key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given unsigned 8-bit integer.
-//   * For compatibility with Java implementation.
-//   * @param value uint8_t to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(uint8_t key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given signed 8-bit integer.
-//   * For compatibility with Java implementation.
-//   * @param value int8_t to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(int8_t key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given double-precision floating point value.
-//   * For compatibility with Java implementation.
-//   * @param value double to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(double key, FwdUpdate&& value);
-//
-//  /**
-//   * Update this sketch with a given floating point value.
-//   * For compatibility with Java implementation.
-//   * @param value float to update the sketch with
-//   */
-//  template<typename FwdUpdate>
-//  void update(float key, FwdUpdate&& value);
+  template<typename FwdUpdate>
+  void update(int64_t key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given unsigned 32-bit integer.
+   * For compatibility with Java implementation.
+   * @param value uint32_t to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(uint32_t key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given signed 32-bit integer.
+   * For compatibility with Java implementation.
+   * @param value int32_t to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(int32_t key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given unsigned 16-bit integer.
+   * For compatibility with Java implementation.
+   * @param value uint16_t to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(uint16_t key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given signed 16-bit integer.
+   * For compatibility with Java implementation.
+   * @param value int16_t to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(int16_t key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given unsigned 8-bit integer.
+   * For compatibility with Java implementation.
+   * @param value uint8_t to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(uint8_t key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given signed 8-bit integer.
+   * For compatibility with Java implementation.
+   * @param value int8_t to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(int8_t key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given double-precision floating point value.
+   * For compatibility with Java implementation.
+   * @param value double to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(double key, FwdUpdate&& value);
+
+  /**
+   * Update this sketch with a given floating point value.
+   * For compatibility with Java implementation.
+   * @param value float to update the sketch with
+   */
+  template<typename FwdUpdate>
+  void update(float key, FwdUpdate&& value);
 
   /**
    * Update this sketch with given data of any type.
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index 14dfa5c..e2e2e41 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -95,12 +95,73 @@ auto update_tuple_sketch<S, U, P, SD, A>::get_rf() const -> resize_factor {
 
 template<typename S, typename U, typename P, typename SD, typename A>
 template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(const std::string& key, UU&& value) {
+  if (key.empty()) return;
+  update(key.c_str(), key.length(), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
 void update_tuple_sketch<S, U, P, SD, A>::update(uint64_t key, UU&& value) {
   update(&key, sizeof(key), std::forward<UU>(value));
 }
 
 template<typename S, typename U, typename P, typename SD, typename A>
 template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int64_t key, UU&& value) {
+  update(&key, sizeof(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(uint32_t key, UU&& value) {
+  update(static_cast<int32_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int32_t key, UU&& value) {
+  update(static_cast<int64_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(uint16_t key, UU&& value) {
+  update(static_cast<int16_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int16_t key, UU&& value) {
+  update(static_cast<int64_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(uint8_t key, UU&& value) {
+  update(static_cast<int8_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(double key, UU&& value) {
+  update(canonical_double(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(float key, UU&& value) {
+  update(static_cast<double>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int8_t key, UU&& value) {
+  update(static_cast<int64_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
 void update_tuple_sketch<S, U, P, SD, A>::update(const void* key, size_t length, UU&& value) {
   const uint64_t hash = compute_hash(key, length, map_.seed_);
   if (hash >= map_.theta_ || hash == 0) return; // hash == 0 is reserved to mark empty slots in the table
@@ -115,6 +176,11 @@ void update_tuple_sketch<S, U, P, SD, A>::update(const void* key, size_t length,
 }
 
 template<typename S, typename U, typename P, typename SD, typename A>
+void update_tuple_sketch<S, U, P, SD, A>::trim() {
+  map_.trim();
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
 string<A> update_tuple_sketch<S, U, P, SD, A>::to_string(bool detail) const {
   std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
   auto type = typeid(*this).name();
diff --git a/tuple/test/tuple_sketch_allocation_test.cpp b/tuple/test/tuple_sketch_allocation_test.cpp
index b9f969d..227c9bd 100644
--- a/tuple/test/tuple_sketch_allocation_test.cpp
+++ b/tuple/test/tuple_sketch_allocation_test.cpp
@@ -44,6 +44,9 @@ TEST_CASE("tuple sketch with test allocator: exact mode", "[tuple_sketch]") {
     }
     REQUIRE(count == update_sketch.get_num_retained());
 
+    update_sketch.trim();
+    REQUIRE(update_sketch.get_num_retained() == (1 << update_sketch.get_lg_k()));
+
     auto compact_sketch = update_sketch.compact();
     REQUIRE(!compact_sketch.is_empty());
     REQUIRE(compact_sketch.is_estimation_mode());
diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp
index dedd016..2beeeaf 100644
--- a/tuple/test/tuple_sketch_test.cpp
+++ b/tuple/test/tuple_sketch_test.cpp
@@ -191,4 +191,43 @@ TEST_CASE("tuple sketch: array of doubles", "[tuple_sketch]") {
   std::cout << compact_sketch.to_string(true);
 }
 
+TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sketch]") {
+  std::cout << "update with different values begin" << std::endl;
+  auto sketch = update_tuple_sketch<float>::builder().build();
+  sketch.update(static_cast<uint64_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(static_cast<int64_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(static_cast<uint32_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(static_cast<int32_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(static_cast<uint16_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(static_cast<int16_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(static_cast<uint8_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(static_cast<int8_t>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 1);
+
+  sketch.update(1.0, 1);
+  REQUIRE(sketch.get_num_retained() == 2);
+
+  sketch.update(static_cast<float>(1), 1);
+  REQUIRE(sketch.get_num_retained() == 2);
+
+  sketch.update("a", 1);
+  REQUIRE(sketch.get_num_retained() == 3);
+
+  std::cout << "update with different values end" << std::endl;
+}
+
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org