You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/07/10 00:46:13 UTC

[incubator-datasketches-cpp] branch tuple_sketch updated (0d0bdc3 -> f79257a)

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a change to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git.


    from 0d0bdc3  avoid requiring default constructor for user type
     new 117c33d  removed redundant check
     new f79257a  tuple union moving update

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 tuple/include/theta_union_base.hpp              |  2 ++
 tuple/include/theta_union_base_impl.hpp         | 24 ++++++++++++++-
 tuple/include/theta_update_sketch_base.hpp      | 18 +++++++++++-
 tuple/include/theta_update_sketch_base_impl.hpp | 39 ++++++++++++++++++++++++-
 tuple/include/tuple_sketch.hpp                  | 22 +++++++++++++-
 tuple/include/tuple_sketch_impl.hpp             | 20 +++++++++++++
 tuple/include/tuple_union.hpp                   |  9 ++++--
 tuple/include/tuple_union_impl.hpp              |  5 ++--
 8 files changed, 130 insertions(+), 9 deletions(-)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 01/02: removed redundant check

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 117c33d7e917d51f16940a3c736be59f33f8e9c3
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Jul 9 14:37:56 2020 -0700

    removed redundant check
---
 tuple/include/theta_update_sketch_base_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuple/include/theta_update_sketch_base_impl.hpp b/tuple/include/theta_update_sketch_base_impl.hpp
index 62a1e0f..60d4084 100644
--- a/tuple/include/theta_update_sketch_base_impl.hpp
+++ b/tuple/include/theta_update_sketch_base_impl.hpp
@@ -75,7 +75,7 @@ template<typename EN, typename EK, typename A>
 uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data, size_t length) {
   is_empty_ = false;
   const uint64_t hash = compute_hash(data, length, seed_);
-  if (hash >= theta_ || hash == 0) return 0; // hash == 0 is reserved to mark empty slots in the table
+  if (hash >= theta_) return 0; // hash == 0 is reserved to mark empty slots in the table
   return hash;
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 02/02: tuple union moving update

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit f79257afc1705cf74a439ac7b0fa329057e999ac
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Jul 9 17:45:58 2020 -0700

    tuple union moving update
---
 tuple/include/theta_union_base.hpp              |  2 ++
 tuple/include/theta_union_base_impl.hpp         | 24 +++++++++++++++-
 tuple/include/theta_update_sketch_base.hpp      | 18 +++++++++++-
 tuple/include/theta_update_sketch_base_impl.hpp | 37 +++++++++++++++++++++++++
 tuple/include/tuple_sketch.hpp                  | 22 ++++++++++++++-
 tuple/include/tuple_sketch_impl.hpp             | 20 +++++++++++++
 tuple/include/tuple_union.hpp                   |  9 ++++--
 tuple/include/tuple_union_impl.hpp              |  5 ++--
 8 files changed, 129 insertions(+), 8 deletions(-)

diff --git a/tuple/include/theta_union_base.hpp b/tuple/include/theta_union_base.hpp
index 0b9a1b8..fbe4cf7 100644
--- a/tuple/include/theta_union_base.hpp
+++ b/tuple/include/theta_union_base.hpp
@@ -40,7 +40,9 @@ public:
 
   theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy);
 
+  // TODO: try to avoid duplication
   void update(const Sketch& sketch);
+  void update(Sketch&& sketch);
 
   CompactSketch get_result(bool ordered = true) const;
 
diff --git a/tuple/include/theta_union_base_impl.hpp b/tuple/include/theta_union_base_impl.hpp
index 10e4478..63c0f0f 100644
--- a/tuple/include/theta_union_base_impl.hpp
+++ b/tuple/include/theta_union_base_impl.hpp
@@ -41,7 +41,29 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(const S& sketch) {
       if (!result.second) {
         table_.insert(result.first, entry);
       } else {
-        *result.first = policy_(*result.first, entry);
+        policy_(*result.first, entry);
+      }
+    } else {
+      if (sketch.is_ordered()) break; // early stop
+    }
+  }
+  if (table_.theta_ < union_theta_) union_theta_ = table_.theta_;
+}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+void theta_union_base<EN, EK, P, S, CS, A>::update(S&& sketch) {
+  if (sketch.is_empty()) return;
+  if (sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
+  table_.is_empty_ = false;
+  if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
+  for (auto& entry: sketch) {
+    const uint64_t hash = EK()(entry);
+    if (hash < union_theta_) {
+      auto result = table_.find(hash);
+      if (!result.second) {
+        table_.insert(result.first, std::move(entry));
+      } else {
+        policy_(*result.first, std::move(entry));
       }
     } else {
       if (sketch.is_ordered()) break; // early stop
diff --git a/tuple/include/theta_update_sketch_base.hpp b/tuple/include/theta_update_sketch_base.hpp
index 6c1e441..1356c21 100644
--- a/tuple/include/theta_update_sketch_base.hpp
+++ b/tuple/include/theta_update_sketch_base.hpp
@@ -200,7 +200,23 @@ static inline uint16_t compute_seed_hash(uint64_t seed) {
   return hashes.h1;
 }
 
-// iterator
+// iterators
+
+template<typename Entry, typename ExtractKey>
+class theta_iterator: public std::iterator<std::input_iterator_tag, Entry> {
+public:
+  theta_iterator(Entry* entries, uint32_t size, uint32_t index);
+  theta_iterator& operator++();
+  theta_iterator operator++(int);
+  bool operator==(const theta_iterator& other) const;
+  bool operator!=(const theta_iterator& other) const;
+  Entry& operator*() const;
+
+private:
+  Entry* entries_;
+  uint32_t size_;
+  uint32_t index_;
+};
 
 template<typename Entry, typename ExtractKey>
 class theta_const_iterator: public std::iterator<std::input_iterator_tag, Entry> {
diff --git a/tuple/include/theta_update_sketch_base_impl.hpp b/tuple/include/theta_update_sketch_base_impl.hpp
index 60d4084..78b4cc9 100644
--- a/tuple/include/theta_update_sketch_base_impl.hpp
+++ b/tuple/include/theta_update_sketch_base_impl.hpp
@@ -228,6 +228,43 @@ uint8_t theta_base_builder<Derived>::starting_sub_multiple(uint8_t lg_tgt, uint8
 // iterator
 
 template<typename Entry, typename ExtractKey>
+theta_iterator<Entry, ExtractKey>::theta_iterator(Entry* entries, uint32_t size, uint32_t index):
+entries_(entries), size_(size), index_(index) {
+  while (index_ < size_ && ExtractKey()(entries_[index_]) == 0) ++index_;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_iterator<Entry, ExtractKey>::operator++() -> theta_iterator& {
+  ++index_;
+  while (index_ < size_ && ExtractKey()(entries_[index_]) == 0) ++index_;
+  return *this;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_iterator<Entry, ExtractKey>::operator++(int) -> theta_iterator {
+  theta_iterator tmp(*this);
+  operator++();
+  return tmp;
+}
+
+template<typename Entry, typename ExtractKey>
+bool theta_iterator<Entry, ExtractKey>::operator!=(const theta_iterator& other) const {
+  return index_ != other.index_;
+}
+
+template<typename Entry, typename ExtractKey>
+bool theta_iterator<Entry, ExtractKey>::operator==(const theta_iterator& other) const {
+  return index_ == other.index_;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_iterator<Entry, ExtractKey>::operator*() const -> Entry& {
+  return entries_[index_];
+}
+
+// const iterator
+
+template<typename Entry, typename ExtractKey>
 theta_const_iterator<Entry, ExtractKey>::theta_const_iterator(const Entry* entries, uint32_t size, uint32_t index):
 entries_(entries), size_(size), index_(index) {
   while (index_ < size_ && ExtractKey()(entries_[index_]) == 0) ++index_;
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 5ecb044..3409557 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -40,6 +40,7 @@ class tuple_sketch {
 public:
   using Entry = std::pair<uint64_t, Summary>;
   using ExtractKey = pair_extract_key<uint64_t, Summary>;
+  using iterator = theta_iterator<Entry, ExtractKey>;
   using const_iterator = theta_const_iterator<Entry, ExtractKey>;
 
   virtual ~tuple_sketch() = default;
@@ -109,13 +110,26 @@ public:
    * Iterator over entries in this sketch.
    * @return begin iterator
    */
-  virtual const_iterator begin() const = 0;
+  virtual iterator begin() = 0;
 
   /**
    * Iterator pointing past the valid range.
    * Not to be incremented or dereferenced.
    * @return end iterator
    */
+  virtual iterator end() = 0;
+
+  /**
+   * Const iterator over entries in this sketch.
+   * @return begin const iterator
+   */
+  virtual const_iterator begin() const = 0;
+
+  /**
+   * Const iterator pointing past the valid range.
+   * Not to be incremented or dereferenced.
+   * @return end const iterator
+   */
   virtual const_iterator end() const = 0;
 
 protected:
@@ -152,6 +166,7 @@ public:
   using Base = tuple_sketch<Summary, Allocator>;
   using Entry = typename Base::Entry;
   using ExtractKey = typename Base::ExtractKey;
+  using iterator = typename Base::iterator;
   using const_iterator = typename Base::const_iterator;
   using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
   using tuple_map = theta_update_sketch_base<Entry, ExtractKey, AllocEntry>;
@@ -294,6 +309,8 @@ public:
    */
   compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
 
+  virtual iterator begin();
+  virtual iterator end();
   virtual const_iterator begin() const;
   virtual const_iterator end() const;
 
@@ -316,6 +333,7 @@ public:
   using Base = tuple_sketch<Summary, Allocator>;
   using Entry = typename Base::Entry;
   using ExtractKey = typename Base::ExtractKey;
+  using iterator = typename Base::iterator;
   using const_iterator = typename Base::const_iterator;
   using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
   using AllocU64 = typename std::allocator_traits<Allocator>::template rebind_alloc<uint64_t>;
@@ -347,6 +365,8 @@ public:
   template<typename SerDe = serde<Summary>>
   vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
 
+  virtual iterator begin();
+  virtual iterator end();
   virtual const_iterator begin() const;
   virtual const_iterator end() const;
 
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index 2ae9f0b..ae09cfe 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -199,6 +199,16 @@ string<A> update_tuple_sketch<S, U, P, A>::to_string(bool detail) const {
 }
 
 template<typename S, typename U, typename P, typename A>
+auto update_tuple_sketch<S, U, P, A>::begin() -> iterator {
+  return iterator(map_.entries_, 1 << map_.lg_cur_size_, 0);
+}
+
+template<typename S, typename U, typename P, typename A>
+auto update_tuple_sketch<S, U, P, A>::end() -> iterator {
+  return iterator(nullptr, 0, 1 << map_.lg_cur_size_);
+}
+
+template<typename S, typename U, typename P, typename A>
 auto update_tuple_sketch<S, U, P, A>::begin() const -> const_iterator {
   return const_iterator(map_.entries_, 1 << map_.lg_cur_size_, 0);
 }
@@ -426,6 +436,16 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
 }
 
 template<typename S, typename A>
+auto compact_tuple_sketch<S, A>::begin() -> iterator {
+  return iterator(entries_.data(), entries_.size(), 0);
+}
+
+template<typename S, typename A>
+auto compact_tuple_sketch<S, A>::end() -> iterator {
+  return iterator(nullptr, 0, entries_.size());
+}
+
+template<typename S, typename A>
 auto compact_tuple_sketch<S, A>::begin() const -> const_iterator {
   return const_iterator(entries_.data(), entries_.size(), 0);
 }
diff --git a/tuple/include/tuple_union.hpp b/tuple/include/tuple_union.hpp
index 1e7fbb3..0ee776c 100644
--- a/tuple/include/tuple_union.hpp
+++ b/tuple/include/tuple_union.hpp
@@ -51,9 +51,11 @@ public:
   // in terms of operations on Entry
   struct internal_policy {
     internal_policy(const Policy& policy): policy_(policy) {}
-    Entry& operator()(Entry& internal_entry, const Entry& incoming_entry) const {
+    void operator()(Entry& internal_entry, const Entry& incoming_entry) const {
       policy_(internal_entry.second, incoming_entry.second);
-      return internal_entry;
+    }
+    void operator()(Entry& internal_entry, Entry&& incoming_entry) const {
+      policy_(internal_entry.second, std::move(incoming_entry.second));
     }
     Policy policy_;
   };
@@ -67,7 +69,8 @@ public:
    * This method is to update the union with a given sketch
    * @param sketch to update the union with
    */
-  void update(const Sketch& sketch);
+  template<typename FwdSketch>
+  void update(FwdSketch&& sketch);
 
   /**
    * This method produces a copy of the current state of the union as a compact sketch.
diff --git a/tuple/include/tuple_union_impl.hpp b/tuple/include/tuple_union_impl.hpp
index aa45a31..9f471ac 100644
--- a/tuple/include/tuple_union_impl.hpp
+++ b/tuple/include/tuple_union_impl.hpp
@@ -25,8 +25,9 @@ state_(lg_cur_size, lg_nom_size, rf, p, seed, internal_policy(policy))
 {}
 
 template<typename S, typename P, typename A>
-void tuple_union<S, P, A>::update(const Sketch& sketch) {
-  state_.update(sketch);
+template<typename SS>
+void tuple_union<S, P, A>::update(SS&& sketch) {
+  state_.update(std::forward<SS>(sketch));
 }
 
 template<typename S, typename P, typename A>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org