You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/07/07 23:52:41 UTC

[incubator-datasketches-cpp] branch tuple_sketch updated: a-not-b

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/tuple_sketch by this push:
     new 1d4fcb1  a-not-b
1d4fcb1 is described below

commit 1d4fcb1bcb78be9589a31f561fcde0a55dafdaba
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Jul 7 16:52:27 2020 -0700

    a-not-b
---
 .../include/conditional_back_inserter.hpp          |  0
 tuple/CMakeLists.txt                               |  9 +++
 ...{theta_union_base.hpp => theta_comparators.hpp} | 51 ++++++++-------
 tuple/include/theta_intersection_base_impl.hpp     |  3 -
 ...nion_base.hpp => theta_set_difference_base.hpp} | 25 +++-----
 tuple/include/theta_set_difference_base_impl.hpp   | 75 ++++++++++++++++++++++
 tuple/include/theta_union_base.hpp                 |  2 +-
 tuple/include/theta_union_base_impl.hpp            |  2 -
 tuple/include/theta_update_sketch_base.hpp         | 30 +--------
 tuple/include/tuple_a_not_b.hpp                    | 57 ++++++++++++++++
 ...theta_union_base.hpp => tuple_a_not_b_impl.hpp} | 41 +++---------
 tuple/include/tuple_intersection.hpp               |  1 -
 tuple/include/tuple_union.hpp                      |  1 -
 tuple/test/CMakeLists.txt                          |  1 +
 14 files changed, 187 insertions(+), 111 deletions(-)

diff --git a/theta/include/conditional_back_inserter.hpp b/common/include/conditional_back_inserter.hpp
similarity index 100%
rename from theta/include/conditional_back_inserter.hpp
rename to common/include/conditional_back_inserter.hpp
diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt
index b9e5757..3b0e891 100644
--- a/tuple/CMakeLists.txt
+++ b/tuple/CMakeLists.txt
@@ -35,9 +35,12 @@ target_compile_features(tuple INTERFACE cxx_std_11)
 set(tuple_HEADERS "")
 list(APPEND tuple_HEADERS "include/tuple_sketch.hpp;include/tuple_sketch_impl.hpp")
 list(APPEND tuple_HEADERS "include/tuple_union.hpp;include/tuple_union_impl.hpp")
+list(APPEND tuple_HEADERS "include/tuple_intersection.hpp;include/tuple_intersection_impl.hpp")
+list(APPEND tuple_HEADERS "include/tuple_a_not_b.hpp;include/tuple_a_not_b_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
+list(APPEND tuple_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_sketch_experimental.hpp;include/theta_sketch_experimental_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_union_experimental.hpp;include/theta_union_experimental_impl.hpp")
 
@@ -54,12 +57,18 @@ target_sources(tuple
     ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_sketch_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_union.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_union_impl.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_intersection.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_intersection_impl.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_a_not_b.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_a_not_b_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_experimental.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_experimental_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_experimental.hpp
diff --git a/tuple/include/theta_union_base.hpp b/tuple/include/theta_comparators.hpp
similarity index 52%
copy from tuple/include/theta_union_base.hpp
copy to tuple/include/theta_comparators.hpp
index a41751e..f461b5d 100644
--- a/tuple/include/theta_union_base.hpp
+++ b/tuple/include/theta_comparators.hpp
@@ -17,41 +17,40 @@
  * under the License.
  */
 
-#ifndef THETA_UNION_BASE_HPP_
-#define THETA_UNION_BASE_HPP_
-
-#include "theta_update_sketch_base.hpp"
+#ifndef THETA_COMPARATORS_HPP_
+#define THETA_COMPARATORS_HPP_
 
 namespace datasketches {
 
-template<
-  typename Entry,
-  typename ExtractKey,
-  typename Policy,
-  typename Sketch,
-  typename CompactSketch,
-  typename Allocator = std::allocator<Entry>
->
-class theta_union_base {
-public:
-  using resize_factor = theta_constants::resize_factor;
-  using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
-  using comparator = compare_by_key<Entry, ExtractKey>;
-
-  theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy);
+template<typename Entry, typename ExtractKey>
+struct compare_by_key {
+  bool operator()(const Entry& a, const Entry& b) const {
+    return ExtractKey()(a) < ExtractKey()(b);
+  }
+};
 
-  void update(const Sketch& sketch);
+// less than
 
-  CompactSketch get_result(bool ordered = true) const;
+template<typename T>
+class less_than {
+public:
+  explicit less_than(const T& value): value(value) {}
+  bool operator()(const T& value) const { return value < this->value; }
+private:
+  T value;
+};
 
+template<typename Key, typename Entry, typename ExtractKey>
+class key_less_than {
+public:
+  explicit key_less_than(const Key& key): key(key) {}
+  bool operator()(const Entry& entry) const {
+    return ExtractKey()(entry) < this->key;
+  }
 private:
-  Policy policy_;
-  hash_table table_;
-  uint64_t union_theta_;
+  Key key;
 };
 
 } /* namespace datasketches */
 
-#include "theta_union_base_impl.hpp"
-
 #endif
diff --git a/tuple/include/theta_intersection_base_impl.hpp b/tuple/include/theta_intersection_base_impl.hpp
index 5f776c4..f0c3a98 100644
--- a/tuple/include/theta_intersection_base_impl.hpp
+++ b/tuple/include/theta_intersection_base_impl.hpp
@@ -104,7 +104,6 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(const S& sketch) {
       }
       ++count;
     }
-    std::cout << "intersection: matching done" << std::endl;
     if (count > sketch.get_num_retained()) {
       throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
     } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
@@ -118,11 +117,9 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(const S& sketch) {
       num_entries_ = 0;
       if (theta_ == theta_constants::MAX_THETA) is_empty_ = true;
     } else {
-      std::cout << "intersection: converting to hash map" << std::endl;
       const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
       const size_t size = 1 << lg_size;
       if (lg_size != lg_size_) {
-        std::cout << "intersection: resizing from " << (1<<lg_size_) << " to " << size << std::endl;
         A().deallocate(entries_, 1 << lg_size_);
         entries_ = nullptr;
         lg_size_ = lg_size;
diff --git a/tuple/include/theta_union_base.hpp b/tuple/include/theta_set_difference_base.hpp
similarity index 69%
copy from tuple/include/theta_union_base.hpp
copy to tuple/include/theta_set_difference_base.hpp
index a41751e..03fe6bb 100644
--- a/tuple/include/theta_union_base.hpp
+++ b/tuple/include/theta_set_difference_base.hpp
@@ -17,9 +17,10 @@
  * under the License.
  */
 
-#ifndef THETA_UNION_BASE_HPP_
-#define THETA_UNION_BASE_HPP_
+#ifndef THETA_SET_DIFFERENCE_BASE_HPP_
+#define THETA_SET_DIFFERENCE_BASE_HPP_
 
+#include "theta_comparators.hpp"
 #include "theta_update_sketch_base.hpp"
 
 namespace datasketches {
@@ -27,31 +28,25 @@ namespace datasketches {
 template<
   typename Entry,
   typename ExtractKey,
-  typename Policy,
   typename Sketch,
   typename CompactSketch,
-  typename Allocator = std::allocator<Entry>
+  typename Allocator
 >
-class theta_union_base {
+class theta_set_difference_base {
 public:
-  using resize_factor = theta_constants::resize_factor;
-  using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
   using comparator = compare_by_key<Entry, ExtractKey>;
+  using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
 
-  theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy);
-
-  void update(const Sketch& sketch);
+  theta_set_difference_base(uint64_t seed);
 
-  CompactSketch get_result(bool ordered = true) const;
+  CompactSketch compute(const Sketch& a, const Sketch& b, bool ordered) const;
 
 private:
-  Policy policy_;
-  hash_table table_;
-  uint64_t union_theta_;
+  uint16_t seed_hash_;
 };
 
 } /* namespace datasketches */
 
-#include "theta_union_base_impl.hpp"
+#include "theta_set_difference_base_impl.hpp"
 
 #endif
diff --git a/tuple/include/theta_set_difference_base_impl.hpp b/tuple/include/theta_set_difference_base_impl.hpp
new file mode 100644
index 0000000..7273c9f
--- /dev/null
+++ b/tuple/include/theta_set_difference_base_impl.hpp
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <algorithm>
+
+#include "conditional_back_inserter.hpp"
+
+namespace datasketches {
+
+template<typename EN, typename EK, typename S, typename CS, typename A>
+theta_set_difference_base<EN, EK, S, CS, A>::theta_set_difference_base(uint64_t seed):
+seed_hash_(compute_seed_hash(seed))
+{}
+
+template<typename EN, typename EK, typename S, typename CS, typename A>
+CS theta_set_difference_base<EN, EK, S, CS, A>::compute(const S& a, const S& b, bool ordered) const {
+  if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return CS(a, ordered);
+  if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
+  if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
+
+  const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
+  std::vector<EN, A> entries;
+  bool is_empty = a.is_empty();
+
+  if (b.get_num_retained() == 0) {
+    std::copy_if(a.begin(), a.end(), std::back_inserter(entries), key_less_than<uint64_t, EN, EK>(theta));
+  } else {
+    if (a.is_ordered() && b.is_ordered()) { // sort-based
+      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)));
+    } else { // hash-based
+      const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
+      hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0); // seed is not used here
+      for (const auto& entry: b) {
+        const uint64_t hash = EK()(entry);
+        if (hash < theta) {
+          table.insert(table.find(hash).first, entry);
+        } else if (b.is_ordered()) {
+          break; // early stop
+        }
+      }
+
+      // scan A lookup B
+      for (const auto& entry: a) {
+        const uint64_t hash = EK()(entry);
+        if (hash < theta) {
+          auto result = table.find(hash);
+          if (!result.second) entries.push_back(entry);
+        } else if (a.is_ordered()) {
+          break; // early stop
+        }
+      }
+    }
+  }
+  if (entries.empty() && theta == theta_constants::MAX_THETA) is_empty = true;
+  if (ordered && !a.is_ordered()) std::sort(entries.begin(), entries.end(), comparator());
+  return CS(is_empty, a.is_ordered() || ordered, seed_hash_, theta, std::move(entries));
+}
+
+} /* namespace datasketches */
diff --git a/tuple/include/theta_union_base.hpp b/tuple/include/theta_union_base.hpp
index a41751e..0cb4005 100644
--- a/tuple/include/theta_union_base.hpp
+++ b/tuple/include/theta_union_base.hpp
@@ -34,8 +34,8 @@ template<
 >
 class theta_union_base {
 public:
-  using resize_factor = theta_constants::resize_factor;
   using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
+  using resize_factor = typename hash_table::resize_factor;
   using comparator = compare_by_key<Entry, ExtractKey>;
 
   theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy);
diff --git a/tuple/include/theta_union_base_impl.hpp b/tuple/include/theta_union_base_impl.hpp
index f6f3613..ae56859 100644
--- a/tuple/include/theta_union_base_impl.hpp
+++ b/tuple/include/theta_union_base_impl.hpp
@@ -17,8 +17,6 @@
  * under the License.
  */
 
-#include <iostream>
-#include <sstream>
 #include <algorithm>
 
 namespace datasketches {
diff --git a/tuple/include/theta_update_sketch_base.hpp b/tuple/include/theta_update_sketch_base.hpp
index a2da738..f261389 100644
--- a/tuple/include/theta_update_sketch_base.hpp
+++ b/tuple/include/theta_update_sketch_base.hpp
@@ -25,6 +25,7 @@
 
 #include "common_defs.hpp"
 #include "MurmurHash3.h"
+#include "theta_comparators.hpp"
 
 namespace datasketches {
 
@@ -33,13 +34,6 @@ namespace theta_constants {
   static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
 }
 
-template<typename Entry, typename ExtractKey>
-struct compare_by_key {
-  bool operator()(const Entry& a, const Entry& b) const {
-    return ExtractKey()(a) < ExtractKey()(b);
-  }
-};
-
 template<
   typename Entry,
   typename ExtractKey,
@@ -174,28 +168,6 @@ struct pair_extract_key {
   }
 };
 
-// less than
-
-template<typename T>
-class less_than {
-public:
-  explicit less_than(const T& value): value(value) {}
-  bool operator()(const T& value) const { return value < this->value; }
-private:
-  T value;
-};
-
-template<typename Key, typename Entry, typename ExtractKey>
-class key_less_than {
-public:
-  explicit key_less_than(const Key& key): key(key) {}
-  bool operator()(const Entry& entry) const {
-    return ExtractKey()(entry) < this->key;
-  }
-private:
-  Key key;
-};
-
 // not zero
 
 template<typename Entry, typename ExtractKey>
diff --git a/tuple/include/tuple_a_not_b.hpp b/tuple/include/tuple_a_not_b.hpp
new file mode 100644
index 0000000..5883d73
--- /dev/null
+++ b/tuple/include/tuple_a_not_b.hpp
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TUPLE_A_NOT_B_HPP_
+#define TUPLE_A_NOT_B_HPP_
+
+#include "tuple_sketch.hpp"
+#include "theta_set_difference_base.hpp"
+
+namespace datasketches {
+
+template<
+  typename Summary,
+  typename Allocator = std::allocator<Summary>
+>
+class tuple_a_not_b {
+public:
+  using Entry = std::pair<uint64_t, Summary>;
+  using ExtractKey = pair_extract_key<uint64_t, Summary>;
+  using Sketch = tuple_sketch<Summary, Allocator>;
+  using CompactSketch = compact_tuple_sketch<Summary, Allocator>;
+  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
+  using State = theta_set_difference_base<Entry, ExtractKey, Sketch, CompactSketch, AllocEntry>;
+
+  explicit tuple_a_not_b(uint64_t seed = DEFAULT_SEED);
+
+  /**
+   * Computes the a-not-b set operation given two sketches.
+   * @return the result of a-not-b
+   */
+  CompactSketch compute(const Sketch& a, const Sketch& b, bool ordered = true) const;
+
+private:
+  State state_;
+};
+
+} /* namespace datasketches */
+
+#include "tuple_a_not_b_impl.hpp"
+
+#endif
diff --git a/tuple/include/theta_union_base.hpp b/tuple/include/tuple_a_not_b_impl.hpp
similarity index 51%
copy from tuple/include/theta_union_base.hpp
copy to tuple/include/tuple_a_not_b_impl.hpp
index a41751e..ccc081d 100644
--- a/tuple/include/theta_union_base.hpp
+++ b/tuple/include/tuple_a_not_b_impl.hpp
@@ -17,41 +17,16 @@
  * under the License.
  */
 
-#ifndef THETA_UNION_BASE_HPP_
-#define THETA_UNION_BASE_HPP_
-
-#include "theta_update_sketch_base.hpp"
-
 namespace datasketches {
 
-template<
-  typename Entry,
-  typename ExtractKey,
-  typename Policy,
-  typename Sketch,
-  typename CompactSketch,
-  typename Allocator = std::allocator<Entry>
->
-class theta_union_base {
-public:
-  using resize_factor = theta_constants::resize_factor;
-  using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
-  using comparator = compare_by_key<Entry, ExtractKey>;
-
-  theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed, const Policy& policy);
-
-  void update(const Sketch& sketch);
+template<typename S, typename A>
+tuple_a_not_b<S, A>::tuple_a_not_b(uint64_t seed):
+state_(seed)
+{}
 
-  CompactSketch get_result(bool ordered = true) const;
-
-private:
-  Policy policy_;
-  hash_table table_;
-  uint64_t union_theta_;
-};
+template<typename S, typename A>
+auto tuple_a_not_b<S, A>::compute(const Sketch& a, const Sketch& b, bool ordered) const -> CompactSketch {
+  return state_.compute(a, b, ordered);
+}
 
 } /* namespace datasketches */
-
-#include "theta_union_base_impl.hpp"
-
-#endif
diff --git a/tuple/include/tuple_intersection.hpp b/tuple/include/tuple_intersection.hpp
index 79d8798..ac582c3 100644
--- a/tuple/include/tuple_intersection.hpp
+++ b/tuple/include/tuple_intersection.hpp
@@ -20,7 +20,6 @@
 #ifndef TUPLE_INTERSECTION_HPP_
 #define TUPLE_INTERSECTION_HPP_
 
-#include "serde.hpp"
 #include "tuple_sketch.hpp"
 #include "theta_intersection_base.hpp"
 
diff --git a/tuple/include/tuple_union.hpp b/tuple/include/tuple_union.hpp
index 3a262c3..1e7fbb3 100644
--- a/tuple/include/tuple_union.hpp
+++ b/tuple/include/tuple_union.hpp
@@ -20,7 +20,6 @@
 #ifndef TUPLE_UNION_HPP_
 #define TUPLE_UNION_HPP_
 
-#include "serde.hpp"
 #include "tuple_sketch.hpp"
 #include "theta_union_base.hpp"
 
diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index 668dd50..f604b70 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -42,6 +42,7 @@ target_sources(tuple_test
     tuple_sketch_allocation_test.cpp
     tuple_union_test.cpp
     tuple_intersection_test.cpp
+    tuple_a_not_b_test.cpp
     theta_sketch_experimental_test.cpp
     theta_union_experimental_test.cpp
 )


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org