You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/06/02 21:51:03 UTC
[incubator-datasketches-cpp] 02/09: fixed ordered a-not-b
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit baef5b8947489d6f02d4499a1c8a617f2d69e16b
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 16:26:06 2020 -0700
fixed ordered a-not-b
---
theta/include/theta_a_not_b_impl.hpp | 65 ++++++++++++++----------------------
theta/test/theta_a_not_b_test.cpp | 24 +++++++++++++
2 files changed, 49 insertions(+), 40 deletions(-)
diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index f080903..8951a8f 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -22,6 +22,8 @@
#include <algorithm>
+#include "conditional_back_inserter.hpp"
+
namespace datasketches {
/*
@@ -43,54 +45,37 @@ compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch
const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
vector_u64<A> keys;
- uint32_t keys_size = 0;
- uint32_t count = 0;
bool is_empty = a.is_empty();
+ auto less_than_theta = [theta](uint64_t key) { return key < theta; };
if (b.get_num_retained() == 0) {
- for (auto key: a) if (key < theta) ++count;
- keys.resize(count);
- std::copy_if(a.begin(), a.end(), &keys[0], [theta](uint64_t key) { return key < theta; });
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
- if (count == 0 && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
- }
-
- keys_size = a.get_num_retained();
- keys.resize(keys_size);
-
- if (a.is_ordered() && b.is_ordered()) { // sort-based
- const auto end = std::set_difference(a.begin(), a.end(), b.begin(), b.end(), keys.begin());
- count = end - keys.begin();
- } else { // hash-based
- const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- vector_u64<A> b_hash_table(1 << lg_size, 0);
- for (auto key: b) {
- if (key < theta) {
- update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
- } else if (b.is_ordered()) {
- break; // early stop
+ std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than_theta);
+ } else {
+ if (a.is_ordered() && b.is_ordered()) { // sort-based
+ std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than_theta));
+ } else { // hash-based
+ const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
+ vector_u64<A> b_hash_table(1 << lg_size, 0);
+ for (auto key: b) {
+ if (key < theta) {
+ update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
+ } else if (b.is_ordered()) {
+ break; // early stop
+ }
}
- }
- // scan A lookup B
- for (auto key: a) {
- if (key < theta) {
- if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys[count++] = key;
- } else if (a.is_ordered()) {
- break; // early stop
+ // scan A lookup B
+ for (auto key: a) {
+ if (key < theta) {
+ if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
+ } else if (a.is_ordered()) {
+ break; // early stop
+ }
}
}
}
-
- if (count == 0) {
- keys.resize(0);
- if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
- } else if (count < keys_size) {
- keys.resize(count);
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
- }
-
+ if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
+ if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
}
diff --git a/theta/test/theta_a_not_b_test.cpp b/theta/test/theta_a_not_b_test.cpp
index 54bd3fa..1ef5255 100644
--- a/theta/test/theta_a_not_b_test.cpp
+++ b/theta/test/theta_a_not_b_test.cpp
@@ -217,4 +217,28 @@ TEST_CASE("theta a-not-b: seed mismatch", "[theta_a_not_b]") {
REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
}
+TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 10000; i++) a.update(value++);
+
+ update_theta_sketch b = update_theta_sketch::builder().build();
+ value = 5000;
+ for (int i = 0; i < 25000; i++) b.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE(result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+
+ // ordered inputs
+ result = a_not_b.compute(a.compact(), b.compact());
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE(result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+}
+
} /* namespace datasketches */
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org