You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/06/02 21:51:03 UTC

[incubator-datasketches-cpp] 02/09: fixed ordered a-not-b

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit baef5b8947489d6f02d4499a1c8a617f2d69e16b
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 16:26:06 2020 -0700

    fixed ordered a-not-b
---
 theta/include/theta_a_not_b_impl.hpp | 65 ++++++++++++++----------------------
 theta/test/theta_a_not_b_test.cpp    | 24 +++++++++++++
 2 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index f080903..8951a8f 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -22,6 +22,8 @@
 
 #include <algorithm>
 
+#include "conditional_back_inserter.hpp"
+
 namespace datasketches {
 
 /*
@@ -43,54 +45,37 @@ compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch
 
   const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
   vector_u64<A> keys;
-  uint32_t keys_size = 0;
-  uint32_t count = 0;
   bool is_empty = a.is_empty();
+  auto less_than_theta = [theta](uint64_t key) { return key < theta; };
 
   if (b.get_num_retained() == 0) {
-    for (auto key: a) if (key < theta) ++count;
-    keys.resize(count);
-    std::copy_if(a.begin(), a.end(), &keys[0], [theta](uint64_t key) { return key < theta; });
-    if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
-    if (count == 0 && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
-    return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
-  }
-
-  keys_size = a.get_num_retained();
-  keys.resize(keys_size);
-
-  if (a.is_ordered() && b.is_ordered()) { // sort-based
-    const auto end = std::set_difference(a.begin(), a.end(), b.begin(), b.end(), keys.begin());
-    count = end - keys.begin();
-  } else { // hash-based
-    const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
-    vector_u64<A> b_hash_table(1 << lg_size, 0);
-    for (auto key: b) {
-      if (key < theta) {
-        update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
-      } else if (b.is_ordered()) {
-        break; // early stop
+    std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than_theta);
+  } else {
+    if (a.is_ordered() && b.is_ordered()) { // sort-based
+      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than_theta));
+    } else { // hash-based
+      const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
+      vector_u64<A> b_hash_table(1 << lg_size, 0);
+      for (auto key: b) {
+        if (key < theta) {
+          update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
+        } else if (b.is_ordered()) {
+          break; // early stop
+        }
       }
-    }
 
-    // scan A lookup B
-    for (auto key: a) {
-      if (key < theta) {
-        if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys[count++] = key;
-      } else if (a.is_ordered()) {
-        break; // early stop
+      // scan A lookup B
+      for (auto key: a) {
+        if (key < theta) {
+          if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
+        } else if (a.is_ordered()) {
+          break; // early stop
+        }
       }
     }
   }
-
-  if (count == 0) {
-    keys.resize(0);
-    if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
-  } else if (count < keys_size) {
-    keys.resize(count);
-    if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
-  }
-
+  if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
+  if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
   return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
 }
 
diff --git a/theta/test/theta_a_not_b_test.cpp b/theta/test/theta_a_not_b_test.cpp
index 54bd3fa..1ef5255 100644
--- a/theta/test/theta_a_not_b_test.cpp
+++ b/theta/test/theta_a_not_b_test.cpp
@@ -217,4 +217,28 @@ TEST_CASE("theta a-not-b: seed mismatch", "[theta_a_not_b]") {
   REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
 }
 
+TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
+  update_theta_sketch a = update_theta_sketch::builder().build();
+  int value = 0;
+  for (int i = 0; i < 10000; i++) a.update(value++);
+
+  update_theta_sketch b = update_theta_sketch::builder().build();
+  value = 5000;
+  for (int i = 0; i < 25000; i++) b.update(value++);
+
+  theta_a_not_b a_not_b;
+
+  // unordered inputs
+  compact_theta_sketch result = a_not_b.compute(a, b);
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE(result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+
+  // ordered inputs
+  result = a_not_b.compute(a.compact(), b.compact());
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE(result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+}
+
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org