You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/06/02 21:51:01 UTC

[incubator-datasketches-cpp] branch patch_for_rc4 created (now 6f04d3f)

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a change to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git.


      at 6f04d3f  added new header

This branch includes the following new commits:

     new c60448e  cherry-pick #151: no checking of seed hash for empty compact sketches
     new baef5b8  fixed ordered a-not-b
     new ef43c5b  conditional back inserter
     new 9b1b34b  reference member is problematic
     new 97dcd5b  lambda as rvalue
     new 4baf1db  MSVC compatibility
     new 98497af  one more attempt at MSVC compatibility
     new 680ddc1  another attempt at MSVC compatibility
     new 6f04d3f  added new header

The 9 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 08/09: another attempt at MSVC compatibility

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 680ddc103443a0c5ada12e1ae9fa68496a3a34a3
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Jun 1 10:29:31 2020 -0700

    another attempt at MSVC compatibility
---
 theta/include/conditional_back_inserter.hpp | 9 +++++++++
 theta/include/theta_a_not_b.hpp             | 7 +++++++
 theta/include/theta_a_not_b_impl.hpp        | 4 ++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/theta/include/conditional_back_inserter.hpp b/theta/include/conditional_back_inserter.hpp
index e9b0590..9b33833 100644
--- a/theta/include/conditional_back_inserter.hpp
+++ b/theta/include/conditional_back_inserter.hpp
@@ -31,6 +31,15 @@ public:
   template<typename P>
   conditional_back_insert_iterator(Container& c, P&& p): std::back_insert_iterator<Container>(c), p(std::forward<P>(p)) {}
 
+  // MSVC seems to insist on having copy constructor and assignment
+  conditional_back_insert_iterator(const conditional_back_insert_iterator& other):
+    std::back_insert_iterator<Container>(other), p(other.p) {}
+  conditional_back_insert_iterator& operator=(const conditional_back_insert_iterator& other) {
+    std::back_insert_iterator<Container>::operator=(other);
+    p = other.p;
+    return *this;
+  }
+
   conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
     if (p(value)) std::back_insert_iterator<Container>::operator=(value);
     return *this;
diff --git a/theta/include/theta_a_not_b.hpp b/theta/include/theta_a_not_b.hpp
index 5287887..db66ac7 100644
--- a/theta/include/theta_a_not_b.hpp
+++ b/theta/include/theta_a_not_b.hpp
@@ -54,6 +54,13 @@ private:
   typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
   uint16_t seed_hash_;
 
+  class less_than {
+  public:
+    explicit less_than(uint64_t value): value(value) {}
+    bool operator()(uint64_t value) const { return value < this->value; }
+  private:
+    uint64_t value;
+  };
 };
 
 // alias with default allocator for convenience
diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index 2ffca28..4343ee3 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -48,10 +48,10 @@ compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch
   bool is_empty = a.is_empty();
 
   if (b.get_num_retained() == 0) {
-    std::copy_if(a.begin(), a.end(), std::back_inserter(keys), [theta](uint64_t key) { return key < theta; });
+    std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
   } else {
     if (a.is_ordered() && b.is_ordered()) { // sort-based
-      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, [theta](uint64_t key) { return key < theta; }));
+      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
     } else { // hash-based
       const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
       vector_u64<A> b_hash_table(1 << lg_size, 0);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 04/09: reference member is problematic

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 9b1b34b7ba7f833d6e2278321535ea860383586e
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 16:59:36 2020 -0700

    reference member is problematic
---
 theta/include/conditional_back_inserter.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/theta/include/conditional_back_inserter.hpp b/theta/include/conditional_back_inserter.hpp
index f128a3b..2e40e9f 100644
--- a/theta/include/conditional_back_inserter.hpp
+++ b/theta/include/conditional_back_inserter.hpp
@@ -28,7 +28,7 @@ namespace datasketches {
 template <class Container, class Predicate>
 class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
 public:
-  conditional_back_insert_iterator(Container& c, Predicate& p): std::back_insert_iterator<Container>(c), p(p) {}
+  conditional_back_insert_iterator(Container& c, Predicate&& p): std::back_insert_iterator<Container>(c), p(std::forward<Predicate>(p)) {}
 
   conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
     if (p(value)) std::back_insert_iterator<Container>::operator=(value);
@@ -40,12 +40,12 @@ public:
   conditional_back_insert_iterator& operator++(int) { return *this; }
 
 private:
-  Predicate& p;
+  Predicate p;
 };
 
 template< class Container, class Predicate>
-conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate& p) {
-  return conditional_back_insert_iterator<Container, Predicate>(c, p);
+conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
+  return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
 }
 
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 09/09: added new header

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 6f04d3f16938a3a7dc62a0c3c912b92bcb5d289f
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Jun 1 14:28:43 2020 -0700

    added new header
---
 theta/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/theta/CMakeLists.txt b/theta/CMakeLists.txt
index ae02e79..69af9db 100644
--- a/theta/CMakeLists.txt
+++ b/theta/CMakeLists.txt
@@ -36,6 +36,7 @@ set(theta_HEADERS "")
 list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
 list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/binomial_bounds.hpp;include/theta_sketch_impl.hpp")
 list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
+list(APPEND theta_HEADERS "include/conditional_back_inserter.hpp")
 
 install(TARGETS theta
   EXPORT ${PROJECT_NAME}
@@ -55,4 +56,5 @@ target_sources(theta
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
 )


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 02/09: fixed ordered a-not-b

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit baef5b8947489d6f02d4499a1c8a617f2d69e16b
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 16:26:06 2020 -0700

    fixed ordered a-not-b
---
 theta/include/theta_a_not_b_impl.hpp | 65 ++++++++++++++----------------------
 theta/test/theta_a_not_b_test.cpp    | 24 +++++++++++++
 2 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index f080903..8951a8f 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -22,6 +22,8 @@
 
 #include <algorithm>
 
+#include "conditional_back_inserter.hpp"
+
 namespace datasketches {
 
 /*
@@ -43,54 +45,37 @@ compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch
 
   const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
   vector_u64<A> keys;
-  uint32_t keys_size = 0;
-  uint32_t count = 0;
   bool is_empty = a.is_empty();
+  auto less_than_theta = [theta](uint64_t key) { return key < theta; };
 
   if (b.get_num_retained() == 0) {
-    for (auto key: a) if (key < theta) ++count;
-    keys.resize(count);
-    std::copy_if(a.begin(), a.end(), &keys[0], [theta](uint64_t key) { return key < theta; });
-    if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
-    if (count == 0 && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
-    return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
-  }
-
-  keys_size = a.get_num_retained();
-  keys.resize(keys_size);
-
-  if (a.is_ordered() && b.is_ordered()) { // sort-based
-    const auto end = std::set_difference(a.begin(), a.end(), b.begin(), b.end(), keys.begin());
-    count = end - keys.begin();
-  } else { // hash-based
-    const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
-    vector_u64<A> b_hash_table(1 << lg_size, 0);
-    for (auto key: b) {
-      if (key < theta) {
-        update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
-      } else if (b.is_ordered()) {
-        break; // early stop
+    std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than_theta);
+  } else {
+    if (a.is_ordered() && b.is_ordered()) { // sort-based
+      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than_theta));
+    } else { // hash-based
+      const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
+      vector_u64<A> b_hash_table(1 << lg_size, 0);
+      for (auto key: b) {
+        if (key < theta) {
+          update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
+        } else if (b.is_ordered()) {
+          break; // early stop
+        }
       }
-    }
 
-    // scan A lookup B
-    for (auto key: a) {
-      if (key < theta) {
-        if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys[count++] = key;
-      } else if (a.is_ordered()) {
-        break; // early stop
+      // scan A lookup B
+      for (auto key: a) {
+        if (key < theta) {
+          if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
+        } else if (a.is_ordered()) {
+          break; // early stop
+        }
       }
     }
   }
-
-  if (count == 0) {
-    keys.resize(0);
-    if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
-  } else if (count < keys_size) {
-    keys.resize(count);
-    if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
-  }
-
+  if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
+  if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
   return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
 }
 
diff --git a/theta/test/theta_a_not_b_test.cpp b/theta/test/theta_a_not_b_test.cpp
index 54bd3fa..1ef5255 100644
--- a/theta/test/theta_a_not_b_test.cpp
+++ b/theta/test/theta_a_not_b_test.cpp
@@ -217,4 +217,28 @@ TEST_CASE("theta a-not-b: seed mismatch", "[theta_a_not_b]") {
   REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
 }
 
+TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
+  update_theta_sketch a = update_theta_sketch::builder().build();
+  int value = 0;
+  for (int i = 0; i < 10000; i++) a.update(value++);
+
+  update_theta_sketch b = update_theta_sketch::builder().build();
+  value = 5000;
+  for (int i = 0; i < 25000; i++) b.update(value++);
+
+  theta_a_not_b a_not_b;
+
+  // unordered inputs
+  compact_theta_sketch result = a_not_b.compute(a, b);
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE(result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+
+  // ordered inputs
+  result = a_not_b.compute(a.compact(), b.compact());
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE(result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+}
+
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 06/09: MSVC compatibility

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 4baf1db31b3f4bdba1c43830935a476a7ef7c361
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 21:42:38 2020 -0700

    MSVC compatibility
---
 theta/include/conditional_back_inserter.hpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/theta/include/conditional_back_inserter.hpp b/theta/include/conditional_back_inserter.hpp
index 2e40e9f..5e7f2ef 100644
--- a/theta/include/conditional_back_inserter.hpp
+++ b/theta/include/conditional_back_inserter.hpp
@@ -25,9 +25,10 @@
 
 namespace datasketches {
 
-template <class Container, class Predicate>
+template <typename Container, typename Predicate>
 class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
 public:
+  conditional_back_insert_iterator(Container& c, Predicate& p): std::back_insert_iterator<Container>(c), p(p) {}
   conditional_back_insert_iterator(Container& c, Predicate&& p): std::back_insert_iterator<Container>(c), p(std::forward<Predicate>(p)) {}
 
   conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
@@ -43,7 +44,12 @@ private:
   Predicate p;
 };
 
-template< class Container, class Predicate>
+template<typename Container, typename Predicate>
+conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate& p) {
+  return conditional_back_insert_iterator<Container, Predicate>(c, p);
+}
+
+template<typename Container, typename Predicate>
 conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
   return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 03/09: conditional back inserter

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit ef43c5ba50b9a4e901f06159946bb55535834e8e
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 16:28:09 2020 -0700

    conditional back inserter
---
 theta/include/conditional_back_inserter.hpp | 53 +++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/theta/include/conditional_back_inserter.hpp b/theta/include/conditional_back_inserter.hpp
new file mode 100644
index 0000000..f128a3b
--- /dev/null
+++ b/theta/include/conditional_back_inserter.hpp
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef CONDITIONAL_BACK_INSERTER_HPP_
+#define CONDITIONAL_BACK_INSERTER_HPP_
+
+#include <iterator>
+#include <functional>
+
+namespace datasketches {
+
+template <class Container, class Predicate>
+class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
+public:
+  conditional_back_insert_iterator(Container& c, Predicate& p): std::back_insert_iterator<Container>(c), p(p) {}
+
+  conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
+    if (p(value)) std::back_insert_iterator<Container>::operator=(value);
+    return *this;
+  }
+
+  conditional_back_insert_iterator& operator*() { return *this; }
+  conditional_back_insert_iterator& operator++() { return *this; }
+  conditional_back_insert_iterator& operator++(int) { return *this; }
+
+private:
+  Predicate& p;
+};
+
+template< class Container, class Predicate>
+conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate& p) {
+  return conditional_back_insert_iterator<Container, Predicate>(c, p);
+}
+
+} /* namespace datasketches */
+
+#endif


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 05/09: lambda as rvalue

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 97dcd5b28e55f630d087c80e46a8c7a097f87e4b
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 17:57:25 2020 -0700

    lambda as rvalue
---
 theta/include/theta_a_not_b_impl.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index 8951a8f..2ffca28 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -46,13 +46,12 @@ compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch
   const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
   vector_u64<A> keys;
   bool is_empty = a.is_empty();
-  auto less_than_theta = [theta](uint64_t key) { return key < theta; };
 
   if (b.get_num_retained() == 0) {
-    std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than_theta);
+    std::copy_if(a.begin(), a.end(), std::back_inserter(keys), [theta](uint64_t key) { return key < theta; });
   } else {
     if (a.is_ordered() && b.is_ordered()) { // sort-based
-      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than_theta));
+      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, [theta](uint64_t key) { return key < theta; }));
     } else { // hash-based
       const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
       vector_u64<A> b_hash_table(1 << lg_size, 0);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 01/09: cherry-pick #151: no checking of seed hash for empty compact sketches

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit c60448e0443f1bb40f84f378750f17ca65bc5541
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Tue Jun 2 13:54:50 2020 -0700

    cherry-pick #151: no checking of seed hash for empty compact sketches
---
 theta/include/theta_a_not_b_impl.hpp        |   3 +--
 theta/include/theta_intersection_impl.hpp   |   2 +-
 theta/include/theta_sketch_impl.hpp         |  14 ++++++++++----
 theta/test/theta_compact_empty_from_java.sk | Bin 8 -> 8 bytes
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index cc171ce..f080903 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -37,10 +37,9 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
 
 template<typename A>
 compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
-  if (a.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
+  if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
   if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
   if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
-  if (a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
 
   const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
   vector_u64<A> keys;
diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp
index 79fea4e..6be6757 100644
--- a/theta/include/theta_intersection_impl.hpp
+++ b/theta/include/theta_intersection_impl.hpp
@@ -44,7 +44,7 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
 template<typename A>
 void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
   if (is_empty_) return;
-  if (sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
+  if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
   is_empty_ |= sketch.is_empty();
   theta_ = std::min(theta_, sketch.get_theta64());
   if (is_valid_ && num_keys_ == 0) return;
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 417dfa1..0514884 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -101,9 +101,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
   is.read((char*)&seed_hash, sizeof(seed_hash));
 
   check_serial_version(serial_version, SERIAL_VERSION);
-  check_seed_hash(seed_hash, get_seed_hash(seed));
 
   if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    check_seed_hash(seed_hash, get_seed_hash(seed));
     typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
     typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
     return unique_ptr(
@@ -114,6 +114,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
       }
     );
   } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+    if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
     typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
     return unique_ptr(
       static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
@@ -146,9 +148,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
   ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
 
   check_serial_version(serial_version, SERIAL_VERSION);
-  check_seed_hash(seed_hash, get_seed_hash(seed));
 
   if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    check_seed_hash(seed_hash, get_seed_hash(seed));
     typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
     typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
     return unique_ptr(
@@ -161,6 +163,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
       }
     );
   } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+    if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
     typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
     return unique_ptr(
       static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
@@ -753,7 +757,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
   is.read((char*)&seed_hash, sizeof(seed_hash));
   theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
   theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
-  theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+  const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+  if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
   return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
 }
 
@@ -801,7 +806,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
   ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
   theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
   theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
-  theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+  const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+  if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
   return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
 }
 
diff --git a/theta/test/theta_compact_empty_from_java.sk b/theta/test/theta_compact_empty_from_java.sk
index 44730d3..f6c647f 100644
Binary files a/theta/test/theta_compact_empty_from_java.sk and b/theta/test/theta_compact_empty_from_java.sk differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 07/09: one more attempt at MSVC compatibility

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 98497af00743aaa183c4df7f9e5381ca47780232
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri May 29 22:02:22 2020 -0700

    one more attempt at MSVC compatibility
---
 theta/include/conditional_back_inserter.hpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/theta/include/conditional_back_inserter.hpp b/theta/include/conditional_back_inserter.hpp
index 5e7f2ef..e9b0590 100644
--- a/theta/include/conditional_back_inserter.hpp
+++ b/theta/include/conditional_back_inserter.hpp
@@ -28,8 +28,8 @@ namespace datasketches {
 template <typename Container, typename Predicate>
 class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
 public:
-  conditional_back_insert_iterator(Container& c, Predicate& p): std::back_insert_iterator<Container>(c), p(p) {}
-  conditional_back_insert_iterator(Container& c, Predicate&& p): std::back_insert_iterator<Container>(c), p(std::forward<Predicate>(p)) {}
+  template<typename P>
+  conditional_back_insert_iterator(Container& c, P&& p): std::back_insert_iterator<Container>(c), p(std::forward<P>(p)) {}
 
   conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
     if (p(value)) std::back_insert_iterator<Container>::operator=(value);
@@ -45,11 +45,6 @@ private:
 };
 
 template<typename Container, typename Predicate>
-conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate& p) {
-  return conditional_back_insert_iterator<Container, Predicate>(c, p);
-}
-
-template<typename Container, typename Predicate>
 conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
   return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org