You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/06/02 21:51:02 UTC

[incubator-datasketches-cpp] 01/09: cherry-pick #151: no checking of seed hash for empty compact sketches

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit c60448e0443f1bb40f84f378750f17ca65bc5541
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Tue Jun 2 13:54:50 2020 -0700

    cherry-pick #151: no checking of seed hash for empty compact sketches
---
 theta/include/theta_a_not_b_impl.hpp        |   3 +--
 theta/include/theta_intersection_impl.hpp   |   2 +-
 theta/include/theta_sketch_impl.hpp         |  14 ++++++++++----
 theta/test/theta_compact_empty_from_java.sk | Bin 8 -> 8 bytes
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index cc171ce..f080903 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -37,10 +37,9 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
 
 template<typename A>
 compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
-  if (a.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
+  if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
   if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
   if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
-  if (a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
 
   const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
   vector_u64<A> keys;
diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp
index 79fea4e..6be6757 100644
--- a/theta/include/theta_intersection_impl.hpp
+++ b/theta/include/theta_intersection_impl.hpp
@@ -44,7 +44,7 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
 template<typename A>
 void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
   if (is_empty_) return;
-  if (sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
+  if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
   is_empty_ |= sketch.is_empty();
   theta_ = std::min(theta_, sketch.get_theta64());
   if (is_valid_ && num_keys_ == 0) return;
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 417dfa1..0514884 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -101,9 +101,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
   is.read((char*)&seed_hash, sizeof(seed_hash));
 
   check_serial_version(serial_version, SERIAL_VERSION);
-  check_seed_hash(seed_hash, get_seed_hash(seed));
 
   if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    check_seed_hash(seed_hash, get_seed_hash(seed));
     typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
     typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
     return unique_ptr(
@@ -114,6 +114,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
       }
     );
   } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+    if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
     typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
     return unique_ptr(
       static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
@@ -146,9 +148,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
   ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
 
   check_serial_version(serial_version, SERIAL_VERSION);
-  check_seed_hash(seed_hash, get_seed_hash(seed));
 
   if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    check_seed_hash(seed_hash, get_seed_hash(seed));
     typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
     typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
     return unique_ptr(
@@ -161,6 +163,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
       }
     );
   } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+    if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
     typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
     return unique_ptr(
       static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
@@ -753,7 +757,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
   is.read((char*)&seed_hash, sizeof(seed_hash));
   theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
   theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
-  theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+  const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+  if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
   return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
 }
 
@@ -801,7 +806,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
   ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
   theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
   theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
-  theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+  const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+  if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
   return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
 }
 
diff --git a/theta/test/theta_compact_empty_from_java.sk b/theta/test/theta_compact_empty_from_java.sk
index 44730d3..f6c647f 100644
Binary files a/theta/test/theta_compact_empty_from_java.sk and b/theta/test/theta_compact_empty_from_java.sk differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org