You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/06/02 21:51:02 UTC
[incubator-datasketches-cpp] 01/09: cherry-pick #151: no checking
of seed hash for empty compact sketches
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch patch_for_rc4
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit c60448e0443f1bb40f84f378750f17ca65bc5541
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Tue Jun 2 13:54:50 2020 -0700
cherry-pick #151: no checking of seed hash for empty compact sketches
---
theta/include/theta_a_not_b_impl.hpp | 3 +--
theta/include/theta_intersection_impl.hpp | 2 +-
theta/include/theta_sketch_impl.hpp | 14 ++++++++++----
theta/test/theta_compact_empty_from_java.sk | Bin 8 -> 8 bytes
4 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index cc171ce..f080903 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -37,10 +37,9 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
template<typename A>
compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
- if (a.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
+ if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
- if (a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
vector_u64<A> keys;
diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp
index 79fea4e..6be6757 100644
--- a/theta/include/theta_intersection_impl.hpp
+++ b/theta/include/theta_intersection_impl.hpp
@@ -44,7 +44,7 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
template<typename A>
void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
if (is_empty_) return;
- if (sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
+ if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
is_empty_ |= sketch.is_empty();
theta_ = std::min(theta_, sketch.get_theta64());
if (is_valid_ && num_keys_ == 0) return;
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 417dfa1..0514884 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -101,9 +101,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
is.read((char*)&seed_hash, sizeof(seed_hash));
check_serial_version(serial_version, SERIAL_VERSION);
- check_seed_hash(seed_hash, get_seed_hash(seed));
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ check_seed_hash(seed_hash, get_seed_hash(seed));
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
return unique_ptr(
@@ -114,6 +114,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
}
);
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
return unique_ptr(
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
@@ -146,9 +148,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
check_serial_version(serial_version, SERIAL_VERSION);
- check_seed_hash(seed_hash, get_seed_hash(seed));
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ check_seed_hash(seed_hash, get_seed_hash(seed));
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
return unique_ptr(
@@ -161,6 +163,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
}
);
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
return unique_ptr(
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
@@ -753,7 +757,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
is.read((char*)&seed_hash, sizeof(seed_hash));
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
}
@@ -801,7 +806,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
}
diff --git a/theta/test/theta_compact_empty_from_java.sk b/theta/test/theta_compact_empty_from_java.sk
index 44730d3..f6c647f 100644
Binary files a/theta/test/theta_compact_empty_from_java.sk and b/theta/test/theta_compact_empty_from_java.sk differ
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org