You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/06/23 23:16:09 UTC
[incubator-datasketches-cpp] branch tuple_sketch updated: more
methods and tests
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
The following commit(s) were added to refs/heads/tuple_sketch by this push:
new 4424074 more methods and tests
4424074 is described below
commit 4424074c4877eab20eceb767de1cac58b6e03f04
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Jun 23 16:15:59 2020 -0700
more methods and tests
---
tuple/include/theta_sketch_experimental.hpp | 4 +-
tuple/include/theta_sketch_experimental_impl.hpp | 5 +
tuple/include/theta_union_experimental.hpp | 17 ++-
tuple/include/theta_update_sketch_base.hpp | 20 +++-
tuple/include/theta_update_sketch_base_impl.hpp | 7 +-
tuple/include/tuple_sketch.hpp | 136 +++++++++++------------
tuple/include/tuple_sketch_impl.hpp | 66 +++++++++++
tuple/test/tuple_sketch_allocation_test.cpp | 3 +
tuple/test/tuple_sketch_test.cpp | 39 +++++++
9 files changed, 219 insertions(+), 78 deletions(-)
diff --git a/tuple/include/theta_sketch_experimental.hpp b/tuple/include/theta_sketch_experimental.hpp
index 9c8bd3d..a920929 100644
--- a/tuple/include/theta_sketch_experimental.hpp
+++ b/tuple/include/theta_sketch_experimental.hpp
@@ -50,6 +50,8 @@ public:
void update(uint64_t key);
void update(const void* key, size_t length);
+ void trim();
+
string<A> to_string(bool detail = false) const;
vector_bytes serialize(unsigned header_size_bytes = 0) const;
@@ -62,7 +64,7 @@ public:
private:
enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
- typedef theta_update_sketch_base<uint64_t, trivial_extract_key<uint64_t>, A> theta_table;
+ using theta_table = theta_update_sketch_base<uint64_t, trivial_extract_key<uint64_t>, A>;
theta_table table_;
theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
diff --git a/tuple/include/theta_sketch_experimental_impl.hpp b/tuple/include/theta_sketch_experimental_impl.hpp
index 347022f..81db200 100644
--- a/tuple/include/theta_sketch_experimental_impl.hpp
+++ b/tuple/include/theta_sketch_experimental_impl.hpp
@@ -44,6 +44,11 @@ void theta_sketch_experimental<A>::update(const void* key, size_t length) {
}
template<typename A>
+void theta_sketch_experimental<A>::trim() {
+ table_.trim();
+}
+
+template<typename A>
string<A> theta_sketch_experimental<A>::to_string(bool detail) const {
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
auto type = typeid(*this).name();
diff --git a/tuple/include/theta_union_experimental.hpp b/tuple/include/theta_union_experimental.hpp
index 585c5fe..87b6a64 100644
--- a/tuple/include/theta_union_experimental.hpp
+++ b/tuple/include/theta_union_experimental.hpp
@@ -29,6 +29,17 @@ namespace datasketches {
// experimental theta union derived from the same base as tuple union
+// utility function to hide unused compiler warning
+// usually has no additional cost
+template<typename T> void unused(T&&...) {}
+
+struct pass_through_policy {
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
+ unused(incoming_entry);
+ return internal_entry;
+ }
+};
+
template<typename Allocator = std::allocator<uint64_t>>
class theta_union_experimental {
public:
@@ -38,12 +49,6 @@ public:
using CompactSketch = compact_theta_sketch_experimental<Allocator>;
using resize_factor = theta_constants::resize_factor;
- struct pass_through_policy {
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
- return internal_entry;
- }
- };
-
using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
// No constructor here. Use builder instead.
diff --git a/tuple/include/theta_update_sketch_base.hpp b/tuple/include/theta_update_sketch_base.hpp
index e90091f..0046adc 100644
--- a/tuple/include/theta_update_sketch_base.hpp
+++ b/tuple/include/theta_update_sketch_base.hpp
@@ -45,7 +45,7 @@ struct theta_update_sketch_base {
// TODO: copy and move
~theta_update_sketch_base();
- typedef Entry* iterator;
+ using iterator = Entry*;
std::pair<iterator, bool> find(uint64_t key) const;
@@ -77,6 +77,7 @@ struct theta_update_sketch_base {
void resize();
void rebuild();
+ void trim();
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
@@ -220,6 +221,23 @@ private:
uint32_t index_;
};
+// double value canonicalization for compatibility with Java
+static inline int64_t canonical_double(double value) {
+ union {
+ int64_t long_value;
+ double double_value;
+ } long_double_union;
+
+ if (value == 0.0) {
+ long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
+ } else if (std::isnan(value)) {
+ long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
+ } else {
+ long_double_union.double_value = value;
+ }
+ return long_double_union.long_value;
+}
+
} /* namespace datasketches */
#include "theta_update_sketch_base_impl.hpp"
diff --git a/tuple/include/theta_update_sketch_base_impl.hpp b/tuple/include/theta_update_sketch_base_impl.hpp
index ff4492d..3ba239c 100644
--- a/tuple/include/theta_update_sketch_base_impl.hpp
+++ b/tuple/include/theta_update_sketch_base_impl.hpp
@@ -140,7 +140,6 @@ void theta_update_sketch_base<EN, EK, A>::resize() {
const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
lg_cur_size_ += factor;
const size_t new_size = 1 << lg_cur_size_;
-// std::cout << "resizing from " << old_size << " to " << new_size << std::endl;
EN* old_entries = entries_;
entries_ = A().allocate(new_size);
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
@@ -157,7 +156,6 @@ void theta_update_sketch_base<EN, EK, A>::resize() {
template<typename EN, typename EK, typename A>
void theta_update_sketch_base<EN, EK, A>::rebuild() {
-// std::cout << "rebuilding" << std::endl;
const size_t size = 1 << lg_cur_size_;
const uint32_t pivot = (1 << lg_nom_size_) + size - num_entries_;
std::nth_element(&entries_[0], &entries_[pivot], &entries_[size], comparator());
@@ -176,6 +174,11 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
A().deallocate(old_entries, size);
}
+template<typename EN, typename EK, typename A>
+void theta_update_sketch_base<EN, EK, A>::trim() {
+ if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
+}
+
// builder
template<bool dummy>
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 55dda48..b520e22 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -232,8 +232,8 @@ public:
* Update this sketch with a given string.
* @param value string to update the sketch with
*/
-// template<typename FwdUpdate>
-// void update(const std::string& key, FwdUpdate&& value);
+ template<typename FwdUpdate>
+ void update(const std::string& key, FwdUpdate&& value);
/**
* Update this sketch with a given unsigned 64-bit integer.
@@ -246,72 +246,72 @@ public:
* Update this sketch with a given signed 64-bit integer.
* @param value int64_t to update the sketch with
*/
-// template<typename FwdUpdate>
-// void update(int64_t key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given unsigned 32-bit integer.
-// * For compatibility with Java implementation.
-// * @param value uint32_t to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(uint32_t key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given signed 32-bit integer.
-// * For compatibility with Java implementation.
-// * @param value int32_t to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(int32_t key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given unsigned 16-bit integer.
-// * For compatibility with Java implementation.
-// * @param value uint16_t to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(uint16_t key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given signed 16-bit integer.
-// * For compatibility with Java implementation.
-// * @param value int16_t to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(int16_t key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given unsigned 8-bit integer.
-// * For compatibility with Java implementation.
-// * @param value uint8_t to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(uint8_t key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given signed 8-bit integer.
-// * For compatibility with Java implementation.
-// * @param value int8_t to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(int8_t key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given double-precision floating point value.
-// * For compatibility with Java implementation.
-// * @param value double to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(double key, FwdUpdate&& value);
-//
-// /**
-// * Update this sketch with a given floating point value.
-// * For compatibility with Java implementation.
-// * @param value float to update the sketch with
-// */
-// template<typename FwdUpdate>
-// void update(float key, FwdUpdate&& value);
+ template<typename FwdUpdate>
+ void update(int64_t key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given unsigned 32-bit integer.
+ * For compatibility with Java implementation.
+ * @param value uint32_t to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(uint32_t key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given signed 32-bit integer.
+ * For compatibility with Java implementation.
+ * @param value int32_t to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(int32_t key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given unsigned 16-bit integer.
+ * For compatibility with Java implementation.
+ * @param value uint16_t to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(uint16_t key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given signed 16-bit integer.
+ * For compatibility with Java implementation.
+ * @param value int16_t to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(int16_t key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given unsigned 8-bit integer.
+ * For compatibility with Java implementation.
+ * @param value uint8_t to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(uint8_t key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given signed 8-bit integer.
+ * For compatibility with Java implementation.
+ * @param value int8_t to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(int8_t key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given double-precision floating point value.
+ * For compatibility with Java implementation.
+ * @param value double to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(double key, FwdUpdate&& value);
+
+ /**
+ * Update this sketch with a given floating point value.
+ * For compatibility with Java implementation.
+ * @param value float to update the sketch with
+ */
+ template<typename FwdUpdate>
+ void update(float key, FwdUpdate&& value);
/**
* Update this sketch with given data of any type.
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index 14dfa5c..e2e2e41 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -95,12 +95,73 @@ auto update_tuple_sketch<S, U, P, SD, A>::get_rf() const -> resize_factor {
template<typename S, typename U, typename P, typename SD, typename A>
template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(const std::string& key, UU&& value) {
+ if (key.empty()) return;
+ update(key.c_str(), key.length(), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
void update_tuple_sketch<S, U, P, SD, A>::update(uint64_t key, UU&& value) {
update(&key, sizeof(key), std::forward<UU>(value));
}
template<typename S, typename U, typename P, typename SD, typename A>
template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int64_t key, UU&& value) {
+ update(&key, sizeof(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(uint32_t key, UU&& value) {
+ update(static_cast<int32_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int32_t key, UU&& value) {
+ update(static_cast<int64_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(uint16_t key, UU&& value) {
+ update(static_cast<int16_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int16_t key, UU&& value) {
+ update(static_cast<int64_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(uint8_t key, UU&& value) {
+ update(static_cast<int8_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(double key, UU&& value) {
+ update(canonical_double(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(float key, UU&& value) {
+ update(static_cast<double>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
+void update_tuple_sketch<S, U, P, SD, A>::update(int8_t key, UU&& value) {
+ update(static_cast<int64_t>(key), std::forward<UU>(value));
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
+template<typename UU>
void update_tuple_sketch<S, U, P, SD, A>::update(const void* key, size_t length, UU&& value) {
const uint64_t hash = compute_hash(key, length, map_.seed_);
if (hash >= map_.theta_ || hash == 0) return; // hash == 0 is reserved to mark empty slots in the table
@@ -115,6 +176,11 @@ void update_tuple_sketch<S, U, P, SD, A>::update(const void* key, size_t length,
}
template<typename S, typename U, typename P, typename SD, typename A>
+void update_tuple_sketch<S, U, P, SD, A>::trim() {
+ map_.trim();
+}
+
+template<typename S, typename U, typename P, typename SD, typename A>
string<A> update_tuple_sketch<S, U, P, SD, A>::to_string(bool detail) const {
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
auto type = typeid(*this).name();
diff --git a/tuple/test/tuple_sketch_allocation_test.cpp b/tuple/test/tuple_sketch_allocation_test.cpp
index b9f969d..227c9bd 100644
--- a/tuple/test/tuple_sketch_allocation_test.cpp
+++ b/tuple/test/tuple_sketch_allocation_test.cpp
@@ -44,6 +44,9 @@ TEST_CASE("tuple sketch with test allocator: exact mode", "[tuple_sketch]") {
}
REQUIRE(count == update_sketch.get_num_retained());
+ update_sketch.trim();
+ REQUIRE(update_sketch.get_num_retained() == (1 << update_sketch.get_lg_k()));
+
auto compact_sketch = update_sketch.compact();
REQUIRE(!compact_sketch.is_empty());
REQUIRE(compact_sketch.is_estimation_mode());
diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp
index dedd016..2beeeaf 100644
--- a/tuple/test/tuple_sketch_test.cpp
+++ b/tuple/test/tuple_sketch_test.cpp
@@ -191,4 +191,43 @@ TEST_CASE("tuple sketch: array of doubles", "[tuple_sketch]") {
std::cout << compact_sketch.to_string(true);
}
+TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sketch]") {
+ std::cout << "update with different values begin" << std::endl;
+ auto sketch = update_tuple_sketch<float>::builder().build();
+ sketch.update(static_cast<uint64_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(static_cast<int64_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(static_cast<uint32_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(static_cast<int32_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(static_cast<uint16_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(static_cast<int16_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(static_cast<uint8_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(static_cast<int8_t>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 1);
+
+ sketch.update(1.0, 1);
+ REQUIRE(sketch.get_num_retained() == 2);
+
+ sketch.update(static_cast<float>(1), 1);
+ REQUIRE(sketch.get_num_retained() == 2);
+
+ sketch.update("a", 1);
+ REQUIRE(sketch.get_num_retained() == 3);
+
+ std::cout << "update with different values end" << std::endl;
+}
+
} /* namespace datasketches */
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org