You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/08/04 23:42:16 UTC
[incubator-datasketches-cpp] branch tuple_sketch updated: stream
serialization and java compatibility
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
The following commit(s) were added to refs/heads/tuple_sketch by this push:
new 3d3fdbd stream serialization and java compatibility
3d3fdbd is described below
commit 3d3fdbd36694f3cffe3c0dc1e280339bdbbc5c9c
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Aug 4 16:42:03 2020 -0700
stream serialization and java compatibility
---
tuple/include/array_of_doubles_sketch.hpp | 4 +-
tuple/include/array_of_doubles_sketch_impl.hpp | 47 +++++++++++--
tuple/include/theta_helpers.hpp | 30 ++++-----
tuple/include/tuple_sketch.hpp | 2 +-
tuple/test/aod_1_compact_empty_from_java.sk | 1 +
...aod_1_compact_non_empty_no_entries_from_java.sk | Bin 0 -> 16 bytes
tuple/test/array_of_doubles_sketch_test.cpp | 73 +++++++++++++++++++--
7 files changed, 128 insertions(+), 29 deletions(-)
diff --git a/tuple/include/array_of_doubles_sketch.hpp b/tuple/include/array_of_doubles_sketch.hpp
index 1ca217d..b869470 100644
--- a/tuple/include/array_of_doubles_sketch.hpp
+++ b/tuple/include/array_of_doubles_sketch.hpp
@@ -51,14 +51,16 @@ public:
using Entry = typename Base::Entry;
using AllocEntry = typename Base::AllocEntry;
using AllocU64 = typename Base::AllocU64;
- using flags = typename Base::flags;
static const uint8_t SERIAL_VERSION = 1;
static const uint8_t SKETCH_FAMILY = 9;
static const uint8_t SKETCH_TYPE = 3;
+ enum flags { UNUSED1, UNUSED2, IS_EMPTY, HAS_ENTRIES, IS_ORDERED };
compact_array_of_doubles_sketch(const Base& other, bool ordered = true);
+ void serialize(std::ostream& os) const;
+
static compact_array_of_doubles_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
// for internal use
diff --git a/tuple/include/array_of_doubles_sketch_impl.hpp b/tuple/include/array_of_doubles_sketch_impl.hpp
index e208ed1..db0cac3 100644
--- a/tuple/include/array_of_doubles_sketch_impl.hpp
+++ b/tuple/include/array_of_doubles_sketch_impl.hpp
@@ -28,6 +28,41 @@ compact_array_of_doubles_sketch<num, A>::compact_array_of_doubles_sketch(bool is
Base(is_empty, is_ordered, seed_hash, theta, std::move(entries)) {}
template<int num, typename A>
+void compact_array_of_doubles_sketch<num, A>::serialize(std::ostream& os) const {
+ const uint8_t preamble_longs = 1;
+ os.write((char*)&preamble_longs, sizeof(preamble_longs));
+ const uint8_t serial_version = SERIAL_VERSION;
+ os.write((char*)&serial_version, sizeof(serial_version));
+ const uint8_t family = SKETCH_FAMILY;
+ os.write((char*)&family, sizeof(family));
+ const uint8_t type = SKETCH_TYPE;
+ os.write((char*)&type, sizeof(type));
+ const uint8_t flags_byte(
+ (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+ (this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
+ (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
+ );
+ os.write((char*)&flags_byte, sizeof(flags_byte));
+ const uint8_t num_values = num;
+ os.write(reinterpret_cast<const char*>(&num_values), sizeof(num_values));
+ const uint16_t seed_hash = this->get_seed_hash();
+ os.write((char*)&seed_hash, sizeof(seed_hash));
+ os.write((char*)&(this->theta_), sizeof(uint64_t));
+ if (this->get_num_retained() > 0) {
+ const uint32_t num_entries = this->entries_.size();
+ os.write((char*)&num_entries, sizeof(num_entries));
+ const uint32_t unused32 = 0;
+ os.write((char*)&unused32, sizeof(unused32));
+ for (const auto& it: this->entries_) {
+ os.write((char*)&it.first, sizeof(uint64_t));
+ }
+ for (const auto& it: this->entries_) {
+ os.write((char*)&it.second, sizeof(Summary));
+ }
+ }
+}
+
+template<int num, typename A>
compact_array_of_doubles_sketch<num, A> compact_array_of_doubles_sketch<num, A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
uint8_t preamble_longs;
is.read((char*)&preamble_longs, sizeof(preamble_longs));
@@ -46,19 +81,18 @@ compact_array_of_doubles_sketch<num, A> compact_array_of_doubles_sketch<num, A>:
checker<true>::check_sketch_family(family, SKETCH_FAMILY);
checker<true>::check_sketch_type(type, SKETCH_TYPE);
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+ check_value(num_values, static_cast<uint8_t>(num), "number of values");
+ const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
+ if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
uint64_t theta;
is.read((char*)&theta, sizeof(theta));
+ std::vector<Entry, AllocEntry> entries(allocator);
uint32_t num_entries = 0;
- if (!is_empty) {
+ if (has_entries) {
is.read((char*)&num_entries, sizeof(num_entries));
uint32_t unused32;
is.read((char*)&unused32, sizeof(unused32));
- }
- std::vector<Entry, AllocEntry> entries(allocator);
- if (!is_empty) {
entries.reserve(num_entries);
std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
is.read((char*)keys.data(), num_entries * sizeof(uint64_t));
@@ -69,6 +103,7 @@ compact_array_of_doubles_sketch<num, A> compact_array_of_doubles_sketch<num, A>:
}
}
if (!is.good()) throw std::runtime_error("error reading from std::istream");
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
return compact_array_of_doubles_sketch(is_empty, is_ordered, seed_hash, theta, std::move(entries));
}
diff --git a/tuple/include/theta_helpers.hpp b/tuple/include/theta_helpers.hpp
index ca3c655..6852590 100644
--- a/tuple/include/theta_helpers.hpp
+++ b/tuple/include/theta_helpers.hpp
@@ -25,31 +25,27 @@
namespace datasketches {
+template<typename T>
+static void check_value(T actual, T expected, const char* description) {
+ if (actual != expected) {
+ throw std::invalid_argument(std::string(description) + " mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
+ }
+}
+
template<bool dummy>
class checker {
public:
+ static void check_serial_version(uint8_t actual, uint8_t expected) {
+ check_value(actual, expected, "serial version");
+ }
static void check_sketch_family(uint8_t actual, uint8_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch family mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
- }
+ check_value(actual, expected, "sketch family");
}
-
static void check_sketch_type(uint8_t actual, uint8_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
- }
+ check_value(actual, expected, "sketch type");
}
-
- static void check_serial_version(uint8_t actual, uint8_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch serial version mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
- }
- }
-
static void check_seed_hash(uint16_t actual, uint16_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch seed hash mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
- }
+ check_value(actual, expected, "seed hash");
}
};
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 9b960d0..915f170 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -414,7 +414,7 @@ public:
// for internal use
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
-private:
+protected:
bool is_empty_;
bool is_ordered_;
uint16_t seed_hash_;
diff --git a/tuple/test/aod_1_compact_empty_from_java.sk b/tuple/test/aod_1_compact_empty_from_java.sk
new file mode 100644
index 0000000..8d2583d
--- /dev/null
+++ b/tuple/test/aod_1_compact_empty_from_java.sk
@@ -0,0 +1 @@
+ ̓�������
\ No newline at end of file
diff --git a/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk b/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk
new file mode 100644
index 0000000..f67106d
Binary files /dev/null and b/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk differ
diff --git a/tuple/test/array_of_doubles_sketch_test.cpp b/tuple/test/array_of_doubles_sketch_test.cpp
index fa7bfe2..f72861b 100644
--- a/tuple/test/array_of_doubles_sketch_test.cpp
+++ b/tuple/test/array_of_doubles_sketch_test.cpp
@@ -19,6 +19,7 @@
#include <iostream>
#include <fstream>
+#include <sstream>
#include <array>
#include <catch.hpp>
@@ -32,7 +33,45 @@ const std::string inputPath = TEST_BINARY_INPUT_PATH;
const std::string inputPath = "test/";
#endif
-TEST_CASE("tuple sketch: array of doubles serialization compatibility with java", "[tuple_sketch]") {
+TEST_CASE("aod sketch: serialization compatibility with java - empty", "[tuple_sketch]") {
+ auto update_sketch = update_array_of_doubles_sketch<1>::builder().build();
+ REQUIRE(update_sketch.is_empty());
+ REQUIRE(update_sketch.get_num_retained() == 0);
+ auto compact_sketch = update_sketch.compact();
+
+ // read binary sketch from Java
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "aod_1_compact_empty_from_java.sk", std::ios::binary);
+ auto compact_sketch_from_java = compact_array_of_doubles_sketch<1>::deserialize(is);
+ REQUIRE(compact_sketch.get_num_retained() == compact_sketch_from_java.get_num_retained());
+ REQUIRE(compact_sketch.get_theta() == Approx(compact_sketch_from_java.get_theta()).margin(1e-10));
+ REQUIRE(compact_sketch.get_estimate() == Approx(compact_sketch_from_java.get_estimate()).margin(1e-10));
+ REQUIRE(compact_sketch.get_lower_bound(1) == Approx(compact_sketch_from_java.get_lower_bound(1)).margin(1e-10));
+ REQUIRE(compact_sketch.get_upper_bound(1) == Approx(compact_sketch_from_java.get_upper_bound(1)).margin(1e-10));
+}
+
+TEST_CASE("aod sketch: serialization compatibility with java - non-empty no entries", "[tuple_sketch]") {
+ auto update_sketch = update_array_of_doubles_sketch<1>::builder().set_p(0.01).build();
+ std::array<double, 1> a = {1};
+ update_sketch.update(1, a);
+ REQUIRE_FALSE(update_sketch.is_empty());
+ REQUIRE(update_sketch.get_num_retained() == 0);
+ auto compact_sketch = update_sketch.compact();
+
+ // read binary sketch from Java
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "aod_1_compact_non_empty_no_entries_from_java.sk", std::ios::binary);
+ auto compact_sketch_from_java = compact_array_of_doubles_sketch<1>::deserialize(is);
+ REQUIRE(compact_sketch.get_num_retained() == compact_sketch_from_java.get_num_retained());
+ REQUIRE(compact_sketch.get_theta() == Approx(compact_sketch_from_java.get_theta()).margin(1e-10));
+ REQUIRE(compact_sketch.get_estimate() == Approx(compact_sketch_from_java.get_estimate()).margin(1e-10));
+ REQUIRE(compact_sketch.get_lower_bound(1) == Approx(compact_sketch_from_java.get_lower_bound(1)).margin(1e-10));
+ REQUIRE(compact_sketch.get_upper_bound(1) == Approx(compact_sketch_from_java.get_upper_bound(1)).margin(1e-10));
+}
+
+TEST_CASE("aod sketch: serialization compatibility with java - estimation mode", "[tuple_sketch]") {
auto update_sketch = update_array_of_doubles_sketch<1>::builder().build();
std::array<double, 1> a = {1};
for (int i = 0; i < 8192; ++i) update_sketch.update(i, a);
@@ -51,15 +90,41 @@ TEST_CASE("tuple sketch: array of doubles serialization compatibility with java"
REQUIRE(compact_sketch.get_lower_bound(2) == Approx(compact_sketch_from_java.get_lower_bound(2)).margin(1e-10));
REQUIRE(compact_sketch.get_upper_bound(2) == Approx(compact_sketch_from_java.get_upper_bound(2)).margin(1e-10));
REQUIRE(compact_sketch.get_lower_bound(3) == Approx(compact_sketch_from_java.get_lower_bound(3)).margin(1e-10));
- REQUIRE(compact_sketch.get_upper_bound(3) == Approx(compact_sketch.get_upper_bound(3)).margin(1e-10));
+ REQUIRE(compact_sketch.get_upper_bound(3) == Approx(compact_sketch_from_java.get_upper_bound(3)).margin(1e-10));
// sketch from Java is not ordered
// transform it to ordered so that iteration sequence would match exactly
compact_array_of_doubles_sketch<1> ordered_sketch_from_java(compact_sketch_from_java, true);
auto it = ordered_sketch_from_java.begin();
for (const auto& entry: compact_sketch) {
- REQUIRE(entry.first == (*it).first);
- REQUIRE(entry.second == (*it).second);
+ REQUIRE(entry == *it);
+ ++it;
+ }
+}
+
+TEST_CASE("aod sketch: serialize deserialize - estimation mode", "[tuple_sketch]") {
+ auto update_sketch = update_array_of_doubles_sketch<2>::builder().build();
+ std::array<double, 2> a = {1, 2};
+ for (int i = 0; i < 8192; ++i) update_sketch.update(i, a);
+ compact_array_of_doubles_sketch<2> compact_sketch = update_sketch.compact();
+
+ std::stringstream ss;
+ ss.exceptions(std::ios::failbit | std::ios::badbit);
+ compact_sketch.serialize(ss);
+ auto deserialized_sketch = compact_array_of_doubles_sketch<2>::deserialize(ss);
+ REQUIRE(compact_sketch.get_num_retained() == deserialized_sketch.get_num_retained());
+ REQUIRE(compact_sketch.get_theta() == Approx(deserialized_sketch.get_theta()).margin(1e-10));
+ REQUIRE(compact_sketch.get_estimate() == Approx(deserialized_sketch.get_estimate()).margin(1e-10));
+ REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized_sketch.get_lower_bound(1)).margin(1e-10));
+ REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized_sketch.get_upper_bound(1)).margin(1e-10));
+ REQUIRE(compact_sketch.get_lower_bound(2) == Approx(deserialized_sketch.get_lower_bound(2)).margin(1e-10));
+ REQUIRE(compact_sketch.get_upper_bound(2) == Approx(deserialized_sketch.get_upper_bound(2)).margin(1e-10));
+ REQUIRE(compact_sketch.get_lower_bound(3) == Approx(deserialized_sketch.get_lower_bound(3)).margin(1e-10));
+ REQUIRE(compact_sketch.get_upper_bound(3) == Approx(deserialized_sketch.get_upper_bound(3)).margin(1e-10));
+ // sketches must be ordered and the iteration sequence must match exactly
+ auto it = deserialized_sketch.begin();
+ for (const auto& entry: compact_sketch) {
+ REQUIRE(entry == *it);
++it;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org