You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/08/04 23:42:16 UTC

[incubator-datasketches-cpp] branch tuple_sketch updated: stream serialization and java compatibility

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/tuple_sketch by this push:
     new 3d3fdbd  stream serialization and java compatibility
3d3fdbd is described below

commit 3d3fdbd36694f3cffe3c0dc1e280339bdbbc5c9c
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Aug 4 16:42:03 2020 -0700

    stream serialization and java compatibility
---
 tuple/include/array_of_doubles_sketch.hpp          |   4 +-
 tuple/include/array_of_doubles_sketch_impl.hpp     |  47 +++++++++++--
 tuple/include/theta_helpers.hpp                    |  30 ++++-----
 tuple/include/tuple_sketch.hpp                     |   2 +-
 tuple/test/aod_1_compact_empty_from_java.sk        |   1 +
 ...aod_1_compact_non_empty_no_entries_from_java.sk | Bin 0 -> 16 bytes
 tuple/test/array_of_doubles_sketch_test.cpp        |  73 +++++++++++++++++++--
 7 files changed, 128 insertions(+), 29 deletions(-)

diff --git a/tuple/include/array_of_doubles_sketch.hpp b/tuple/include/array_of_doubles_sketch.hpp
index 1ca217d..b869470 100644
--- a/tuple/include/array_of_doubles_sketch.hpp
+++ b/tuple/include/array_of_doubles_sketch.hpp
@@ -51,14 +51,16 @@ public:
   using Entry = typename Base::Entry;
   using AllocEntry = typename Base::AllocEntry;
   using AllocU64 = typename Base::AllocU64;
-  using flags = typename Base::flags;
 
   static const uint8_t SERIAL_VERSION = 1;
   static const uint8_t SKETCH_FAMILY = 9;
   static const uint8_t SKETCH_TYPE = 3;
+  enum flags { UNUSED1, UNUSED2, IS_EMPTY, HAS_ENTRIES, IS_ORDERED };
 
   compact_array_of_doubles_sketch(const Base& other, bool ordered = true);
 
+  void serialize(std::ostream& os) const;
+
   static compact_array_of_doubles_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
 
   // for internal use
diff --git a/tuple/include/array_of_doubles_sketch_impl.hpp b/tuple/include/array_of_doubles_sketch_impl.hpp
index e208ed1..db0cac3 100644
--- a/tuple/include/array_of_doubles_sketch_impl.hpp
+++ b/tuple/include/array_of_doubles_sketch_impl.hpp
@@ -28,6 +28,41 @@ compact_array_of_doubles_sketch<num, A>::compact_array_of_doubles_sketch(bool is
 Base(is_empty, is_ordered, seed_hash, theta, std::move(entries)) {}
 
 template<int num, typename A>
+void compact_array_of_doubles_sketch<num, A>::serialize(std::ostream& os) const {
+  const uint8_t preamble_longs = 1;
+  os.write((char*)&preamble_longs, sizeof(preamble_longs));
+  const uint8_t serial_version = SERIAL_VERSION;
+  os.write((char*)&serial_version, sizeof(serial_version));
+  const uint8_t family = SKETCH_FAMILY;
+  os.write((char*)&family, sizeof(family));
+  const uint8_t type = SKETCH_TYPE;
+  os.write((char*)&type, sizeof(type));
+  const uint8_t flags_byte(
+    (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+    (this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
+    (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
+  );
+  os.write((char*)&flags_byte, sizeof(flags_byte));
+  const uint8_t num_values = num;
+  os.write(reinterpret_cast<const char*>(&num_values), sizeof(num_values));
+  const uint16_t seed_hash = this->get_seed_hash();
+  os.write((char*)&seed_hash, sizeof(seed_hash));
+  os.write((char*)&(this->theta_), sizeof(uint64_t));
+  if (this->get_num_retained() > 0) {
+    const uint32_t num_entries = this->entries_.size();
+    os.write((char*)&num_entries, sizeof(num_entries));
+    const uint32_t unused32 = 0;
+    os.write((char*)&unused32, sizeof(unused32));
+    for (const auto& it: this->entries_) {
+      os.write((char*)&it.first, sizeof(uint64_t));
+    }
+    for (const auto& it: this->entries_) {
+      os.write((char*)&it.second, sizeof(Summary));
+    }
+  }
+}
+
+template<int num, typename A>
 compact_array_of_doubles_sketch<num, A> compact_array_of_doubles_sketch<num, A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
   uint8_t preamble_longs;
   is.read((char*)&preamble_longs, sizeof(preamble_longs));
@@ -46,19 +81,18 @@ compact_array_of_doubles_sketch<num, A> compact_array_of_doubles_sketch<num, A>:
   checker<true>::check_sketch_family(family, SKETCH_FAMILY);
   checker<true>::check_sketch_type(type, SKETCH_TYPE);
   checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
-  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
-  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+  check_value(num_values, static_cast<uint8_t>(num), "number of values");
+  const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
+  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
 
   uint64_t theta;
   is.read((char*)&theta, sizeof(theta));
+  std::vector<Entry, AllocEntry> entries(allocator);
   uint32_t num_entries = 0;
-  if (!is_empty) {
+  if (has_entries) {
     is.read((char*)&num_entries, sizeof(num_entries));
     uint32_t unused32;
     is.read((char*)&unused32, sizeof(unused32));
-  }
-  std::vector<Entry, AllocEntry> entries(allocator);
-  if (!is_empty) {
     entries.reserve(num_entries);
     std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
     is.read((char*)keys.data(), num_entries * sizeof(uint64_t));
@@ -69,6 +103,7 @@ compact_array_of_doubles_sketch<num, A> compact_array_of_doubles_sketch<num, A>:
     }
   }
   if (!is.good()) throw std::runtime_error("error reading from std::istream");
+  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
   const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
   return compact_array_of_doubles_sketch(is_empty, is_ordered, seed_hash, theta, std::move(entries));
 }
diff --git a/tuple/include/theta_helpers.hpp b/tuple/include/theta_helpers.hpp
index ca3c655..6852590 100644
--- a/tuple/include/theta_helpers.hpp
+++ b/tuple/include/theta_helpers.hpp
@@ -25,31 +25,27 @@
 
 namespace datasketches {
 
+template<typename T>
+static void check_value(T actual, T expected, const char* description) {
+  if (actual != expected) {
+    throw std::invalid_argument(std::string(description) + " mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
+  }
+}
+
 template<bool dummy>
 class checker {
 public:
+  static void check_serial_version(uint8_t actual, uint8_t expected) {
+    check_value(actual, expected, "serial version");
+  }
   static void check_sketch_family(uint8_t actual, uint8_t expected) {
-      if (actual != expected) {
-        throw std::invalid_argument("Sketch family mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
-      }
+    check_value(actual, expected, "sketch family");
   }
-
   static void check_sketch_type(uint8_t actual, uint8_t expected) {
-      if (actual != expected) {
-        throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
-      }
+    check_value(actual, expected, "sketch type");
   }
-
-  static void check_serial_version(uint8_t actual, uint8_t expected) {
-    if (actual != expected) {
-      throw std::invalid_argument("Sketch serial version mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
-    }
-  }
-
   static void check_seed_hash(uint16_t actual, uint16_t expected) {
-    if (actual != expected) {
-      throw std::invalid_argument("Sketch seed hash mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
-    }
+    check_value(actual, expected, "seed hash");
   }
 };
 
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 9b960d0..915f170 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -414,7 +414,7 @@ public:
   // for internal use
   compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
 
-private:
+protected:
   bool is_empty_;
   bool is_ordered_;
   uint16_t seed_hash_;
diff --git a/tuple/test/aod_1_compact_empty_from_java.sk b/tuple/test/aod_1_compact_empty_from_java.sk
new file mode 100644
index 0000000..8d2583d
--- /dev/null
+++ b/tuple/test/aod_1_compact_empty_from_java.sk
@@ -0,0 +1 @@
+	̓�������
\ No newline at end of file
diff --git a/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk b/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk
new file mode 100644
index 0000000..f67106d
Binary files /dev/null and b/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk differ
diff --git a/tuple/test/array_of_doubles_sketch_test.cpp b/tuple/test/array_of_doubles_sketch_test.cpp
index fa7bfe2..f72861b 100644
--- a/tuple/test/array_of_doubles_sketch_test.cpp
+++ b/tuple/test/array_of_doubles_sketch_test.cpp
@@ -19,6 +19,7 @@
 
 #include <iostream>
 #include <fstream>
+#include <sstream>
 #include <array>
 
 #include <catch.hpp>
@@ -32,7 +33,45 @@ const std::string inputPath = TEST_BINARY_INPUT_PATH;
 const std::string inputPath = "test/";
 #endif
 
-TEST_CASE("tuple sketch: array of doubles serialization compatibility with java", "[tuple_sketch]") {
+TEST_CASE("aod sketch: serialization compatibility with java - empty", "[tuple_sketch]") {
+  auto update_sketch = update_array_of_doubles_sketch<1>::builder().build();
+  REQUIRE(update_sketch.is_empty());
+  REQUIRE(update_sketch.get_num_retained() == 0);
+  auto compact_sketch = update_sketch.compact();
+
+  // read binary sketch from Java
+  std::ifstream is;
+  is.exceptions(std::ios::failbit | std::ios::badbit);
+  is.open(inputPath + "aod_1_compact_empty_from_java.sk", std::ios::binary);
+  auto compact_sketch_from_java = compact_array_of_doubles_sketch<1>::deserialize(is);
+  REQUIRE(compact_sketch.get_num_retained() == compact_sketch_from_java.get_num_retained());
+  REQUIRE(compact_sketch.get_theta() == Approx(compact_sketch_from_java.get_theta()).margin(1e-10));
+  REQUIRE(compact_sketch.get_estimate() == Approx(compact_sketch_from_java.get_estimate()).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(1) == Approx(compact_sketch_from_java.get_lower_bound(1)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(1) == Approx(compact_sketch_from_java.get_upper_bound(1)).margin(1e-10));
+}
+
+TEST_CASE("aod sketch: serialization compatibility with java - non-empty no entries", "[tuple_sketch]") {
+  auto update_sketch = update_array_of_doubles_sketch<1>::builder().set_p(0.01).build();
+  std::array<double, 1> a = {1};
+  update_sketch.update(1, a);
+  REQUIRE_FALSE(update_sketch.is_empty());
+  REQUIRE(update_sketch.get_num_retained() == 0);
+  auto compact_sketch = update_sketch.compact();
+
+  // read binary sketch from Java
+  std::ifstream is;
+  is.exceptions(std::ios::failbit | std::ios::badbit);
+  is.open(inputPath + "aod_1_compact_non_empty_no_entries_from_java.sk", std::ios::binary);
+  auto compact_sketch_from_java = compact_array_of_doubles_sketch<1>::deserialize(is);
+  REQUIRE(compact_sketch.get_num_retained() == compact_sketch_from_java.get_num_retained());
+  REQUIRE(compact_sketch.get_theta() == Approx(compact_sketch_from_java.get_theta()).margin(1e-10));
+  REQUIRE(compact_sketch.get_estimate() == Approx(compact_sketch_from_java.get_estimate()).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(1) == Approx(compact_sketch_from_java.get_lower_bound(1)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(1) == Approx(compact_sketch_from_java.get_upper_bound(1)).margin(1e-10));
+}
+
+TEST_CASE("aod sketch: serialization compatibility with java - estimation mode", "[tuple_sketch]") {
   auto update_sketch = update_array_of_doubles_sketch<1>::builder().build();
   std::array<double, 1> a = {1};
   for (int i = 0; i < 8192; ++i) update_sketch.update(i, a);
@@ -51,15 +90,41 @@ TEST_CASE("tuple sketch: array of doubles serialization compatibility with java"
   REQUIRE(compact_sketch.get_lower_bound(2) == Approx(compact_sketch_from_java.get_lower_bound(2)).margin(1e-10));
   REQUIRE(compact_sketch.get_upper_bound(2) == Approx(compact_sketch_from_java.get_upper_bound(2)).margin(1e-10));
   REQUIRE(compact_sketch.get_lower_bound(3) == Approx(compact_sketch_from_java.get_lower_bound(3)).margin(1e-10));
-  REQUIRE(compact_sketch.get_upper_bound(3) == Approx(compact_sketch.get_upper_bound(3)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(3) == Approx(compact_sketch_from_java.get_upper_bound(3)).margin(1e-10));
 
   // sketch from Java is not ordered
   // transform it to ordered so that iteration sequence would match exactly
   compact_array_of_doubles_sketch<1> ordered_sketch_from_java(compact_sketch_from_java, true);
   auto it = ordered_sketch_from_java.begin();
   for (const auto& entry: compact_sketch) {
-    REQUIRE(entry.first == (*it).first);
-    REQUIRE(entry.second == (*it).second);
+    REQUIRE(entry == *it);
+    ++it;
+  }
+}
+
+TEST_CASE("aod sketch: serialize deserialize - estimation mode", "[tuple_sketch]") {
+  auto update_sketch = update_array_of_doubles_sketch<2>::builder().build();
+  std::array<double, 2> a = {1, 2};
+  for (int i = 0; i < 8192; ++i) update_sketch.update(i, a);
+  compact_array_of_doubles_sketch<2> compact_sketch = update_sketch.compact();
+
+  std::stringstream ss;
+  ss.exceptions(std::ios::failbit | std::ios::badbit);
+  compact_sketch.serialize(ss);
+  auto deserialized_sketch = compact_array_of_doubles_sketch<2>::deserialize(ss);
+  REQUIRE(compact_sketch.get_num_retained() == deserialized_sketch.get_num_retained());
+  REQUIRE(compact_sketch.get_theta() == Approx(deserialized_sketch.get_theta()).margin(1e-10));
+  REQUIRE(compact_sketch.get_estimate() == Approx(deserialized_sketch.get_estimate()).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized_sketch.get_lower_bound(1)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized_sketch.get_upper_bound(1)).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(2) == Approx(deserialized_sketch.get_lower_bound(2)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(2) == Approx(deserialized_sketch.get_upper_bound(2)).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(3) == Approx(deserialized_sketch.get_lower_bound(3)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(3) == Approx(deserialized_sketch.get_upper_bound(3)).margin(1e-10));
+  // sketches must be ordered and the iteration sequence must match exactly
+  auto it = deserialized_sketch.begin();
+  for (const auto& entry: compact_sketch) {
+    REQUIRE(entry == *it);
     ++it;
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org