You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2022/04/05 08:11:55 UTC

[datasketches-cpp] branch quantiles updated: (de)serializatio fixes and compatibility with java, including older serialization versions

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch quantiles
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


The following commit(s) were added to refs/heads/quantiles by this push:
     new b37e23f  (de)serializatio fixes and compatibility with java, including older serialization versions
b37e23f is described below

commit b37e23f774d1ccfb95b5ce047157a9c030bbc654
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Tue Apr 5 01:11:39 2022 -0700

    (de)serializatio fixes and compatibility with java, including older serialization versions
---
 quantiles/include/quantiles_sketch.hpp          |   2 +-
 quantiles/include/quantiles_sketch_impl.hpp     |  17 ++-
 quantiles/test/CMakeLists.txt                   |   1 +
 quantiles/test/Qk128_n1000_v0.3.0.sk            | Bin 0 -> 4136 bytes
 quantiles/test/Qk128_n1000_v0.6.0.sk            | Bin 0 -> 3936 bytes
 quantiles/test/Qk128_n1000_v0.8.0.sk            | Bin 0 -> 3936 bytes
 quantiles/test/Qk128_n1000_v0.8.3.sk            | Bin 0 -> 3936 bytes
 quantiles/test/Qk128_n50_v0.3.0.sk              | Bin 0 -> 552 bytes
 quantiles/test/Qk128_n50_v0.6.0.sk              | Bin 0 -> 432 bytes
 quantiles/test/Qk128_n50_v0.8.0.sk              | Bin 0 -> 432 bytes
 quantiles/test/Qk128_n50_v0.8.3.sk              | Bin 0 -> 432 bytes
 quantiles/test/quantiles_compatibility_test.cpp | 131 ++++++++++++++++++++++++
 12 files changed, 146 insertions(+), 5 deletions(-)

diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp
index ade5fa4..075dd51 100644
--- a/quantiles/include/quantiles_sketch.hpp
+++ b/quantiles/include/quantiles_sketch.hpp
@@ -450,7 +450,7 @@ private:
   static const uint8_t SERIAL_VERSION_1 = 1;
   static const uint8_t SERIAL_VERSION_2 = 2;
   static const uint8_t SERIAL_VERSION = 3;
-  static const uint8_t FAMILY = 15;
+  static const uint8_t FAMILY = 8;
 
   enum flags { RESERVED0, RESERVED1, IS_EMPTY, IS_COMPACT, IS_SORTED };
 
diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp
index 34040a1..7cfb919 100644
--- a/quantiles/include/quantiles_sketch_impl.hpp
+++ b/quantiles/include/quantiles_sketch_impl.hpp
@@ -272,7 +272,7 @@ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde
 
   const auto items_seen = read<uint64_t>(is);
 
-  const bool is_compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
+  const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
   const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
 
   A alloc(allocator);
@@ -289,6 +289,10 @@ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde
   // serde call did not throw, repackage with destrtuctor
   max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
 
+  if (serial_version == 1) {
+    read<uint64_t>(is); // no longer used
+  }
+
   // allocate buffers as needed
   const uint8_t levels_needed = compute_levels_needed(k, items_seen);
   const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
@@ -300,7 +304,7 @@ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde
 
   // load base buffer
   const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
-  uint32_t items_to_read = is_compact ? bb_items : 2 * k;
+  uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
   Level base_buffer = deserialize_array(is, items_to_read, 2 * k, serde, allocator);
   
   // populate vector of Levels directly
@@ -377,7 +381,7 @@ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, cons
   uint64_t items_seen;
   ptr += copy_from_mem(ptr, items_seen);
 
-  const bool is_compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
+  const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
   const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
 
   A alloc(allocator);
@@ -394,6 +398,11 @@ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, cons
   // serde call did not throw, repackage with destrtuctor
   max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
 
+  if (serial_version == 1) {
+    uint64_t unused_long;
+    ptr += copy_from_mem(ptr, unused_long); // no longer used
+  }
+
   // allocate buffers as needed
   const uint8_t levels_needed = compute_levels_needed(k, items_seen);
   const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
@@ -405,7 +414,7 @@ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, cons
 
   // load base buffer
   const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
-  uint32_t items_to_read = is_compact ? bb_items : 2 * k;
+  uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
   auto base_buffer_pair = deserialize_array(ptr, end_ptr - ptr, items_to_read, 2 * k, serde, allocator);
   ptr += base_buffer_pair.second;
   
diff --git a/quantiles/test/CMakeLists.txt b/quantiles/test/CMakeLists.txt
index c63a5a1..9b65a8e 100644
--- a/quantiles/test/CMakeLists.txt
+++ b/quantiles/test/CMakeLists.txt
@@ -39,6 +39,7 @@ add_test(
 target_sources(quantiles_test
   PRIVATE
     quantiles_sketch_test.cpp
+    quantiles_compatibility_test.cpp
     #quantiles_sketch_custom_type_test.cpp
     #quantiles_sketch_validation.cpp
     #kolmogorov_smirnov_test.cpp
diff --git a/quantiles/test/Qk128_n1000_v0.3.0.sk b/quantiles/test/Qk128_n1000_v0.3.0.sk
new file mode 100644
index 0000000..72db104
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.3.0.sk differ
diff --git a/quantiles/test/Qk128_n1000_v0.6.0.sk b/quantiles/test/Qk128_n1000_v0.6.0.sk
new file mode 100644
index 0000000..dd6e72d
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.6.0.sk differ
diff --git a/quantiles/test/Qk128_n1000_v0.8.0.sk b/quantiles/test/Qk128_n1000_v0.8.0.sk
new file mode 100644
index 0000000..f9166bb
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.8.0.sk differ
diff --git a/quantiles/test/Qk128_n1000_v0.8.3.sk b/quantiles/test/Qk128_n1000_v0.8.3.sk
new file mode 100644
index 0000000..6cadb2d
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.8.3.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.3.0.sk b/quantiles/test/Qk128_n50_v0.3.0.sk
new file mode 100644
index 0000000..4667596
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.3.0.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.6.0.sk b/quantiles/test/Qk128_n50_v0.6.0.sk
new file mode 100644
index 0000000..a2334e4
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.6.0.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.8.0.sk b/quantiles/test/Qk128_n50_v0.8.0.sk
new file mode 100644
index 0000000..83cc21d
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.8.0.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.8.3.sk b/quantiles/test/Qk128_n50_v0.8.3.sk
new file mode 100644
index 0000000..597cd60
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.8.3.sk differ
diff --git a/quantiles/test/quantiles_compatibility_test.cpp b/quantiles/test/quantiles_compatibility_test.cpp
new file mode 100644
index 0000000..a9836d2
--- /dev/null
+++ b/quantiles/test/quantiles_compatibility_test.cpp
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch.hpp>
+#include <cmath>
+#include <cstring>
+#include <sstream>
+#include <fstream>
+
+#include <quantiles_sketch.hpp>
+#include <serde.hpp>
+#include <test_allocator.hpp>
+
+namespace datasketches {
+
+#ifdef TEST_BINARY_INPUT_PATH
+static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
+#else
+static std::string testBinaryInputPath = "test/";
+#endif
+
+// these tests are for compatibility with old versions of Java's
+// Quantiles sketch, which is only for doubles.
+// 
+// typical usage would be just quantiles_sketch<double>, but here we use test_allocator
+using quantiles_double_sketch = quantiles_sketch<double, std::less<double>, test_allocator<double>>;
+
+static void quantiles_decode_and_check(uint16_t k, uint64_t n, std::string version,
+                                       double expected_median) {
+  const double median_rank = 0.5;
+
+  std::ostringstream filestr;
+  filestr << "Qk" << k << "_n" << n << "_v" << version << ".sk";
+  // as stream
+  std::ifstream is;
+  is.exceptions(std::ios::failbit | std::ios::badbit);
+  std::string filename = testBinaryInputPath + filestr.str();
+
+  is.open(filename, std::ios::binary);
+  auto sketch_stream = quantiles_double_sketch::deserialize(is, serde<double>(), 0);
+  is.close();
+  REQUIRE(sketch_stream.get_quantile(median_rank) == expected_median);
+
+  // as bytes
+  std::ifstream infile(filename, std::ios::binary);
+  std::vector<char> bytes(
+    (std::istreambuf_iterator<char>(infile)),
+    (std::istreambuf_iterator<char>()));
+  infile.close();
+
+  auto sketch_bytes = quantiles_double_sketch::deserialize(bytes.data(), bytes.size(), serde<double>(), 0);
+  REQUIRE(sketch_bytes.get_quantile(median_rank) == expected_median);
+}
+
+TEST_CASE("quantiles compatibility", "[quantiles_compatibility]") {
+
+  // setup
+  test_allocator_total_bytes = 0;
+
+  SECTION("Qk128_n50_v0.3.0.sk") {
+    // file: Qk128_n50_v0.3.0.sk
+    // median: 26.0
+    quantiles_decode_and_check(128, 50, "0.3.0", 26.0);
+  }
+
+  SECTION("Qk128_n1000_v0.3.0.sk") {
+    // file: Qk128_n1000_v0.3.0.sk
+    // median: 501.0
+    quantiles_decode_and_check(128, 1000, "0.3.0", 501.0);
+  }
+
+  SECTION("Qk128_n50_v0.6.0.sk") {
+    // file: Qk128_n50_v0.6.0.sk
+    // median: 26.0
+    quantiles_decode_and_check(128, 50, "0.6.0", 26.0);
+  }
+
+  SECTION("Qk128_n1000_v0.6.0.sk") {
+    // file: Qk128_n1000_v0.6.0.sk
+    // median: 501.0
+    quantiles_decode_and_check(128, 1000, "0.6.0", 501.0);
+  }
+
+  SECTION("Qk128_n50_v0.8.0.sk") {
+    // file: Qk128_n50_v0.8.0.sk
+    // median: 26.0
+    quantiles_decode_and_check(128, 50, "0.8.0", 26.0);
+  }
+
+  SECTION("Qk128_n1000_v0.8.0.sk") {
+    // file: Qk128_n1000_v0.8.0.sk
+    // median: 501.0
+    quantiles_decode_and_check(128, 1000, "0.8.0", 501.0);
+  }
+
+  SECTION("Qk128_n50_v0.8.3.sk") {
+    // file: Qk128_n50_v0.8.3.sk
+    // median: 26.0
+    quantiles_decode_and_check(128, 50, "0.8.0", 26.0);
+  }
+
+  SECTION("Qk128_n1000_v0.8.3.sk") {
+    // file: Qk128_n1000_v0.8.3.sk
+    // median: 501.0
+    quantiles_decode_and_check(128, 1000, "0.8.3", 501.0);
+  }
+
+  // cleanup
+  if (test_allocator_total_bytes != 0) {
+    REQUIRE(test_allocator_total_bytes == 0);
+  }
+
+}
+
+} /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org