You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2022/04/05 08:11:55 UTC
[datasketches-cpp] branch quantiles updated: (de)serializatio fixes and compatibility with java, including older serialization versions
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch quantiles
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
The following commit(s) were added to refs/heads/quantiles by this push:
new b37e23f (de)serializatio fixes and compatibility with java, including older serialization versions
b37e23f is described below
commit b37e23f774d1ccfb95b5ce047157a9c030bbc654
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Tue Apr 5 01:11:39 2022 -0700
(de)serializatio fixes and compatibility with java, including older serialization versions
---
quantiles/include/quantiles_sketch.hpp | 2 +-
quantiles/include/quantiles_sketch_impl.hpp | 17 ++-
quantiles/test/CMakeLists.txt | 1 +
quantiles/test/Qk128_n1000_v0.3.0.sk | Bin 0 -> 4136 bytes
quantiles/test/Qk128_n1000_v0.6.0.sk | Bin 0 -> 3936 bytes
quantiles/test/Qk128_n1000_v0.8.0.sk | Bin 0 -> 3936 bytes
quantiles/test/Qk128_n1000_v0.8.3.sk | Bin 0 -> 3936 bytes
quantiles/test/Qk128_n50_v0.3.0.sk | Bin 0 -> 552 bytes
quantiles/test/Qk128_n50_v0.6.0.sk | Bin 0 -> 432 bytes
quantiles/test/Qk128_n50_v0.8.0.sk | Bin 0 -> 432 bytes
quantiles/test/Qk128_n50_v0.8.3.sk | Bin 0 -> 432 bytes
quantiles/test/quantiles_compatibility_test.cpp | 131 ++++++++++++++++++++++++
12 files changed, 146 insertions(+), 5 deletions(-)
diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp
index ade5fa4..075dd51 100644
--- a/quantiles/include/quantiles_sketch.hpp
+++ b/quantiles/include/quantiles_sketch.hpp
@@ -450,7 +450,7 @@ private:
static const uint8_t SERIAL_VERSION_1 = 1;
static const uint8_t SERIAL_VERSION_2 = 2;
static const uint8_t SERIAL_VERSION = 3;
- static const uint8_t FAMILY = 15;
+ static const uint8_t FAMILY = 8;
enum flags { RESERVED0, RESERVED1, IS_EMPTY, IS_COMPACT, IS_SORTED };
diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp
index 34040a1..7cfb919 100644
--- a/quantiles/include/quantiles_sketch_impl.hpp
+++ b/quantiles/include/quantiles_sketch_impl.hpp
@@ -272,7 +272,7 @@ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde
const auto items_seen = read<uint64_t>(is);
- const bool is_compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
A alloc(allocator);
@@ -289,6 +289,10 @@ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde
// serde call did not throw, repackage with destrtuctor
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
+ if (serial_version == 1) {
+ read<uint64_t>(is); // no longer used
+ }
+
// allocate buffers as needed
const uint8_t levels_needed = compute_levels_needed(k, items_seen);
const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
@@ -300,7 +304,7 @@ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde
// load base buffer
const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
- uint32_t items_to_read = is_compact ? bb_items : 2 * k;
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
Level base_buffer = deserialize_array(is, items_to_read, 2 * k, serde, allocator);
// populate vector of Levels directly
@@ -377,7 +381,7 @@ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, cons
uint64_t items_seen;
ptr += copy_from_mem(ptr, items_seen);
- const bool is_compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
A alloc(allocator);
@@ -394,6 +398,11 @@ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, cons
// serde call did not throw, repackage with destrtuctor
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
+ if (serial_version == 1) {
+ uint64_t unused_long;
+ ptr += copy_from_mem(ptr, unused_long); // no longer used
+ }
+
// allocate buffers as needed
const uint8_t levels_needed = compute_levels_needed(k, items_seen);
const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
@@ -405,7 +414,7 @@ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, cons
// load base buffer
const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
- uint32_t items_to_read = is_compact ? bb_items : 2 * k;
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
auto base_buffer_pair = deserialize_array(ptr, end_ptr - ptr, items_to_read, 2 * k, serde, allocator);
ptr += base_buffer_pair.second;
diff --git a/quantiles/test/CMakeLists.txt b/quantiles/test/CMakeLists.txt
index c63a5a1..9b65a8e 100644
--- a/quantiles/test/CMakeLists.txt
+++ b/quantiles/test/CMakeLists.txt
@@ -39,6 +39,7 @@ add_test(
target_sources(quantiles_test
PRIVATE
quantiles_sketch_test.cpp
+ quantiles_compatibility_test.cpp
#quantiles_sketch_custom_type_test.cpp
#quantiles_sketch_validation.cpp
#kolmogorov_smirnov_test.cpp
diff --git a/quantiles/test/Qk128_n1000_v0.3.0.sk b/quantiles/test/Qk128_n1000_v0.3.0.sk
new file mode 100644
index 0000000..72db104
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.3.0.sk differ
diff --git a/quantiles/test/Qk128_n1000_v0.6.0.sk b/quantiles/test/Qk128_n1000_v0.6.0.sk
new file mode 100644
index 0000000..dd6e72d
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.6.0.sk differ
diff --git a/quantiles/test/Qk128_n1000_v0.8.0.sk b/quantiles/test/Qk128_n1000_v0.8.0.sk
new file mode 100644
index 0000000..f9166bb
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.8.0.sk differ
diff --git a/quantiles/test/Qk128_n1000_v0.8.3.sk b/quantiles/test/Qk128_n1000_v0.8.3.sk
new file mode 100644
index 0000000..6cadb2d
Binary files /dev/null and b/quantiles/test/Qk128_n1000_v0.8.3.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.3.0.sk b/quantiles/test/Qk128_n50_v0.3.0.sk
new file mode 100644
index 0000000..4667596
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.3.0.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.6.0.sk b/quantiles/test/Qk128_n50_v0.6.0.sk
new file mode 100644
index 0000000..a2334e4
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.6.0.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.8.0.sk b/quantiles/test/Qk128_n50_v0.8.0.sk
new file mode 100644
index 0000000..83cc21d
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.8.0.sk differ
diff --git a/quantiles/test/Qk128_n50_v0.8.3.sk b/quantiles/test/Qk128_n50_v0.8.3.sk
new file mode 100644
index 0000000..597cd60
Binary files /dev/null and b/quantiles/test/Qk128_n50_v0.8.3.sk differ
diff --git a/quantiles/test/quantiles_compatibility_test.cpp b/quantiles/test/quantiles_compatibility_test.cpp
new file mode 100644
index 0000000..a9836d2
--- /dev/null
+++ b/quantiles/test/quantiles_compatibility_test.cpp
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch.hpp>
+#include <cmath>
+#include <cstring>
+#include <sstream>
+#include <fstream>
+
+#include <quantiles_sketch.hpp>
+#include <serde.hpp>
+#include <test_allocator.hpp>
+
+namespace datasketches {
+
+#ifdef TEST_BINARY_INPUT_PATH
+static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
+#else
+static std::string testBinaryInputPath = "test/";
+#endif
+
+// these tests are for compatibility with old versions of Java's
+// Quantiles sketch, which is only for doubles.
+//
+// typical usage would be just quantiles_sketch<double>, but here we use test_allocator
+using quantiles_double_sketch = quantiles_sketch<double, std::less<double>, test_allocator<double>>;
+
+static void quantiles_decode_and_check(uint16_t k, uint64_t n, std::string version,
+ double expected_median) {
+ const double median_rank = 0.5;
+
+ std::ostringstream filestr;
+ filestr << "Qk" << k << "_n" << n << "_v" << version << ".sk";
+ // as stream
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ std::string filename = testBinaryInputPath + filestr.str();
+
+ is.open(filename, std::ios::binary);
+ auto sketch_stream = quantiles_double_sketch::deserialize(is, serde<double>(), 0);
+ is.close();
+ REQUIRE(sketch_stream.get_quantile(median_rank) == expected_median);
+
+ // as bytes
+ std::ifstream infile(filename, std::ios::binary);
+ std::vector<char> bytes(
+ (std::istreambuf_iterator<char>(infile)),
+ (std::istreambuf_iterator<char>()));
+ infile.close();
+
+ auto sketch_bytes = quantiles_double_sketch::deserialize(bytes.data(), bytes.size(), serde<double>(), 0);
+ REQUIRE(sketch_bytes.get_quantile(median_rank) == expected_median);
+}
+
+TEST_CASE("quantiles compatibility", "[quantiles_compatibility]") {
+
+ // setup
+ test_allocator_total_bytes = 0;
+
+ SECTION("Qk128_n50_v0.3.0.sk") {
+ // file: Qk128_n50_v0.3.0.sk
+ // median: 26.0
+ quantiles_decode_and_check(128, 50, "0.3.0", 26.0);
+ }
+
+ SECTION("Qk128_n1000_v0.3.0.sk") {
+ // file: Qk128_n1000_v0.3.0.sk
+ // median: 501.0
+ quantiles_decode_and_check(128, 1000, "0.3.0", 501.0);
+ }
+
+ SECTION("Qk128_n50_v0.6.0.sk") {
+ // file: Qk128_n50_v0.6.0.sk
+ // median: 26.0
+ quantiles_decode_and_check(128, 50, "0.6.0", 26.0);
+ }
+
+ SECTION("Qk128_n1000_v0.6.0.sk") {
+ // file: Qk128_n1000_v0.6.0.sk
+ // median: 501.0
+ quantiles_decode_and_check(128, 1000, "0.6.0", 501.0);
+ }
+
+ SECTION("Qk128_n50_v0.8.0.sk") {
+ // file: Qk128_n50_v0.8.0.sk
+ // median: 26.0
+ quantiles_decode_and_check(128, 50, "0.8.0", 26.0);
+ }
+
+ SECTION("Qk128_n1000_v0.8.0.sk") {
+ // file: Qk128_n1000_v0.8.0.sk
+ // median: 501.0
+ quantiles_decode_and_check(128, 1000, "0.8.0", 501.0);
+ }
+
+ SECTION("Qk128_n50_v0.8.3.sk") {
+ // file: Qk128_n50_v0.8.3.sk
+ // median: 26.0
+ quantiles_decode_and_check(128, 50, "0.8.0", 26.0);
+ }
+
+ SECTION("Qk128_n1000_v0.8.3.sk") {
+ // file: Qk128_n1000_v0.8.3.sk
+ // median: 501.0
+ quantiles_decode_and_check(128, 1000, "0.8.3", 501.0);
+ }
+
+ // cleanup
+ if (test_allocator_total_bytes != 0) {
+ REQUIRE(test_allocator_total_bytes == 0);
+ }
+
+}
+
+} /* namespace datasketches */
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org