You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2023/01/12 04:56:39 UTC
[datasketches-cpp] branch theta_compression updated: pack num_entries
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch theta_compression
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
The following commit(s) were added to refs/heads/theta_compression by this push:
new ba9c089 pack num_entries
ba9c089 is described below
commit ba9c089cab5d427bfe085dc200c9846e019ad9d2
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Wed Jan 11 20:56:33 2023 -0800
pack num_entries
---
theta/include/compact_theta_sketch_parser.hpp | 7 +++----
theta/include/compact_theta_sketch_parser_impl.hpp | 23 ++++++++++++++++------
theta/include/theta_sketch_impl.hpp | 23 ++++++++++------------
3 files changed, 30 insertions(+), 23 deletions(-)
diff --git a/theta/include/compact_theta_sketch_parser.hpp b/theta/include/compact_theta_sketch_parser.hpp
index 8a88324..29bd860 100644
--- a/theta/include/compact_theta_sketch_parser.hpp
+++ b/theta/include/compact_theta_sketch_parser.hpp
@@ -52,11 +52,10 @@ private:
static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3; // ver 1-3
static const size_t COMPACT_SKETCH_THETA_U64 = 2; // ver 1-3
static const size_t COMPACT_SKETCH_V4_MIN_ENTRY_ZEROS_BYTE = 3;
+ static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE = 4;
static const size_t COMPACT_SKETCH_V4_THETA_U64 = 1;
- static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_EXACT_U32 = 2;
- static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_ESTIMATION_U32 = 4;
- static const size_t COMPACT_SKETCH_V4_PACKED_DATA_EXACT_U8 = 12;
- static const size_t COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_U8 = 20;
+ static const size_t COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE = 8;
+ static const size_t COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE = 16;
static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
diff --git a/theta/include/compact_theta_sketch_parser_impl.hpp b/theta/include/compact_theta_sketch_parser_impl.hpp
index 115cdda..576017e 100644
--- a/theta/include/compact_theta_sketch_parser_impl.hpp
+++ b/theta/include/compact_theta_sketch_parser_impl.hpp
@@ -26,6 +26,12 @@
namespace datasketches {
+template<typename T>
+T whole_bytes_to_hold_bits(T bits) {
+ static_assert(std::is_integral<T>::value, "integral type expected");
+ return (bits >> 3) + ((bits & 7) > 0);
+}
+
template<bool dummy>
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
check_memory_size(ptr, size, 8, dump_on_error);
@@ -42,16 +48,21 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
check_memory_size(ptr, size, 16, dump_on_error);
theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_V4_THETA_U64];
}
- const size_t num_entries_index = has_theta ? COMPACT_SKETCH_V4_NUM_ENTRIES_ESTIMATION_U32 : COMPACT_SKETCH_V4_NUM_ENTRIES_EXACT_U32;
- check_memory_size(ptr, size, (num_entries_index + 1) * sizeof(uint32_t), dump_on_error);
- const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[num_entries_index];
- const size_t entries_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_U8 : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_U8;
+ const uint8_t num_entries_bytes = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE];
+ size_t data_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE;
+ check_memory_size(ptr, size, data_offset_bytes + num_entries_bytes, dump_on_error);
+ uint32_t num_entries = 0;
+ const uint8_t* num_entries_ptr = reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes;
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
+ num_entries |= (*num_entries_ptr++) << (i << 3);
+ }
+ data_offset_bytes += num_entries_bytes;
const uint8_t min_entry_zeros = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_MIN_ENTRY_ZEROS_BYTE];
const size_t expected_bits = (64 - min_entry_zeros) * num_entries;
- const size_t expected_size_bytes = entries_offset_bytes + std::ceil(expected_bits / 8.0);
+ const size_t expected_size_bytes = data_offset_bytes + whole_bytes_to_hold_bits(expected_bits);
check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
return {false, true, seed_hash, num_entries, theta,
- reinterpret_cast<const uint8_t*>(ptr) + entries_offset_bytes, static_cast<uint8_t>(64 - min_entry_zeros)};
+ reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes, static_cast<uint8_t>(64 - min_entry_zeros)};
}
case 3: {
uint64_t theta = theta_constants::MAX_THETA;
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index a49cb31..ed50bba 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -425,11 +425,11 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size
uint8_t min_entry_zeros = count_leading_zeros_in_u64(ored);
size_t compressed_bits = (64 - min_entry_zeros) * entries_.size();
-// const uint8_t num_entries_zeros = count_leading_zeros_in_u32(entries_.size());
-// const uint8_t num_entries_bits = 31 - num_entries_zeros; // first 1 is understood
-// compressed_bits += num_entries_bits * (entries_.size() > 1); // no bits for 0 or 1 entry
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
+ const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + std::ceil(compressed_bits / 8.0) + sizeof(uint32_t);
+ const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
+ + whole_bytes_to_hold_bits(compressed_bits);
vector_bytes bytes(size, 0, entries_.get_allocator());
uint8_t* ptr = bytes.data() + header_size_bytes;
@@ -438,7 +438,7 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size
ptr += copy_to_mem<uint8_t>(serial_version, ptr);
ptr += copy_to_mem(SKETCH_TYPE, ptr);
ptr += copy_to_mem(min_entry_zeros, ptr);
- ptr += sizeof(uint8_t); // unused
+ ptr += copy_to_mem(num_entries_bytes, ptr);
const uint8_t flags_byte(
(1 << flags::IS_COMPACT) |
(1 << flags::IS_READ_ONLY) |
@@ -450,14 +450,11 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size
if (this->is_estimation_mode()) {
ptr += copy_to_mem(theta_, ptr);
}
-// uint8_t offset_bits = 0;
-// if (entries_.size() > 1) {
-// offset_bits = put_bits64(entries_.size(), num_entries_bits, ptr, offset_bits);
-// std::cout << "writing " << std::to_string(num_entries_bits) << " bits\n";
-// }
-
- // uncompressed num_entries for now
- ptr += copy_to_mem<uint32_t>(entries_.size(), ptr);
+ uint32_t num_entries = entries_.size();
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
+ *ptr++ = num_entries & 0xff;
+ num_entries >>= 8;
+ }
const uint8_t entry_bits = 64 - min_entry_zeros;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org