You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2023/01/12 04:56:39 UTC

[datasketches-cpp] branch theta_compression updated: pack num_entries

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch theta_compression
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


The following commit(s) were added to refs/heads/theta_compression by this push:
     new ba9c089  pack num_entries
ba9c089 is described below

commit ba9c089cab5d427bfe085dc200c9846e019ad9d2
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Wed Jan 11 20:56:33 2023 -0800

    pack num_entries
---
 theta/include/compact_theta_sketch_parser.hpp      |  7 +++----
 theta/include/compact_theta_sketch_parser_impl.hpp | 23 ++++++++++++++++------
 theta/include/theta_sketch_impl.hpp                | 23 ++++++++++------------
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/theta/include/compact_theta_sketch_parser.hpp b/theta/include/compact_theta_sketch_parser.hpp
index 8a88324..29bd860 100644
--- a/theta/include/compact_theta_sketch_parser.hpp
+++ b/theta/include/compact_theta_sketch_parser.hpp
@@ -52,11 +52,10 @@ private:
   static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3; // ver 1-3
   static const size_t COMPACT_SKETCH_THETA_U64 = 2; // ver 1-3
   static const size_t COMPACT_SKETCH_V4_MIN_ENTRY_ZEROS_BYTE = 3;
+  static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE = 4;
   static const size_t COMPACT_SKETCH_V4_THETA_U64 = 1;
-  static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_EXACT_U32 = 2;
-  static const size_t COMPACT_SKETCH_V4_NUM_ENTRIES_ESTIMATION_U32 = 4;
-  static const size_t COMPACT_SKETCH_V4_PACKED_DATA_EXACT_U8 = 12;
-  static const size_t COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_U8 = 20;
+  static const size_t COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE = 8;
+  static const size_t COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE = 16;
 
   static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
   static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
diff --git a/theta/include/compact_theta_sketch_parser_impl.hpp b/theta/include/compact_theta_sketch_parser_impl.hpp
index 115cdda..576017e 100644
--- a/theta/include/compact_theta_sketch_parser_impl.hpp
+++ b/theta/include/compact_theta_sketch_parser_impl.hpp
@@ -26,6 +26,12 @@
 
 namespace datasketches {
 
+template<typename T>
+T whole_bytes_to_hold_bits(T bits) {
+  static_assert(std::is_integral<T>::value, "integral type expected");
+  return (bits >> 3) + ((bits & 7) > 0);
+}
+
 template<bool dummy>
 auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
   check_memory_size(ptr, size, 8, dump_on_error);
@@ -42,16 +48,21 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
       check_memory_size(ptr, size, 16, dump_on_error);
       theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_V4_THETA_U64];
     }
-    const size_t num_entries_index = has_theta ? COMPACT_SKETCH_V4_NUM_ENTRIES_ESTIMATION_U32 : COMPACT_SKETCH_V4_NUM_ENTRIES_EXACT_U32;
-    check_memory_size(ptr, size, (num_entries_index + 1) * sizeof(uint32_t), dump_on_error);
-    const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[num_entries_index];
-    const size_t entries_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_U8 : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_U8;
+    const uint8_t num_entries_bytes = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE];
+    size_t data_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE;
+    check_memory_size(ptr, size, data_offset_bytes + num_entries_bytes, dump_on_error);
+    uint32_t num_entries = 0;
+    const uint8_t* num_entries_ptr = reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes;
+    for (unsigned i = 0; i < num_entries_bytes; ++i) {
+      num_entries |= (*num_entries_ptr++) << (i << 3);
+    }
+    data_offset_bytes += num_entries_bytes;
     const uint8_t min_entry_zeros = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_MIN_ENTRY_ZEROS_BYTE];
     const size_t expected_bits = (64 - min_entry_zeros) * num_entries;
-    const size_t expected_size_bytes = entries_offset_bytes + std::ceil(expected_bits / 8.0);
+    const size_t expected_size_bytes = data_offset_bytes + whole_bytes_to_hold_bits(expected_bits);
     check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
     return {false, true, seed_hash, num_entries, theta,
-      reinterpret_cast<const uint8_t*>(ptr) + entries_offset_bytes, static_cast<uint8_t>(64 - min_entry_zeros)};
+      reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes, static_cast<uint8_t>(64 - min_entry_zeros)};
   }
   case 3: {
       uint64_t theta = theta_constants::MAX_THETA;
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index a49cb31..ed50bba 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -425,11 +425,11 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size
   uint8_t min_entry_zeros = count_leading_zeros_in_u64(ored);
   size_t compressed_bits = (64 - min_entry_zeros) * entries_.size();
 
-//  const uint8_t num_entries_zeros = count_leading_zeros_in_u32(entries_.size());
-//  const uint8_t num_entries_bits = 31 - num_entries_zeros; // first 1 is understood
-//  compressed_bits += num_entries_bits * (entries_.size() > 1); // no bits for 0 or 1 entry
+  // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
+  const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
 
-  const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + std::ceil(compressed_bits / 8.0) + sizeof(uint32_t);
+  const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
+      + whole_bytes_to_hold_bits(compressed_bits);
   vector_bytes bytes(size, 0, entries_.get_allocator());
   uint8_t* ptr = bytes.data() + header_size_bytes;
 
@@ -438,7 +438,7 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size
   ptr += copy_to_mem<uint8_t>(serial_version, ptr);
   ptr += copy_to_mem(SKETCH_TYPE, ptr);
   ptr += copy_to_mem(min_entry_zeros, ptr);
-  ptr += sizeof(uint8_t); // unused
+  ptr += copy_to_mem(num_entries_bytes, ptr);
   const uint8_t flags_byte(
     (1 << flags::IS_COMPACT) |
     (1 << flags::IS_READ_ONLY) |
@@ -450,14 +450,11 @@ auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size
   if (this->is_estimation_mode()) {
     ptr += copy_to_mem(theta_, ptr);
   }
-//  uint8_t offset_bits = 0;
-//  if (entries_.size() > 1) {
-//    offset_bits = put_bits64(entries_.size(), num_entries_bits, ptr, offset_bits);
-//    std::cout << "writing " << std::to_string(num_entries_bits) << " bits\n";
-//  }
-
-  // uncompressed num_entries for now
-  ptr += copy_to_mem<uint32_t>(entries_.size(), ptr);
+  uint32_t num_entries = entries_.size();
+  for (unsigned i = 0; i < num_entries_bytes; ++i) {
+    *ptr++ = num_entries & 0xff;
+    num_entries >>= 8;
+  }
 
   const uint8_t entry_bits = 64 - min_entry_zeros;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org