You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2019/08/19 23:47:29 UTC

[incubator-datasketches-cpp] branch kll_minor_cleanup created (now c868803)

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a change to branch kll_minor_cleanup
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git.


      at c868803  more tests for kll sketch of strings

This branch includes the following new commits:

     new c2b25d3  minor improvements
     new c868803  more tests for kll sketch of strings

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 02/02: more tests for kll sketch of strings

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch kll_minor_cleanup
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit c868803ef6181ac3783905d968e6ff90aa3e82c5
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Aug 19 16:47:13 2019 -0700

    more tests for kll sketch of strings
---
 kll/test/kll_sketch_test.cpp | 96 +++++++++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 27 deletions(-)

diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp
index 8e12791..c14bb03 100644
--- a/kll/test/kll_sketch_test.cpp
+++ b/kll/test/kll_sketch_test.cpp
@@ -67,7 +67,9 @@ class kll_sketch_test: public CppUnit::TestFixture {
   CPPUNIT_TEST(merge_min_value_from_other);
   CPPUNIT_TEST(merge_min_and_max_from_other);
   CPPUNIT_TEST(sketch_of_ints);
-  CPPUNIT_TEST(sketch_of_strings);
+  CPPUNIT_TEST(sketch_of_strings_stream);
+  CPPUNIT_TEST(sketch_of_strings_bytes);
+  CPPUNIT_TEST(sketch_of_strings_single_item_bytes);
   CPPUNIT_TEST(copy);
   CPPUNIT_TEST_SUITE_END();
 
@@ -502,43 +504,83 @@ public:
     CPPUNIT_ASSERT_EQUAL(sketch.get_rank(n), sketch2.get_rank(n));
   }
 
-  void sketch_of_strings() {
-    kll_string_sketch sketch;
-    CPPUNIT_ASSERT_THROW(sketch.get_quantile(0), std::runtime_error);
-    CPPUNIT_ASSERT_THROW(sketch.get_min_value(), std::runtime_error);
-    CPPUNIT_ASSERT_THROW(sketch.get_max_value(), std::runtime_error);
-    CPPUNIT_ASSERT_EQUAL(8u, sketch.get_serialized_size_bytes());
+  void sketch_of_strings_stream() {
+    kll_string_sketch sketch1;
+    CPPUNIT_ASSERT_THROW(sketch1.get_quantile(0), std::runtime_error);
+    CPPUNIT_ASSERT_THROW(sketch1.get_min_value(), std::runtime_error);
+    CPPUNIT_ASSERT_THROW(sketch1.get_max_value(), std::runtime_error);
+    CPPUNIT_ASSERT_EQUAL(8u, sketch1.get_serialized_size_bytes());
 
-    const int n(1000);
-    for (int i = 0; i < n; i++) sketch.update(std::to_string(i));
+    const int n = 1000;
+    for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
 
-    CPPUNIT_ASSERT_EQUAL(std::string("0"), sketch.get_min_value());
-    CPPUNIT_ASSERT_EQUAL(std::string("999"), sketch.get_max_value());
+    CPPUNIT_ASSERT_EQUAL(std::string("0"), sketch1.get_min_value());
+    CPPUNIT_ASSERT_EQUAL(std::string("999"), sketch1.get_max_value());
 
     std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
-    sketch.serialize(s);
-    CPPUNIT_ASSERT_EQUAL(sketch.get_serialized_size_bytes(), (uint32_t) s.tellp());
+    sketch1.serialize(s);
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_serialized_size_bytes(), (uint32_t) s.tellp());
     auto sketch2 = kll_string_sketch::deserialize(s);
     CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) s.tellg());
     CPPUNIT_ASSERT_EQUAL(s.tellp(), s.tellg());
-    CPPUNIT_ASSERT_EQUAL(sketch.is_empty(), sketch2.is_empty());
-    CPPUNIT_ASSERT_EQUAL(sketch.is_estimation_mode(), sketch2.is_estimation_mode());
-    CPPUNIT_ASSERT_EQUAL(sketch.get_n(), sketch2.get_n());
-    CPPUNIT_ASSERT_EQUAL(sketch.get_num_retained(), sketch2.get_num_retained());
-    CPPUNIT_ASSERT_EQUAL(sketch.get_min_value(), sketch2.get_min_value());
-    CPPUNIT_ASSERT_EQUAL(sketch.get_max_value(), sketch2.get_max_value());
-    CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
-    CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
-    CPPUNIT_ASSERT_EQUAL(sketch.get_quantile(0.5), sketch2.get_quantile(0.5));
-    CPPUNIT_ASSERT_EQUAL(sketch.get_rank(std::to_string(0)), sketch2.get_rank(std::to_string(0)));
-    CPPUNIT_ASSERT_EQUAL(sketch.get_rank(std::to_string(n)), sketch2.get_rank(std::to_string(n)));
+    CPPUNIT_ASSERT_EQUAL(sketch1.is_empty(), sketch2.is_empty());
+    CPPUNIT_ASSERT_EQUAL(sketch1.is_estimation_mode(), sketch2.is_estimation_mode());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_n(), sketch2.get_n());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_num_retained(), sketch2.get_num_retained());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_min_value(), sketch2.get_min_value());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_max_value(), sketch2.get_max_value());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_quantile(0.5), sketch2.get_quantile(0.5));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(0)), sketch2.get_rank(std::to_string(0)));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(n)), sketch2.get_rank(std::to_string(n)));
 
     // to take a look using hexdump
-    std::ofstream os("kll-string.bin");
-    sketch.serialize(os);
+    //std::ofstream os("kll-string.bin");
+    //sketch1.serialize(os);
 
     // debug print
-    //sketch.to_stream(std::cout);
+    //sketch1.to_stream(std::cout);
+  }
+
+  void sketch_of_strings_bytes() {
+    kll_string_sketch sketch1;
+    CPPUNIT_ASSERT_THROW(sketch1.get_quantile(0), std::runtime_error);
+    CPPUNIT_ASSERT_THROW(sketch1.get_min_value(), std::runtime_error);
+    CPPUNIT_ASSERT_THROW(sketch1.get_max_value(), std::runtime_error);
+    CPPUNIT_ASSERT_EQUAL(8u, sketch1.get_serialized_size_bytes());
+
+    const int n = 1000;
+    for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
+
+    CPPUNIT_ASSERT_EQUAL(std::string("0"), sketch1.get_min_value());
+    CPPUNIT_ASSERT_EQUAL(std::string("999"), sketch1.get_max_value());
+
+    auto data = sketch1.serialize();
+    CPPUNIT_ASSERT_EQUAL((size_t) sketch1.get_serialized_size_bytes(), data.second);
+    auto sketch2 = kll_string_sketch::deserialize(data.first.get(), data.second);
+    CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) data.second);
+    CPPUNIT_ASSERT_EQUAL(sketch1.is_empty(), sketch2.is_empty());
+    CPPUNIT_ASSERT_EQUAL(sketch1.is_estimation_mode(), sketch2.is_estimation_mode());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_n(), sketch2.get_n());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_num_retained(), sketch2.get_num_retained());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_min_value(), sketch2.get_min_value());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_max_value(), sketch2.get_max_value());
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_quantile(0.5), sketch2.get_quantile(0.5));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(0)), sketch2.get_rank(std::to_string(0)));
+    CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(n)), sketch2.get_rank(std::to_string(n)));
+  }
+
+
+  void sketch_of_strings_single_item_bytes() {
+    kll_string_sketch sketch1;
+    sketch1.update("a");
+    auto data = sketch1.serialize();
+    CPPUNIT_ASSERT_EQUAL((size_t) sketch1.get_serialized_size_bytes(), data.second);
+    auto sketch2 = kll_string_sketch::deserialize(data.first.get(), data.second);
+    CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) data.second);
   }
 
   void copy() {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-cpp] 01/02: minor improvements

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch kll_minor_cleanup
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit c2b25d3d1a83563763c56cd1a40f91c2140154a1
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Aug 19 16:46:21 2019 -0700

    minor improvements
---
 kll/include/kll_sketch.hpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index 4663af5..c7ff7a6 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -281,7 +281,7 @@ class kll_sketch {
     kll_sketch(uint16_t k, uint8_t flags_byte, const void* bytes, size_t size);
 
     // common update code
-    uint32_t internal_update(const T& value);
+    inline uint32_t internal_update(const T& value);
 
     // The following code is only valid in the special case of exactly reaching capacity while updating.
     // It cannot be used while merging, while reducing k, or anything else.
@@ -416,14 +416,14 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
 
 template<typename T, typename C, typename S, typename A>
 void kll_sketch<T, C, S, A>::update(const T& value) {
-  const uint32_t next_pos = internal_update(value);
-  new (&items_[next_pos]) T(value);
+  const uint32_t index = internal_update(value);
+  new (&items_[index]) T(value);
 }
 
 template<typename T, typename C, typename S, typename A>
 void kll_sketch<T, C, S, A>::update(T&& value) {
-  const uint32_t next_pos = internal_update(value);
-  new (&items_[next_pos]) T(std::move(value));
+  const uint32_t index = internal_update(value);
+  new (&items_[index]) T(std::move(value));
 }
 
 template<typename T, typename C, typename S, typename A>
@@ -438,9 +438,7 @@ uint32_t kll_sketch<T, C, S, A>::internal_update(const T& value) {
   if (levels_[0] == 0) compress_while_updating();
   n_++;
   is_level_zero_sorted_ = false;
-  const uint32_t next_pos(levels_[0] - 1);
-  levels_[0] = next_pos;
-  return next_pos;
+  return --levels_[0];
 }
 
 template<typename T, typename C, typename S, typename A>
@@ -647,7 +645,8 @@ std::pair<void_ptr_with_deleter, const size_t> kll_sketch<T, C, S, A>::serialize
     }
     ptr += S().serialize(ptr, &items_[levels_[0]], get_num_retained());
   }
-  if (ptr != static_cast<char*>(data_ptr.get()) + size) throw std::logic_error("serialized size mismatch");
+  const size_t delta = ptr - static_cast<const char*>(data_ptr.get());
+  if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
   return std::make_pair(std::move(data_ptr), size);
 }
 
@@ -804,7 +803,8 @@ kll_sketch<T, C, S, A>::kll_sketch(uint16_t k, uint8_t flags_byte, const void* b
     new (max_value_) T(items_[levels_[0]]);
   }
   is_level_zero_sorted_ = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
-  if (ptr != static_cast<const char*>(bytes) + size) throw std::logic_error("deserialized size mismatch");
+  const size_t delta = ptr - static_cast<const char*>(bytes);
+  if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
 }
 
 // The following code is only valid in the special case of exactly reaching capacity while updating.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org