You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/11 07:25:50 UTC

[incubator-datasketches-cpp] branch sampling updated: [WIP] add binary compatibility tests to var opt sketch

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/sampling by this push:
     new 180094d  [WIP] add binary compatibility tests to var opt sketch
180094d is described below

commit 180094d223c2139e648cde21d9f026ddbe6321aa
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Mon Feb 10 23:25:33 2020 -0800

    [WIP] add binary compatibility tests to var opt sketch
---
 sampling/include/var_opt_sketch.hpp    |   2 +-
 sampling/test/var_opt_sketch_test.cpp  | 201 ++++++---------------------------
 sampling/test/varopt_long_sampling.bin | Bin 0 -> 8248 bytes
 sampling/test/varopt_string_exact.bin  | Bin 0 -> 2916 bytes
 4 files changed, 36 insertions(+), 167 deletions(-)

diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index 9a35ee8..79cfd48 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -106,7 +106,7 @@ class var_opt_sketch {
     static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
     static const uint8_t PREAMBLE_LONGS_FULL   = 4;
     static const uint8_t SER_VER = 2;
-    static const uint8_t FAMILY  = 12;
+    static const uint8_t FAMILY  = 13;
     static const uint8_t EMPTY_FLAG_MASK  = 4;
     static const uint8_t GADGET_FLAG_MASK = 128;
 
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index 78bcb27..524454e 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -59,14 +59,10 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
   CPPUNIT_TEST(string_serialization);
   CPPUNIT_TEST(pseudo_light_update);
   CPPUNIT_TEST(pseudo_heavy_update);
-  // CPPUNIT_TEST(decrease_k_with_under_full_sketch);
-  // CPPUNIT_TEST(decrease_k_with_full_sketch);
   CPPUNIT_TEST(reset);
   CPPUNIT_TEST(estimate_subset_sum);
-  // CPPUNIT_TEST(binary_compatibility);
-  
-  // CPPUNIT_TEST(empty);
-  // CPPUNIT_TEST(vo_union);
+  CPPUNIT_TEST(deserialize_exact_from_java);
+  CPPUNIT_TEST(deserialize_sampling_from_java);
   CPPUNIT_TEST_SUITE_END();
 
   var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
@@ -107,8 +103,6 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
   }
 
   void invalid_k() {
-    std::cerr << "start invalid_k()" << std::endl;
-    {
     CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k = 0",
       var_opt_sketch<int> sk(0),
       std::invalid_argument);
@@ -116,13 +110,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_THROW_MESSAGE("constructor failed to catch invalid k < 0 (aka >= 2^31)",
       var_opt_sketch<int> sk(1<<31),
       std::invalid_argument);
-    }
-    std::cerr << "end invalid_k()" << std::endl;
   }
 
   void bad_ser_ver() {
-    std::cerr << "start bad_ser_ver()" << std::endl;
-    {
     var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
     std::vector<uint8_t> bytes = sk.serialize();
     bytes[1] = 0; // corrupt the serialization version byte
@@ -138,13 +128,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad serialization version",
       var_opt_sketch<int>::deserialize(ss),
       std::invalid_argument);
-    }
-    std::cerr << "end bad_ser_ver()" << std::endl;
   }
 
   void bad_family() {
-    std::cerr << "start bad_family()" << std::endl;
-    {
     var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
     std::vector<uint8_t> bytes = sk.serialize();
     bytes[2] = 0; // corrupt the family byte
@@ -159,13 +145,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(stream) failed to catch bad family id",
       var_opt_sketch<int>::deserialize(ss),
       std::invalid_argument);
-    }
-    std::cerr << "end bad_family()" << std::endl;
   }
 
   void bad_prelongs() {
-    std::cerr << "start bad_prelongs()" << std::endl;
-    {
     // The nubmer of preamble longs shares bits with resize_factor, but the latter
     // has no invalid values as it gets 2 bites for 4 enum values.
     var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
@@ -185,13 +167,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch bad preamble longs",
       var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
       std::invalid_argument);
-    }
-    std::cerr << "end bad_prelongs()" << std::endl;
   }
 
   void malformed_preamble() {
-    std::cerr << "start malformed_preamble()" << std::endl;
-    {
     uint32_t k = 50;
     var_opt_sketch<int> sk = create_unweighted_sketch(k, k);
     const std::vector<uint8_t> src_bytes = sk.serialize();
@@ -225,13 +203,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_THROW_MESSAGE("deserialize(bytes) failed to catch invalid R count",
       var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()),
       std::invalid_argument);
-    }
-    std::cerr << "end malformed_preamble()" << std::endl;
   }
 
   void empty_sketch() {
-    std::cerr << "start empty_sketch()" << std::endl;
-    {
     var_opt_sketch<std::string> sk(5);
     CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
     CPPUNIT_ASSERT_EQUAL((uint32_t) 0, sk.get_num_samples());
@@ -242,13 +216,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     var_opt_sketch<std::string> loaded_sk = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
     CPPUNIT_ASSERT_EQUAL((uint64_t) 0, loaded_sk.get_n());
     CPPUNIT_ASSERT_EQUAL((uint32_t) 0, loaded_sk.get_num_samples());
-    }
-    std::cerr << "end empty_sketch()" << std::endl;
   }
 
   void non_empty_degenerate_sketch() {
-    std::cerr << "start non_empty_degenerate_sketch()" << std::endl;
-    {
     // Make an empty serialized sketch, then extend it to a
     // PREAMBLE_LONGS_WARMUP-sized byte array, with no items.
     // Then clear the empty flag so it will try to load the rest.
@@ -264,24 +234,16 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch non-empty sketch with no items",
       var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()),
       std::invalid_argument);
-    }
-    std::cerr << "end non_empty_degenerate_sketch()" << std::endl;
   }
 
   void invalid_weight() {
-    std::cerr << "start invalid_weights()" << std::endl;
-    {
     var_opt_sketch<std::string> sk(100, resize_factor::X2);
     CPPUNIT_ASSERT_THROW_MESSAGE("update() accepted a negative weight",
       sk.update("invalid_weight", -1.0),
       std::invalid_argument);
-    }
-    std::cerr << "end invalid_weights()" << std::endl;
   }
 
   void corrupt_serialized_weight() {
-    std::cerr << "start corrupt_serialized_weight()" << std::endl;
-    {
     var_opt_sketch<int> sk = create_unweighted_sketch(100, 20);
     auto bytes = sk.to_string();
     
@@ -298,13 +260,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_THROW_MESSAGE("deserialize() failed to catch negative item weight",
       var_opt_sketch<std::string>::deserialize(ss),
       std::invalid_argument);
-    }
-    std::cerr << "end corrupt_serialized_weight()" << std::endl;
   }
 
   void cumulative_weight() {
-    std::cerr << "start cumulative_weight()" << std::endl;
-    {
     uint32_t k = 256;
     uint64_t n = 10 * k;
     var_opt_sketch<int> sk(k);
@@ -329,13 +287,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     
     double weight_ratio = output_sum / input_sum;
     CPPUNIT_ASSERT(std::abs(weight_ratio - 1.0) < EPS);
-    }
-    std::cerr << "end cumulative_weight()" << std::endl;
   }
 
   void under_full_sketch_serialization() {
-    std::cerr << "start under_full_sketch_serialization()" << std::endl;
-    {
     var_opt_sketch<int> sk = create_unweighted_sketch(100, 10); // need n < k
 
     auto bytes = sk.serialize();
@@ -346,13 +300,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     sk.serialize(ss);
     var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
     check_if_equal(sk, sk_from_stream);
-    }
-    std::cerr << "end under_full_sketch_serialization()" << std::endl;
   }
 
   void end_of_warmup_sketch_serialization() {
-    std::cerr << "start end_of_warmup_sketch_serialization()" << std::endl;
-    {
     var_opt_sketch<int> sk = create_unweighted_sketch(2843, 2843); // need n == k
     auto bytes = sk.serialize();
 
@@ -367,13 +317,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     sk.serialize(ss);
     var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
     check_if_equal(sk, sk_from_stream);
-    }
-    std::cerr << "end end_of_warmup_sketch_serialization()" << std::endl;
   }
 
   void full_sketch_serialization() {
-    std::cerr << "start full_sketch_serialization()" << std::endl;
-    {
     var_opt_sketch<int> sk = create_unweighted_sketch(32, 32);
     sk.update(100, 100.0);
     sk.update(101, 101.0);
@@ -400,13 +346,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     sk.serialize(ss);
     var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
     check_if_equal(sk, sk_from_stream);
-    }
-    std::cerr << "end full_sketch_serialization()" << std::endl;
   }
 
   void string_serialization() {
-    std::cerr << "start string_serialization()" << std::endl;
-    {
     var_opt_sketch<std::string> sk(5);
     sk.update("a", 1.0);
     sk.update("bc", 1.0);
@@ -423,14 +365,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     sk.serialize(ss);
     var_opt_sketch<std::string> sk_from_stream = var_opt_sketch<std::string>::deserialize(ss);
     check_if_equal(sk, sk_from_stream);
-
-    }
-    std::cerr << "end string_serialization()" << std::endl;
   }
 
   void pseudo_light_update() {
-    std::cerr << "start pseudo_light_update()" << std::endl;
-    {
     uint32_t k = 1024;
     var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
     sk.update(0, 1.0); // k+2nd update
@@ -442,13 +379,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     double wt = (*it).second;
     CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_light_update()",
       ((1.0 * (k + 2)) / k), wt, EPS);
-    }
-    std::cerr << "end pseudo_light_update()" << std::endl;
   }
 
   void pseudo_heavy_update() {
-    std::cerr << "start pseudo_heavy_update()" << std::endl;
-    {
     uint32_t k = 1024;
     double wt_scale = 10.0 * k;
     var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
@@ -476,13 +409,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     }
     CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("weight corruption in pseudo_light_update()",
       1.0 + wt_scale + (2 * k), wt, EPS);
-    }
-    std::cerr << "end pseudo_heavy_update()" << std::endl;
   }
 
   void reset() {
-    std::cerr << "start reset()" << std::endl;
-    {
     uint32_t k = 1024;
     uint64_t n1 = 20;
     uint64_t n2 = 2 * k;
@@ -508,13 +437,9 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     sk.reset();
     CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sk.get_n());
     CPPUNIT_ASSERT_EQUAL(k, sk.get_k());
-    }
-    std::cerr << "end reset()" << std::endl;
   }
 
   void estimate_subset_sum() {
-    std::cerr << "start estimate_subset_sum()" << std::endl;
-    {
     uint32_t k = 10;
     var_opt_sketch<int> sk(k);
 
@@ -563,7 +488,7 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
       total_weight += 1.0 * i;
     }
 
-    summary= sk.estimate_subset_sum([](int x) { return x < 0; });
+    summary = sk.estimate_subset_sum([](int x) { return x < 0; });
     CPPUNIT_ASSERT_GREATEREQUAL(summary.lower_bound, summary.estimate); // estimate >= lower_bound)
     CPPUNIT_ASSERT_LESSEQUAL(summary.upper_bound, summary.estimate); // estimate <= upper_bound)
 
@@ -584,98 +509,42 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_DOUBLES_EQUAL(summary.lower_bound, summary.estimate, 0.0);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(summary.upper_bound, summary.estimate, 0.0);
     CPPUNIT_ASSERT_LESS(total_weight, summary.estimate); // exact mode, so know it must be strictly less
-    }
-    std::cerr << "end estimate_subset_sum()" << std::endl;
   }
 
-  /**********************************************************************/
-
-  void vo_union() {
-    int k = 10;
-    var_opt_sketch<int> sk(k), sk2(k+3);
-
-    for (int i = 0; i < 10*k; ++i) {
-      sk.update(i);
-      sk2.update(i);
-    }
-    sk.update(-1, 10000.0);
-    sk2.update(-2, 4000.0);
-    std::cerr << sk.to_string() << std::endl;
-
-    var_opt_union<int> vou(k+3);
-    std::cerr << vou.to_string() << std::endl;
-    vou.update(sk);
-    vou.update(sk2);
-    std::cerr << vou.to_string() << std::endl;
-
-    var_opt_sketch<int> r = vou.get_result();
-    std::cerr << "-----------------------" << std::endl << r.to_string() << std::endl;
+  void deserialize_exact_from_java() {
+    std::ifstream is;
+    is.exceptions(std::ios::failbit | std::ios::badbit);
+    is.open(testBinaryInputPath + "varopt_string_exact.bin", std::ios::binary);
+    var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
+    CPPUNIT_ASSERT(!sketch.is_empty());
+    CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 200, sketch.get_n());
+    CPPUNIT_ASSERT_EQUAL((uint32_t) 200, sketch.get_num_samples());
+    subset_summary ss = sketch.estimate_subset_sum([](std::string x){ return true; });
+
+    double tgt_wt = 0.0;
+    for (int i = 1; i <= 200; ++i) { tgt_wt += 1000.0 / i; }
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(tgt_wt, ss.total_sketch_weight, EPS);
   }
 
-  void empty() {
-    int k = 10;
-
-    {
-    var_opt_sketch<int> sketch(k);
-
-    for (int i = 0; i < 2*k; ++i)
-      sketch.update(i);
-    sketch.update(1000, 100000.0);
-
-    std::cout << sketch.to_string();
-
-    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
-    sketch.serialize(ss);
-    std::cout << "sketch.serialize() done\n";
-    var_opt_sketch<int> sk2 = var_opt_sketch<int>::deserialize(ss);
-    std::cout << sk2.to_string() << std::endl;;
-    }
-
-    {
-    var_opt_sketch<std::string> sk(k);
-    std::cout << "Expected size: " << sk.get_serialized_size_bytes() << std::endl;
-    std::string x[26];
-    x[0]  = std::string("a");
-    x[1]  = std::string("b");
-    x[2]  = std::string("c");
-    x[3]  = std::string("d");
-    x[4]  = std::string("e");
-    x[5]  = std::string("f");
-    x[6]  = std::string("g");
-    x[7]  = std::string("h");
-    x[8]  = std::string("i");
-    x[9]  = std::string("j");
-    x[10] = std::string("k");
-    x[11] = std::string("l");
-    x[12] = std::string("m");
-    x[13] = std::string("n");
-    x[14] = std::string("o");
-    x[15] = std::string("p");
-    x[16] = std::string("q");
-    x[17] = std::string("r");
-    x[18] = std::string("s");
-    x[19] = std::string("t");
-    x[20] = std::string("u");
-    x[21] = std::string("v");
-    x[22] = std::string("w");
-    x[23] = std::string("x");
-    x[24] = std::string("y");
-    x[25] = std::string("z");
-
-    for (int i=0; i <11; ++i)
-      sk.update(x[i]);
-    sk.update(x[11], 10000);
-    std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
-    sk.serialize(ss);
-    std::cout << "ss size: " << ss.str().length() << std::endl;
-    auto vec = sk.serialize();
-    std::cout << "Vector size: " << vec.size() << std::endl;
-    
-    var_opt_sketch<std::string> sk2 = var_opt_sketch<std::string>::deserialize(ss);
-    std::cout << sk2.to_string() << std::endl;
-    const std::string str("much longer string with luck won't fit nicely in existing structure location");
-    sk2.update(str, 1000000);
-    }
+  void deserialize_sampling_from_java() {
+    std::ifstream is;
+    is.exceptions(std::ios::failbit | std::ios::badbit);
+    is.open(testBinaryInputPath + "varopt_long_sampling.bin", std::ios::binary);
+    var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
+    CPPUNIT_ASSERT(!sketch.is_empty());
+    CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 2003, sketch.get_n());
+    CPPUNIT_ASSERT_EQUAL(sketch.get_k(), sketch.get_num_samples());
+    subset_summary ss = sketch.estimate_subset_sum([](int64_t x){ return true; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(332000.0, ss.estimate, EPS);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(332000.0, ss.total_sketch_weight, EPS);
+
+    ss = sketch.estimate_subset_sum([](int64_t x){ return x < 0; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(330000.0, ss.estimate, 0.0);
+
+    ss = sketch.estimate_subset_sum([](int64_t x){ return x >= 0; });
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(2000.0, ss.estimate, EPS);
   }
 
 };
diff --git a/sampling/test/varopt_long_sampling.bin b/sampling/test/varopt_long_sampling.bin
new file mode 100644
index 0000000..ae8c750
Binary files /dev/null and b/sampling/test/varopt_long_sampling.bin differ
diff --git a/sampling/test/varopt_string_exact.bin b/sampling/test/varopt_string_exact.bin
new file mode 100644
index 0000000..2da7e4e
Binary files /dev/null and b/sampling/test/varopt_string_exact.bin differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org