You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/15 09:04:22 UTC

[incubator-datasketches-cpp] 01/02: finish deserialization tests from java, describe code used to generate java binaries

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git

commit 7e3e8ab36d8978e2dfd95cf6468550509168172b
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Fri Feb 14 11:39:27 2020 -0800

    finish deserialization tests from java, describe code used to generate java binaries
---
 sampling/test/binaries_from_java.txt               |  67 +++++++++++++++++++++
 sampling/test/var_opt_sketch_test.cpp              |   4 +-
 sampling/test/var_opt_union_test.cpp               |  65 +++++---------------
 ...ampling.bin => varopt_sketch_long_sampling.bin} | Bin
 ...ng_exact.bin => varopt_sketch_string_exact.bin} | Bin
 sampling/test/varopt_union_double_sampling.bin     | Bin 0 -> 572 bytes
 6 files changed, 85 insertions(+), 51 deletions(-)

diff --git a/sampling/test/binaries_from_java.txt b/sampling/test/binaries_from_java.txt
new file mode 100644
index 0000000..eb3ea30
--- /dev/null
+++ b/sampling/test/binaries_from_java.txt
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+Code snippets used to generate to generate the binary images from Java.
+Heavy items have negative weights to allow a simple predicate to filter
+heavy vs light sketch entires.
+
+
+varopt_sketch_long_sampling.bin:
+final VarOptItemsSketch<String> sk = VarOptItemsSketch.newInstance(1024);
+for (int i = 1; i <= 200; ++i) {
+    sk.update(Integer.toString(i), 1000.0 / i);
+}
+byte[] bytes = sk.toByteArray(new ArrayOfStringsSerDe());
+
+
+varopt_sketch_string_exact.bin:
+final VarOptItemsSketch<Long> sk = VarOptItemsSketch.newInstance(1024);
+for (long i = 0; i < 2000; ++i) {
+  sk.update(i, 1.0);
+}
+sk.update(-1L, 100000.0);
+sk.update(-2L, 110000.0);
+sk.update(-3L, 120000.0);
+byte[] bytes = sk.toByteArray(new ArrayOfLongsSerDe());
+
+
+varopt_union_double_sampling.bin:
+// parallels small samplign sketch test
+final int kSmall = 16;
+final int n1 = 32;
+final int n2 = 64;
+final int kMax = 128;
+
+// small k sketch, but sampling
+VarOptItemsSketch<Double> sketch = VarOptItemsSketch.newInstance(kSmall);
+for (int i = 0; i < n1; ++i) {
+  sketch.update(1.0 * i, 1.0);
+}
+sketch.update(-1.0, n1 * n1); // add a heavy item
+
+final VarOptItemsUnion<Double> union = VarOptItemsUnion.newInstance(kMax);
+union.update(sketch);
+
+// another one, but different n to get a different per-item weight
+sketch = VarOptItemsSketch.newInstance(kSmall);
+for (int i = 0; i < n2; ++i) {
+  sketch.update(1.0 * i, 1.0);
+}
+union.update(sketch);
+byte[] bytes = union.toByteArray(new ArrayOfDoublesSerDe());
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index a2819f3..2472a3a 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -513,7 +513,7 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
   void deserialize_exact_from_java() {
     std::ifstream is;
     is.exceptions(std::ios::failbit | std::ios::badbit);
-    is.open(testBinaryInputPath + "varopt_string_exact.bin", std::ios::binary);
+    is.open(testBinaryInputPath + "varopt_sketch_string_exact.bin", std::ios::binary);
     var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
     CPPUNIT_ASSERT(!sketch.is_empty());
     CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
@@ -529,7 +529,7 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
   void deserialize_sampling_from_java() {
     std::ifstream is;
     is.exceptions(std::ios::failbit | std::ios::badbit);
-    is.open(testBinaryInputPath + "varopt_long_sampling.bin", std::ios::binary);
+    is.open(testBinaryInputPath + "varopt_sketch_long_sampling.bin", std::ios::binary);
     var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
     CPPUNIT_ASSERT(!sketch.is_empty());
     CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
diff --git a/sampling/test/var_opt_union_test.cpp b/sampling/test/var_opt_union_test.cpp
index 45f92ab..fd3460b 100644
--- a/sampling/test/var_opt_union_test.cpp
+++ b/sampling/test/var_opt_union_test.cpp
@@ -54,8 +54,7 @@ class var_opt_union_test: public CppUnit::TestFixture {
   CPPUNIT_TEST(serialize_empty);
   CPPUNIT_TEST(serialize_exact);
   CPPUNIT_TEST(serialize_sampling);
-  // CPPUNIT_TEST(deserialize_exact_from_java);
-  // CPPUNIT_TEST(deserialize_sampling_from_java);
+  CPPUNIT_TEST(deserialize_from_java);
   CPPUNIT_TEST_SUITE_END();
 
   var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
@@ -316,7 +315,7 @@ class var_opt_union_test: public CppUnit::TestFixture {
     CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt + (n1 * n1), ss.total_sketch_weight, EPS);
     CPPUNIT_ASSERT_LESS(k_max, result.get_k());
 
-    // check tha tmark information is preserved as expected
+    // check that mark information is preserved as expected
     compare_serialization_deserialization(u, false);
   }
 
@@ -344,54 +343,22 @@ class var_opt_union_test: public CppUnit::TestFixture {
     compare_serialization_deserialization(u);
   }
 
-/**********************************************************/
-
-
-  void test_union() {
-    var_opt_union<int> u(10);
-
-    var_opt_sketch<int> sk = create_unweighted_sketch(9, 100);
-    u.update(sk);
-    std::cout << u.to_string() << std::endl;
-
-    auto vec = u.serialize();
-    std::cout << vec.size() << "\t" << vec.capacity() << "\t" << vec.empty() << std::endl;
-  }
-
-  void deserialize_exact_from_java() {
+  void deserialize_from_java() {
     std::ifstream is;
     is.exceptions(std::ios::failbit | std::ios::badbit);
-    is.open(testBinaryInputPath + "varopt_string_exact.bin", std::ios::binary);
-    var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
-    CPPUNIT_ASSERT(!sketch.is_empty());
-    CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
-    CPPUNIT_ASSERT_EQUAL((uint64_t) 200, sketch.get_n());
-    CPPUNIT_ASSERT_EQUAL((uint32_t) 200, sketch.get_num_samples());
-    subset_summary ss = sketch.estimate_subset_sum([](std::string x){ return true; });
-
-    double tgt_wt = 0.0;
-    for (int i = 1; i <= 200; ++i) { tgt_wt += 1000.0 / i; }
-    CPPUNIT_ASSERT_DOUBLES_EQUAL(tgt_wt, ss.total_sketch_weight, EPS);
-  }
-
-  void deserialize_sampling_from_java() {
-    std::ifstream is;
-    is.exceptions(std::ios::failbit | std::ios::badbit);
-    is.open(testBinaryInputPath + "varopt_long_sampling.bin", std::ios::binary);
-    var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
-    CPPUNIT_ASSERT(!sketch.is_empty());
-    CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
-    CPPUNIT_ASSERT_EQUAL((uint64_t) 2003, sketch.get_n());
-    CPPUNIT_ASSERT_EQUAL(sketch.get_k(), sketch.get_num_samples());
-    subset_summary ss = sketch.estimate_subset_sum([](int64_t x){ return true; });
-    CPPUNIT_ASSERT_DOUBLES_EQUAL(332000.0, ss.estimate, EPS);
-    CPPUNIT_ASSERT_DOUBLES_EQUAL(332000.0, ss.total_sketch_weight, EPS);
-
-    ss = sketch.estimate_subset_sum([](int64_t x){ return x < 0; });
-    CPPUNIT_ASSERT_DOUBLES_EQUAL(330000.0, ss.estimate, 0.0);
-
-    ss = sketch.estimate_subset_sum([](int64_t x){ return x >= 0; });
-    CPPUNIT_ASSERT_DOUBLES_EQUAL(2000.0, ss.estimate, EPS);
+    is.open(testBinaryInputPath + "varopt_union_double_sampling.bin", std::ios::binary);
+    var_opt_union<double> u = var_opt_union<double>::deserialize(is);
+    
+    // must reduce k in the process, like in small_sampling_sketch()
+    var_opt_sketch<double> result = u.get_result();
+    CPPUNIT_ASSERT(!result.is_empty());
+    CPPUNIT_ASSERT_EQUAL((uint64_t) 97, result.get_n());
+  
+    double expected_wt = 96.0;// light items -- ignoring the heavy one
+    subset_summary ss = result.estimate_subset_sum([](float x){return x >= 0;});
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.estimate, EPS);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt + 1024.0, ss.total_sketch_weight, EPS);
+    CPPUNIT_ASSERT_LESS((uint32_t) 128, result.get_k());
   }
 
 };
diff --git a/sampling/test/varopt_long_sampling.bin b/sampling/test/varopt_sketch_long_sampling.bin
similarity index 100%
rename from sampling/test/varopt_long_sampling.bin
rename to sampling/test/varopt_sketch_long_sampling.bin
diff --git a/sampling/test/varopt_string_exact.bin b/sampling/test/varopt_sketch_string_exact.bin
similarity index 100%
rename from sampling/test/varopt_string_exact.bin
rename to sampling/test/varopt_sketch_string_exact.bin
diff --git a/sampling/test/varopt_union_double_sampling.bin b/sampling/test/varopt_union_double_sampling.bin
new file mode 100644
index 0000000..b3a229e
Binary files /dev/null and b/sampling/test/varopt_union_double_sampling.bin differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org