You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2020/02/15 09:04:22 UTC
[incubator-datasketches-cpp] 01/02: finish deserialization tests
from java, describe code used to generate java binaries
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch sampling
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit 7e3e8ab36d8978e2dfd95cf6468550509168172b
Author: Jon Malkin <jm...@users.noreply.github.com>
AuthorDate: Fri Feb 14 11:39:27 2020 -0800
finish deserialization tests from java, describe code used to generate java binaries
---
sampling/test/binaries_from_java.txt | 67 +++++++++++++++++++++
sampling/test/var_opt_sketch_test.cpp | 4 +-
sampling/test/var_opt_union_test.cpp | 65 +++++---------------
...ampling.bin => varopt_sketch_long_sampling.bin} | Bin
...ng_exact.bin => varopt_sketch_string_exact.bin} | Bin
sampling/test/varopt_union_double_sampling.bin | Bin 0 -> 572 bytes
6 files changed, 85 insertions(+), 51 deletions(-)
diff --git a/sampling/test/binaries_from_java.txt b/sampling/test/binaries_from_java.txt
new file mode 100644
index 0000000..eb3ea30
--- /dev/null
+++ b/sampling/test/binaries_from_java.txt
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+Code snippets used to generate to generate the binary images from Java.
+Heavy items have negative weights to allow a simple predicate to filter
+heavy vs light sketch entires.
+
+
+varopt_sketch_long_sampling.bin:
+final VarOptItemsSketch<String> sk = VarOptItemsSketch.newInstance(1024);
+for (int i = 1; i <= 200; ++i) {
+ sk.update(Integer.toString(i), 1000.0 / i);
+}
+byte[] bytes = sk.toByteArray(new ArrayOfStringsSerDe());
+
+
+varopt_sketch_string_exact.bin:
+final VarOptItemsSketch<Long> sk = VarOptItemsSketch.newInstance(1024);
+for (long i = 0; i < 2000; ++i) {
+ sk.update(i, 1.0);
+}
+sk.update(-1L, 100000.0);
+sk.update(-2L, 110000.0);
+sk.update(-3L, 120000.0);
+byte[] bytes = sk.toByteArray(new ArrayOfLongsSerDe());
+
+
+varopt_union_double_sampling.bin:
+// parallels small samplign sketch test
+final int kSmall = 16;
+final int n1 = 32;
+final int n2 = 64;
+final int kMax = 128;
+
+// small k sketch, but sampling
+VarOptItemsSketch<Double> sketch = VarOptItemsSketch.newInstance(kSmall);
+for (int i = 0; i < n1; ++i) {
+ sketch.update(1.0 * i, 1.0);
+}
+sketch.update(-1.0, n1 * n1); // add a heavy item
+
+final VarOptItemsUnion<Double> union = VarOptItemsUnion.newInstance(kMax);
+union.update(sketch);
+
+// another one, but different n to get a different per-item weight
+sketch = VarOptItemsSketch.newInstance(kSmall);
+for (int i = 0; i < n2; ++i) {
+ sketch.update(1.0 * i, 1.0);
+}
+union.update(sketch);
+byte[] bytes = union.toByteArray(new ArrayOfDoublesSerDe());
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index a2819f3..2472a3a 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -513,7 +513,7 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
void deserialize_exact_from_java() {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(testBinaryInputPath + "varopt_string_exact.bin", std::ios::binary);
+ is.open(testBinaryInputPath + "varopt_sketch_string_exact.bin", std::ios::binary);
var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
CPPUNIT_ASSERT(!sketch.is_empty());
CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
@@ -529,7 +529,7 @@ class var_opt_sketch_test: public CppUnit::TestFixture {
void deserialize_sampling_from_java() {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(testBinaryInputPath + "varopt_long_sampling.bin", std::ios::binary);
+ is.open(testBinaryInputPath + "varopt_sketch_long_sampling.bin", std::ios::binary);
var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
CPPUNIT_ASSERT(!sketch.is_empty());
CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
diff --git a/sampling/test/var_opt_union_test.cpp b/sampling/test/var_opt_union_test.cpp
index 45f92ab..fd3460b 100644
--- a/sampling/test/var_opt_union_test.cpp
+++ b/sampling/test/var_opt_union_test.cpp
@@ -54,8 +54,7 @@ class var_opt_union_test: public CppUnit::TestFixture {
CPPUNIT_TEST(serialize_empty);
CPPUNIT_TEST(serialize_exact);
CPPUNIT_TEST(serialize_sampling);
- // CPPUNIT_TEST(deserialize_exact_from_java);
- // CPPUNIT_TEST(deserialize_sampling_from_java);
+ CPPUNIT_TEST(deserialize_from_java);
CPPUNIT_TEST_SUITE_END();
var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
@@ -316,7 +315,7 @@ class var_opt_union_test: public CppUnit::TestFixture {
CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt + (n1 * n1), ss.total_sketch_weight, EPS);
CPPUNIT_ASSERT_LESS(k_max, result.get_k());
- // check tha tmark information is preserved as expected
+ // check that mark information is preserved as expected
compare_serialization_deserialization(u, false);
}
@@ -344,54 +343,22 @@ class var_opt_union_test: public CppUnit::TestFixture {
compare_serialization_deserialization(u);
}
-/**********************************************************/
-
-
- void test_union() {
- var_opt_union<int> u(10);
-
- var_opt_sketch<int> sk = create_unweighted_sketch(9, 100);
- u.update(sk);
- std::cout << u.to_string() << std::endl;
-
- auto vec = u.serialize();
- std::cout << vec.size() << "\t" << vec.capacity() << "\t" << vec.empty() << std::endl;
- }
-
- void deserialize_exact_from_java() {
+ void deserialize_from_java() {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(testBinaryInputPath + "varopt_string_exact.bin", std::ios::binary);
- var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
- CPPUNIT_ASSERT(!sketch.is_empty());
- CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
- CPPUNIT_ASSERT_EQUAL((uint64_t) 200, sketch.get_n());
- CPPUNIT_ASSERT_EQUAL((uint32_t) 200, sketch.get_num_samples());
- subset_summary ss = sketch.estimate_subset_sum([](std::string x){ return true; });
-
- double tgt_wt = 0.0;
- for (int i = 1; i <= 200; ++i) { tgt_wt += 1000.0 / i; }
- CPPUNIT_ASSERT_DOUBLES_EQUAL(tgt_wt, ss.total_sketch_weight, EPS);
- }
-
- void deserialize_sampling_from_java() {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(testBinaryInputPath + "varopt_long_sampling.bin", std::ios::binary);
- var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
- CPPUNIT_ASSERT(!sketch.is_empty());
- CPPUNIT_ASSERT_EQUAL((uint32_t) 1024, sketch.get_k());
- CPPUNIT_ASSERT_EQUAL((uint64_t) 2003, sketch.get_n());
- CPPUNIT_ASSERT_EQUAL(sketch.get_k(), sketch.get_num_samples());
- subset_summary ss = sketch.estimate_subset_sum([](int64_t x){ return true; });
- CPPUNIT_ASSERT_DOUBLES_EQUAL(332000.0, ss.estimate, EPS);
- CPPUNIT_ASSERT_DOUBLES_EQUAL(332000.0, ss.total_sketch_weight, EPS);
-
- ss = sketch.estimate_subset_sum([](int64_t x){ return x < 0; });
- CPPUNIT_ASSERT_DOUBLES_EQUAL(330000.0, ss.estimate, 0.0);
-
- ss = sketch.estimate_subset_sum([](int64_t x){ return x >= 0; });
- CPPUNIT_ASSERT_DOUBLES_EQUAL(2000.0, ss.estimate, EPS);
+ is.open(testBinaryInputPath + "varopt_union_double_sampling.bin", std::ios::binary);
+ var_opt_union<double> u = var_opt_union<double>::deserialize(is);
+
+ // must reduce k in the process, like in small_sampling_sketch()
+ var_opt_sketch<double> result = u.get_result();
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT_EQUAL((uint64_t) 97, result.get_n());
+
+ double expected_wt = 96.0;// light items -- ignoring the heavy one
+ subset_summary ss = result.estimate_subset_sum([](float x){return x >= 0;});
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt, ss.estimate, EPS);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(expected_wt + 1024.0, ss.total_sketch_weight, EPS);
+ CPPUNIT_ASSERT_LESS((uint32_t) 128, result.get_k());
}
};
diff --git a/sampling/test/varopt_long_sampling.bin b/sampling/test/varopt_sketch_long_sampling.bin
similarity index 100%
rename from sampling/test/varopt_long_sampling.bin
rename to sampling/test/varopt_sketch_long_sampling.bin
diff --git a/sampling/test/varopt_string_exact.bin b/sampling/test/varopt_sketch_string_exact.bin
similarity index 100%
rename from sampling/test/varopt_string_exact.bin
rename to sampling/test/varopt_sketch_string_exact.bin
diff --git a/sampling/test/varopt_union_double_sampling.bin b/sampling/test/varopt_union_double_sampling.bin
new file mode 100644
index 0000000..b3a229e
Binary files /dev/null and b/sampling/test/varopt_union_double_sampling.bin differ
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org