You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/03/18 20:24:26 UTC
[incubator-datasketches-website] branch more_examples updated: KLL
C++ example
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch more_examples
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-website.git
The following commit(s) were added to refs/heads/more_examples by this push:
new 2df17d8 KLL C++ example
2df17d8 is described below
commit 2df17d8d9aae50b0fa668c6a7aa68b084d81e0b2
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Wed Mar 18 13:23:54 2020 -0700
KLL C++ example
---
_includes/toc.html | 2 +
docs/Quantiles/QuantilesCppExample.md | 116 ++++++++++++++++++++++++++++++++++
src/main/resources/docgen/toc.json | 1 +
3 files changed, 119 insertions(+)
diff --git a/_includes/toc.html b/_includes/toc.html
index 7a833e4..3ac6539 100644
--- a/_includes/toc.html
+++ b/_includes/toc.html
@@ -208,6 +208,7 @@
</p>
<div class="collapse" id="collapse_most_frequent_examples">
<li><a href="{{site.docs_dir}}/Frequency/FrequentItemsJavaExample.html">Frequent Items Java Example</a></li>
+ <li><a href="{{site.docs_dir}}/Frequency/FrequentItemsCppExample.html">Frequent Items C++ Example</a></li>
<li><a href="{{site.docs_dir}}/Frequency/FrequentItemsPigUDFs.html">Frequent Items Pig UDFs</a></li>
<li><a href="{{site.docs_dir}}/Frequency/FrequentItemsHiveUDFs.html">Frequent Items Hive UDFs</a></li>
<li><a href="{{site.docs_dir}}/DruidIntegration.html">Using Sketches in Druid</a></li>
@@ -234,6 +235,7 @@
</p>
<div class="collapse" id="collapse_quantiles_examples">
<li><a href="{{site.docs_dir}}/Quantiles/QuantilesJavaExample.html">Quantiles Sketch Java Example</a></li>
+ <li><a href="{{site.docs_dir}}/Quantiles/QuantilesCppExample.html">Quantiles Sketch (KLL) C++ Example</a></li>
<li><a href="{{site.docs_dir}}/Quantiles/QuantilesPigUDFs.html">Quantiles Sketch Pig UDFs</a></li>
<li><a href="{{site.docs_dir}}/Quantiles/QuantilesHiveUDFs.html">Quantiles Sketch Hive UDFs</a></li>
<li><a href="{{site.docs_dir}}/DruidIntegration.html">Using Sketches in Druid</a></li>
diff --git a/docs/Quantiles/QuantilesCppExample.md b/docs/Quantiles/QuantilesCppExample.md
new file mode 100644
index 0000000..ec8b577
--- /dev/null
+++ b/docs/Quantiles/QuantilesCppExample.md
@@ -0,0 +1,116 @@
+---
+layout: doc_page
+---
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+# KLL Sketch C++ Example
+
+ #include <iostream>
+ #include <fstream>
+ #include <random>
+ #include <chrono>
+
+ #include <kll_sketch.hpp>
+
+ //simplified file operations and no error handling for clarity
+ int main(int argc, char **argv) {
+ // this section generates two sketches from random data and serializes them into files
+ {
+ std::default_random_engine generator(std::chrono::system_clock::now().time_since_epoch().count());
+ std::normal_distribution<float> nd(0, 1); // mean=0, stddev=1
+
+ datasketches::kll_sketch<float> sketch1; // default k=200
+ for (int i = 0; i < 10000; i++) {
+ sketch1.update(nd(generator)); // mean=0, stddev=1
+ }
+ std::ofstream os1("kll_sketch_float1.bin");
+ sketch1.serialize(os1);
+
+ datasketches::kll_sketch<float> sketch2; // default k=200
+ for (int i = 0; i < 10000; i++) {
+ sketch2.update(nd(generator) + 1); // shift the mean for the second sketch
+ }
+ std::ofstream os2("kll_sketch_float2.bin");
+ sketch2.serialize(os2);
+ }
+
+ // this section deserializes the sketches, produces a union and prints some results
+ {
+ std::ifstream is1("kll_sketch_float1.bin");
+ auto sketch1 = datasketches::kll_sketch<float>::deserialize(is1);
+
+ std::ifstream is2("kll_sketch_float2.bin");
+ auto sketch2 = datasketches::kll_sketch<float>::deserialize(is2);
+
+ // we could merge sketch2 into sketch1 or the other way around
+ // this is an example of using a new sketch as a union and keeping the original sketches intact
+ datasketches::kll_sketch<float> u; // default k=200
+ u.merge(sketch1);
+ u.merge(sketch2);
+
+ // Debug output
+ u.to_stream(std::cout);
+
+ std::cout << "Min, Median, Max values" << std::endl;
+ const double fractions[3] {0, 0.5, 1};
+ auto quantiles = u.get_quantiles(fractions, 3);
+ std::cout << quantiles[0] << ", " << quantiles[1] << ", " << quantiles[2] << std::endl;
+
+ std::cout << "Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)" << std::endl;
+ const float split_points[] {-2, 0, 2};
+ const int num_split_points = 3;
+ auto pmf = u.get_PMF(split_points, num_split_points);
+ std::cout << pmf[0] << ", " << pmf[1] << ", " << pmf[2] << ", " << pmf[3] << std::endl;
+
+ std::cout << "Frequency Histogram: estimated number of original values in the same bins" << std::endl;
+ const int num_bins = num_split_points + 1;
+ int histogram[num_bins];
+ for (int i = 0; i < num_bins; i++) {
+ histogram[i] = pmf[i] * u.get_n(); // scale the fractions by the total count of values
+ }
+ std::cout << histogram[0] << ", " << histogram[1] << ", " << histogram[2] << ", " << histogram[3] << std::endl;
+ }
+
+ return 0;
+ }
+
+ Output (will be sligtly different every time due to random input):
+ ### KLL sketch summary:
+ K : 200
+ min K : 200
+ M : 8
+ N : 20000
+ Epsilon : 1.33%
+ Epsilon PMF : 1.65%
+ Empty : false
+ Estimation mode: true
+ Levels : 7
+ Sorted : false
+ Capacity items : 565
+ Retained items : 394
+ Storage bytes : 1632
+ Min value : -3.49
+ Max value : 4.52
+ ### End sketch summary
+ Min, Median, Max values
+ -3.49, 0.51, 4.52
+ Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)
+ 0.0146, 0.313, 0.582, 0.0901
+ Frequency Histogram: estimated number of original values in the same bins
+ 293, 6267, 11639, 1801
diff --git a/src/main/resources/docgen/toc.json b/src/main/resources/docgen/toc.json
index 304e58e..624c4b4 100644
--- a/src/main/resources/docgen/toc.json
+++ b/src/main/resources/docgen/toc.json
@@ -197,6 +197,7 @@
{ "class":"Dropdown", "desc" : "Quantiles Examples", "array":
[
{"class":"Doc", "desc" : "Quantiles Sketch Java Example", "dir" : "Quantiles", "file": "QuantilesJavaExample" },
+ {"class":"Doc", "desc" : "KLL Quantiles Sketch C++ Example", "dir" : "Quantiles", "file": "QuantilesCppExample" },
{"class":"Doc", "desc" : "Quantiles Sketch Pig UDFs", "dir" : "Quantiles", "file": "QuantilesPigUDFs" },
{"class":"Doc", "desc" : "Quantiles Sketch Hive UDFs", "dir" : "Quantiles", "file": "QuantilesHiveUDFs" },
{"class":"Doc", "desc" : "Using Sketches in Druid", "dir" : "", "file": "DruidIntegration" },
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org