You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2019/06/14 23:16:00 UTC

[incubator-datasketches-postgresql] branch master updated: updated doc and package

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-postgresql.git


The following commit(s) were added to refs/heads/master by this push:
     new d2436ef  updated doc and package
d2436ef is described below

commit d2436efa99476b49ae1e5107243b494c6d93ba33
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Jun 14 16:15:39 2019 -0700

    updated doc and package
---
 META.json            |   8 ++--
 README.md            | 126 +++++++++++++++++++++++++++++++++++++++++++++++++--
 datasketches.control |   2 +-
 3 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/META.json b/META.json
index 51852f5..98a2747 100644
--- a/META.json
+++ b/META.json
@@ -1,7 +1,7 @@
 {
    "name": "datasketches",
    "abstract": "approximate algorithms for big data analysis",
-   "version": "1.1.0",
+   "version": "1.2.0",
    "maintainer": [
      "Alexander Saydakov <al...@apache.org>",
      "Sketches User List <sk...@googlegroups.com>"
@@ -31,11 +31,11 @@
    },
    "resources": {
       "bugtracker": {
-         "web": "https://github.com/DataSketches/sketches-postgres/issues"
+         "web": "https://github.com/apache/incubator-datasketches-postgresql/issues"
       },
       "repository": {
-        "url":  "https://github.com/DataSketches/sketches-postgres.git",
-        "web":  "https://github.com/DataSketches/sketches-postgres",
+        "url":  "https://github.com/apache/incubator-datasketches-postgresql.git",
+        "web":  "https://github.com/apache/incubator-datasketches-postgresql",
         "type": "git"
       }
    },
diff --git a/README.md b/README.md
index cedf49d..ed9bf43 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,12 @@
-Module for PostgreSQL to support approximate algorithms based on the Datasketches core library sketches-core-cpp.
-See https://datasketches.github.io/ for details.
+Module for PostgreSQL to support approximate algorithms based on the Datasketches core library datasketches-cpp.
+See [Datasketches documentation](https://datasketches.github.io/) for details.
 
-This module currently supports two sketches:
+This module currently supports the following sketches:
 
 - CPC (Compressed Probabilistic Counting) sketch - very compact (when serialized) distinct-counting sketch
+- Theta sketch - distinct counting with set operations (intersection, a-not-b)
 - KLL float quantiles sketch - for estimating distributions: quantile, rank, PMF (histogram), CDF
+- Frequent strings sketch - capture the heaviest items (strings) by count or by some other weight
 
 <h1>Examples</h1>
 
@@ -14,7 +16,7 @@ Suppose 100 million random integer values uniformly distributed in the range fro
 
 Exact count distinct:
 
-	$ time psql test -c "select count(distinct id) from random_ints_100m;"
+	$ time psql test -c "select count(distinct id) from random_ints_100m"
 	  count
 	----------
 	 63208457
@@ -24,7 +26,7 @@ Exact count distinct:
 
 Approximate count distinct:
 
-	$ time psql test -c "select cpc_sketch_distinct(id) from random_ints_100m;"
+	$ time psql test -c "select cpc_sketch_distinct(id) from random_ints_100m"
 	 cpc_sketch_distinct 
 	---------------------
 	    63423695.9451363
@@ -45,6 +47,60 @@ Merging sketches:
 	-------------------------
 	        3.00024414612919
 
+<h2>Distinct counting with Theta sketch</h2>
+
+See above for the exact distinct count of 100 million random integers
+
+Approximate distinct count:
+
+	$ time psql test -c "select theta_sketch_distinct(id) from random_ints_100m"
+	 theta_sketch_distinct 
+	-----------------------
+	      64593262.4373193
+	(1 row)
+
+	real	0m19.701s
+
+Note that the above one-off distinct count is just to show the basic usage. Most importantly, the sketch can be used as an "additive" distinct count metric in a data cube.
+
+Aggregate union:
+
+	create table theta_sketch_test(sketch theta_sketch);
+	insert into theta_sketch_test select theta_sketch_build(1);
+	insert into theta_sketch_test select theta_sketch_build(2);
+	insert into theta_sketch_test select theta_sketch_build(3);
+	select theta_sketch_get_estimate(theta_sketch_union(sketch)) from theta_sketch_test;
+	 theta_sketch_get_estimate 
+	---------------------------
+	                         3
+
+Non-aggregate set operations:
+
+	create table theta_set_op_test(sketch1 theta_sketch, sketch2 theta_sketch);
+	insert into theta_set_op_test select theta_sketch_build(1), theta_sketch_build(1);
+	insert into theta_set_op_test select theta_sketch_build(1), theta_sketch_build(2);
+
+	select theta_sketch_get_estimate(theta_sketch_union(sketch1, sketch2)) from theta_set_op_test;
+	 theta_sketch_get_estimate 
+	---------------------------
+	                         1
+	                         2
+	(2 rows)
+
+	select theta_sketch_get_estimate(theta_sketch_intersection(sketch1, sketch2)) from theta_set_op_test;
+	 theta_sketch_get_estimate 
+	---------------------------
+	                         1
+	                         0
+	(2 rows)
+
+	select theta_sketch_get_estimate(theta_sketch_a_not_b(sketch1, sketch2)) from theta_set_op_test;
+	 theta_sketch_get_estimate 
+	---------------------------
+	                         0
+	                         1
+	(2 rows)
+
 <h2>Estimating quanitles, ranks and histograms with KLL sketch</h2>
 
 Table "normal" has 1 million values from the normal distribution with mean=0 and stddev=1.
@@ -91,3 +147,63 @@ The ARRAY[-2, -1, 0, 1, 2] of 5 split points defines 6 intervals (bins): (-inf,-
 In this simple example we know the value of N since we constructed this sketch, but in a general case sketches are merged across dimensions of data hypercube, so the vale of N is not known in advance.
 
 Note that the normal distribution was used just to show the basic usage. The sketch does not make any assumptions about the distribution.
+
+<h2>Frequent strings</h2>
+
+Consider a numeric Zipfian distribution with parameter alpha=1.1 (high skew)
+and range of 2<sup>13</sup>, so that the number 1 has the highest frequency,
+the number 2 appears substantially less frequently and so on.
+Suppose zipf\_1p1\_8k\_100m table has 100 million random values drawn from such
+a distribution, and the values are converted to strings.
+
+Suppose the goal is to get the most frequent strings from this table. In
+terms of the frequent items sketch we have to chose a threshold. Let's try
+to capture values that repeat more than 1 million times, or more than 1% of
+the 100 million entries in the table. According to the [error table](https://datasketches.github.io/docs/Frequency/FrequentItemsErrorTable.html),
+frequent items sketch of size 2<sup>9</sup> must capture all values more
+frequent then about 0.7% of the input.
+
+The following query is to build a sketch with lg_k=9 and get results with
+estimated weight above 1 million using "no false negatives" policy.
+The output format is: value, estimate, lower bound, upper bound.
+
+	$ time psql test -c "select frequent_strings_sketch_result_no_false_negatives(frequent_strings_sketch_build(9, value), 1000000) from zipf_1p1_8k_100m"
+	 frequent_strings_sketch_result_no_false_negatives 
+	---------------------------------------------------
+	 (1,15328953,15209002,15328953)
+	 (2,7156065,7036114,7156065)
+	 (3,4578361,4458410,4578361)
+	 (4,3334808,3214857,3334808)
+	 (5,2608563,2488612,2608563)
+	 (6,2135715,2015764,2135715)
+	 (7,1801961,1682010,1801961)
+	 (8,1557433,1437482,1557433)
+	 (9,1368446,1248495,1368446)
+	 (10,1216532,1096581,1216532)
+	 (11,1098304,978353,1098304)
+	(11 rows)
+	
+	real	0m38.178s
+
+Here is an equivalent exact computation:
+
+	$ time psql test -c "select value, weight from (select value, count(*) as weight from zipf_1p1_8k_100m group by value) t where weight > 1000000 order by weight desc"
+	 value |  weight  
+	-------+----------
+	 1     | 15328953
+	 2     |  7156065
+	 3     |  4578361
+	 4     |  3334808
+	 5     |  2608563
+	 6     |  2135715
+	 7     |  1801961
+	 8     |  1557433
+	 9     |  1368446
+	 10    |  1216532
+	 11    |  1098304
+	(11 rows)
+	
+	real	0m18.362s
+
+In this particular case the exact computation happens to be faster. This is
+just to show the basic usage. Most importantly, the sketch can be used as an "additive" metric in a data cube, and can be easily merged across dimensions.
\ No newline at end of file
diff --git a/datasketches.control b/datasketches.control
index 72634af..6fdc30f 100644
--- a/datasketches.control
+++ b/datasketches.control
@@ -1,6 +1,6 @@
 # Datasketches module
 comment = 'Aggregation functions and data types for approximate algorithms.'
-default_version = '1.0.0'
+default_version = '1.2.0'
 relocatable = true
 
 module_pathname = '$libdir/datasketches'


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org