You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/12/27 00:14:12 UTC
madlib git commit: Summary: Add additional stats + rename output field
Repository: madlib
Updated Branches:
refs/heads/master 44f89c9a1 -> d025bb460
Summary: Add additional stats + rename output field
JIRA: MADLIB-1167
Changes:
- Add the following statistics to madlib.summary():
positive values
negative values
zero values
95% confidence intervals for the 'mean'
- Rename 'row_count' to 'num_col_summarized' in summary() result type
Previously, when we run `SELECT * FROM madlib.summary(valid_inputs)`, it
returns a composite type containing a field named `row_count`, which
refers to the number of rows in the output table. When we run
`SELECT * FROM user_defined_summary_table;`, it also contains a column
named `row_count`, which refers to number of rows for the target column.
To eliminate the confusion, we rename the first `row_count` to
`num_col_summarized`, modify explanation and also update user doc.
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/d025bb46
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/d025bb46
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/d025bb46
Branch: refs/heads/master
Commit: d025bb4609baeb7c7a1d136590780a8fafdee208
Parents: 44f89c9
Author: Jingyi Mei <jm...@pivotal.io>
Authored: Tue Dec 26 16:09:54 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Tue Dec 26 16:09:54 2017 -0800
----------------------------------------------------------------------
.../postgres/modules/summary/Summarizer.py_in | 46 ++++++-
.../postgres/modules/summary/summary.py_in | 8 +-
.../postgres/modules/summary/summary.sql_in | 135 ++++++++++++-------
.../modules/summary/test/summary.sql_in | 3 +
4 files changed, 137 insertions(+), 55 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/Summarizer.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/Summarizer.py_in b/src/ports/postgres/modules/summary/Summarizer.py_in
index fe4d51a..277fa7f 100644
--- a/src/ports/postgres/modules/summary/Summarizer.py_in
+++ b/src/ports/postgres/modules/summary/Summarizer.py_in
@@ -28,6 +28,7 @@ class Summarizer:
self._columns = None
self._column_names = None
self._delimiter = '_.*.&.!.!.&.*_'
+ self._z_score = 1.96
def _populate_columns(self):
if self._target_cols:
@@ -99,11 +100,11 @@ class Summarizer:
self._validate_ntile_array()
_assert(self._how_many_mfv is not None and self._how_many_mfv > 0,
"Summary - Invalid parameter: Number of most frequent values"
- "required should be positive")
+ " required should be positive")
self._how_many_mfv = int(self._how_many_mfv)
_assert(self._n_cols_per_run is not None and self._n_cols_per_run > 0,
"Summary - Invalid parameter: Number of columns per run"
- "should be positive")
+ " should be positive")
self._n_cols_per_run = int(self._n_cols_per_run)
# ----- End of argument validation functions -----------------------------
@@ -199,6 +200,20 @@ class Summarizer:
args['max_columns'] = ','.join([minmax_type('max', c) for c in cols])
args['ntile_columns'] = "array_to_string(array[NULL], ',')"
+
+ pos_str = "sum(case when {0} > 0 then 1 else 0 end)"
+ args['positive_columns'] = ','.join([pos_str.format(c['attname'])
+ if c['typname'] in numeric_types
+ else 'NULL' for c in cols])
+ neg_str = "sum(case when {0} < 0 then 1 else 0 end)"
+ args["negative_columns"] = ','.join([neg_str.format(c['attname'])
+ if c['typname'] in numeric_types
+ else 'NULL' for c in cols])
+ zero_str = "sum(case when {0} = 0 then 1 else 0 end)"
+ args["zero_columns"] = ','.join([zero_str.format(c['attname'])
+ if c['typname'] in numeric_types
+ else 'NULL' for c in cols])
+
if self._ntile_array:
args['ntile_columns'] = ",".join([
"array_to_string(array[" +
@@ -214,17 +229,20 @@ class Summarizer:
array[{column_types}]::text[] as datatype,
array[{column_number}]::integer[] as colnum,
count(*)::bigint as rowcount,
+ array[{positive_columns}]::bigint[] as positive_values,
+ array[{negative_columns}]::bigint[] as negative_values,
+ array[{zero_columns}]::bigint[] as zero_values,
array[{mean_columns}]::float8[] as mean,
array[{var_columns}]::float8[] as variance,
array[{distinct_columns}]::bigint[] as distinct_values,
array[{missing_columns}]::bigint[] as missing_values,
array[{blank_columns}]::bigint[] as blank_values,
array[{min_columns}]::float8[] as min,
+ array[{max_columns}]::float8[] as max,
array[{q1_columns}]::float8[] as first_quartile,
array[{q2_columns}]::float8[] as median,
array[{q3_columns}]::float8[] as third_quartile,
array[{ntile_columns}]::text[] as ntiles,
- array[{max_columns}]::float8[] as max,
array[{mfv_value}]::text[] as mfv_value,
array[{mfv_count}]::text[] as mfv_count
FROM {source_table}{group_expr}
@@ -247,14 +265,19 @@ class Summarizer:
distinct_values::float8 / rowcount as fraction_distinct_values,
missing_values::float8 / rowcount as fraction_missing_values,
blank_values::float8 / rowcount as fraction_blank_values,
+ positive_values,
+ negative_values,
+ zero_values,
mean,
variance,
+ (mean - {z_score} * sqrt(variance / rowcount))::float8 as ci_lower_bound,
+ (mean + {z_score} * sqrt(variance / rowcount))::float8 as ci_upper_bound,
min,
+ max,
first_quartile,
median,
third_quartile,
ntiles,
- max,
string_to_array(mfv_value, '{delimiter}') as mfv_value,
string_to_array(mfv_count, '{delimiter}') as mfv_count
FROM
@@ -269,19 +292,23 @@ class Summarizer:
unnest(distinct_values) as distinct_values,
unnest(missing_values) as missing_values,
unnest(blank_values) as blank_values,
+ unnest(positive_values) as positive_values,
+ unnest(negative_values) as negative_values,
+ unnest(zero_values) as zero_values,
unnest(mean) as mean,
unnest(variance) as variance,
unnest(min) as min,
+ unnest(max) as max,
unnest(first_quartile) as first_quartile,
unnest(median) as median,
unnest(third_quartile) as third_quartile,
string_to_array(unnest(ntiles), ',') as ntiles,
- unnest(max) as max,
unnest(mfv_value) as mfv_value,
unnest(mfv_count) as mfv_count
FROM ({subquery}) q1
) q2
- """.format(schema_madlib=self._schema_madlib, subquery=subquery, delimiter=self._delimiter)
+ """.format(schema_madlib=self._schema_madlib, subquery=subquery,
+ delimiter=self._delimiter, z_score=self._z_score)
return query
def _build_query(self, group_val, cols, create_table):
@@ -322,8 +349,15 @@ class Summarizer:
blank_values as blank_values,
(missing_values::float8 / rowcount) as fraction_missing,
(blank_values::float8 / rowcount) as fraction_blank,
+ positive_values,
+ negative_values,
+ zero_values,
mean,
variance,
+ CASE WHEN ci_lower_bound is NULL OR ci_upper_bound is NULL
+ THEN null
+ ELSE ARRAY[ci_lower_bound, ci_upper_bound]
+ END as confidence_interval,
min,
max,
{first_quartile}
http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/summary.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.py_in b/src/ports/postgres/modules/summary/summary.py_in
index ff813f3..1dd6c61 100644
--- a/src/ports/postgres/modules/summary/summary.py_in
+++ b/src/ports/postgres/modules/summary/summary.py_in
@@ -68,11 +68,11 @@ def summary(schema_madlib, source_table, output_table, target_cols, grouping_col
summarizer.run()
end = time()
- row_count = plpy.execute(
+ num_col_summarized = plpy.execute(
"SELECT count(*) FROM {output_table}".format(
output_table=output_table))[0]['count']
- return (output_table, row_count, end - start)
+ return (output_table, num_col_summarized, end - start)
# -----------------------------------------------------------------------
@@ -144,8 +144,12 @@ def summary_help_message(schema_madlib, message, **kwargs):
- blank_values : Number of blank values (blanks are defined by the regular expression '^\w*$')
- fraction_missing : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3)
- fraction_blank : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3)
+ - positive_values : Number of positive values in the target column (if target is numeric, else NULL)
+ - negative_values : Number of negative values in the target column (if target is numeric, else NULL)
+ - zero_values : Number of zero values in the target column (if target is numeric, else NULL)
- mean : Mean value of target column (if target is numeric, else NULL)
- variance : Variance of target columns (if target is numeric, else NULL for strings)
+ - confidence_interval : Confidence interval (95% using z-score) of the mean value for the target column (if target is numeric, else NULL)
- min : Min value of target column (for strings this is the length of the shortest string)
- max : Max value of target column (for strings this is the length of the longest string)
- first_quartile : First quartile (25th percentile, valid only for numeric columns)
http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in
index 148191f..1ceaa29 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in
@@ -53,8 +53,8 @@ The \b summary() function returns a composite type containing three fields:
<td>TEXT. The name of the output table.</td>
</tr>
<tr>
- <th>row_count</th>
- <td>INTEGER. The number of rows in the output table.</td>
+ <th>num_col_summarized</th>
+ <td>INTEGER. The number of columns from the source table that have been summarized.</td>
</tr>
<tr>
<th>duration</th>
@@ -119,6 +119,18 @@ This table contains the following columns:
<td>Percentage of total rows that are blank, as a decimal value, e.g. 0.3.</td>
</tr>
<tr>
+ <th>positive_values</th>
+ <td>Number of positive values in the target column if target is numeric, otherwise NULL.</td>
+ </tr>
+ <tr>
+ <th>negative_values</th>
+ <td>Number of negative values in the target column if target is numeric, otherwise NULL.</td>
+ </tr>
+ <tr>
+ <th>zero_values</th>
+ <td>Number of zero values in the target column if target is numeric, otherwise NULL.</td>
+ </tr>
+ <tr>
<th>mean</th>
<td>Mean value of target column if target is numeric, otherwise NULL.</td>
</tr>
@@ -127,6 +139,11 @@ This table contains the following columns:
<td>Variance of target column if target is numeric, otherwise NULL.</td>
</tr>
<tr>
+ <th>confidence_interval</th>
+ <td>Confidence interval (95% using z-score) of the mean value for the target column
+ if target is numeric, otherwise NULL.</td>
+ </tr>
+ <tr>
<th>min</th>
<td>Minimum value of target column. For strings this is the length of the shortest string.</td>
</tr>
@@ -284,9 +301,9 @@ SELECT * FROM madlib.summary( 'iris', -- Source table
</pre>
Result:
<pre class="result">
- output_table | row_count | duration
---------------+-----------+---------------------
- iris_summary | 6 | 0.00712704658508301
+ output_table | num_col_summarized | duration
+--------------+--------------------+-------------------
+ iris_summary | 6 | 0.574938058853149
(1 row)
</pre>
View the summary data.
@@ -298,31 +315,35 @@ SELECT * FROM iris_summary;
Result (partial):
<pre class="result">
...
- -[ RECORD 2 ]-------+-----------------------------------
-group_by |
-group_by_value |
+ -[ RECORD 2 ]--------+---------------------------------------------
+group_by |
+group_by_value |
target_column | sepal_length
column_number | 2
data_type | float8
row_count | 30
distinct_values | 22
missing_values | 0
-blank_values |
+blank_values |
fraction_missing | 0
-fraction_blank |
+fraction_blank |
+positive_values | 30
+negative_values | 0
+zero_values | 0
mean | 5.84333333333333
-variance | 0.9294367816092
+variance | 0.929436781609188
+confidence_interval | {5.49834423494374,6.18832243172292}
min | 4.4
max | 7.6
first_quartile | 4.925
median | 5.75
third_quartile | 6.575
-most_frequent_values | {4.9,6.3,6.5,4.6,5,6.9,5.4,4.4,7,6.4}
-mfv_frequencies | {4,3,2,2,2,1,1,1,1,1}
+most_frequent_values | {4.9,6.3,5,6.5,4.6,7.2,5.5,5.7,7.3,6.7}
+mfv_frequencies | {4,3,2,2,2,1,1,1,1,1}
...
- -[ RECORD 6 ]-------+-----------------------------------
-group_by |
-group_by_value |
+ -[ RECORD 6 ]--------+---------------------------------------------
+group_by |
+group_by_value |
target_column | class_name
column_number | 6
data_type | text
@@ -332,13 +353,17 @@ missing_values | 0
blank_values | 0
fraction_missing | 0
fraction_blank | 0
-mean |
-variance |
+positive_values |
+negative_values |
+zero_values |
+mean |
+variance |
+confidence_interval |
min | 11
max | 15
-first_quartile |
-median |
-third_quartile |
+first_quartile |
+median |
+third_quartile |
most_frequent_values | {Iris-setosa,Iris-versicolor,Iris-virginica}
mfv_frequencies | {10,10,10}
</pre>
@@ -358,7 +383,7 @@ SELECT * FROM iris_summary;
</pre>
Result (partial):
<pre class="result">
- -[ RECORD 1 ]-------+-----------------------------------
+ -[ RECORD 1 ]--------+----------------------------------------
group_by | class_name
group_by_value | Iris-setosa
target_column | sepal_length
@@ -367,20 +392,24 @@ data_type | float8
row_count | 10
distinct_values | 7
missing_values | 0
-blank_values |
+blank_values |
fraction_missing | 0
-fraction_blank |
+fraction_blank |
+positive_values | 10
+negative_values | 0
+zero_values | 0
mean | 4.86
-variance | 0.0848888888888976
+variance | 0.0848888888888875
+confidence_interval | {4.67941507384182,5.04058492615818}
min | 4.4
max | 5.4
first_quartile | 4.625
median | 4.9
third_quartile | 5
-most_frequent_values | {4.6,4.9,5,5.1,4.4,5.4,4.7}
+most_frequent_values | {4.9,5,4.6,5.1,4.7,5.4,4.4}
mfv_frequencies | {2,2,2,1,1,1,1}
...
- -[ RECORD 3 ]-------+-----------------------------------
+ -[ RECORD 3 ]--------+----------------------------------------
group_by | class_name
group_by_value | Iris-versicolor
target_column | sepal_length
@@ -389,17 +418,21 @@ data_type | float8
row_count | 10
distinct_values | 10
missing_values | 0
-blank_values |
+blank_values |
fraction_missing | 0
-fraction_blank |
+fraction_blank |
+positive_values | 10
+negative_values | 0
+zero_values | 0
mean | 6.1
variance | 0.528888888888893
+confidence_interval | {5.64924734548141,6.55075265451859}
min | 4.9
max | 7
first_quartile | 5.55
median | 6.35
third_quartile | 6.575
-most_frequent_values | {7,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2}
+most_frequent_values | {6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,7,6.4}
mfv_frequencies | {1,1,1,1,1,1,1,1,1,1}
...
</pre>
@@ -421,43 +454,51 @@ SELECT * FROM iris_summary;
</pre>
Result:
<pre class="result">
- -[ RECORD 1 ]-------+-----------------------------------
-group_by |
-group_by_value |
+ -[ RECORD 1 ]--------+------------------------------------
+group_by |
+group_by_value |
target_column | sepal_length
column_number | 2
data_type | float8
row_count | 30
distinct_values | 22
missing_values | 0
-blank_values |
+blank_values |
fraction_missing | 0
-fraction_blank |
+fraction_blank |
+positive_values | 30
+negative_values | 0
+zero_values | 0
mean | 5.84333333333333
-variance | 0.9294367816092
+variance | 0.929436781609175
+confidence_interval | {5.49834423494375,6.18832243172292}
min | 4.4
max | 7.6
quantile_array | {5.057,6.414}
-most_frequent_values | {4.9,6.3,5}
+most_frequent_values | {4.9,6.3,6.5}
mfv_frequencies | {4,3,2}
- -[ RECORD 2 ]-------+-----------------------------------
-group_by |
-group_by_value |
+ -[ RECORD 2 ]--------+------------------------------------
+group_by |
+group_by_value |
target_column | sepal_width
column_number | 3
data_type | float8
row_count | 30
distinct_values | 14
missing_values | 0
-blank_values |
+blank_values |
fraction_missing | 0
-fraction_blank |
+fraction_blank |
+positive_values | 30
+negative_values | 0
+zero_values | 0
mean | 3.04
variance | 0.13903448275862
+confidence_interval | {2.90656901047539,3.17343098952461}
min | 2.3
max | 3.9
quantile_array | {2.9,3.2}
-most_frequent_values | {3,2.9,3.2}
+most_frequent_values | {2.9,3,3.2}
mfv_frequencies | {4,4,3}
</pre>
@@ -474,7 +515,7 @@ count and most frequent values:
- If <em>get_estimates</em> is TRUE then the distinct value computation is
estimated using Flajolet-Martin. MFV is computed using a
fast method that does parallel aggregation in Greenplum Database at the expense
- of missing some of the most frequent values.
+ of missing or duplicating some of the most frequent values.
- If <em>get_estimates</em> is FALSE then the distinct values are computed
in a slower but exact method using PostgreSQL COUNT DISTINCT. MFV is computed using a
faithful implementation that preserves the approximation guarantees of
@@ -494,9 +535,9 @@ File summary.sql_in documenting the \b summary() function
DROP TYPE IF EXISTS MADLIB_SCHEMA.summary_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.summary_result AS
(
- output_table TEXT,
- row_count INT4,
- duration FLOAT8
+ output_table TEXT,
+ num_col_summarized INT4,
+ duration FLOAT8
);
http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/test/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/test/summary.sql_in b/src/ports/postgres/modules/summary/test/summary.sql_in
index bf7c4f4..879b4f9 100644
--- a/src/ports/postgres/modules/summary/test/summary.sql_in
+++ b/src/ports/postgres/modules/summary/test/summary.sql_in
@@ -38,6 +38,9 @@ sunny, 75, 70, true, Play
overcast, 72, 90, true, Play
overcast, 81, 75, false, Play
rain, 71, 80, true, Don't Play
+snow, 0, 85, false, Don't Play
+snow, -9, 72, true, Play
+sunny, -2, 80, true, Don't play
\.
DROP TABLE IF EXISTS example_data_summary;