You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/12/27 00:14:12 UTC
madlib git commit: Summary: Add additional stats + rename output field

Repository: madlib
Updated Branches:
  refs/heads/master 44f89c9a1 -> d025bb460


Summary: Add additional stats + rename output field

JIRA: MADLIB-1167

Changes:

- Add the following statistics to madlib.summary():

    positive values
    negative values
    zero values
    95% confidence intervals for the 'mean'

- Rename 'row_count' to 'num_col_summarized' in summary() result type

Previously, when we run `SELECT * FROM madlib.summary(valid_inputs)`, it
returns a composite type containing a field named `row_count`, which
refers to the number of rows in the output table. When we run
`SELECT * FROM user_defined_summary_table;`, it also contains a column
named `row_count`, which refers to number of rows for the target column.

To eliminate the confusion, we rename the first `row_count` to
`num_col_summarized`, modify explanation and also update user doc.


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/d025bb46
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/d025bb46
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/d025bb46

Branch: refs/heads/master
Commit: d025bb4609baeb7c7a1d136590780a8fafdee208
Parents: 44f89c9
Author: Jingyi Mei <jm...@pivotal.io>
Authored: Tue Dec 26 16:09:54 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Tue Dec 26 16:09:54 2017 -0800

----------------------------------------------------------------------
 .../postgres/modules/summary/Summarizer.py_in   |  46 ++++++-
 .../postgres/modules/summary/summary.py_in      |   8 +-
 .../postgres/modules/summary/summary.sql_in     | 135 ++++++++++++-------
 .../modules/summary/test/summary.sql_in         |   3 +
 4 files changed, 137 insertions(+), 55 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/Summarizer.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/Summarizer.py_in b/src/ports/postgres/modules/summary/Summarizer.py_in
index fe4d51a..277fa7f 100644
--- a/src/ports/postgres/modules/summary/Summarizer.py_in
+++ b/src/ports/postgres/modules/summary/Summarizer.py_in
@@ -28,6 +28,7 @@ class Summarizer:
         self._columns = None
         self._column_names = None
         self._delimiter = '_.*.&.!.!.&.*_'
+        self._z_score = 1.96
 
     def _populate_columns(self):
         if self._target_cols:
@@ -99,11 +100,11 @@ class Summarizer:
         self._validate_ntile_array()
         _assert(self._how_many_mfv is not None and self._how_many_mfv > 0,
                 "Summary - Invalid parameter: Number of most frequent values"
-                "required should be positive")
+                " required should be positive")
         self._how_many_mfv = int(self._how_many_mfv)
         _assert(self._n_cols_per_run is not None and self._n_cols_per_run > 0,
                 "Summary - Invalid parameter: Number of columns per run"
-                "should be positive")
+                " should be positive")
         self._n_cols_per_run = int(self._n_cols_per_run)
 
 # ----- End of argument validation functions -----------------------------
@@ -199,6 +200,20 @@ class Summarizer:
         args['max_columns'] = ','.join([minmax_type('max', c) for c in cols])
 
         args['ntile_columns'] = "array_to_string(array[NULL], ',')"
+
+        pos_str = "sum(case when {0} > 0 then 1 else 0 end)"
+        args['positive_columns'] = ','.join([pos_str.format(c['attname'])
+                                             if c['typname'] in numeric_types
+                                             else 'NULL' for c in cols])
+        neg_str = "sum(case when {0} < 0 then 1 else 0 end)"
+        args["negative_columns"] = ','.join([neg_str.format(c['attname'])
+                                             if c['typname'] in numeric_types
+                                             else 'NULL' for c in cols])
+        zero_str = "sum(case when {0} = 0 then 1 else 0 end)"
+        args["zero_columns"] = ','.join([zero_str.format(c['attname'])
+                                         if c['typname'] in numeric_types
+                                         else 'NULL' for c in cols])
+
         if self._ntile_array:
             args['ntile_columns'] = ",".join([
                 "array_to_string(array[" +
@@ -214,17 +229,20 @@ class Summarizer:
                     array[{column_types}]::text[] as datatype,
                     array[{column_number}]::integer[] as colnum,
                     count(*)::bigint as rowcount,
+                    array[{positive_columns}]::bigint[] as positive_values,
+                    array[{negative_columns}]::bigint[] as negative_values,
+                    array[{zero_columns}]::bigint[] as zero_values,
                     array[{mean_columns}]::float8[] as mean,
                     array[{var_columns}]::float8[] as variance,
                     array[{distinct_columns}]::bigint[] as distinct_values,
                     array[{missing_columns}]::bigint[] as missing_values,
                     array[{blank_columns}]::bigint[] as blank_values,
                     array[{min_columns}]::float8[] as min,
+                    array[{max_columns}]::float8[] as max,
                     array[{q1_columns}]::float8[] as first_quartile,
                     array[{q2_columns}]::float8[] as median,
                     array[{q3_columns}]::float8[] as third_quartile,
                     array[{ntile_columns}]::text[] as ntiles,
-                    array[{max_columns}]::float8[] as max,
                     array[{mfv_value}]::text[] as mfv_value,
                     array[{mfv_count}]::text[] as mfv_count
                 FROM {source_table}{group_expr}
@@ -247,14 +265,19 @@ class Summarizer:
                     distinct_values::float8 / rowcount as fraction_distinct_values,
                     missing_values::float8 / rowcount as fraction_missing_values,
                     blank_values::float8 / rowcount as fraction_blank_values,
+                    positive_values,
+                    negative_values,
+                    zero_values,
                     mean,
                     variance,
+                    (mean - {z_score} * sqrt(variance / rowcount))::float8 as ci_lower_bound,
+                    (mean + {z_score} * sqrt(variance / rowcount))::float8 as ci_upper_bound,
                     min,
+                    max,
                     first_quartile,
                     median,
                     third_quartile,
                     ntiles,
-                    max,
                     string_to_array(mfv_value, '{delimiter}') as mfv_value,
                     string_to_array(mfv_count, '{delimiter}') as mfv_count
                 FROM
@@ -269,19 +292,23 @@ class Summarizer:
                         unnest(distinct_values) as distinct_values,
                         unnest(missing_values) as missing_values,
                         unnest(blank_values) as blank_values,
+                        unnest(positive_values) as positive_values,
+                        unnest(negative_values) as negative_values,
+                        unnest(zero_values) as zero_values,
                         unnest(mean) as mean,
                         unnest(variance) as variance,
                         unnest(min) as min,
+                        unnest(max) as max,
                         unnest(first_quartile) as first_quartile,
                         unnest(median) as median,
                         unnest(third_quartile) as third_quartile,
                         string_to_array(unnest(ntiles), ',') as ntiles,
-                        unnest(max) as max,
                         unnest(mfv_value) as mfv_value,
                         unnest(mfv_count) as mfv_count
                     FROM ({subquery}) q1
                 ) q2
-        """.format(schema_madlib=self._schema_madlib, subquery=subquery, delimiter=self._delimiter)
+        """.format(schema_madlib=self._schema_madlib, subquery=subquery,
+                   delimiter=self._delimiter, z_score=self._z_score)
         return query
 
     def _build_query(self, group_val, cols, create_table):
@@ -322,8 +349,15 @@ class Summarizer:
                 blank_values as blank_values,
                 (missing_values::float8 / rowcount) as fraction_missing,
                 (blank_values::float8 / rowcount) as fraction_blank,
+                positive_values,
+                negative_values,
+                zero_values,
                 mean,
                 variance,
+                CASE WHEN ci_lower_bound is NULL OR ci_upper_bound is NULL
+                     THEN null
+                     ELSE ARRAY[ci_lower_bound, ci_upper_bound]
+                END as confidence_interval,
                 min,
                 max,
                 {first_quartile}

http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/summary.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.py_in b/src/ports/postgres/modules/summary/summary.py_in
index ff813f3..1dd6c61 100644
--- a/src/ports/postgres/modules/summary/summary.py_in
+++ b/src/ports/postgres/modules/summary/summary.py_in
@@ -68,11 +68,11 @@ def summary(schema_madlib, source_table, output_table, target_cols, grouping_col
         summarizer.run()
         end = time()
 
-        row_count = plpy.execute(
+        num_col_summarized = plpy.execute(
             "SELECT count(*) FROM {output_table}".format(
                 output_table=output_table))[0]['count']
 
-    return (output_table, row_count, end - start)
+    return (output_table, num_col_summarized, end - start)
 
 
 # -----------------------------------------------------------------------
@@ -144,8 +144,12 @@ def summary_help_message(schema_madlib, message, **kwargs):
             - blank_values          : Number of blank values (blanks are defined by the regular expression '^\w*$')
             - fraction_missing      : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3)
             - fraction_blank        : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3)
+            - positive_values       : Number of positive values in the target column (if target is numeric, else NULL)
+            - negative_values       : Number of negative values in the target column (if target is numeric, else NULL)
+            - zero_values           : Number of zero values in the target column (if target is numeric, else NULL)
             - mean                  : Mean value of target column (if target is numeric, else NULL)
             - variance              : Variance of target columns (if target is numeric, else NULL for strings)
+            - confidence_interval   : Confidence interval (95% using z-score) of the mean value for the target column (if target is numeric, else NULL)
             - min                   : Min value of target column (for strings this is the length of the shortest string)
             - max                   : Max value of target column (for strings this is the length of the longest string)
             - first_quartile        : First quartile (25th percentile, valid only for numeric columns)

http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in
index 148191f..1ceaa29 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in
@@ -53,8 +53,8 @@ The \b summary() function returns a composite type containing three fields:
         <td>TEXT. The name of the output table.</td>
     </tr>
     <tr>
-        <th>row_count</th>
-        <td>INTEGER. The number of rows in the output table.</td>
+        <th>num_col_summarized</th>
+        <td>INTEGER. The number of columns from the source table that have been summarized.</td>
     </tr>
     <tr>
         <th>duration</th>
@@ -119,6 +119,18 @@ This table contains the following columns:
         <td>Percentage of total rows that are blank, as a decimal value, e.g. 0.3.</td>
     </tr>
     <tr>
+        <th>positive_values</th>
+        <td>Number of positive values in the target column if target is numeric, otherwise NULL.</td>
+    </tr>
+    <tr>
+        <th>negative_values</th>
+        <td>Number of negative values in the target column if target is numeric, otherwise NULL.</td>
+    </tr>
+    <tr>
+        <th>zero_values</th>
+        <td>Number of zero values in the target column if target is numeric, otherwise NULL.</td>
+    </tr>
+    <tr>
         <th>mean</th>
         <td>Mean value of target column if target is numeric, otherwise NULL.</td>
     </tr>
@@ -127,6 +139,11 @@ This table contains the following columns:
         <td>Variance of target column if target is numeric, otherwise NULL.</td>
     </tr>
     <tr>
+        <th>confidence_interval</th>
+        <td>Confidence interval (95% using z-score) of the mean value for the target column
+        if target is numeric, otherwise NULL.</td>
+    </tr>
+    <tr>
         <th>min</th>
         <td>Minimum value of target column. For strings this is the length of the shortest string.</td>
     </tr>
@@ -284,9 +301,9 @@ SELECT * FROM madlib.summary( 'iris',            -- Source table
 </pre>
 Result:
 <pre class="result">
- output_table | row_count |      duration       
---------------+-----------+---------------------
- iris_summary |         6 | 0.00712704658508301
+ output_table | num_col_summarized |     duration
+--------------+--------------------+-------------------
+ iris_summary |                  6 | 0.574938058853149
 (1 row)
 </pre>
 View the summary data.
@@ -298,31 +315,35 @@ SELECT * FROM iris_summary;
 Result (partial):
 <pre class="result">
 ...
-&nbsp;-[ RECORD 2 ]-------+-----------------------------------
-group_by             | 
-group_by_value       | 
+&nbsp;-[ RECORD 2 ]--------+---------------------------------------------
+group_by             |
+group_by_value       |
 target_column        | sepal_length
 column_number        | 2
 data_type            | float8
 row_count            | 30
 distinct_values      | 22
 missing_values       | 0
-blank_values         | 
+blank_values         |
 fraction_missing     | 0
-fraction_blank       | 
+fraction_blank       |
+positive_values      | 30
+negative_values      | 0
+zero_values          | 0
 mean                 | 5.84333333333333
-variance             | 0.9294367816092
+variance             | 0.929436781609188
+confidence_interval  | {5.49834423494374,6.18832243172292}
 min                  | 4.4
 max                  | 7.6
 first_quartile       | 4.925
 median               | 5.75
 third_quartile       | 6.575
-most_frequent_values | {4.9,6.3,6.5,4.6,5,6.9,5.4,4.4,7,6.4}
-mfv_frequencies      | {4,3,2,2,2,1,1,1,1,1}  
+most_frequent_values | {4.9,6.3,5,6.5,4.6,7.2,5.5,5.7,7.3,6.7}
+mfv_frequencies      | {4,3,2,2,2,1,1,1,1,1}
 ...
-&nbsp;-[ RECORD 6 ]-------+-----------------------------------
-group_by             | 
-group_by_value       | 
+&nbsp;-[ RECORD 6 ]--------+---------------------------------------------
+group_by             |
+group_by_value       |
 target_column        | class_name
 column_number        | 6
 data_type            | text
@@ -332,13 +353,17 @@ missing_values       | 0
 blank_values         | 0
 fraction_missing     | 0
 fraction_blank       | 0
-mean                 | 
-variance             | 
+positive_values      |
+negative_values      |
+zero_values          |
+mean                 |
+variance             |
+confidence_interval  |
 min                  | 11
 max                  | 15
-first_quartile       | 
-median               | 
-third_quartile       | 
+first_quartile       |
+median               |
+third_quartile       |
 most_frequent_values | {Iris-setosa,Iris-versicolor,Iris-virginica}
 mfv_frequencies      | {10,10,10}
 </pre>
@@ -358,7 +383,7 @@ SELECT * FROM iris_summary;
 </pre>
 Result (partial):
 <pre class="result">
-&nbsp;-[ RECORD 1 ]-------+-----------------------------------
+&nbsp;-[ RECORD 1 ]--------+----------------------------------------
 group_by             | class_name
 group_by_value       | Iris-setosa
 target_column        | sepal_length
@@ -367,20 +392,24 @@ data_type            | float8
 row_count            | 10
 distinct_values      | 7
 missing_values       | 0
-blank_values         | 
+blank_values         |
 fraction_missing     | 0
-fraction_blank       | 
+fraction_blank       |
+positive_values      | 10
+negative_values      | 0
+zero_values          | 0
 mean                 | 4.86
-variance             | 0.0848888888888976
+variance             | 0.0848888888888875
+confidence_interval  | {4.67941507384182,5.04058492615818}
 min                  | 4.4
 max                  | 5.4
 first_quartile       | 4.625
 median               | 4.9
 third_quartile       | 5
-most_frequent_values | {4.6,4.9,5,5.1,4.4,5.4,4.7}
+most_frequent_values | {4.9,5,4.6,5.1,4.7,5.4,4.4}
 mfv_frequencies      | {2,2,2,1,1,1,1}
 ...
-&nbsp;-[ RECORD 3 ]-------+-----------------------------------
+&nbsp;-[ RECORD 3 ]--------+----------------------------------------
 group_by             | class_name
 group_by_value       | Iris-versicolor
 target_column        | sepal_length
@@ -389,17 +418,21 @@ data_type            | float8
 row_count            | 10
 distinct_values      | 10
 missing_values       | 0
-blank_values         | 
+blank_values         |
 fraction_missing     | 0
-fraction_blank       | 
+fraction_blank       |
+positive_values      | 10
+negative_values      | 0
+zero_values          | 0
 mean                 | 6.1
 variance             | 0.528888888888893
+confidence_interval  | {5.64924734548141,6.55075265451859}
 min                  | 4.9
 max                  | 7
 first_quartile       | 5.55
 median               | 6.35
 third_quartile       | 6.575
-most_frequent_values | {7,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2}
+most_frequent_values | {6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,7,6.4}
 mfv_frequencies      | {1,1,1,1,1,1,1,1,1,1}
 ...
 </pre>
@@ -421,43 +454,51 @@ SELECT * FROM iris_summary;
 </pre>
 Result:
 <pre class="result">
-&nbsp;-[ RECORD 1 ]-------+-----------------------------------
-group_by             | 
-group_by_value       | 
+&nbsp;-[ RECORD 1 ]--------+------------------------------------
+group_by             |
+group_by_value       |
 target_column        | sepal_length
 column_number        | 2
 data_type            | float8
 row_count            | 30
 distinct_values      | 22
 missing_values       | 0
-blank_values         | 
+blank_values         |
 fraction_missing     | 0
-fraction_blank       | 
+fraction_blank       |
+positive_values      | 30
+negative_values      | 0
+zero_values          | 0
 mean                 | 5.84333333333333
-variance             | 0.9294367816092
+variance             | 0.929436781609175
+confidence_interval  | {5.49834423494375,6.18832243172292}
 min                  | 4.4
 max                  | 7.6
 quantile_array       | {5.057,6.414}
-most_frequent_values | {4.9,6.3,5}
+most_frequent_values | {4.9,6.3,6.5}
 mfv_frequencies      | {4,3,2}
-&nbsp;-[ RECORD 2 ]-------+-----------------------------------
-group_by             | 
-group_by_value       | 
+&nbsp;-[ RECORD 2 ]--------+------------------------------------
+group_by             |
+group_by_value       |
 target_column        | sepal_width
 column_number        | 3
 data_type            | float8
 row_count            | 30
 distinct_values      | 14
 missing_values       | 0
-blank_values         | 
+blank_values         |
 fraction_missing     | 0
-fraction_blank       | 
+fraction_blank       |
+positive_values      | 30
+negative_values      | 0
+zero_values          | 0
 mean                 | 3.04
 variance             | 0.13903448275862
+confidence_interval  | {2.90656901047539,3.17343098952461}
 min                  | 2.3
 max                  | 3.9
 quantile_array       | {2.9,3.2}
-most_frequent_values | {3,2.9,3.2}
+most_frequent_values | {2.9,3,3.2}
 mfv_frequencies      | {4,4,3}
 </pre>
 
@@ -474,7 +515,7 @@ count and most frequent values:
     -  If <em>get_estimates</em> is TRUE then the distinct value computation is
     estimated using Flajolet-Martin. MFV is computed using a
     fast method that does parallel aggregation in Greenplum Database at the expense
-    of missing some of the most frequent values.
+    of missing or duplicating some of the most frequent values.
     -  If <em>get_estimates</em> is FALSE then the distinct values are computed
     in a slower but exact method using PostgreSQL COUNT DISTINCT.  MFV is computed using a
     faithful implementation that preserves the approximation guarantees of
@@ -494,9 +535,9 @@ File summary.sql_in documenting the \b summary() function
 DROP TYPE IF EXISTS MADLIB_SCHEMA.summary_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.summary_result AS
 (
-    output_table    TEXT,
-    row_count       INT4,
-    duration        FLOAT8
+    output_table             TEXT,
+    num_col_summarized       INT4,
+    duration                 FLOAT8
 );
 
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/d025bb46/src/ports/postgres/modules/summary/test/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/test/summary.sql_in b/src/ports/postgres/modules/summary/test/summary.sql_in
index bf7c4f4..879b4f9 100644
--- a/src/ports/postgres/modules/summary/test/summary.sql_in
+++ b/src/ports/postgres/modules/summary/test/summary.sql_in
@@ -38,6 +38,9 @@ sunny, 75, 70, true, Play
 overcast, 72, 90, true, Play
 overcast, 81, 75, false, Play
 rain, 71, 80, true, Don't Play
+snow, 0, 85, false, Don't Play
+snow, -9, 72, true, Play
+sunny, -2, 80, true, Don't play
 \.
 
 DROP TABLE IF EXISTS example_data_summary;