You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/12/05 18:31:55 UTC
madlib git commit: Feature: Allow NULL in rows for computing correlations and covariance JIRA: MADLIB-1166

Repository: madlib
Updated Branches:
  refs/heads/master f28c5a07e -> 4aa073294


Feature: Allow NULL in rows for computing correlations and covariance
JIRA: MADLIB-1166

Additional Authors:
Orhan Kislal <ok...@pivotal.io>

- Impute NULL values with the mean of corresponding column.

Closes #206


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/4aa07329
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/4aa07329
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/4aa07329

Branch: refs/heads/master
Commit: 4aa0732942523a38ed3e7da60e43e3307ff8a5a1
Parents: f28c5a0
Author: Swatisoni <so...@gmail.com>
Authored: Tue Dec 5 10:27:13 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Tue Dec 5 10:30:41 2017 -0800

----------------------------------------------------------------------
 .../postgres/modules/stats/correlation.py_in    | 37 +++++------
 .../postgres/modules/stats/correlation.sql_in   | 25 ++++++--
 .../modules/stats/test/correlation.sql_in       | 65 ++++++++++++++++++++
 3 files changed, 101 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/4aa07329/src/ports/postgres/modules/stats/correlation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in
index 0b08208..f658b48 100644
--- a/src/ports/postgres/modules/stats/correlation.py_in
+++ b/src/ports/postgres/modules/stats/correlation.py_in
@@ -165,7 +165,6 @@ def _populate_output_table(schema_madlib, source_table, output_table,
     with MinWarning("info" if verbose else "error"):
         start = time()
         col_len = len(col_names)
-        col_names_as_float_array = py_list_to_sql_string(col_names, "float8")
         col_names_as_text_array = py_list_to_sql_string(col_names, "varchar")
         temp_table = unique_string()
         if get_cov:
@@ -180,32 +179,31 @@ def _populate_output_table(schema_madlib, source_table, output_table,
             function_name = "Correlation"
             agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib)
 
+        cols = ','.join(["coalesce({0}, avg_{0})".format(col) for col in col_names])
+        avgs = ','.join(["avg({0}) AS avg_{0}".format(col) for col in col_names])
+        avg_array = ','.join(["avg_{0}".format(col) for col in col_names])
         # actual computation
-        plpy.execute("""
+        sql1 = """
+
             CREATE TEMP TABLE {temp_table} AS
             SELECT
-                tot_cnt,
-                count(*) AS non_null_cnt,
+                count(*) AS tot_cnt,
                 mean,
                 {agg_str} as cor_mat
             FROM
             (
-                SELECT {col_names_as_float_array} AS x
-                FROM {source_table}
-            ) src1,
-            (
-                SELECT
-                    count(*) AS tot_cnt,
-                    {schema_madlib}.avg(x) AS mean
-                FROM
+                SELECT ARRAY[ {cols} ] AS x,
+                        ARRAY [ {avg_array} ] AS mean
+                FROM {source_table},
                 (
-                    SELECT {col_names_as_float_array} AS x
+                    SELECT {avgs}
                     FROM {source_table}
-                ) src2
-            ) subq
-            WHERE NOT {schema_madlib}.array_contains_null(x)
-            GROUP BY tot_cnt, mean
-            """.format(**locals()))
+                )sub1
+            ) sub2
+            GROUP BY mean
+            """.format(**locals())
+
+        plpy.execute(sql1)
 
         # create summary table
         summary_table = add_postfix(output_table, "_summary")
@@ -217,8 +215,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
                 '{output_table}'::varchar   AS output_table,
                 {col_names_as_text_array}   AS column_names,
                 mean                        AS mean_vector,
-                non_null_cnt                AS total_rows_processed,
-                tot_cnt - non_null_cnt      AS total_rows_skipped
+                tot_cnt                     AS total_rows_processed
             FROM {temp_table}
             """.format(**locals())
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/4aa07329/src/ports/postgres/modules/stats/correlation.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in
index 25c2859..79ea82f 100644
--- a/src/ports/postgres/modules/stats/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/correlation.sql_in
@@ -20,6 +20,7 @@ m4_include(`SQLCommon.m4')
 <ul>
 <li><a href="#usage">Correlation Function</a></li>
 <li><a href="#examples">Examples</a></li>
+<li><a href="#literature">Literature</a></li>
 <li><a href="#related">Related Topics</a></li>
 </ul>
 </div>
@@ -104,10 +105,6 @@ is also created at the same time, which has the following columns:
   <th>total_rows_processed</th>
   <td>BIGINT. Total numbers of rows processed.</td>
 </tr>
-<tr>
-  <th>total_rows_skipped</th>
-  <td>BIGINT. Total numbers of rows skipped due to missing values.</td>
-</tr>
 </table>
 </dd>
 
@@ -207,8 +204,24 @@ Result:
 </pre>
 
 @par Notes
-Current implementation ignores a row that contains NULL entirely. This means
-any correlation in such a row (with NULLs) does not contribute to the final answer.
+
+Null values will be replaced by the mean of their respective columns (Mean
+imputation/substitution). Mean imputation is a method in which the missing
+value on a certain variable is replaced by the mean of the available cases.
+This method maintains the sample size and is easy to use, but the variability
+in the data is reduced, so the standard deviations and the variance estimates
+tend to be underestimated. Please refer to [1] and [2] for details.
+
+If the mean imputation method is not suitable for the target use case, it is
+advised to employ a view that handles the NULL values prior to calling the
+correlation/covariance functions.
+
+@anchor literature
+@literature
+
+[1] https://en.wikipedia.org/wiki/Imputation_(statistics)
+
+[2] https://www.iriseekhout.com/missing-data/missing-data-methods/imputation-methods/
 
 @anchor related
 @par Related Topics

http://git-wip-us.apache.org/repos/asf/madlib/blob/4aa07329/src/ports/postgres/modules/stats/test/correlation.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/test/correlation.sql_in b/src/ports/postgres/modules/stats/test/correlation.sql_in
index 163f641..d6cd014 100644
--- a/src/ports/postgres/modules/stats/test/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/test/correlation.sql_in
@@ -22,3 +22,68 @@ DROP TABLE IF EXISTS corr_output, corr_output_summary;
 SELECT * FROM correlation('rand_numeric', 'corr_output', Null);
 DROP TABLE IF EXISTS corr_output, corr_output_summary;
 SELECT * FROM correlation('rand_numeric', 'corr_output', 'a, c, e');
+
+DROP TABLE IF EXISTS example_data;
+CREATE TABLE example_data(
+    id SERIAL, outlook TEXT,
+    temperature FLOAT8, humidity FLOAT8,
+    windy TEXT, class TEXT, new_col FLOAT8);
+INSERT INTO example_data VALUES
+(1, 'sunny', 85, 85, 'false', 'Dont Play', 1),
+(2, 'sunny', 80, 90, 'true', 'Dont Play', 12),
+(3, 'overcast', 83, 78, 'false', 'Play', 13),
+(4, 'rain', 70, 96, 'false', 'Play', 16),
+(5, 'rain', 68, 80, 'false', 'Play', 17),
+(6, 'rain', 65, 70, 'true', 'Dont Play', 12),
+(7, 'overcast', 64, 65, 'true', 'Play', 15),
+(8, 'sunny', 72, 95, 'false', 'Dont Play', 19),
+(9, 'sunny', 69, 70, 'false', 'Play', 20),
+(10, 'rain', 75, 80, 'false', 'Play', 32),
+(11, 'sunny', 75, 70, 'true', 'Play', 31),
+(12, 'overcast', 72, 90, 'true', 'Play', 11),
+(13, 'overcast', 81, 75, 'false', 'Play', 31),
+(14, 'rain', 71, 80, 'true', 'Dont Play', 21),
+(15, NULL, 100, 100, 'true', NULL, 11),
+(16, NULL, 110, 100, 'true', NULL, 13);
+
+--- example_data_output will have correlations of only two columns which do
+--- not include any null columns in another columns
+DROP TABLE IF EXISTS example_data_output, example_data_output_summary;
+SELECT correlation( 'example_data',
+                    'example_data_output',
+                    'temperature, humidity, new_col',
+                    True);
+SELECT * FROM example_data_output;
+
+INSERT INTO example_data VALUES (17, NULL, 110, 100, 'true', NULL, NULL);
+
+--- example_data_output will have correlations of only three columns which does
+--- include any columns. Hence if this
+DROP TABLE IF EXISTS example_data_output_with_null_2column, example_data_output_with_null_2column_summary;
+SELECT correlation( 'example_data',
+                    'example_data_output_with_null_2column',
+                    'temperature, humidity',
+                    True);
+
+SELECT * FROM example_data_output_with_null_2column;
+
+DROP TABLE IF EXISTS example_data_output_with_null_3column, example_data_output_with_null_3column_summary;
+SELECT correlation( 'example_data',
+                    'example_data_output_with_null_3column',
+                    'temperature, humidity, new_col',
+                    True);
+SELECT * FROM example_data_output_with_null_3column;
+
+SELECT assert(ABS(a.temperature - b.temperature) < 0.000001,
+       'Rows with NULL values are ignored.')
+FROM
+  example_data_output_with_null_2column a,
+  example_data_output_with_null_3column b
+WHERE a.column_position = 2 and b.column_position = 2;
+
+SELECT assert(ABS(a.temperature - b.temperature) > 0.000001,
+       'New row with NULL values is ignored.')
+FROM
+  example_data_output a,
+  example_data_output_with_null_2column b
+WHERE a.column_position = 2 and b.column_position = 2;