You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/01/21 19:06:56 UTC

incubator-madlib git commit: Correlation: Fix minor messaging error in summary table

Repository: incubator-madlib
Updated Branches:
  refs/heads/master d282e0f36 -> 0a48f3ad8


Correlation: Fix minor messaging error in summary table


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0a48f3ad
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0a48f3ad
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0a48f3ad

Branch: refs/heads/master
Commit: 0a48f3ad8af8fdd904d3eca51bb4ed3489e7db72
Parents: d282e0f
Author: Rahul Iyer <ri...@pivotal.io>
Authored: Thu Jan 21 10:06:49 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Thu Jan 21 10:06:49 2016 -0800

----------------------------------------------------------------------
 .../postgres/modules/stats/correlation.py_in    | 43 ++++++++++++--------
 1 file changed, 26 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0a48f3ad/src/ports/postgres/modules/stats/correlation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in
index fc9959b..e8b64cb 100644
--- a/src/ports/postgres/modules/stats/correlation.py_in
+++ b/src/ports/postgres/modules/stats/correlation.py_in
@@ -144,7 +144,7 @@ def _analyze_target_cols(source_table, target_cols):
 
 
 def _populate_output_table(schema_madlib, source_table, output_table,
-                           col_names, get_cov=True, verbose=False):
+                           col_names, get_cov=False, verbose=False):
     """
     Creates a relation with the appropriate number of columns given a list of
     column names and populates with the correlation coefficients. If the table
@@ -167,6 +167,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
         col_names_str = ",".join(col_names)
         temp_table = unique_string()
         if get_cov:
+            function_name = "Covariance"
             agg_str = """
                 (CASE WHEN count(*) > 0
                       THEN {0}.array_scalar_mult({0}.covariance_agg(x, mean),
@@ -174,6 +175,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
                       ELSE NULL
                 END) """.format(schema_madlib)
         else:
+            function_name = "Correlation"
             agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib)
 
         # actual computation
@@ -208,7 +210,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
         q_summary = """
             CREATE TABLE {summary_table} AS
             SELECT
-                'correlation'::varchar      AS method,
+                '{function_name}'::varchar  AS method,
                 '{source_table}'::varchar   AS source,
                 '{output_table}'::varchar   AS output_table,
                 '{col_names_str}'::varchar  AS column_names,
@@ -232,21 +234,17 @@ def _populate_output_table(schema_madlib, source_table, output_table,
             CREATE TABLE {output_table} AS
             SELECT
                 *
-            FROM
-            (
+            FROM (
                 SELECT
                     generate_series(1, {num_cols}) AS column_position,
                     unnest($1) AS variable
             ) variable_subq
-            JOIN
-            (
+            JOIN (
                 SELECT
                     *
-                FROM
-                    {schema_madlib}.__deconstruct_lower_triangle(
+                FROM {schema_madlib}.__deconstruct_lower_triangle(
                         (SELECT cor_mat FROM {temp_table})
-                    )
-                    AS {as_list}
+                    ) AS {as_list}
             ) matrix_subq
             USING (column_position)
             """.format(num_cols=len(col_names), **locals()), ["varchar[]"])
@@ -346,7 +344,24 @@ SELECT madlib.{func}('example_data', 'example_data_output', 'temperature, humidi
 SELECT * from example_data_output order by column_position;
          """.format(func=func)
     else:
-        return """
+        if cov:
+            return """
+Covariance is a measure of how much two random variables change together. If the
+greater values of one variable mainly correspond with the greater values of the
+other variable, and the same holds for the smaller values, i.e., the variables
+tend to show similar behavior, the covariance is positive. In the opposite
+case, when the greater values of one variable mainly correspond to the smaller
+values of the other, i.e., the variables tend to show opposite behavior, the
+covariance is negative. The sign of the covariance therefore shows the tendency
+-------
+For an overview on usage, run:
+    SELECT {schema_madlib}.covariance('usage');
+-------
+For examples:
+    SELECT {schema_madlib}.covariance('example');
+            """.format(schema_madlib=schema_madlib)
+        else:
+            return """
 A correlation function is the degree and direction of association of
 two variables; how well can one random variable be predicted
 from the other. The coefficient of correlation varies from -1 to 1:
@@ -355,14 +370,8 @@ perfectly anti-correlated.
 -------
 For an overview on usage, run:
     SELECT {schema_madlib}.correlation('usage');
-
-To obtain the covariance values instead of correlation:
-    SELECT {schema_madlib}.covariance('usage');
 -------
 For examples:
     SELECT {schema_madlib}.correlation('example');
-OR
-    SELECT {schema_madlib}.covariance('example');
-
             """.format(schema_madlib=schema_madlib)
 # ------------------------------------------------------------------------------