You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/01/21 19:06:56 UTC
incubator-madlib git commit: Correlation: Fix minor messaging error
in summary table
Repository: incubator-madlib
Updated Branches:
refs/heads/master d282e0f36 -> 0a48f3ad8
Correlation: Fix minor messaging error in summary table
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0a48f3ad
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0a48f3ad
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0a48f3ad
Branch: refs/heads/master
Commit: 0a48f3ad8af8fdd904d3eca51bb4ed3489e7db72
Parents: d282e0f
Author: Rahul Iyer <ri...@pivotal.io>
Authored: Thu Jan 21 10:06:49 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Thu Jan 21 10:06:49 2016 -0800
----------------------------------------------------------------------
.../postgres/modules/stats/correlation.py_in | 43 ++++++++++++--------
1 file changed, 26 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0a48f3ad/src/ports/postgres/modules/stats/correlation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in
index fc9959b..e8b64cb 100644
--- a/src/ports/postgres/modules/stats/correlation.py_in
+++ b/src/ports/postgres/modules/stats/correlation.py_in
@@ -144,7 +144,7 @@ def _analyze_target_cols(source_table, target_cols):
def _populate_output_table(schema_madlib, source_table, output_table,
- col_names, get_cov=True, verbose=False):
+ col_names, get_cov=False, verbose=False):
"""
Creates a relation with the appropriate number of columns given a list of
column names and populates with the correlation coefficients. If the table
@@ -167,6 +167,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
col_names_str = ",".join(col_names)
temp_table = unique_string()
if get_cov:
+ function_name = "Covariance"
agg_str = """
(CASE WHEN count(*) > 0
THEN {0}.array_scalar_mult({0}.covariance_agg(x, mean),
@@ -174,6 +175,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
ELSE NULL
END) """.format(schema_madlib)
else:
+ function_name = "Correlation"
agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib)
# actual computation
@@ -208,7 +210,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
q_summary = """
CREATE TABLE {summary_table} AS
SELECT
- 'correlation'::varchar AS method,
+ '{function_name}'::varchar AS method,
'{source_table}'::varchar AS source,
'{output_table}'::varchar AS output_table,
'{col_names_str}'::varchar AS column_names,
@@ -232,21 +234,17 @@ def _populate_output_table(schema_madlib, source_table, output_table,
CREATE TABLE {output_table} AS
SELECT
*
- FROM
- (
+ FROM (
SELECT
generate_series(1, {num_cols}) AS column_position,
unnest($1) AS variable
) variable_subq
- JOIN
- (
+ JOIN (
SELECT
*
- FROM
- {schema_madlib}.__deconstruct_lower_triangle(
+ FROM {schema_madlib}.__deconstruct_lower_triangle(
(SELECT cor_mat FROM {temp_table})
- )
- AS {as_list}
+ ) AS {as_list}
) matrix_subq
USING (column_position)
""".format(num_cols=len(col_names), **locals()), ["varchar[]"])
@@ -346,7 +344,24 @@ SELECT madlib.{func}('example_data', 'example_data_output', 'temperature, humidi
SELECT * from example_data_output order by column_position;
""".format(func=func)
else:
- return """
+ if cov:
+ return """
+Covariance is a measure of how much two random variables change together. If the
+greater values of one variable mainly correspond with the greater values of the
+other variable, and the same holds for the smaller values, i.e., the variables
+tend to show similar behavior, the covariance is positive. In the opposite
+case, when the greater values of one variable mainly correspond to the smaller
+values of the other, i.e., the variables tend to show opposite behavior, the
+covariance is negative. The sign of the covariance therefore shows the tendency
+-------
+For an overview on usage, run:
+ SELECT {schema_madlib}.covariance('usage');
+-------
+For examples:
+ SELECT {schema_madlib}.covariance('example');
+ """.format(schema_madlib=schema_madlib)
+ else:
+ return """
A correlation function is the degree and direction of association of
two variables; how well can one random variable be predicted
from the other. The coefficient of correlation varies from -1 to 1:
@@ -355,14 +370,8 @@ perfectly anti-correlated.
-------
For an overview on usage, run:
SELECT {schema_madlib}.correlation('usage');
-
-To obtain the covariance values instead of correlation:
- SELECT {schema_madlib}.covariance('usage');
-------
For examples:
SELECT {schema_madlib}.correlation('example');
-OR
- SELECT {schema_madlib}.covariance('example');
-
""".format(schema_madlib=schema_madlib)
# ------------------------------------------------------------------------------