You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/08/29 20:42:14 UTC
[37/50] [abbrv] incubator-madlib git commit: Elastic Net: Fix
normalization issue
Elastic Net: Fix normalization issue
MADLIB-1094 and MADLIB-1146
avg in psql is numerically unstable
Data scaling was not occuring when
grouping is true.
Closes #164
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ceefae4f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ceefae4f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ceefae4f
Branch: refs/heads/latest_release
Commit: ceefae4f4180b88a1aa5712d0e43f0b00573c378
Parents: 6025c4b
Author: Cooper Sloan <co...@gmail.com>
Authored: Thu Aug 10 12:04:04 2017 -0700
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Fri Aug 11 11:31:12 2017 -0700
----------------------------------------------------------------------
.../elastic_net_generate_result.py_in | 6 +--
.../elastic_net/elastic_net_optimizer_igd.py_in | 4 +-
.../modules/elastic_net/elastic_net_utils.py_in | 42 ++++++++++++++------
3 files changed, 35 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
index df5489f..7a87ef6 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
@@ -38,13 +38,13 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args):
select_mean_and_std = ''
inner_join_x = ''
inner_join_y = ''
+ grouping_cols_list = split_quoted_delimited_str(grouping_column)
+ select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp)
+ for grp in grouping_cols_list]) + ','
if data_scaled:
- grouping_cols_list = split_quoted_delimited_str(grouping_column)
select_grouping_info = ','.join([
grp_col.strip()+"\t"+cols_types[grp_col.strip()]
for grp_col in grouping_column.split(',')]) + ","
- select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp)
- for grp in grouping_cols_list]) + ','
x_grp_cols = ' AND '.join([
'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp,
args["x_mean_table"], grp) for grp in grouping_cols_list])
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
index d73a754..c5d21c2 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
@@ -4,7 +4,7 @@ from utilities.utilities import unique_string
from utilities.in_mem_group_control import GroupIterationController
from elastic_net_utils import _compute_means
from elastic_net_utils import _normalize_data
-from elastic_net_utils import _compute_data_scales
+from elastic_net_utils import _compute_scales
from elastic_net_utils import _tbl_dimension_rownum
from elastic_net_utils import _elastic_net_validate_args
from utilities.utilities import _array_to_string
@@ -216,7 +216,7 @@ def _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate,
args["col_ind_var_new"] = args["col_ind_var_norm_new"]
args["col_dep_var_new"] = args["col_dep_var_norm_new"]
else:
- _compute_data_scales(args)
+ _compute_scales(args)
tbl_used = tbl_source
args["col_ind_var_new"] = col_ind_var
args["col_dep_var_new"] = col_dep_var
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
index b2f2505..154ac31 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
@@ -129,18 +129,27 @@ def _compute_log_likelihood(coef, intercept, **args):
Compute the log-likelihood at the end of calculation
"""
if args["family"] == "gaussian": # linear models
+ loss_query = """
+ select
+ {method}(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict(
+ '{coefficients}'::double precision[],
+ {intercept}::double precision,
+ {col_ind_var_new}))^2)/({denominator})
+ as loss
+ from
+ {tbl_used}
+ """
+ # See jira 1094, avg experiences numerical instability
+ denominator = "2."
+ method = "avg"
+ if not args["normalization"]:
+ method = "sum"
+ denominator = "count(*) * 2."
loss = plpy.execute(
- """
- select
- avg(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict(
- '{coefficients}'::double precision[],
- {intercept}::double precision,
- {col_ind_var_new}))^2) / 2.
- as loss
- from
- {tbl_used}
- """.format(coefficients=_array_to_string(coef),
+ loss_query.format(coefficients=_array_to_string(coef),
intercept=intercept,
+ method=method,
+ denominator=denominator,
**args))[0]["loss"]
elif args["family"] == "binomial": # logistic models
loss = plpy.execute(
@@ -192,8 +201,18 @@ def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
return None
# ------------------------------------------------------------------------
+def _compute_scales(args):
+ if args["grouping_col"]:
+ _compute_data_scales_grouping(args)
+ else:
+ _compute_data_scales(args)
def _compute_data_scales_grouping(args):
+ # When grouping_col is defined, we must find an array containing
+ # the mean of every dimension in the independent variable (x), the
+ # mean of dependent variable (y) and the standard deviation for them
+ # specific to groups. Store these results in temp tables x_mean_table
+ # and y_mean_table.
__utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"],
args["dimension"], args["schema_madlib"], args["grouping_col"],
args["x_mean_table"])
@@ -227,13 +246,13 @@ def _normalize_data(args):
The output is stored in tbl_data_scaled
"""
y_decenter = True if args["family"] == "gaussian" else False
+ _compute_scales(args)
if args["grouping_col"]:
# When grouping_col is defined, we must find an array containing
# the mean of every dimension in the independent variable (x), the
# mean of dependent variable (y) and the standard deviation for them
# specific to groups. Store these results in temp tables x_mean_table
# and y_mean_table.
- _compute_data_scales_grouping(args)
# __utils_normalize_data_grouping reads the various means and stds
# from the tables.
__utils_normalize_data_grouping(y_decenter=y_decenter,
@@ -251,7 +270,6 @@ def _normalize_data(args):
# When no grouping_col is defined, the mean and std for both 'x' and
# 'y' can be defined using strings, stored in x_mean_str, x_std_str
# etc. We don't need a table like how we needed for grouping.
- _compute_data_scales(args)
__utils_normalize_data(y_decenter=y_decenter,
tbl_data=args["tbl_source"],
col_ind_var=args["col_ind_var"],