You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/12/22 00:08:51 UTC

[1/3] incubator-madlib git commit: Elastic net: Add cross validation

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 38d1e87a8 -> 6939fd63b


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
index 6b291ec..ce6b280 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
@@ -1,4 +1,3 @@
-
 import plpy
 import math
 import re
@@ -7,22 +6,25 @@ from utilities.utilities import _array_to_string
 from convex.utils_regularization import __utils_ind_var_scales
 from convex.utils_regularization import __utils_dep_var_scale
 from convex.utils_regularization import __utils_normalize_data
-# from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
-# from utilities.validate_args import scalar_col_has_no_null
-# from utilities.validate_args import array_col_has_same_dimension
-# from utilities.validate_args import array_col_has_no_null
 from utilities.control import IterationController2S
-#from convex.lasso_igd import IterationControllerNoTableDrop
-from utilities.utilities import __mad_version
 
-version_wrapper = __mad_version()
-mad_vec = version_wrapper.select_vecfunc()
+from collections import namedtuple
+
+# ------------------------------------------------------------------------
+# -- constants -----------------------------------------------------------
 
-# ========================================================================
+# below constants are defined in a manner that allow using them as enums:
+#   'igd' in OPTIMIZERS  (returns True)
+#   'igd' == OPTIMIZERS.igd  (returns True)
+#   To change the
+BINOMIAL_FAMILIES = namedtuple("bin", ("binomial logistic"))('binomial', 'logistic')
+GAUSSIAN_FAMILIES = namedtuple("gau", ("gaussian linear"))('gaussian', 'linear')
+OPTIMIZERS = namedtuple("opt", ("igd fista"))('igd', 'fista')
+# -------------------------------------------------------------------------
 
 
-def __process_results(coef, intercept, outstr_array):
+def _process_results(coef, intercept, outstr_array):
     """
     Return features, features_selected, dense_coef
     """
@@ -42,10 +44,10 @@ def __process_results(coef, intercept, outstr_array):
     dense_coef = _array_to_string(dense_coef)
 
     return (features, features_selected, dense_coef, _array_to_string(coef))
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __process_warmup_lambdas(lambdas, lambda_value):
+def _process_warmup_lambdas(lambdas, lambda_value):
     """
     Convert the string of warmup_lambdas into an double array
     @param lambdas The string which will be converted to an array
@@ -57,10 +59,9 @@ def __process_warmup_lambdas(lambdas, lambda_value):
         plpy.error("Elastic Net error: warmup_lambdas must be NULL or something like {3,2,1} !")
 
     elm = _string_to_array(matched.group(1))
-    for i in range(len(elm)):
-        elm[i] = float(elm[i])
+    elm = [float(i) for i in elm]
 
-    if elm[len(elm) - 1] != lambda_value:
+    if elm[- 1] != lambda_value:
         plpy.error("""
                    Elastic Net error: The last element of warmup_lambdas must
                    be equal to the lambda value that you want to compute !
@@ -75,53 +76,40 @@ def __process_warmup_lambdas(lambdas, lambda_value):
                            """)
 
     return elm
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __generate_warmup_lambda_sequence(tbl_used, col_ind_var, col_dep_var,
-                                      dimension, row_num, lambda_value,
-                                      alpha, num_steps, sq):
+def _generate_warmup_lambda_sequence(lambda_value, n_steps):
     """
     Compute lambda sequence, when warmup is True and warmup_lambdas are
     not given
     """
-    if num_steps == 1:
+    if n_steps == 1:
         return [lambda_value]
 
-    # mean_y = plpy.execute("select avg({col_dep_var}) from {tbl_used}".format(
-    #     col_dep_var = col_dep_var, tbl_used = tbl_used))[0]["avg"]
-    # xy = [0] * dimension
-    # for i in range(1,dimension+1):
-    #     xy[i-1] = plpy.execute("""
-    #                          select abs(sum({col_ind_var}[{i}] * ({col_dep_var} - {mean_y})))
-    #                          from {tbl_used}
-    #                          """.format(col_ind_var = col_ind_var,
-    #                                     col_dep_var = col_dep_var,
-    #                                     mean_y = mean_y,
-    #                                     tbl_used = tbl_used,
-    #                                     i = i))[0]["abs"]
-    # max_sq = max(sq)
-    # epsilon = 0.001
-    # effective_alpha = alpha + (1 - alpha) * epsilon
-    # largest = (max(xy)/ float(row_num) + epsilon * max_sq) / effective_alpha
     largest = 1e5
-    if lambda_value == 0.:
+
+    if abs(lambda_value - 0.) < 1e-6:
+        zero_lambda = True
         smallest = 0.001 * largest
+        n_steps -= 1
     else:
         smallest = lambda_value
-    step = math.log(largest / smallest) / (float(num_steps) - 1)
-    seq = range(num_steps)
-    seq.reverse()
-    for i in range(num_steps):
-        seq[i] = math.exp(seq[i] * step + math.log(smallest))
-    if lambda_value == 0:
-        seq.append(0)
+        zero_lambda = False
+
+    smallest, largest = min(smallest, largest), max(smallest, largest)
+    step = math.log(smallest / largest) / (float(n_steps) - 1)
+    constant = math.log(largest)
+
+    seq = [math.exp(j * step + constant) for j in range(n_steps)]
+    if zero_lambda:
+        seq.append(0.)
 
     return seq
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __compute_average_sq(**args):
+def _compute_average_sq(**args):
     """
     Compute the average squares of all features, used to estimtae the largest lambda
     Actually only the largest value is used, so order does not matter here
@@ -132,10 +120,10 @@ def __compute_average_sq(**args):
             sq[i] = (args["x_scales"]["std"][i]) ** 2 + (args["x_scales"]["mean"][i]) ** 2
 
     return sq
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __compute_log_likelihood(coef, intercept, **args):
+def _compute_log_likelihood(coef, intercept, **args):
     """
     Compute the log-likelihood at the end of calculation
     """
@@ -167,23 +155,20 @@ def __compute_log_likelihood(coef, intercept, **args):
             """.format(coefficients=_array_to_string(coef),
                        intercept=intercept,
                        **args))[0]["loss"]
-    module_1 = sum(x * x for x in coef)
+    module_1 = sum(x * x for x in coef) / 2.
     module_2 = sum(abs(x) for x in coef)
 
-    log_likelihood = -(loss + args["lambda_value"] *
-                       ((1 - args["alpha"]) *
-                        module_1 / 2. + args["alpha"] * module_2))
+    log_likelihood = - (loss + args["lambda_value"] *
+                        ((1 - args["alpha"]) * module_1 + args["alpha"] * module_2))
     return log_likelihood
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
-                                tbl_result, tbl_summary, lambda_value, alpha,
-                                normalization, max_iter, tolerance):
-    if (tbl_source is None or col_ind_var is None or col_dep_var is None
-            or tbl_result is None or lambda_value is None or alpha is None
-            or normalization is None or len(tbl_source) == 0 or len(col_ind_var) == 0
-            or len(col_dep_var) == 0 or len(tbl_result) == 0):
+def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
+                               tbl_result, tbl_summary, lambda_value, alpha,
+                               normalization, max_iter, tolerance):
+    if (any(i is None for i in (lambda_value, alpha, normalization)) or
+            any(not i for i in (tbl_source, col_ind_var, col_dep_var, tbl_result))):
         plpy.error("Elastic Net error: You have unsupported NULL/empty value(s) in the arguments!")
 
     if table_exists(tbl_result, only_first_schema=True):
@@ -205,31 +190,31 @@ def __elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
         plpy.error("Elastic Net error: tolerance must be positive!")
 
     return None
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __compute_data_scales(args):
+def _compute_data_scales(args):
     args["x_scales"] = __utils_ind_var_scales(tbl_data=args["tbl_source"], col_ind_var=args["col_ind_var"],
                                               dimension=args["dimension"], schema_madlib=args["schema_madlib"])
 
     if args["family"] == "binomial":
         args["y_scale"] = dict(mean=0, std=1)
     else:
-        args["y_scale"] = __utils_dep_var_scale(schema_madlib=args["schema_madlib"], tbl_data=args["tbl_source"], 
-            col_ind_var=args["col_ind_var"], col_dep_var=args["col_dep_var"])
+        args["y_scale"] = __utils_dep_var_scale(schema_madlib=args["schema_madlib"], tbl_data=args["tbl_source"],
+                                                col_ind_var=args["col_ind_var"], col_dep_var=args["col_dep_var"])
 
     args["xmean_str"] = _array_to_string(args["x_scales"]["mean"])
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __normalize_data(args):
+def _normalize_data(args):
     """
     Compute the scaling factors for independent and dependent
     variables, and then scale the original data.
 
     The output is stored in tbl_data_scaled
     """
-    __compute_data_scales(args)
+    _compute_data_scales(args)
 
     y_decenter = True if args["family"] == "gaussian" else False
 
@@ -248,10 +233,10 @@ def __normalize_data(args):
                            grouping_col=args["grouping_col"])
 
     return None
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var):
+def _tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var):
     """
     Measure the dimension and row number of source data table
     """
@@ -270,14 +255,14 @@ def __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var):
                            select count(*) from {tbl_source}
                            WHERE not {schema_madlib}.array_contains_null({col_ind_var})
                            """.format(tbl_source=tbl_source,
-                           schema_madlib=schema_madlib,
-                           col_ind_var=col_ind_var))[0]["count"]
+                                      schema_madlib=schema_madlib,
+                                      col_ind_var=col_ind_var))[0]["count"]
 
     return (dimension, row_num)
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __compute_means(**args):
+def _compute_means(**args):
     """
     Compute the averages of dependent (y) and independent (x) variables
     """
@@ -287,7 +272,7 @@ def __compute_means(**args):
         return (xmean_str, ymean)
     else:
         return (args["xmean_str"], args["y_scale"]["mean"])
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
 class IterationControllerNoTableDrop (IterationController2S):

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
index e79289b..5146b93 100644
--- a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
+++ b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
@@ -814,3 +814,53 @@ end;
 $$ language plpgsql volatile;
 
 select check_elastic_net();
+
+CREATE VIEW lin_housing AS
+SELECT id, x AS features, y AS price, grp_by_col AS grp FROM lin_housing_wi;
+
+DROP TABLE if exists house_en, house_en_summary, house_en_cv;
+SELECT elastic_net_train(
+    'lin_housing',
+    'house_en',
+    'price',
+    'features',
+    'gaussian',
+    1,
+    0.2,
+    True,
+    NULL,
+    'fista',
+    $$ eta = 2, max_stepsize = 0.5, use_active_set = f,
+       n_folds = 3, lambda_value = {0.1, 1, 2}, alpha = {0, 1}
+    $$,
+    NULL,
+    100,
+    1e-6
+);
+SELECT * FROM house_en;
+SELECT * FROM house_en_summary;
+
+DROP TABLE if exists house_en, house_en_summary, house_en_cv;
+SELECT elastic_net_train(
+    'lin_housing_wi',
+    'house_en',
+    'y',
+    'x',
+    'gaussian',
+    0.1,
+    0.2,
+    True,
+    NULL,
+    'fista',
+    $$ eta = 2, max_stepsize = 0.5, use_active_set = f,
+       n_folds = 3, validation_result=house_en_cv,
+       n_lambdas = 3, alpha = {0, 0.1, 1},
+       warmup = True, warmup_lambdas = {10, 1, 0.1}
+    $$,
+    NULL,
+    100,
+    1e-6
+);
+SELECT * FROM house_en;
+SELECT * FROM house_en_summary;
+SELECT * FROM house_en_cv;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/svm/svm.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in
index 5e1ea33..0dd7767 100644
--- a/src/ports/postgres/modules/svm/svm.py_in
+++ b/src/ports/postgres/modules/svm/svm.py_in
@@ -1518,9 +1518,7 @@ def _extract_params(schema_madlib, params, module='SVM'):
         'eps_table': str,
         'class_weight': str}
 
-    params_vals = extract_keyvalue_params(params,
-                                          params_types,
-                                          params_default)
+    params_vals = extract_keyvalue_params(params, params_types, params_default)
     if params_vals['n_folds'] < 0:
         plpy.error("{0} Error: n_folds must be non-negative!".format(module))
 
@@ -1556,6 +1554,7 @@ def _extract_params(schema_madlib, params, module='SVM'):
     return params_vals
 # -------------------------------------------------------------------------
 
+
 import unittest
 
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/utilities/control.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/control.py_in b/src/ports/postgres/modules/utilities/control.py_in
index a0c3d55..b031701 100644
--- a/src/ports/postgres/modules/utilities/control.py_in
+++ b/src/ports/postgres/modules/utilities/control.py_in
@@ -36,8 +36,7 @@ class EnableOptimizer(object):
         self.optimizer_enabled = False
         # we depend on the fact that all GPDB/HAWQ versions that have the
         # optimizer also define function properties
-        self.guc_exists = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>,
-                                   <!True!>, <!False!>)
+        self.guc_exists = True if HAS_FUNCTION_PROPERTIES else False
 
     def __enter__(self):
         # we depend on the fact that all GPDB/HAWQ versions that have the ORCA
@@ -106,10 +105,9 @@ class MinWarning:
     def __enter__(self):
         self.oldMsgLevel = plpy.execute("""
             SELECT setting FROM pg_settings WHERE name='client_min_messages'
-        """)[0]['setting']
-        plpy.execute("""
-            SET client_min_messages = {warningLevel}
-            """.format(warningLevel=self.warningLevel))
+            """)[0]['setting']
+        plpy.execute("SET client_min_messages = {warningLevel}".
+                     format(warningLevel=self.warningLevel))
         return self
 
     def __exit__(self, *args):
@@ -120,9 +118,8 @@ class MinWarning:
             return False
         else:
             # if no exception then we reset the client_min_messages
-            plpy.execute("""
-                SET client_min_messages = {oldMsgLevel};
-                """.format(oldMsgLevel=self.oldMsgLevel))
+            plpy.execute("SET client_min_messages = {oldMsgLevel}; ".
+                         format(oldMsgLevel=self.oldMsgLevel))
 
 
 class IterationController:
@@ -230,8 +227,8 @@ class IterationController:
                             SELECT $1 AS _state
                         ) AS _state ON True
                     """.format(expression=expression).
-                        format(iteration=self.iteration, **self.kwargs),
-                        ["DOUBLE PRECISION[]"])
+                    format(iteration=self.iteration, **self.kwargs),
+                    ["DOUBLE PRECISION[]"])
                 resultObject = plpy.execute(eval_plan,
                                             [[] if self.new_state['_state'] is None
                                              else self.new_state['_state']])
@@ -245,7 +242,7 @@ class IterationController:
                             WHERE _state._iteration = {{iteration}}
                         ) AS _state ON True
                     """.format(expression=expression).
-                        format(iteration=self.iteration, **self.kwargs))
+                    format(iteration=self.iteration, **self.kwargs))
 
         if resultObject.nrows() == 0:
             return None
@@ -294,13 +291,9 @@ class IterationController:
         if STATE_IN_MEM:
             self.old_state = self.new_state
             update_plan = plpy.prepare("""
-                SELECT
-                    {iteration} AS _iteration,
-                    ({newState}) AS _state
-                """.format(iteration=self.iteration,
-                           newState=newState, **self.kwargs).
-                    format(__state__='$1'),
-                    ["DOUBLE PRECISION[]"])
+                SELECT {0} AS _iteration, ({1}) AS _state
+                """.format(self.iteration, newState).format(__state__='$1'),
+                ["DOUBLE PRECISION[]"])
             self.new_state = plpy.execute(update_plan,
                                           [None if self.new_state['_state'] is None
                                            else self.new_state['_state']])[0]
@@ -310,10 +303,9 @@ class IterationController:
                 SELECT
                     {iteration},
                     ({newState})
-                """.format(
-                    iteration=self.iteration,
-                    newState=newState,
-                    **self.kwargs))
+                """.format(iteration=self.iteration,
+                           newState=newState,
+                           **self.kwargs))
             if self.truncAfterIteration:
                 self.runSQL("""
                     DELETE FROM {rel_state} AS _state
@@ -362,8 +354,8 @@ class IterationController2D(IterationController):
                         LEFT OUTER JOIN (
                             SELECT {{schema_madlib}}.array_to_2d($1) AS _state
                         ) AS _state ON True
-                    """.format(expression=expression).format(
-                               **self.kwargs), ["DOUBLE PRECISION[]"])
+                    """.format(expression=expression).
+                    format(**self.kwargs), ["DOUBLE PRECISION[]"])
 
                 resultObject = plpy.execute(
                     eval_plan,
@@ -378,8 +370,7 @@ class IterationController2D(IterationController):
                             WHERE _state._iteration = {{iteration}}
                         ) AS _state ON True
                     """.format(expression=expression).
-                        format(iteration=self.iteration,
-                               **self.kwargs))
+                    format(iteration=self.iteration, **self.kwargs))
         if resultObject.nrows() == 0:
             return None
         else:
@@ -429,10 +420,7 @@ class IterationController2D(IterationController):
                 SELECT
                     {iteration},
                     ({newState})
-                """.format(
-                    iteration=self.iteration,
-                    newState=newState,
-                    **self.kwargs))
+                """.format(iteration=self.iteration, newState=newState, **self.kwargs))
             if self.truncAfterIteration:
                 self.runSQL("""
                     DELETE FROM {rel_state} AS _state

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
index ccb87f6..efebb08 100644
--- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
+++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
@@ -14,6 +14,7 @@ from collections import Iterable
 
 class BaseState(object):
     """@brief Abstraction for intermediate iteration state"""
+
     def __init__(self, **kwargs):
         self._state = {}
         self._is_none = None
@@ -86,11 +87,11 @@ class BaseState(object):
         return result
 
     def are_last_state_value_zero(self):
-        ## This function returns a boolean value, after inspecting the last
-        ## element of the state array for each group.
-        ## This returns True, only if the last element of the state array
-        ## of every group is (almost equal to) 0. If the value is non-zero
-        ## even for a single group, it returns False.
+        # This function returns a boolean value, after inspecting the last
+        # element of the state array for each group.
+        # This returns True, only if the last element of the state array
+        # of every group is (almost equal to) 0. If the value is non-zero
+        # even for a single group, it returns False.
         return all([self.isclose_to_zero(val[-1]) for val in self._state.itervalues()])
 
     def update(self, col_grp_key, col_grp_state, ret_states):
@@ -168,6 +169,7 @@ class BaseState(object):
 
 class Bytea8State(BaseState):
     """@brief bytea8 type state"""
+
     def __init__(self, **kwargs):
         super(Bytea8State, self).__init__(**kwargs)
 
@@ -219,7 +221,7 @@ class GroupIterationController:
             grouping_col=("NULL"
                           if arg_dict["grouping_col"] is None
                           else arg_dict["grouping_col"]),
-            )
+        )
         self.grp_to_n_tuples = {}
         self.failed_grp_keys = []
 
@@ -250,9 +252,9 @@ class GroupIterationController:
                             "unnest($3) AS {col_grp_key}, "
                             "unnest($4) AS {col_n_tuples}"
                             .format(**self.kwargs))
-        _using_str="ON TRUE"
-        _grouped_state_type="float8[]"
-        _groupby_str=""
+        _using_str = "ON TRUE"
+        _grouped_state_type = "float8[]"
+        _groupby_str = ""
         if not self.is_group_null:
             _groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format(
                 **self.kwargs)
@@ -308,7 +310,7 @@ class GroupIterationController:
             # We cannot allow NULL due to array_to_string cannot handle it well.
             if not self.is_group_null:
                 null_test = (" OR ".join([g.strip() + " is NULL" for g in
-                             self.kwargs['grouping_col'].split(",")]))
+                                          self.kwargs['grouping_col'].split(",")]))
                 null_count = plpy.execute("""
                     SELECT count(*) FROM {rel_state} WHERE {null_test}
                     """.format(null_test=null_test,
@@ -335,7 +337,7 @@ class GroupIterationController:
 
     def are_last_state_value_zero(self):
         return self.new_states.are_last_state_value_zero()
-    
+
     def info(self):
         """ Logging intermediate state information """
         if not self.verbose:
@@ -378,10 +380,10 @@ class GroupIterationController:
             JOIN ( {select_n_tuples} ) AS _rel_n_tuples
             USING ({col_grp_key})
             """.format(
-                iteration=self.iteration,
-                select_rel_state=group_param.select_rel_state,
-                select_n_tuples=group_param.select_n_tuples,
-                **self.kwargs)
+            iteration=self.iteration,
+            select_rel_state=group_param.select_rel_state,
+            select_n_tuples=group_param.select_n_tuples,
+            **self.kwargs)
         insert_plan = plpy.prepare(insert_sql,
                                    ["text[]", group_param.grouped_state_type,
                                     "text[]", "bigint[]"])
@@ -503,14 +505,14 @@ class GroupIterationController:
             {using_str}
             {groupby_str}
             """.format(
-                newState=newState,
-                iteration=self.iteration,
-                using_str=group_param.using_str,
-                groupby_str=group_param.groupby_str,
-                _grp_key=group_param.grp_key,
-                select_rel_state=group_param.select_rel_state,
-                select_n_tuples=group_param.select_n_tuples,
-                **self.kwargs)
+            newState=newState,
+            iteration=self.iteration,
+            using_str=group_param.using_str,
+            groupby_str=group_param.groupby_str,
+            _grp_key=group_param.grp_key,
+            select_rel_state=group_param.select_rel_state,
+            select_n_tuples=group_param.select_n_tuples,
+            **self.kwargs)
         update_plan = plpy.prepare(run_sql,
                                    ["text[]", group_param.grouped_state_type,
                                     "text[]", "integer[]"])

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 6a02e3d..733bbdf 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -463,7 +463,9 @@ def preprocess_keyvalue_params(input_params, split_char='='):
 def extract_keyvalue_params(input_params,
                             input_param_types,
                             default_values=None,
-                            split_char='='):
+                            split_char='=',
+                            usage_str='',
+                            ignore_invalid=False):
     """ Extract key value pairs from input parameters or set the default values
 
     Args:
@@ -480,6 +482,8 @@ def extract_keyvalue_params(input_params,
         @param default_values: dict, Default values for each allowed parameter.
         @param split_char: str, The character used to split key and value.
                             Default set to '='
+        @param usage_str: str, An optional usage string to print with error message.
+        @param ignore_invalid: bool, If True an invalid param input is ignore silently
 
     Returns:
         Dict. Dictionary of input parameter values with key as parameter name
@@ -499,17 +503,22 @@ def extract_keyvalue_params(input_params,
     for s in preprocess_keyvalue_params(input_params, split_char=split_char):
         items = split_quoted_delimited_str(s, delimiter=split_char)
         if (len(items) != 2):
-            raise KeyError("Input parameter list has incorrect format")
+            raise KeyError("Input parameter list has incorrect format \n"
+                           "{0}".format(usage_str))
 
         param_name = items[0].strip(" \"").lower()
         param_value = items[1].strip()
         if not param_name or param_name in ('none', 'null'):
-            plpy.error("Invalid input param name: {0} ".format(str(param_name)))
+            plpy.error("Invalid input param name: {0} \n"
+                       "{1}".format(param_name, usage_str))
         try:
             param_type = input_param_types[param_name]
         except KeyError:
-            raise KeyError("Invalid input: {0} is not a valid parameter".
-                           format(param_name))
+            if not ignore_invalid:
+                raise KeyError("Invalid input: {0} is not a valid parameter \n"
+                               "{1}".format(param_name, usage_str))
+            else:
+                continue
         try:
             if param_type in (int, str, float):
                 parameter_dict[param_name] = param_type(param_value)
@@ -522,11 +531,11 @@ def extract_keyvalue_params(input_params,
                 #  Raises ValueError if anything else.
                 parameter_dict[param_name] = bool(strtobool(param_value))
             else:
-                raise TypeError("Invalid input: {0} has unsupported type".
-                                format(param_name))
+                raise TypeError("Invalid input: {0} has unsupported type \n"
+                                "{1}".format(param_name, usage_str))
         except ValueError:
-            raise ValueError("Invalid input: {0} must be {1}".
-                             format(param_name, str(param_type)))
+            raise ValueError("Invalid input: {0} must be {1} \n"
+                             "{2}".format(param_name, param_type, usage_str))
     return parameter_dict
 # -------------------------------------------------------------------------
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/validation/internal/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
index 7f8c64e..a79b45a 100644
--- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
@@ -22,6 +22,8 @@ from utilities.utilities import __mad_version
 from utilities.utilities import unique_string
 from utilities.utilities import num_samples
 
+from utilities.validate_args import get_cols_and_types
+
 from math import sqrt
 from collections import namedtuple
 from operator import itemgetter
@@ -134,7 +136,7 @@ class ValidationResult(object):
         The columns of tbl_name are mean, std and the leaf keys in sub_args.
         All column types are assumed to be double precision.
         """
-        if tbl_name == '' or tbl_name is None:
+        if not tbl_name or not str(tbl_name).strip():
             return
 
         cv_history_f = self._flatten()
@@ -171,6 +173,7 @@ class _ValidationArgs(object):
     @classmethod
     def grid(cls, sub_args):
         def comb_dict(dicts):
+            # same as dict((k, v) for d in dicts for k, v in d.iteritems())
             return dict(chain.from_iterable(d.iteritems() for d in dicts))
 
         def make_dicts(k, vs):
@@ -227,8 +230,7 @@ def _cv_copy_data(rel_origin, dependent_varname,
 
 def _cv_split_data(rel_source, col_data, col_id, row_num,
                    rel_train, rel_valid, n_folds, which_fold):
-    """
-    """
+
     col_string = _cv_col_string(rel_source, col_data, [col_id])
 
     (start_row, end_row) = _cv_validation_rows(row_num, n_folds, which_fold)
@@ -282,6 +284,7 @@ class CrossValidator(object):
     args : dict (recursive)
            Contains all the arguments to run estimator and the data to be used for validation:
 
+           Following names are assumed to be available in the args dict:
                 - source_table: the data table
                 - independent_varname: the column for features
                 - dependent_varname: the column for target
@@ -342,16 +345,13 @@ class CrossValidator(object):
         output_table = unique_string(desp='output_table')
         model_table = args['model_table']
 
-        predictor(schema_madlib, model_table,
-                  rel_valid, col_id, output_table)
+        predictor(schema_madlib, model_table, rel_valid, col_id, output_table)
 
         score = self._score(output_table, rel_valid, scorer)
-        plpy.execute("""
-                     DROP TABLE IF EXISTS {model_table}, {model_table}_summary;
-                     """.format(model_table=model_table))
-        plpy.execute("""
-                     DROP TABLE IF EXISTS {output_table};
-                     """.format(output_table=output_table))
+        plpy.execute("DROP TABLE IF EXISTS {model_table}, {model_table}_summary;".
+                     format(model_table=model_table))
+        plpy.execute("DROP TABLE IF EXISTS {output_table};".
+                     format(output_table=output_table))
         return score
 
     def _score(self, pred, orig, method):


[2/3] incubator-madlib git commit: Elastic net: Add cross validation

Posted by ri...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
index 8c00376..c48beca 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
@@ -1,82 +1,70 @@
-
 import plpy
-from elastic_net_utils import __process_results
-from elastic_net_utils import __compute_log_likelihood
-from utilities.utilities import __mad_version
-from utilities.utilities import _array_to_string
-from utilities.utilities import __mad_version
-from utilities.control import MinWarning
+from elastic_net_utils import _process_results
+from elastic_net_utils import _compute_log_likelihood
 from utilities.validate_args import get_cols_and_types
 
-version_wrapper = __mad_version()
-mad_vec = version_wrapper.select_vecfunc()
 
-def __elastic_net_generate_result (optimizer, iteration_run, **args):
+def _elastic_net_generate_result(optimizer, iteration_run, **args):
     """
     Generate result table for all optimizers
     """
     standardize_flag = "True" if args["normalization"] else "False"
     source_table = args["rel_source"]
     if optimizer == "fista":
-        result_func = "__gaussian_fista_result({col_grp_state})".format(col_grp_state=args["col_grp_state"])
+        result_func = "__gaussian_fista_result({0})".format(args["col_grp_state"])
     elif optimizer == "igd":
         result_func = """__gaussian_igd_result({col_grp_state},
             '{sq_str}'::double precision[],
             {threshold}::double precision,
             {tolerance}::double precision
             )
-        """.format(col_grp_state=args["col_grp_state"], 
-            tolerance=args["warmup_tolerance"],
-            threshold=args["threshold"],
-            sq_str=args["sq_str"])
+        """.format(col_grp_state=args["col_grp_state"],
+                   tolerance=args["warmup_tolerance"],
+                   threshold=args["threshold"],
+                   sq_str=args["sq_str"])
     tbl_state = "{rel_state}".format(rel_state=args["rel_state"])
 
     grouping_column = args['grouping_col']
-    grouping_str1 = ""
     if grouping_column:
         col_grp_key = args['col_grp_key']
         grouping_str = args['grouping_str']
-        groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format(
-            grouping_col=grouping_column, col_grp_key=col_grp_key)
         cols_types = dict(get_cols_and_types(args["tbl_source"]))
         grouping_str1 = grouping_column + ","
-        select_grouping_info = ','.join([grp_col.strip()+"\t" + cols_types[grp_col.strip()]
-            for grp_col in grouping_column.split(',')]) + ","
-        using_str = "USING ({col_grp_key})".format(col_grp_key=col_grp_key)
+        select_grouping_info = ','.join([grp_col.strip() + "\t" + cols_types[grp_col.strip()]
+                                         for grp_col in grouping_column.split(',')]) + ","
         out_table_qstr = """
-        SELECT
-            {grouping_str1}
-            (result).coefficients AS coef,
-            (result).intercept AS intercept
-        FROM (
-            SELECT {schema_madlib}.{result_func} AS result,
-            {col_grp_key}
-            FROM {tbl_state}
-            WHERE {col_grp_iteration} = {iteration_run}
-        ) t
-        JOIN
-        (
             SELECT
                 {grouping_str1}
-                array_to_string(ARRAY[{grouping_str}],
-                                ','
-                               ) AS {col_grp_key}
-            FROM {source_table}
-                {groupby_str}
-        ) n_tuples_including_nulls_subq
-        {using_str}
-        """.format(result_func = result_func, tbl_state = tbl_state,
-            col_grp_iteration = args["col_grp_iteration"],
-            iteration_run = iteration_run,
-            groupby_str=groupby_str,
-            grouping_str1=grouping_str1,
-            grouping_str=grouping_str,
-            using_str=using_str,
-            col_grp_key=col_grp_key,
-            source_table=source_table,
-            schema_madlib = args["schema_madlib"])
+                (result).coefficients AS coef,
+                (result).intercept AS intercept
+            FROM
+                (
+                    SELECT {schema_madlib}.{result_func} AS result, {col_grp_key}
+                    FROM {tbl_state}
+                    WHERE {col_grp_iteration} = {iteration_run}
+                ) t
+                JOIN
+                (
+                    SELECT
+                        {grouping_str1}
+                        array_to_string(ARRAY[{grouping_str}], ',') AS {col_grp_key}
+                    FROM {source_table}
+                    GROUP BY {grouping_col}, {col_grp_key}
+                ) n_tuples_including_nulls_subq
+                USING ({col_grp_key})
+            """.format(result_func=result_func,
+                       tbl_state=tbl_state,
+                       grouping_col=grouping_column,
+                       col_grp_iteration=args["col_grp_iteration"],
+                       iteration_run=iteration_run,
+                       grouping_str1=grouping_str1,
+                       grouping_str=grouping_str,
+                       col_grp_key=col_grp_key,
+                       source_table=source_table,
+                       schema_madlib=args["schema_madlib"])
     else:
-        ## Its a much simpler query when there is no grouping.
+        # It's a much simpler query when there is no grouping.
+        grouping_str1 = ""
         select_grouping_info = ""
         out_table_qstr = """
             SELECT
@@ -87,12 +75,12 @@ def __elastic_net_generate_result (optimizer, iteration_run, **args):
                 FROM {tbl_state}
                 WHERE {col_grp_iteration} = {iteration_run}
             ) t
-        """.format(result_func = result_func, tbl_state = tbl_state,
-            col_grp_iteration = args["col_grp_iteration"],
-            iteration_run = iteration_run,
-            schema_madlib = args["schema_madlib"])
+        """.format(result_func=result_func, tbl_state=tbl_state,
+                   col_grp_iteration=args["col_grp_iteration"],
+                   iteration_run=iteration_run,
+                   schema_madlib=args["schema_madlib"])
 
-    ## Create the output table
+    # Create the output table
     plpy.execute("""
              DROP TABLE IF EXISTS {tbl_result};
              CREATE TABLE {tbl_result} (
@@ -111,68 +99,77 @@ def __elastic_net_generate_result (optimizer, iteration_run, **args):
     result = plpy.execute(out_table_qstr)
     for res in result:
         build_output_table(res, grouping_column, grouping_str1,
-            standardize_flag, iteration_run, **args)
+                           standardize_flag, iteration_run, **args)
 
-    ## Create summary table, listing the grouping columns used.
-    summary_table = args["tbl_summary"]
+    # Create summary table, listing the grouping columns used.
     grouping_text = "NULL" if not grouping_column else grouping_column
+    failed_groups = plpy.execute("""
+        SELECT count(*) AS num_failed_groups FROM {0} WHERE coef_all IS NULL
+        """.format(args['tbl_result']))[0]
+    all_groups = plpy.execute("SELECT count(*) AS num_all_groups FROM {0} ".
+                              format(args['tbl_result']))[0]
+    args.update(failed_groups)
+    args.update(all_groups)
     plpy.execute("""
-        DROP TABLE IF EXISTS {summary_table};
-        CREATE TABLE {summary_table} (
-            grouping_col    text
-        )
-        """.format(summary_table=summary_table))
-    plpy.execute("""
-            INSERT INTO {summary_table} VALUES
-            ('{grouping_text}')
-        """.format(summary_table=summary_table, grouping_text=grouping_text))
+        CREATE TABLE {tbl_summary} AS
+        SELECT
+            'elastic_net'::varchar              AS method,
+            '{tbl_source}'::varchar             AS source_table,
+            '{tbl_result}'::varchar             AS out_table,
+            $madlib_super_quote${col_dep_var}$madlib_super_quote$::varchar
+                                                AS dependent_varname,
+            $madlib_super_quote${col_ind_var}$madlib_super_quote$::varchar
+                                                AS independent_varname,
+            '{family}'::varchar                 AS family,
+            {alpha}::float                      AS alpha,
+            {lambda_value}::float               AS lambda_value,
+            '{grouping_text}'::varchar          AS grouping_col,
+            {num_all_groups}::integer           AS num_all_groups,
+            {num_failed_groups}::integer        AS num_failed_groups
+        """.format(grouping_text=grouping_text,
+                   **args))
     return None
 
 
-def build_output_table(res, grouping_column, grouping_str1, 
-    standardize_flag, iteration_run, **args):
+def build_output_table(res, grouping_column, grouping_str1,
+                       standardize_flag, iteration_run, **args):
     """
     Insert model captured in "res" into the output table
     """
-    r_coef = mad_vec(res["coef"], text = False)
-    if args["normalization"]:
-        (coef, intercept) = __restore_scale(r_coef, res["intercept"], args)
-    else:
-        coef = r_coef
-        intercept = res["intercept"]
+    r_coef = res["coef"]
+    if r_coef:
+        if args["normalization"]:
+            (coef, intercept) = _restore_scale(r_coef, res["intercept"], args)
+        else:
+            coef = r_coef
+            intercept = res["intercept"]
 
-    (features, features_selected, dense_coef, sparse_coef) = __process_results(
-        coef, intercept, args["outstr_array"])
+        (features, features_selected, dense_coef, sparse_coef) = _process_results(
+            coef, intercept, args["outstr_array"])
 
-    # compute the likelihood
-    if args["normalization"]:
-        coef_str = _array_to_string(r_coef) # use un-restored coef
-    else:
-        coef_str = sparse_coef
+        log_likelihood = _compute_log_likelihood(r_coef, res["intercept"], **args)
+        if grouping_column:
+            grouping_info = ','.join([str(res[grp_col.strip()])
+                                      for grp_col in grouping_str1.split(',')
+                                      if grp_col.strip() in res.keys()]) + ","
+        else:
+            grouping_info = ""
+        fquery = """
+            INSERT INTO {tbl_result} VALUES
+                ({grouping_info} '{family}', '{features}'::text[], '{features_selected}'::text[],
+                '{dense_coef}'::double precision[], '{sparse_coef}'::double precision[],
+                {intercept}, {log_likelihood}, {standardize_flag}, {iteration})
+            """.format(features=features, features_selected=features_selected,
+                       dense_coef=dense_coef, sparse_coef=sparse_coef,
+                       intercept=intercept, log_likelihood=log_likelihood,
+                       grouping_info=grouping_info,
+                       standardize_flag=standardize_flag, iteration=iteration_run,
+                       **args)
+        plpy.execute(fquery)
+# ------------------------------------------------------------------------
 
-    log_likelihood = __compute_log_likelihood(r_coef, res["intercept"], **args)
-    if grouping_column:
-        grouping_info = ','.join([str(res[grp_col.strip()]) 
-            for grp_col in grouping_str1.split(',') 
-            if grp_col.strip() in res.keys()]) + ","
-    else:
-        grouping_info = ""
-    fquery = """
-        INSERT INTO {tbl_result} VALUES
-            ({grouping_info} '{family}', '{features}'::text[], '{features_selected}'::text[],
-            '{dense_coef}'::double precision[], '{sparse_coef}'::double precision[],
-            {intercept}, {log_likelihood}, {standardize_flag}, {iteration})
-        """.format(
-            features = features, features_selected = features_selected,
-            dense_coef = dense_coef, sparse_coef = sparse_coef,
-            intercept = intercept, log_likelihood = log_likelihood,
-            grouping_info=grouping_info,
-            standardize_flag = standardize_flag, iteration = iteration_run,
-            **args)
-    plpy.execute(fquery)
-# ========================================================================
 
-def __restore_scale (coef, intercept, args):
+def _restore_scale(coef, intercept, args):
     """
     Restore the original scales
     """

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net_models.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_models.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_models.py_in
index d27e766..b7ea016 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_models.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_models.py_in
@@ -1,88 +1,91 @@
-
-## Try to make every function has a useful return value !
-## Try to avoid any changes to function arguments !
-
+from elastic_net_optimizer_fista import _elastic_net_fista_train
+from elastic_net_optimizer_igd import _elastic_net_igd_train
 import plpy
-from elastic_net_optimizer_fista import __elastic_net_fista_train
-from elastic_net_optimizer_igd import __elastic_net_igd_train
 
 # ========================================================================
 
-def __elastic_net_gaussian_fista_train(schema_madlib, tbl_source, col_ind_var,
-                                       col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                       normalization, optimizer_params, max_iter,
-                                       tolerance, outstr_array, grouping_str,
-                                       grouping_col, **kwargs):
+
+def _elastic_net_gaussian_fista_train(schema_madlib, tbl_source, col_ind_var,
+                                      col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                                      normalization, optimizer_params, max_iter,
+                                      tolerance, outstr_array, grouping_str,
+                                      grouping_col, **kwargs):
     """
     Use FISTA to solve linear models
-    """    
-    return __elastic_net_fista_train(schema_madlib,
-                                     "__gaussian_fista_step",
-                                     "__gaussian_fista_state_diff",
-                                     "gaussian",
-                                     tbl_source, col_ind_var,
-                                     col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                     normalization, optimizer_params, max_iter,
-                                     tolerance, outstr_array, grouping_str,
-                                     grouping_col, **kwargs)
+    """
+    return _elastic_net_fista_train(schema_madlib,
+                                    "__gaussian_fista_step",
+                                    "__gaussian_fista_state_diff",
+                                    "gaussian",
+                                    tbl_source, col_ind_var,
+                                    col_dep_var, tbl_result, tbl_summary,
+                                    lambda_value, alpha,
+                                    normalization, optimizer_params, max_iter,
+                                    tolerance, outstr_array, grouping_str,
+                                    grouping_col, **kwargs)
 
 # ========================================================================
 
-def __elastic_net_gaussian_igd_train(schema_madlib, tbl_source, col_ind_var,
-                                     col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                     normalization, optimizer_params, max_iter,
-                                     tolerance, outstr_array, grouping_str,
-                                     grouping_col, **kwargs):
+
+def _elastic_net_gaussian_igd_train(schema_madlib, tbl_source, col_ind_var,
+                                    col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                                    normalization, optimizer_params, max_iter,
+                                    tolerance, outstr_array, grouping_str,
+                                    grouping_col, **kwargs):
     """
     Use IGD to solve linear models
     """
-    return __elastic_net_igd_train(schema_madlib,
-                                   "__gaussian_igd_step",
-                                   "__gaussian_igd_state_diff",
-                                   "gaussian",
-                                   tbl_source, col_ind_var,
-                                   col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                   normalization, optimizer_params, max_iter,
-                                   tolerance, outstr_array, grouping_str,
-                                   grouping_col, **kwargs)
+    return _elastic_net_igd_train(schema_madlib,
+                                  "__gaussian_igd_step",
+                                  "__gaussian_igd_state_diff",
+                                  "gaussian",
+                                  tbl_source, col_ind_var,
+                                  col_dep_var, tbl_result, tbl_summary,
+                                  lambda_value, alpha,
+                                  normalization, optimizer_params, max_iter,
+                                  tolerance, outstr_array, grouping_str,
+                                  grouping_col, **kwargs)
 
 # ========================================================================
 
-def __elastic_net_binomial_fista_train(schema_madlib, tbl_source, col_ind_var,
-                                       col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                       normalization, optimizer_params, max_iter,
-                                       tolerance, outstr_array, grouping_str,
-                                       grouping_col, **kwargs):
+
+def _elastic_net_binomial_fista_train(schema_madlib, tbl_source, col_ind_var,
+                                      col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                                      normalization, optimizer_params, max_iter,
+                                      tolerance, outstr_array, grouping_str,
+                                      grouping_col, **kwargs):
     """
     Use FISTA to solve linear models
-    """    
-    return __elastic_net_fista_train(schema_madlib,
-                                     "__binomial_fista_step",
-                                     "__binomial_fista_state_diff",
-                                     "binomial",
-                                     tbl_source, col_ind_var,
-                                     col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                     normalization, optimizer_params, max_iter,
-                                     tolerance, outstr_array, grouping_str,
-                                     grouping_col, **kwargs)
+    """
+    return _elastic_net_fista_train(schema_madlib,
+                                    "__binomial_fista_step",
+                                    "__binomial_fista_state_diff",
+                                    "binomial",
+                                    tbl_source, col_ind_var,
+                                    col_dep_var, tbl_result, tbl_summary,
+                                    lambda_value, alpha,
+                                    normalization, optimizer_params, max_iter,
+                                    tolerance, outstr_array, grouping_str,
+                                    grouping_col, **kwargs)
 
 # ========================================================================
 
-def __elastic_net_binomial_igd_train(schema_madlib, tbl_source, col_ind_var,
-                                     col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                     normalization, optimizer_params, max_iter,
-                                     tolerance, outstr_array, grouping_str,
-                                     grouping_col, **kwargs):
+
+def _elastic_net_binomial_igd_train(schema_madlib, tbl_source, col_ind_var,
+                                    col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                                    normalization, optimizer_params, max_iter,
+                                    tolerance, outstr_array, grouping_str,
+                                    grouping_col, **kwargs):
     """
     Use IGD to solve linear models
     """
-    return __elastic_net_igd_train(schema_madlib,
-                                   "__binomial_igd_step",
-                                   "__binomial_igd_state_diff",
-                                   "binomial",
-                                   tbl_source, col_ind_var,
-                                   col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                   normalization, optimizer_params, max_iter,
-                                   tolerance, outstr_array, grouping_str,
-                                   grouping_col, **kwargs)
-    
+    return _elastic_net_igd_train(schema_madlib,
+                                  "__binomial_igd_step",
+                                  "__binomial_igd_state_diff",
+                                  "binomial",
+                                  tbl_source, col_ind_var,
+                                  col_dep_var, tbl_result, tbl_summary,
+                                  lambda_value, alpha,
+                                  normalization, optimizer_params, max_iter,
+                                  tolerance, outstr_array, grouping_str,
+                                  grouping_col, **kwargs)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
index 6cc1eab..8ac6819 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
@@ -1,145 +1,53 @@
-
-## Try to make every function has a useful return value !
-## Try to avoid any changes to function arguments !
-
 import plpy
-import math
-from elastic_net_utils import __normalize_data
-from elastic_net_utils import __compute_data_scales
-from elastic_net_utils import __compute_means
-from elastic_net_utils import __tbl_dimension_rownum
-from utilities.utilities import unique_string
-from utilities.in_mem_group_control import GroupIterationController
+from elastic_net_generate_result import _elastic_net_generate_result
+from elastic_net_utils import _normalize_data
+from elastic_net_utils import _tbl_dimension_rownum
+from elastic_net_utils import _elastic_net_validate_args
+from elastic_net_utils import _compute_average_sq
+from elastic_net_utils import _generate_warmup_lambda_sequence
+from elastic_net_utils import _process_warmup_lambdas
 from utilities.control import MinWarning
-from elastic_net_utils import __elastic_net_validate_args
-from utilities.utilities import _array_to_string
-from elastic_net_utils import __compute_average_sq
-from elastic_net_utils import __generate_warmup_lambda_sequence
-from elastic_net_utils import __process_warmup_lambdas
-from elastic_net_generate_result import __elastic_net_generate_result
-from utilities.utilities import __mad_version
-from utilities.utilities import preprocess_keyvalue_params
+from utilities.in_mem_group_control import GroupIterationController
+from utilities.utilities import unique_string
+from utilities.utilities import extract_keyvalue_params
 
-version_wrapper = __mad_version()
-mad_vec = version_wrapper.select_vecfunc()
 
-## ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __fista_params_parser(optimizer_params, lambda_value, tolerance, schema_madlib):
+def _fista_params_parser(optimizer_params, lambda_value, tolerance, schema_madlib):
     """
     Parse fista parameters.
     """
-    allowed_params = set(["max_stepsize", "eta", "warmup", "warmup_lambdas",
-                          "warmup_lambda_no", "use_active_set", "random_stepsize",
-                          "activeset_tolerance", "warmup_tolerance"])
-    name_value = dict()
     # default values
-    name_value["max_stepsize"] = 2.
-    name_value["use_active_set"] = 0  # use of active set
-    name_value["eta"] = 1.2
-    name_value["warmup"] = False
-    name_value["warmup_lambdas"] = None
-    name_value["warmup_lambda_no"] = 15
-    name_value["random_stepsize"] = 0  # use random stepsize
-    name_value["activeset_tolerance"] = tolerance
-    name_value["warmup_tolerance"] = tolerance
-
-    warmup_lambdas = None
-    warmup_lambda_no = None
-
-    if optimizer_params is None or len(optimizer_params) == 0:
-        return name_value
-
-    for s in preprocess_keyvalue_params(optimizer_params):
-        items = s.split("=")
-        if (len(items) != 2):
-            plpy.error("Elastic Net error: Optimizer parameter list "
-                       "has incorrect format!")
-        param_name = items[0].strip(" \"").lower()
-        param_value = items[1].strip(" \"").lower()
-
-        if param_name not in allowed_params:
-            plpy.error(
-                """
-                Elastic Net error: {param_name} is not a valid parameter name for the FISTA optimizer.
-                Run:
-
-                SELECT {schema_madlib}.elastic_net_train('fista');
-
-                to see the parameters for FISTA algorithm.
-                """.format(param_name=param_name,
-                           schema_madlib=schema_madlib))
-
-        if param_name == "activeset_tolerance":
-            try:
-                name_value["activeset_tolerance"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: activeset_tolerance must be a "
-                           "float number!")
-
-        if param_name == "warmup_tolerance":
-            try:
-                name_value["warmup_tolerance"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: warmup_tolerance must be a "
-                           "float number!")
-
-        if param_name == "max_stepsize":
-            try:
-                name_value["max_stepsize"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: max_stepsize must be a "
-                           "float number!")
-
-        if param_name == "eta":
-            try:
-                name_value["eta"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: eta must be a float number!")
-
-        if param_name == "random_stepsize":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["random_stepsize"] = 1
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["random_stepsize"] = 0
-            else:
-                plpy.error("Elastic Net error: Do you need to add some "
-                           "randomness to step size (True/False or yes/no) ?")
-
-        if param_name == "warmup":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["warmup"] = True
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["warmup"] = False
-            else:
-                plpy.error("Elastic Net error: Do you need warmup "
-                           "(True/False or yes/no) ?")
-
-        if param_name == "warmup_lambdas" and param_value != "null":
-            warmup_lambdas = param_value
-
-        if param_name == "warmup_lambda_no":
-            warmup_lambda_no = param_value
-
-        if param_name == "use_active_set":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["use_active_set"] = 1
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["use_active_set"] = 0
-            else:
-                plpy.error("Elastic Net error: Do you need warmup "
-                           "(True/False or yes/no) ?")
-
-    if name_value["warmup"]:
-        if warmup_lambdas is not None:
-            # errors are handled in __process_warmup_lambdas
-            name_value["warmup_lambdas"] = __process_warmup_lambdas(warmup_lambdas, lambda_value)
-        if warmup_lambda_no is not None:
-            try:
-                name_value["warmup_lambda_no"] = int(warmup_lambda_no)
-            except:
-                plpy.error("Elastic Net error: warmup_lambda_no must be an integer!")
+    defaults_and_types = {
+        "max_stepsize": (2., float),
+        "eta": (1.2, float),
+        "warmup": (False, bool),
+        "warmup_lambdas": (None, list),
+        "warmup_lambda_no": (15, int),
+        "use_active_set": (False, bool),
+        "random_stepsize": (False, bool),
+        "activeset_tolerance": (tolerance, float),
+        "warmup_tolerance": (tolerance, float)
+    }
+    param_defaults = dict([(k, v[0]) for k, v in defaults_and_types.items()])
+    param_types = dict([(k, v[1]) for k, v in defaults_and_types.items()])
+
+    if not optimizer_params:
+        return param_defaults
+
+    usage_str = ("Run:\n"
+                 "   SELECT {0}.elastic_net_train('fista');\n"
+                 "   to see the parameters for FISTA algorithm.".
+                 format(schema_madlib))
+    name_value = extract_keyvalue_params(optimizer_params, param_types,
+                                         param_defaults, usage_str=usage_str,
+                                         ignore_invalid=True)
+
+    if name_value["warmup"] and name_value['warmup_lambdas'] is not None:
+        # errors are handled in _process_warmup_lambdas
+        name_value['warmup_lambdas'] = _process_warmup_lambdas(name_value['warmup_lambdas'], lambda_value)
 
     # validate the parameters
     if name_value["max_stepsize"] <= 0:
@@ -162,9 +70,10 @@ def __fista_params_parser(optimizer_params, lambda_value, tolerance, schema_madl
         plpy.error("Elastic Net error: warmup_tolerance must be positive!")
 
     return name_value
-## ========================================================================
+# -------------------------------------------------------------------------
 
-def __fista_construct_dict(
+
+def _fista_construct_dict(
         schema_madlib, family, tbl_source, col_ind_var, col_dep_var,
         tbl_result, dimension, row_num, lambda_value, alpha,
         normalization, max_iter, tolerance, outstr_array, optimizer_params_dict):
@@ -193,8 +102,8 @@ def __fista_construct_dict(
     # Table names useful when normalizing the original data
     # Note: in order to be consistent with the calling convention
     # of the normalization functions, multiple elements of the dict
-    # actually have the same value. This is a price that one has to pay
-    # if he wants to save typing argument names by using **args as the
+    # actually have the same value. This is a price one has to pay
+    # to save typing argument names by using **args as the
     # function argument.
     tbl_ind_scales = unique_string(desp='temp_ind_scales')
     tbl_dep_scale = unique_string(desp='temp_dep_scales')
@@ -204,25 +113,18 @@ def __fista_construct_dict(
                 tbl_data_scaled=tbl_data_scaled)
 
     # Table names used in IGD iterations
-    args.update(tbl_fista_state=unique_string(),
-                tbl_fista_args=unique_string())
+    args.update(tbl_fista_state=unique_string(), tbl_fista_args=unique_string())
 
     # more, for args table
-    args["dimension_name"] = unique_string()
-    args["lambda_name"] = unique_string()
-    args["alpha_name"] = unique_string()
-    args["total_rows_name"] = unique_string()
-    args["max_iter_name"] = unique_string()
-    args["tolerance_name"] = unique_string()
-    args["max_stepsize_name"] = unique_string()
-    args["eta_name"] = unique_string()
-    args["activeset_name"] = unique_string()
-
+    for name in ('dimension_name', 'lambda_name', 'alpha_name',
+                 'total_rows_name', 'max_iter_name', 'tolerance_name',
+                 'max_stepsize_name', 'eta_name', 'activeset_name'):
+        args[name] = unique_string()
     return args
-## ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __fista_cleanup_temp_tbls(**args):
+def _fista_cleanup_temp_tbls(**kwargs):
     """
     Drop all temporary tables used by FISTA optimizer,
     including tables used in the possible normalization
@@ -234,48 +136,49 @@ def __fista_cleanup_temp_tbls(**args):
                 drop table if exists {tbl_data_scaled};
                 drop table if exists {tbl_fista_args};
                 drop table if exists pg_temp.{tbl_fista_state};
-                """.format(**args))
+                """.format(**kwargs))
 
     return None
-## ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __elastic_net_fista_train(schema_madlib, func_step_aggregate,
-                              func_state_diff, family,
-                              tbl_source, col_ind_var,
-                              col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                              normalization, optimizer_params, max_iter,
-                              tolerance, outstr_array, grouping_str,
-                              grouping_col, **kwargs):
+def _elastic_net_fista_train(schema_madlib, func_step_aggregate,
+                             func_state_diff, family,
+                             tbl_source, col_ind_var,
+                             col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                             normalization, optimizer_params, max_iter,
+                             tolerance, outstr_array, grouping_str,
+                             grouping_col, **kwargs):
     """
     func_step_aggregate is string, and it is the name of the step function
     """
-    __elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
-                                tbl_result, tbl_summary, lambda_value, alpha,
-                                normalization, max_iter, tolerance)
-
-    return __elastic_net_fista_train_compute(schema_madlib,
-                                             func_step_aggregate,
-                                             func_state_diff,
-                                             family,
-                                             tbl_source, col_ind_var,
-                                             col_dep_var, tbl_result,
-                                             tbl_summary, lambda_value, alpha,
-                                             normalization,
-                                             optimizer_params, max_iter,
-                                             tolerance, outstr_array,
-                                             grouping_str, grouping_col,
-                                             **kwargs)
-## ========================================================================
-
-
-def __elastic_net_fista_train_compute(schema_madlib, func_step_aggregate,
-                                      func_state_diff, family,
-                                      tbl_source, col_ind_var,
-                                      col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                      normalization, optimizer_params, max_iter,
-                                      tolerance, outstr_array, grouping_str,
-                                      grouping_col, **kwargs):
+    _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
+                               tbl_result, tbl_summary, lambda_value, alpha,
+                               normalization, max_iter, tolerance)
+    return _elastic_net_fista_train_compute(schema_madlib,
+                                            func_step_aggregate,
+                                            func_state_diff,
+                                            family,
+                                            tbl_source,
+                                            col_ind_var, col_dep_var,
+                                            tbl_result, tbl_summary,
+                                            lambda_value, alpha,
+                                            normalization,
+                                            optimizer_params, max_iter,
+                                            tolerance, outstr_array,
+                                            grouping_str, grouping_col,
+                                            **kwargs)
+# ------------------------------------------------------------------------
+
+
+def _elastic_net_fista_train_compute(schema_madlib, func_step_aggregate,
+                                     func_state_diff, family,
+                                     tbl_source, col_ind_var,
+                                     col_dep_var, tbl_result, tbl_summary,
+                                     lambda_value, alpha,
+                                     normalization, optimizer_params, max_iter,
+                                     tolerance, outstr_array, grouping_str,
+                                     grouping_col, **kwargs):
     """
     Fit linear model with elastic net regularization using FISTA optimization.
 
@@ -292,124 +195,119 @@ def __elastic_net_fista_train_compute(schema_madlib, func_step_aggregate,
     @param optimizer_params  Parameters of the above optimizer, the format
                              is '{arg = value, ...}'::varchar[]
     """
-    old_msg_level = plpy.execute("""
-                                 select setting from pg_settings
-                                 where name='client_min_messages'
-                                 """)[0]['setting']
-    plpy.execute("set client_min_messages to error")
-
-    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
-
-    # generate a full dict to ease the following string format
-    # including several temporary table names
-    args = __fista_construct_dict(schema_madlib, family, tbl_source, col_ind_var,
-                                  col_dep_var, tbl_result,
-                                  dimension, row_num, lambda_value,
-                                  alpha, normalization,
-                                  max_iter, tolerance, outstr_array,
-                                  __fista_params_parser(optimizer_params,
-                                                        lambda_value,
-                                                        tolerance,
-                                                        schema_madlib))
-
-    args.update({'grouping_col': grouping_col})
-    # use normalized data or not
-    if normalization:
-        __normalize_data(args)
-        tbl_used = args["tbl_data_scaled"]
-        args["col_ind_var_new"] = args["col_ind_var_norm_new"]
-        args["col_dep_var_new"] = args["col_dep_var_norm_new"]
-    else:
-        #####__compute_data_scales(args)
-        tbl_used = tbl_source
-        args["col_ind_var_new"] = col_ind_var
-        args["col_dep_var_new"] = col_dep_var
-
-    args["tbl_used"] = tbl_used
-
-    if args["warmup_lambdas"] is not None:
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = args["warmup_lambdas"]
-
-    if args["warmup"] and args["warmup_lambdas"] is None:
-        # average squares of each feature
-        # used to estimate the largest lambda value
-        args["sq"] = __compute_average_sq(**args)
-        args["warmup_lambdas"] = \
-            __generate_warmup_lambda_sequence(
-                tbl_used, args["col_ind_var_new"], args["col_dep_var_new"],
-                dimension, row_num, lambda_value, alpha,
-                args["warmup_lambda_no"], args["sq"])
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = args["warmup_lambdas"]
-    elif args["warmup"] is False:
-        args["warm_no"] = 1
-        args["warmup_lambdas"] = [lambda_value]  # only one value
-
-    ## This update is needed in __elastic_net_generate_result() after
-    ## __compute_fista is run. Some of these variables are accessed there.
-    args.update({
-        'rel_state': args["tbl_fista_state"],
-        'col_grp_iteration': unique_string(desp='col_grp_iteration'),
-        'col_grp_state': unique_string(desp='col_grp_state'),
-        'col_grp_key': unique_string(desp='col_grp_key'),
-        'col_n_tuples': unique_string(desp='col_n_tuples'),
-        'lambda_count': 1,
-        'is_active': 0,
-        'state_type': "double precision[]",
-        'rel_source': tbl_used,
-        'grouping_str': grouping_str,
-        'tbl_source': tbl_source,
-        'tbl_summary': tbl_summary
+    with MinWarning('error'):
+        (dimension, row_num) = _tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
+
+        # generate a full dict to ease the following string format
+        # including several temporary table names
+        args = _fista_construct_dict(schema_madlib, family, tbl_source, col_ind_var,
+                                     col_dep_var, tbl_result,
+                                     dimension, row_num, lambda_value,
+                                     alpha, normalization,
+                                     max_iter, tolerance, outstr_array,
+                                     _fista_params_parser(optimizer_params,
+                                                          lambda_value,
+                                                          tolerance,
+                                                          schema_madlib))
+
+        args.update({'grouping_col': grouping_col})
+        # use normalized data or not
+        if normalization:
+            _normalize_data(args)
+            tbl_used = args["tbl_data_scaled"]
+            args["col_ind_var_new"] = args["col_ind_var_norm_new"]
+            args["col_dep_var_new"] = args["col_dep_var_norm_new"]
+        else:
+            tbl_used = tbl_source
+            args["col_ind_var_new"] = col_ind_var
+            args["col_dep_var_new"] = col_dep_var
+
+        args["tbl_used"] = tbl_used
+
+        if args["warmup_lambdas"] is not None:
+            args["warm_no"] = len(args["warmup_lambdas"])
+            args["warmup_lambdas"] = args["warmup_lambdas"]
+
+        if args["warmup"] and args["warmup_lambdas"] is None:
+            # average squares of each feature
+            # used to estimate the largest lambda value
+            args["sq"] = _compute_average_sq(**args)
+            args["warmup_lambdas"] = \
+                _generate_warmup_lambda_sequence(
+                    tbl_used, args["col_ind_var_new"], args["col_dep_var_new"],
+                    dimension, row_num, lambda_value, alpha,
+                    args["warmup_lambda_no"], args["sq"])
+            args["warm_no"] = len(args["warmup_lambdas"])
+            args["warmup_lambdas"] = args["warmup_lambdas"]
+        elif args["warmup"] is False:
+            args["warm_no"] = 1
+            args["warmup_lambdas"] = [lambda_value]  # only one value
+
+        # This update is needed in _elastic_net_generate_result() after
+        # _compute_fista is run. Some of these variables are accessed there.
+        args.update({
+            'rel_state': args["tbl_fista_state"],
+            'col_grp_iteration': unique_string(desp='col_grp_iteration'),
+            'col_grp_state': unique_string(desp='col_grp_state'),
+            'col_grp_key': unique_string(desp='col_grp_key'),
+            'col_n_tuples': unique_string(desp='col_n_tuples'),
+            'lambda_count': 1,
+            'is_active': 0,
+            'state_type': "double precision[]",
+            'rel_source': tbl_used,
+            'grouping_str': grouping_str,
+            'tbl_source': tbl_source,
+            'tbl_summary': tbl_summary
         })
 
-    # perform the actual calculation
-    iteration_run = __compute_fista(
-        schema_madlib, func_step_aggregate,
-        func_state_diff,
-        args["tbl_fista_args"],
-        args["tbl_fista_state"],
-        tbl_used,
-        args["col_ind_var_new"],
-        args["col_dep_var_new"],
-        grouping_str,
-        grouping_col,
-        tolerance,
-        start_iter=0,
-        lambda_name=args["warmup_lambdas"],
-        warmup_lambda_value = args.get('warmup_lambdas')[args["lambda_count"]-1],
-        activeset_tolerance=args["activeset_tolerance"],
-        warmup_tolerance=args["warmup_tolerance"],
-        max_iter=args["max_iter"],
-        warm_no=args["warm_no"],
-        random_stepsize=args["random_stepsize"],
-        use_active_set=args["use_active_set"],
-        alpha=args["alpha"],
-        row_num=args["row_num"],
-        dimension=args["dimension"],
-        max_stepsize=args["max_stepsize"],
-        eta=args["eta"],
-        rel_state= args["tbl_fista_state"],
-        col_grp_iteration= args["col_grp_iteration"],
-        col_grp_state= args["col_grp_state"],
-        col_grp_key= args["col_grp_key"],
-        col_n_tuples= args["col_n_tuples"],
-        lambda_count= args["lambda_count"],
-        is_active= args["is_active"],
-        state_type= args["state_type"],
-        rel_source= args["rel_source"])
-
-    __elastic_net_generate_result("fista", iteration_run, **args)
-
-    # cleanup
-    __fista_cleanup_temp_tbls(**args)
-    plpy.execute("set client_min_messages to " + old_msg_level)
+        # perform the actual calculation
+        iteration_run = _compute_fista(
+            schema_madlib, func_step_aggregate,
+            func_state_diff,
+            args["tbl_fista_args"],
+            args["tbl_fista_state"],
+            tbl_used,
+            args["col_ind_var_new"],
+            args["col_dep_var_new"],
+            grouping_str,
+            grouping_col,
+            tolerance,
+            start_iter=0,
+            lambda_name=args["warmup_lambdas"],
+            warmup_lambda_value=args.get('warmup_lambdas')[args["lambda_count"] - 1],
+            activeset_tolerance=args["activeset_tolerance"],
+            warmup_tolerance=args["warmup_tolerance"],
+            max_iter=args["max_iter"],
+            warm_no=args["warm_no"],
+            random_stepsize=args["random_stepsize"],
+            use_active_set=args["use_active_set"],
+            alpha=args["alpha"],
+            row_num=args["row_num"],
+            dimension=args["dimension"],
+            max_stepsize=args["max_stepsize"],
+            eta=args["eta"],
+            rel_state=args["tbl_fista_state"],
+            col_grp_iteration=args["col_grp_iteration"],
+            col_grp_state=args["col_grp_state"],
+            col_grp_key=args["col_grp_key"],
+            col_n_tuples=args["col_n_tuples"],
+            lambda_count=args["lambda_count"],
+            is_active=args["is_active"],
+            state_type=args["state_type"],
+            rel_source=args["rel_source"])
+
+        _elastic_net_generate_result("fista", iteration_run, **args)
+
+        # cleanup
+        _fista_cleanup_temp_tbls(**args)
     return None
-## ========================================================================
+# ------------------------------------------------------------------------
 
-def __compute_fista(schema_madlib, func_step_aggregate, func_state_diff,
-                    tbl_args, tbl_state, tbl_source, col_ind_var, 
-                    col_dep_var, grouping_str, grouping_col, tolerance, start_iter, **kwargs):
+
+def _compute_fista(schema_madlib, func_step_aggregate, func_state_diff,
+                   tbl_args, tbl_state, tbl_source, col_ind_var,
+                   col_dep_var, grouping_str, grouping_col, tolerance,
+                   start_iter, **kwargs):
     """
     Driver function for elastic net using FISTA
 
@@ -430,7 +328,7 @@ def __compute_fista(schema_madlib, func_step_aggregate, func_state_diff,
         result in \c tbl_state
     """
     args = locals()
-    
+
     for k, v in kwargs.iteritems():
         if k not in args:
             args.update({k: v})
@@ -441,23 +339,24 @@ def __compute_fista(schema_madlib, func_step_aggregate, func_state_diff,
             # manually add the intercept term
             if (it.kwargs["lambda_count"] > len(args.get('lambda_name'))):
                 break
-            it.kwargs["warmup_lambda_value"] = args.get('lambda_name')[it.kwargs["lambda_count"]-1]
+            it.kwargs["warmup_lambda_value"] = args.get('lambda_name')[it.kwargs["lambda_count"] - 1]
             it.update("""
-                  {schema_madlib}.{func_step_aggregate}(
-                      ({col_ind_var})::double precision[],
-                      ({col_dep_var}),
-                      {rel_state}.{col_grp_state},
-                      ({warmup_lambda_value})::double precision,
-                      ({alpha})::double precision,
-                      ({dimension})::integer,
-                      ({row_num})::integer,
-                      ({max_stepsize})::double precision,
-                      ({eta})::double precision,
-                      ({use_active_set})::integer,
-                      {is_active}::integer,
-                      {random_stepsize}::integer
-                  )
-            """)
+                    {schema_madlib}.{func_step_aggregate}(
+                        ({col_ind_var})::double precision[],
+                        ({col_dep_var}),
+                        {rel_state}.{col_grp_state},
+                        ({warmup_lambda_value})::double precision,
+                        ({alpha})::double precision,
+                        ({dimension})::integer,
+                        ({row_num})::integer,
+                        ({max_stepsize})::double precision,
+                        ({eta})::double precision,
+                        ({use_active_set})::integer,
+                        {is_active}::integer,
+                        {random_stepsize}::integer
+                    )
+                    """)
+
             if it.kwargs["is_active"] == 1:
                 it.kwargs["use_tolerance"] = it.kwargs["activeset_tolerance"]
             elif it.kwargs["lambda_count"] < it.kwargs["warm_no"]:
@@ -465,19 +364,21 @@ def __compute_fista(schema_madlib, func_step_aggregate, func_state_diff,
             else:
                 it.kwargs["use_tolerance"] = args["tolerance"]
 
-            if it.kwargs["use_active_set"] == 1:
+            if it.kwargs["use_active_set"]:
                 is_backtracking = it.are_last_state_value_zero()
-                if it.test(
-                    """
-                    {iteration} >= {max_iter}
-                    or
-                    {schema_madlib}.{func_state_diff}(
-                        _state_previous, _state_current) < {use_tolerance}
-                    """):
+                if it.test("""
+                        {iteration} >= {max_iter}
+                        or
+                        {schema_madlib}.{func_state_diff}(
+                            _state_previous, _state_current) < {use_tolerance}
+                        """):
                     if it.iteration < it.kwargs["max_iter"]:
                         if it.kwargs["is_active"] == 0:
                             if (it.kwargs["lambda_count"] < it.kwargs["warm_no"]):
                                 it.kwargs["lambda_count"] += 1
+                                if (len(args.get('lambda_name')) >=
+                                        it.kwargs["lambda_count"]):
+                                    break
                             else:
                                 break
                         else:
@@ -490,12 +391,12 @@ def __compute_fista(schema_madlib, func_step_aggregate, func_state_diff,
                         it.kwargs["is_active"] = 1
             else:
                 if it.test("""
-                    {iteration} >= {max_iter} or
-                    {schema_madlib}.{func_state_diff}(
-                        _state_previous, _state_current) < {tolerance}
-                    """):
+                        {iteration} >= {max_iter} or
+                        {schema_madlib}.{func_state_diff}(
+                            _state_previous, _state_current) < {tolerance}
+                        """):
                     if (it.iteration < it.kwargs["max_iter"] and
-                        it.kwargs["lambda_count"] < it.kwargs["warm_no"]):
+                            it.kwargs["lambda_count"] < it.kwargs["warm_no"]):
                         it.kwargs["lambda_count"] += 1
                     else:
                         break

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
index 187cc83..f652ba0 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
@@ -1,126 +1,56 @@
 
 import plpy
-import math
 from utilities.utilities import unique_string
 from utilities.in_mem_group_control import GroupIterationController
-from elastic_net_utils import __compute_means
-from elastic_net_utils import __normalize_data
-from elastic_net_utils import __compute_data_scales
-from elastic_net_utils import __tbl_dimension_rownum
-from elastic_net_utils import __elastic_net_validate_args
+from elastic_net_utils import _compute_means
+from elastic_net_utils import _normalize_data
+from elastic_net_utils import _compute_data_scales
+from elastic_net_utils import _tbl_dimension_rownum
+from elastic_net_utils import _elastic_net_validate_args
 from utilities.utilities import _array_to_string
-from elastic_net_utils import __compute_average_sq
-from elastic_net_utils import __generate_warmup_lambda_sequence
-from elastic_net_utils import __process_warmup_lambdas
-from elastic_net_generate_result import __elastic_net_generate_result
-from utilities.utilities import __mad_version
-from utilities.utilities import preprocess_keyvalue_params
+from elastic_net_utils import _compute_average_sq
+from elastic_net_utils import _generate_warmup_lambda_sequence
+from elastic_net_utils import _process_warmup_lambdas
+from elastic_net_generate_result import _elastic_net_generate_result
+from utilities.utilities import extract_keyvalue_params
 from utilities.control import MinWarning
 
-version_wrapper = __mad_version()
-mad_vec = version_wrapper.select_vecfunc()
-## ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __igd_params_parser(optimizer_params, lambda_value, tolerance, schema_madlib):
+def _igd_params_parser(optimizer_params, lambda_value, tolerance, schema_madlib):
     """
     Parse IGD parameters.
     """
-    allowed_params = set(["stepsize", "warmup", "warmup_lambdas",
-                          "warmup_lambda_no",
-                          "threshold", "parallel", "warmup_tolerance",
-                          "step_decay"])
-    name_value = dict()
     # default values
-    name_value["parallel"] = True
-    name_value["stepsize"] = 0.01
-    name_value["warmup"] = False
-    name_value["warmup_lambdas"] = None
-    name_value["warmup_lambda_no"] = 15
-    name_value["threshold"] = 1e-10
-    name_value["warmup_tolerance"] = tolerance
-    name_value["step_decay"] = 0
-
-    warmup_lambdas = None
-    warmup_lambda_no = None
-
-    if optimizer_params is None or len(optimizer_params) == 0:
-        return name_value
-
-    for s in preprocess_keyvalue_params(optimizer_params):
-        items = s.split("=")
-        if (len(items) != 2):
-            plpy.error("Elastic Net error: Optimizer parameter list has incorrect format!")
-        param_name = items[0].strip(" \"").lower()
-        param_value = items[1].strip(" \"").lower()
-
-        if param_name not in allowed_params:
-            plpy.error(
-                """
-                Elastic Net error: {param_name} is not a valid parameter name for the IGD optimizer.
-                Run:
-
-                SELECT {schema_madlib}.elastic_net_train('igd');
-
-                to see the parameters for IGD algorithm.
-                """.format(param_name=param_name,
-                           schema_madlib=schema_madlib))
-
-        if param_name == "step_decay":
-            try:
-                name_value["step_decay"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: Decay of step must be a float number!")
-
-        if param_name == "stepsize":
-            try:
-                name_value["stepsize"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: stepsize must be a float number!")
-
-        if param_name == "warmup_tolerance":
-            try:
-                name_value["warmup_tolerance"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: warmup_tolerance must be a float number!")
-
-        if param_name == "warmup":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["warmup"] = True
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["warmup"] = False
-            else:
-                plpy.error("Elastic Net error: Do you need warmup (True/False or yes/no) ?")
-
-        if param_name == "warmup_lambdas" and param_value != "null":
-            warmup_lambdas = param_value
-
-        if param_name == "warmup_lambda_no":
-            warmup_lambda_no = param_value
-
-        if param_name == "threshold":
-            try:
-                name_value["threshold"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: threshold must be a float number!")
-
-        if param_name == "parallel":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["parallel"] = True
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["parallel"] = False
-            else:
-                plpy.error("Elastic Net error: Do you need parallel (True/False or yes/no) ? IGD in parallel might be slower !")
-
-    if name_value["warmup"]:
-        if warmup_lambdas is not None:
-            # errors are handled in __process_warmup_lambdas
-            name_value["warmup_lambdas"] = __process_warmup_lambdas(warmup_lambdas, lambda_value)
-        if warmup_lambda_no is not None:
-            try:
-                name_value["warmup_lambda_no"] = int(warmup_lambda_no)
-            except:
-                plpy.error("Elastic Net error: warmup_lambda_no must be an integer!")
+    defaults_and_types = {
+        "stepsize": (0.01, float),
+        "warmup": (False, bool),
+        "warmup_lambdas": (None, list),
+        "warmup_lambda_no": (15, int),
+        "threshold": (1e-10, float),
+        "parallel": (True, bool),
+        "step_decay": (0.0, float),
+        "warmup_tolerance": (tolerance, float)
+    }
+    param_defaults = dict([(k, v[0]) for k, v in defaults_and_types.items()])
+    param_types = dict([(k, v[1]) for k, v in defaults_and_types.items()])
+
+    if not optimizer_params:
+        return param_defaults
+
+    usage_str = ("Run:\n"
+                 "   SELECT {0}.elastic_net_train('igd');\n"
+                 "   to see the parameters for FISTA algorithm.".
+                 format(schema_madlib))
+    name_value = extract_keyvalue_params(optimizer_params, param_types,
+                                         param_defaults,
+                                         usage_str=usage_str,
+                                         ignore_invalid=True)
+
+    if name_value["warmup"] and name_value['warmup_lambdas'] is not None:
+        # errors are handled in _process_warmup_lambdas
+        name_value['warmup_lambdas'] = _process_warmup_lambdas(name_value['warmup_lambdas'], lambda_value)
 
     # validate the parameters
     if name_value["step_decay"] < 0:
@@ -140,13 +70,14 @@ def __igd_params_parser(optimizer_params, lambda_value, tolerance, schema_madlib
         plpy.error("Elastic Net error: A positive threshold is needed to screen out tiny values around zero!")
 
     return name_value
-## ========================================================================
+# ------------------------------------------------------------------------
+
 
-def __igd_construct_dict(schema_madlib, family, tbl_source,
-                         col_ind_var, col_dep_var,
-                         tbl_result, dimension, row_num, lambda_value, alpha,
-                         normalization, max_iter, tolerance, outstr_array,
-                         optimizer_params_dict):
+def _igd_construct_dict(schema_madlib, family, tbl_source,
+                        col_ind_var, col_dep_var,
+                        tbl_result, dimension, row_num, lambda_value, alpha,
+                        normalization, max_iter, tolerance, outstr_array,
+                        optimizer_params_dict):
     """
     Construct the dict used by a series of SQL queries in IGD optimizer.
     """
@@ -198,10 +129,10 @@ def __igd_construct_dict(schema_madlib, family, tbl_source,
     args["ymean_name"] = unique_string()
 
     return args
-## ========================================================================
+# ------------------------------------------------------------------------
 
 
-def __igd_cleanup_temp_tbls(**args):
+def _igd_cleanup_temp_tbls(**args):
     """
     Drop all temporary tables used by IGD optimizer,
     including tables used in the possible normalization
@@ -215,36 +146,36 @@ def __igd_cleanup_temp_tbls(**args):
                  drop table if exists pg_temp.{tbl_igd_state};
                  """.format(**args))
     return None
-## ========================================================================
-
-
-def __elastic_net_igd_train(schema_madlib, func_step_aggregate,
-                            func_state_diff, family,
-                            tbl_source, col_ind_var,
-                            col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                            normalization, optimizer_params, max_iter,
-                            tolerance, outstr_array, grouping_str,
-                            grouping_col, **kwargs):
-    __elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, tbl_result, tbl_summary,
-                                lambda_value, alpha, normalization, max_iter, tolerance)
-
-    return __elastic_net_igd_train_compute(schema_madlib, func_step_aggregate,
-                                           func_state_diff, family,
-                                           tbl_source, col_ind_var,
-                                           col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                           normalization, optimizer_params, max_iter,
-                                           tolerance, outstr_array, grouping_str,
-                                           grouping_col, **kwargs)
-## ========================================================================
-
-
-def __elastic_net_igd_train_compute(schema_madlib, func_step_aggregate,
-                                    func_state_diff, family,
-                                    tbl_source, col_ind_var,
-                                    col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
-                                    normalization, optimizer_params, max_iter,
-                                    tolerance, outstr_array, grouping_str,
-                                    grouping_col, **kwargs):
+# ------------------------------------------------------------------------
+
+
+def _elastic_net_igd_train(schema_madlib, func_step_aggregate,
+                           func_state_diff, family,
+                           tbl_source, col_ind_var,
+                           col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                           normalization, optimizer_params, max_iter,
+                           tolerance, outstr_array, grouping_str,
+                           grouping_col, **kwargs):
+    _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, tbl_result, tbl_summary,
+                               lambda_value, alpha, normalization, max_iter, tolerance)
+
+    return _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate,
+                                          func_state_diff, family,
+                                          tbl_source, col_ind_var,
+                                          col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                                          normalization, optimizer_params, max_iter,
+                                          tolerance, outstr_array, grouping_str,
+                                          grouping_col, **kwargs)
+# ------------------------------------------------------------------------
+
+
+def _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate,
+                                   func_state_diff, family,
+                                   tbl_source, col_ind_var,
+                                   col_dep_var, tbl_result, tbl_summary, lambda_value, alpha,
+                                   normalization, optimizer_params, max_iter,
+                                   tolerance, outstr_array, grouping_str,
+                                   grouping_col, **kwargs):
     """
     Fit linear model with elastic net regularization using IGD optimization.
 
@@ -261,128 +192,123 @@ def __elastic_net_igd_train_compute(schema_madlib, func_step_aggregate,
     @param optimizer_params  Parameters of the above optimizer, the format
                              is '{arg = value, ...}'::varchar[]
     """
-    old_msg_level = plpy.execute("""
-                                 select setting from pg_settings
-                                 where name='client_min_messages'
-                                 """)[0]['setting']
-    plpy.execute("set client_min_messages to error")
-
-    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
-
-    # generate a full dict to ease the following string format
-    # including several temporary table names
-    args = __igd_construct_dict(schema_madlib, family, tbl_source, col_ind_var,
-                                col_dep_var, tbl_result,
-                                dimension, row_num, lambda_value, alpha, normalization,
-                                max_iter, tolerance, outstr_array,
-                                __igd_params_parser(optimizer_params, lambda_value,
-                                                    tolerance, schema_madlib))
-
-    args.update({'grouping_col': grouping_col})
-    # use normalized data or not
-    if normalization:
-        __normalize_data(args)
-        tbl_used = args["tbl_data_scaled"]
-        args["col_ind_var_new"] = args["col_ind_var_norm_new"]
-        args["col_dep_var_new"] = args["col_dep_var_norm_new"]
-    else:
-        __compute_data_scales(args)
-        tbl_used = tbl_source
-        args["col_ind_var_new"] = col_ind_var
-        args["col_dep_var_new"] = col_dep_var
-
-    args["tbl_used"] = tbl_used
-
-    # average squares of each feature
-    # used to estimate the largest lambda value
-    # also used to screen out tiny values, so order is needed
-    args["sq"] = __compute_average_sq(**args)
-    args["sq_str"] = _array_to_string(args["sq"])
-
-    if args["warmup_lambdas"] is not None:
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = args["warmup_lambdas"]
-
-    if args["warmup"] and args["warmup_lambdas"] is None:
-        args["warmup_lambdas"] = \
-        __generate_warmup_lambda_sequence(
-           args["tbl_used"], args["col_ind_var_new"], args["col_dep_var_new"],
-           dimension, row_num, lambda_value, alpha,
-           args["warmup_lambda_no"], args["sq"])
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = args["warmup_lambdas"]
-    elif args["warmup"] is False:
-        args["warm_no"] = 1
-        args["warmup_lambdas"] = [lambda_value]  # only one value
-
-    # parameter values required by the IGD optimizer
-    (xmean, ymean) = __compute_means(**args)
-
-    args.update({
-        'rel_args': args["tbl_igd_args"],
-        'rel_state': args["tbl_igd_state"],
-        'col_grp_iteration': unique_string(desp='col_grp_iteration'),
-        'col_grp_state': unique_string(desp='col_grp_state'),
-        'col_grp_key': unique_string(desp='col_grp_key'),
-        'col_n_tuples': unique_string(desp='col_n_tuples'),
-        'lambda_count': 1,
-        'state_type': "double precision[]",
-        'rel_source': tbl_used,
-        'grouping_str': grouping_str,
-        'xmean_val': xmean,
-        'ymean_val': ymean,
-        'tbl_source': tbl_source,
-        'tbl_summary': tbl_summary
+    with MinWarning('error'):
+        (dimension, row_num) = _tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
+
+        # generate a full dict to ease the following string format
+        # including several temporary table names
+        args = _igd_construct_dict(schema_madlib, family, tbl_source, col_ind_var,
+                                   col_dep_var, tbl_result,
+                                   dimension, row_num, lambda_value, alpha, normalization,
+                                   max_iter, tolerance, outstr_array,
+                                   _igd_params_parser(optimizer_params, lambda_value,
+                                                      tolerance, schema_madlib))
+
+        args.update({'grouping_col': grouping_col})
+        # use normalized data or not
+        if normalization:
+            _normalize_data(args)
+            tbl_used = args["tbl_data_scaled"]
+            args["col_ind_var_new"] = args["col_ind_var_norm_new"]
+            args["col_dep_var_new"] = args["col_dep_var_norm_new"]
+        else:
+            _compute_data_scales(args)
+            tbl_used = tbl_source
+            args["col_ind_var_new"] = col_ind_var
+            args["col_dep_var_new"] = col_dep_var
+
+        args["tbl_used"] = tbl_used
+
+        # average squares of each feature
+        # used to estimate the largest lambda value
+        # also used to screen out tiny values, so order is needed
+        args["sq"] = _compute_average_sq(**args)
+        args["sq_str"] = _array_to_string(args["sq"])
+
+        if args["warmup_lambdas"] is not None:
+            args["warm_no"] = len(args["warmup_lambdas"])
+            args["warmup_lambdas"] = args["warmup_lambdas"]
+
+        if args["warmup"] and args["warmup_lambdas"] is None:
+            args["warmup_lambdas"] = \
+                _generate_warmup_lambda_sequence(
+                args["tbl_used"], args["col_ind_var_new"], args["col_dep_var_new"],
+                dimension, row_num, lambda_value, alpha,
+                args["warmup_lambda_no"], args["sq"])
+            args["warm_no"] = len(args["warmup_lambdas"])
+            args["warmup_lambdas"] = args["warmup_lambdas"]
+        elif args["warmup"] is False:
+            args["warm_no"] = 1
+            args["warmup_lambdas"] = [lambda_value]  # only one value
+
+        # parameter values required by the IGD optimizer
+        (xmean, ymean) = _compute_means(**args)
+
+        args.update({
+            'rel_args': args["tbl_igd_args"],
+            'rel_state': args["tbl_igd_state"],
+            'col_grp_iteration': unique_string(desp='col_grp_iteration'),
+            'col_grp_state': unique_string(desp='col_grp_state'),
+            'col_grp_key': unique_string(desp='col_grp_key'),
+            'col_n_tuples': unique_string(desp='col_n_tuples'),
+            'lambda_count': 0,
+            'state_type': "double precision[]",
+            'rel_source': tbl_used,
+            'grouping_str': grouping_str,
+            'xmean_val': xmean,
+            'ymean_val': ymean,
+            'tbl_source': tbl_source,
+            'tbl_summary': tbl_summary
         })
-    if not args.get('parallel'):
-        func_step_aggregate += "_single_seg"
-    # perform the actual calculation
-    iteration_run = __compute_igd(schema_madlib,
-                                  func_step_aggregate,
-                                  func_state_diff,
-                                  args["tbl_igd_args"],
-                                  args["tbl_igd_state"], 
-                                  tbl_used,
-                                  args["col_ind_var_new"], 
-                                  args["col_dep_var_new"],
-                                  grouping_str,
-                                  grouping_col,
-                                  start_iter=0,
-                                  max_iter= args["max_iter"],
-                                  tolerance= args["tolerance"],
-                                  warmup_tolerance= args["warmup_tolerance"],
-                                  warm_no= args["warm_no"],
-                                  step_decay= args["step_decay"],
-                                  dimension= args["dimension"],
-                                  stepsize= args["stepsize"],
-                                  lambda_name= args["warmup_lambdas"],
-                                  warmup_lambda_value = args.get('warmup_lambdas')[args["lambda_count"]-1],
-                                  alpha= args["alpha"],
-                                  row_num= args["row_num"],
-                                  xmean_val= args["xmean_val"],
-                                  ymean_val= args["ymean_val"],
-                                  lambda_count= args["lambda_count"],
-                                  rel_state= args["tbl_igd_state"],
-                                  col_grp_iteration= args["col_grp_iteration"],
-                                  col_grp_state= args["col_grp_state"],
-                                  col_grp_key= args["col_grp_key"],
-                                  col_n_tuples= args["col_n_tuples"],
-                                  rel_source= args["rel_source"],
-                                  state_type= args["state_type"],)
-
-    __elastic_net_generate_result("igd", iteration_run, **args)
-
-    # cleanup
-    __igd_cleanup_temp_tbls(**args)
-    plpy.execute("set client_min_messages to " + old_msg_level)
+        if not args.get('parallel'):
+            func_step_aggregate += "_single_seg"
+        # perform the actual calculation
+        iteration_run = _compute_igd(schema_madlib,
+                                     func_step_aggregate,
+                                     func_state_diff,
+                                     args["tbl_igd_args"],
+                                     args["tbl_igd_state"],
+                                     tbl_used,
+                                     args["col_ind_var_new"],
+                                     args["col_dep_var_new"],
+                                     grouping_str,
+                                     grouping_col,
+                                     start_iter=0,
+                                     max_iter=args["max_iter"],
+                                     tolerance=args["tolerance"],
+                                     warmup_tolerance=args["warmup_tolerance"],
+                                     warm_no=args["warm_no"],
+                                     step_decay=args["step_decay"],
+                                     dimension=args["dimension"],
+                                     stepsize=args["stepsize"],
+                                     lambda_name=args["warmup_lambdas"],
+                                     warmup_lambda_value=args.get('warmup_lambdas')[args["lambda_count"]],
+                                     alpha=args["alpha"],
+                                     row_num=args["row_num"],
+                                     xmean_val=args["xmean_val"],
+                                     ymean_val=args["ymean_val"],
+                                     lambda_count=args["lambda_count"],
+                                     rel_state=args["tbl_igd_state"],
+                                     col_grp_iteration=args["col_grp_iteration"],
+                                     col_grp_state=args["col_grp_state"],
+                                     col_grp_key=args["col_grp_key"],
+                                     col_n_tuples=args["col_n_tuples"],
+                                     rel_source=args["rel_source"],
+                                     state_type=args["state_type"],)
+
+        _elastic_net_generate_result("igd", iteration_run, **args)
+
+        # cleanup
+        _igd_cleanup_temp_tbls(**args)
     return None
 
-## ========================================================================
+# ------------------------------------------------------------------------
 
-def __compute_igd(schema_madlib, func_step_aggregate, func_state_diff,
-                  tbl_args, tbl_state, tbl_source,
-                  col_ind_var, col_dep_var, grouping_str, grouping_col, 
-                  start_iter, **kwargs):
+
+def _compute_igd(schema_madlib, func_step_aggregate, func_state_diff,
+                 tbl_args, tbl_state, tbl_source,
+                 col_ind_var, col_dep_var, grouping_str, grouping_col,
+                 start_iter, **kwargs):
     """
     Driver function for elastic net with Gaussian response using IGD
 
@@ -403,7 +329,7 @@ def __compute_igd(schema_madlib, func_step_aggregate, func_state_diff,
         result in \c tbl_state
     """
     args = locals()
-    
+
     for k, v in kwargs.iteritems():
         if k not in args:
             args.update({k: v})
@@ -414,7 +340,7 @@ def __compute_igd(schema_madlib, func_step_aggregate, func_state_diff,
             # manually add the intercept term
             if (it.kwargs["lambda_count"] > len(args.get('lambda_name'))):
                 break
-            it.kwargs["warmup_lambda_value"] = args.get('lambda_name')[it.kwargs["lambda_count"]-1]
+            it.kwargs["warmup_lambda_value"] = args.get('lambda_name')[it.kwargs["lambda_count"] - 1]
             it.update("""
                     {schema_madlib}.{func_step_aggregate}(
                         ({col_ind_var})::double precision[],


[3/3] incubator-madlib git commit: Elastic net: Add cross validation

Posted by ri...@apache.org.
Elastic net: Add cross validation

JIRA: MADLIB-996

This commit adds cross validation feature to the main elastic_net
function. The CV parameters are added to 'optimizer_params' to ensure
backwards compatibility. This could potentially be moved to its own
parameter in a future version.

There is a special case in CV for elastic net: there is an ability to
compute the grid for lambda values automatically. The same procedure as
warmup lambdas is used (described in the original glmnet function in
R) to compute the lambda values.

Closes #77


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/6939fd63
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/6939fd63
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/6939fd63

Branch: refs/heads/master
Commit: 6939fd63b365e35af5610165560ed3853f4d815c
Parents: 38d1e87
Author: Rahul Iyer <ri...@apache.org>
Authored: Thu Nov 3 15:42:17 2016 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Dec 21 16:08:22 2016 -0800

----------------------------------------------------------------------
 CMakeLists.txt                                  |   7 +-
 .../modules/elastic_net/elastic_net.py_in       | 490 +++++++++-------
 .../modules/elastic_net/elastic_net.sql_in      | 183 ++++--
 .../elastic_net_gaussian_fista.py_in            | 490 ----------------
 .../elastic_net/elastic_net_gaussian_igd.py_in  | 451 ---------------
 .../elastic_net_generate_result.py_in           | 207 ++++---
 .../elastic_net/elastic_net_models.py_in        | 133 ++---
 .../elastic_net_optimizer_fista.py_in           | 559 ++++++++-----------
 .../elastic_net/elastic_net_optimizer_igd.py_in | 462 +++++++--------
 .../modules/elastic_net/elastic_net_utils.py_in | 135 ++---
 .../test/elastic_net_install_check.sql_in       |  50 ++
 src/ports/postgres/modules/svm/svm.py_in        |   5 +-
 .../postgres/modules/utilities/control.py_in    |  50 +-
 .../utilities/in_mem_group_control.py_in        |  48 +-
 .../postgres/modules/utilities/utilities.py_in  |  27 +-
 .../validation/internal/cross_validation.py_in  |  22 +-
 16 files changed, 1191 insertions(+), 2128 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d552ed..7e01b9c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,9 +212,10 @@ include(OSXUtils)
 # -- Add subdirectories --------------------------------------------------------
 
 add_subdirectory(src)
-if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_COMPILER_IS_GNUCXX)
-    add_subdirectory(doc) #e.g., Clang/Clang++ does not work
-endif(CMAKE_COMPILER_IS_GNUCC AND CMAKE_COMPILER_IS_GNUCXX)
+# if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_COMPILER_IS_GNUCXX)
+#e.g., Clang/Clang++ does not work
+# endif(CMAKE_COMPILER_IS_GNUCC AND CMAKE_COMPILER_IS_GNUCXX)
+add_subdirectory(doc)
 add_subdirectory(deploy)
 
 # -- Install path for specific madlib version ----------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.py_in b/src/ports/postgres/modules/elastic_net/elastic_net.py_in
index 011a0ff..15c492d 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.py_in
@@ -1,9 +1,13 @@
 
 import plpy
-from elastic_net_models import __elastic_net_gaussian_igd_train
-from elastic_net_models import __elastic_net_gaussian_fista_train
-from elastic_net_models import __elastic_net_binomial_fista_train
-from elastic_net_models import __elastic_net_binomial_igd_train
+from elastic_net_models import _elastic_net_gaussian_igd_train
+from elastic_net_models import _elastic_net_gaussian_fista_train
+from elastic_net_models import _elastic_net_binomial_fista_train
+from elastic_net_models import _elastic_net_binomial_igd_train
+from elastic_net_utils import _generate_warmup_lambda_sequence
+
+from elastic_net_utils import BINOMIAL_FAMILIES, GAUSSIAN_FAMILIES, OPTIMIZERS
+
 from utilities.validate_args import is_col_array
 from utilities.validate_args import table_exists
 from utilities.validate_args import table_is_empty
@@ -11,21 +15,19 @@ from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import get_cols_and_types
 from utilities.validate_args import cols_in_tbl_valid
 from utilities.validate_args import explicit_bool_to_text
+from utilities.utilities import extract_keyvalue_params
 
 from utilities.control import MinWarning
 
 from utilities.utilities import _string_to_array_with_quotes
 from utilities.utilities import _string_to_array
-from utilities.utilities import set_client_min_messages
-from utilities.utilities import __mad_version
 from utilities.utilities import is_psql_numeric_type
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix
 import re
 
-version_wrapper = __mad_version()
-mad_vec = version_wrapper.select_vecfunc()
-# ========================================================================
+from validation.internal.cross_validation import CrossValidator
+# ------------------------------------------------------------------------
 
 
 def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
@@ -176,11 +178,11 @@ def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
           );
 
     View the results:
-    SELECT  houses.id, 
-            houses.price, 
-            houses_en1_prediction.prediction, 
-            houses.price - houses_en1_prediction.prediction AS residual 
-    FROM houses_en1_prediction, houses 
+    SELECT  houses.id,
+            houses.price,
+            houses_en1_prediction.prediction,
+            houses.price - houses_en1_prediction.prediction AS residual
+    FROM houses_en1_prediction, houses
     WHERE houses.id=houses_en1_prediction.id;
 
         """
@@ -322,20 +324,23 @@ def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
     modified version of IGD is actually used.
 
     Parameters --------------------------------
-    stepsize         - default is 0.01
-    threshold        - default is 1e-10. When a coefficient is really
-                       small, set it to be 0
-    warmup           - default is False
-    warmup_lambdas   - default is Null
-    warmup_lambda_no - default is 15. How many lambda's are used in
-                       warm-up, will be overridden if warmup_lambdas
-                       is not NULL
-    warmup_tolerance - default is the same as tolerance. The value
-                       of tolerance used during warmup.
-    parallel         - default is True. Run the computation on
-                       multiple segments or not.
-
-    When warmup is True and warmup_lambdas is NULL, a series
+    stepsize          - default is 0.01
+    threshold         - default is 1e-10. When a coefficient is really
+                        small, set it to be 0
+    warmup            - default is False
+    warmup_lambdas    - default is Null
+    warmup_lambda_no  - default is 15. How many lambda's are used in
+                        warm-up, will be overridden if warmup_lambdas
+                        is not NULL
+    warmup_tolerance  - default is the same as tolerance. The value
+                        of tolerance used during warmup.
+    n_folds           - default is 1, Number of cross validation folds.
+                        Set this to greater than 1 if CV over lambda is required.
+    validation_result - Name of the table to store the cross validation results.
+    parallel          - default is True. Run the computation on
+                        multiple segments or not.
+
+    When warmup is True or if n_folds > 1, and warmup_lambdas is NULL, a series
     of lambda values will be automatically generated and used.
 
     Reference --------------------------------
@@ -353,25 +358,28 @@ def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
     Right now, it supports fitting both linear and logistic models.
 
     Parameters --------------------------------
-    max_stepsize     - default is 4.0
-    eta              - default is 1.2, if stepsize does not work
-                       stepsize/eta will be tried
-    warmup           - default is False
-    warmup_lambdas   - default is NULL, which means that lambda
-                       values will be automatically generated
-    warmup_lambda_no - default is 15. How many lambda's are used in
-                       warm-up, will be overridden if warmup_lambdas
-                       is not NULL
-    warmup_tolerance - default is the same as tolerance. The value
-                       of tolerance used during warmup.
-    use_active_set   - default is False. Sometimes active-set method
-                       can speed up the calculation.
+    max_stepsize        - default is 4.0
+    eta                 - default is 1.2, if stepsize does not work
+                          stepsize/eta will be tried
+    warmup              - default is False
+    warmup_lambdas      - default is NULL, which means that lambda
+                          values will be automatically generated
+    warmup_lambda_no    - default is 15. How many lambda's are used in
+                          warm-up, will be overridden if warmup_lambdas
+                          is not NULL
+    warmup_tolerance    - default is the same as tolerance. The value
+                          of tolerance used during warmup.
+    use_active_set      - default is False. Sometimes active-set method
+                          can speed up the calculation.
     activeset_tolerance - default is the same as tolerance. The
                           value of tolerance used during active set
                           calculation
-    random_stepsize - default is False. Whether add some randomness
-                      to the step size. Sometimes, this can speed
-                      up the calculation.
+    random_stepsize     - default is False. Whether add some randomness
+                         to the step size. Sometimes, this can speed
+                         up the calculation.
+    n_folds             - default is 1. Number of cross validation folds.
+                          Set this to greater than 1 if CV over lambda is required.
+    validation_result   - Name of the table to store the cross validation results.
 
     When warmup is True and warmup_lambdas is NULL, warmup_lambda_no
     of lambda values will be automatically generated and used.
@@ -391,22 +399,22 @@ def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
     SELECT {schema_madlib}.elastic_net_train();
     for help
     """.format(schema_madlib=schema_madlib)
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
-def elastic_net_train(schema_madlib, tbl_source, tbl_result, col_dep_var,
-                      col_ind_var, regress_family, alpha, lambda_value,
+def elastic_net_train(schema_madlib, source_table, model_table, dependent_varname,
+                      independent_varname, regress_family, alpha, lambda_value,
                       standardize, grouping_col, optimizer,
                       optimizer_params, excluded, max_iter, tolerance,
                       **kwargs):
     """
     A wrapper for all variants of elastic net regularization.
 
-    @param tbl_source        Name of data source table
-    @param col_ind_var       Name of independent variable column,
+    @param source_table        Name of data source table
+    @param independent_varname       Name of independent variable column,
                              independent variable is an array
-    @param col_dep_var       Name of dependent variable column
-    @param tbl_result        Name of the table to store the results,
+    @param dependent_varname       Name of dependent variable column
+    @param model_table        Name of the table to store the results,
                              will return fitting coefficients and
                              likelihood
     @param lambda_value      The regularization parameter
@@ -417,96 +425,173 @@ def elastic_net_train(schema_madlib, tbl_source, tbl_result, col_dep_var,
     @param optimizer_params  Parameters of the above optimizer, the format
                              is '{arg = value, ...}'::varchar[]
     @param excluded          Which variables are excluded when
-                             col_ind_var == "*"
+                             independent_varname == "*"
     """
-    if regress_family is None:
-        plpy.error("""
-                   Elastic Net error: Please enter a valid response family name!
-                   Run:
-                   SELECT {schema_madlib}.elastic_net_train();
-                   for supported response family.
-                   """.format(schema_madlib=schema_madlib))
-
-    if optimizer is None:
-        plpy.error("""
-                   Elastic Net error: Please enter a valid optimizer name!
-                   Run:
-                   SELECT {schema_madlib}.elastic_net_train('gaussian');
-                   for supported optimizers.
-                   """.format(schema_madlib=schema_madlib))
-    # handle all special cases of col_ind_var
-    col_ind_var, outstr_array = analyze_input_str(schema_madlib, tbl_source,
-                                                  col_ind_var, col_dep_var,
-                                                  excluded)
+    with MinWarning("warning"):
+        if regress_family is None:
+            plpy.error("""
+                       Elastic Net error: Please enter a valid response family name!
+                       Run:
+                       SELECT {schema_madlib}.elastic_net_train();
+                       for supported response family.
+                       """.format(schema_madlib=schema_madlib))
+
+        if optimizer is None:
+            plpy.error("""
+                       Elastic Net error: Please enter a valid optimizer name!
+                       Run:
+                       SELECT {schema_madlib}.elastic_net_train('gaussian');
+                       for supported optimizers.
+                       """.format(schema_madlib=schema_madlib))
+
+        regress_family = regress_family.lower()
+        optimizer = optimizer.lower()
+
+        if (regress_family not in (BINOMIAL_FAMILIES + GAUSSIAN_FAMILIES) or
+                optimizer not in OPTIMIZERS):
+            plpy.error("""
+                Elastic Net error: Not a supported response family or supported
+                optimizer of the given response family!
+
+                Run:
+                    SELECT {schema_madlib}.elastic_net_train();
+                for help.
+                """.format(schema_madlib=schema_madlib))
+
+        cv_param, optimizer_params = _get_cv_optimizer_params(
+            optimizer_params, alpha, lambda_value)
+
+        args = locals()
+        if cv_param['n_folds'] > 1:
+            args.update(cv_param)
+            _cross_validate_en(args)
+        _internal_elastic_net_train(**args)
+# ------------------------------------------------------------------------
+
+
+def _internal_elastic_net_train(
+        schema_madlib, source_table, model_table, dependent_varname,
+        independent_varname, grouping_col,
+        regress_family, alpha, lambda_value,
+        standardize, optimizer, optimizer_params, excluded,
+        max_iter, tolerance, **kwargs):
+
+    tbl_summary = add_postfix(model_table, "_summary")
+
+    # handle all special cases of independent_varname
+    independent_varname, outstr_array = analyze_input_str(
+        schema_madlib, source_table,
+        independent_varname, dependent_varname, excluded)
+
     # get the grouping info
     grouping_str, grouping_col = _get_grouping_col_str(schema_madlib,
-                                                           tbl_source, grouping_col)
-
-    # # Special case for ridge linear regression
-    # if ((regress_family.lower() == "gaussian" or regress_family.lower() == "linear") and
-    #     optimizer.lower() == "newton" and
-    #     alpha == 0):
-    #     plpy.execute("""select {schema_madlib}.ridge_newton_train(
-    #                         '{tbl_source}', '{col_ind_var}', '{col_dep_var}',
-    #                         '{tbl_result}', {lambda_value}, {standardize}
-    #     )""".format(schema_madlib=schema_madlib,
-    #                 tbl_source = tbl_source,
-    #                 col_ind_var = col_ind_var,
-    #                 col_dep_var = col_dep_var,
-    #                 tbl_result = tbl_result,
-    #                 lambda_value = lambda_value,
-    #                 standardize = standardize))
-    #     return None
-
-    not_supported_family = False
-    not_supported_opt = False
-    tbl_summary = add_postfix(tbl_result, "_summary")
-    if regress_family.lower() in ("gaussian", "linear"):
-        if optimizer.lower() == "igd":
-            __elastic_net_gaussian_igd_train(
-                schema_madlib, tbl_source, col_ind_var, col_dep_var,
-                tbl_result, tbl_summary, lambda_value, alpha, standardize,
-                optimizer_params, max_iter, tolerance, outstr_array, 
+                                                       source_table, grouping_col)
+
+    if regress_family in GAUSSIAN_FAMILIES:
+        if optimizer == OPTIMIZERS.igd:
+            _elastic_net_gaussian_igd_train(
+                schema_madlib, source_table, independent_varname, dependent_varname,
+                model_table, tbl_summary, lambda_value, alpha, standardize,
+                optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
-        if optimizer.lower() == "fista":
-            __elastic_net_gaussian_fista_train(
-            	schema_madlib, tbl_source, col_ind_var, col_dep_var,
-                tbl_result, tbl_summary, lambda_value, alpha, standardize,
-                optimizer_params, max_iter, tolerance, outstr_array, 
+        if optimizer == OPTIMIZERS.fista:
+            _elastic_net_gaussian_fista_train(
+                schema_madlib, source_table, independent_varname, dependent_varname,
+                model_table, tbl_summary, lambda_value, alpha, standardize,
+                optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
-        not_supported_opt = True
-    elif regress_family.lower() in ("binomial", "logistic"):
-        if optimizer.lower() == "igd":
-            col_dep_var = "(" + col_dep_var + ")::boolean"
-            __elastic_net_binomial_igd_train(
-                schema_madlib, tbl_source, col_ind_var, col_dep_var,
-                tbl_result, tbl_summary, lambda_value, alpha, standardize,
-                optimizer_params, max_iter, tolerance, outstr_array, 
+    elif regress_family in BINOMIAL_FAMILIES:
+        if optimizer == OPTIMIZERS.igd:
+            dependent_varname = "(" + dependent_varname + ")::boolean"
+            _elastic_net_binomial_igd_train(
+                schema_madlib, source_table, independent_varname, dependent_varname,
+                model_table, tbl_summary, lambda_value, alpha, standardize,
+                optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
-
-        if optimizer.lower() == "fista":
-            col_dep_var = "(" + col_dep_var + ")::boolean"
-            __elastic_net_binomial_fista_train(
-                schema_madlib, tbl_source, col_ind_var, col_dep_var,
-                tbl_result, tbl_summary, lambda_value, alpha, standardize,
+        if optimizer == OPTIMIZERS.fista:
+            dependent_varname = "(" + dependent_varname + ")::boolean"
+            _elastic_net_binomial_fista_train(
+                schema_madlib, source_table, independent_varname, dependent_varname,
+                model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
-        not_supported_opt = True
-    else:
-        not_supported_family = True
-    if not_supported_family or not_supported_opt:
-        plpy.error("""
-               Elastic Net error: Not a supported response family or supported optimizer of the given response family!
-               Run:
-               SELECT {schema_madlib}.elastic_net_train();
-               for help.
-               """.format(schema_madlib=schema_madlib))
     return None
-# ========================================================================
+# ----------------------------------------------------------------------
+
+
+def _get_cv_optimizer_params(param_str, alpha, smallest_lambda):
+    cv_params_defaults = {
+        "n_folds": (1, int),
+        "lambda_value": (None, list),
+        "alpha": ([alpha], list),
+        "n_lambdas": (15, int),
+        "validation_result": (None, str)
+    }
+    param_defaults = dict([(k, v[0]) for k, v in cv_params_defaults.items()])
+    param_types = dict([(k, v[1]) for k, v in cv_params_defaults.items()])
+
+    if not param_str:
+        return param_defaults, param_str
+
+    name_value = extract_keyvalue_params(param_str, param_types, param_defaults,
+                                         ignore_invalid=True)
+    if name_value['n_folds'] > 1:
+        if not name_value['lambda_value']:
+            if name_value['n_lambdas']:
+                name_value['lambda_value'] = _generate_warmup_lambda_sequence(
+                    smallest_lambda, name_value['n_lambdas'])
+                # no warmup when cross validating on lambda
+                param_str += ', warmup=False'
+            else:
+                name_value['lambda_value'] = [float(smallest_lambda)]
+        else:
+            name_value['lambda_value'] = map(float, name_value['lambda_value'])
+            # no warmup when cross validating on lambda
+            param_str += ', warmup=False'
+        name_value['alpha'] = map(float, name_value['alpha'])
+    return name_value, param_str
+# ------------------------------------------------------------------------
+
+
+def _cross_validate_en(args):
+    # updating params_dict will also update args['params_dict']
+    if args['n_folds'] > 1 and args['grouping_col']:
+        plpy.error('Elastic Net Error: cross validation with grouping is not supported!')
+
+    allowed_cv_params = ('lambda_value', 'alpha')  # keep trailing comma for single element
+    cv_params_values = {}
+
+    for param in allowed_cv_params:
+        if isinstance(args[param], list):
+            if len(args[param]) > 1:
+                cv_params_values[param] = args[param]
+            else:
+                args[param] = args[param][0]
+
+    if not cv_params_values and args['n_folds'] <= 1:
+        # no cross validation
+        return
+
+    if not cv_params_values and args['n_folds'] > 1:
+        plpy.warning('Elastic Net Warning: n_folds > 1 but no '
+                     'cross validation parameter provided')
+        return
+
+    if cv_params_values and args['n_folds'] <= 1:
+        plpy.error('Elastic Net Error: All parameters must be scalar '
+                   'when n_folds is 0 or 1')
+
+    scorer = 'classification' if args['regress_family'] in BINOMIAL_FAMILIES else 'regression'
+    cv = CrossValidator(_internal_elastic_net_train, elastic_net_predict_all, scorer, args)
+    val_res = cv.validate(cv_params_values, args['n_folds'])
+    if 'validation_result' in args:
+        val_res.output_tbl(args['validation_result'])
+    args.update(val_res.top('sub_args'))
+# ------------------------------------------------------------------------------
 
 
 def _get_grouping_col_str(schema_madlib, source_table, grouping_col):
@@ -517,7 +602,7 @@ def _get_grouping_col_str(schema_madlib, source_table, grouping_col):
         intersect = frozenset(
             _string_to_array(grouping_col)).intersection(
                 frozenset(
-                    ('regress_family', 'coef_all', 
+                    ('regress_family', 'coef_all',
                      'features_selected',
                      'coef_nonzero', 'intercept',
                      'log_likelihood', 'standardize',
@@ -541,7 +626,7 @@ def _get_grouping_col_str(schema_madlib, source_table, grouping_col):
 # ------------------------------------------------------------------------------
 
 
-def __check_args(tbl_source, col_ind_var, col_dep_var):
+def _check_args(tbl_source, col_ind_var, col_dep_var):
     """
     Check arguments before analyze_input_str
     """
@@ -555,7 +640,7 @@ def __check_args(tbl_source, col_ind_var, col_dep_var):
 
     if table_is_empty(tbl_source):
         plpy.error("Elastic Net error: Data table " + tbl_source + " is empty!")
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
 def analyze_input_str(schema_madlib, tbl_source,
@@ -568,7 +653,7 @@ def analyze_input_str(schema_madlib, tbl_source,
     @param col_dep_var Dependent variables
     @param excluded Which variables are excluded when col_ind_var == "*"
     """
-    __check_args(tbl_source, col_ind_var, col_dep_var)
+    _check_args(tbl_source, col_ind_var, col_dep_var)
 
     outstr_array = []
     if col_ind_var == "*":
@@ -593,38 +678,36 @@ def analyze_input_str(schema_madlib, tbl_source,
             included_col_types = [col_types_dict[i] for i in outstr_array]
             if not all(is_psql_numeric_type(i)
                        for i in included_col_types):
-                plpy.error("""
-                           Elastic Net error: All columns to be included in the
-                           independent variables should be of the numeric type.
-                           """)
+                plpy.error("Elastic Net error: All columns to be included in the"
+                           "independent variables should be of the numeric type.")
         col_ind_var_new = "ARRAY[" + ','.join(outstr_array) + "]"
         return (col_ind_var_new, outstr_array)
 
     if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib):
-        ## if the input is a column name and not an expression
+        # if the input is a column name and not an expression
         return analyze_single_input_str(schema_madlib, tbl_source,
                                         col_ind_var, excluded)
     else:
-        ## if input is an expression resulting in an array output
+        # if input is an expression resulting in an array output
         matched = re.match(r"(?i)^array\[(.*)\]", col_ind_var)
         if matched:
-            ## array expression starts with the word "ARRAY"
+            # array expression starts with the word "ARRAY"
             outstr_array = _string_to_array(matched.group(1))
         else:
-            ## any other form of array expression
+            # any other form of array expression
             n_feat = plpy.execute(""" SELECT array_upper({indep_var}, 1) as num_feat
                                       FROM {source} LIMIT 1
                                   """.format(indep_var=col_ind_var,
                                              source=tbl_source))[0]["num_feat"]
-            outstr_array = ["[" + str(i) + "]" for i in range(1, n_feat+1)]
-        ## We allow expressions for independent variables that could start with
-        ## something other than 'array'
-        ##    Example use case: input independent variable of array column
-        ##     adding an intercept could be done as '1 || x' where 'x' is array
-        ##     of independent variables.
+            outstr_array = ["[" + str(i) + "]" for i in range(1, n_feat + 1)]
+        # We allow expressions for independent variables that could start with
+        # something other than 'array'
+        # Example use case: input independent variable of array column
+        # adding an intercept could be done as '1 || x' where 'x' is array
+        # of independent variables.
         # plpy.error("Elastic Net error: Independent variable format is not quite right!")
         return (col_ind_var, outstr_array)
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
 def analyze_single_input_str(schema_madlib, tbl_source, col_ind_var,
@@ -670,7 +753,7 @@ def analyze_single_input_str(schema_madlib, tbl_source, col_ind_var,
             s = []
 
         outstr_array = ["%s[%s]" % (col_ind_var, str(i))
-                        for i in range(1, dimension+1) if i not in s]
+                        for i in range(1, dimension + 1) if i not in s]
         if s:
             col_ind_var_new = "ARRAY[" + ",".join(outstr_array) + "]"
         else:
@@ -680,7 +763,7 @@ def analyze_single_input_str(schema_madlib, tbl_source, col_ind_var,
     else:
         plpy.error("Elastic Net error: Single column name included for "
                    "independent variable is not found in source table.")
-# ========================================================================
+# ------------------------------------------------------------------------
 
 
 def elastic_net_predict_all(schema_madlib, tbl_model, tbl_new_source,
@@ -690,32 +773,52 @@ def elastic_net_predict_all(schema_madlib, tbl_model, tbl_new_source,
     """
     summary_table = add_postfix(tbl_model, "_summary")
     grouping_col = plpy.execute("SELECT grouping_col FROM {summary_table}".
-        format(summary_table=summary_table))[0]["grouping_col"]
-    old_msg_level = set_client_min_messages("error")
-    regress_family = plpy.execute("SELECT family FROM {tbl_model} ".
-                                  format(tbl_model=tbl_model))[0]["family"]
-
-    if regress_family.lower() in ("gaussian", "linear"):
-        predict_func = "elastic_net_gaussian_predict"
-    elif regress_family.lower() in ("binomial", "logistic"):
-        predict_func = "elastic_net_binomial_predict"
-    else:
-        plpy.error("Elastic Net error: Not a supported response family!")
+                                format(summary_table=summary_table))[0]["grouping_col"]
+    with MinWarning("error"):
+        regress_family = plpy.execute("SELECT family FROM {tbl_model} ".
+                                      format(tbl_model=tbl_model))[0]["family"]
+
+        if regress_family.lower() in ("gaussian", "linear"):
+            predict_func = "elastic_net_gaussian_predict"
+        elif regress_family.lower() in ("binomial", "logistic"):
+            predict_func = "elastic_net_binomial_predict"
+        else:
+            plpy.error("Elastic Net error: Not a supported response family!")
 
-    if col_id is None or col_id == '':
-        plpy.error("Elastic Net error: invalid ID column provided!")
-    if columns_exist_in_table(tbl_new_source, [col_id], schema_madlib):
-        elastic_net_predict_id = col_id
-    else:
-        elastic_net_predict_id = 'elastic_net_predict_id'
-
-    dense_vars = mad_vec(plpy.execute(""" SELECT features AS fs
-                                          FROM {tbl_model}
-                                      """.format(tbl_model=tbl_model))[0]["fs"])
-    dense_vars_str = "ARRAY[" + ", ".join(dense_vars) + "]"
-    # Must be careful to avoid possible name conflicts
-    if not grouping_col or grouping_col != 'NULL':
-        qstr = """
+        if col_id is None or col_id == '':
+            plpy.error("Elastic Net error: invalid ID column provided!")
+        if columns_exist_in_table(tbl_new_source, [col_id], schema_madlib):
+            elastic_net_predict_id = col_id
+        else:
+            elastic_net_predict_id = 'elastic_net_predict_id'
+
+        dense_vars = plpy.execute(""" SELECT features AS fs FROM {tbl_model}
+                                  """.format(tbl_model=tbl_model))[0]["fs"]
+        dense_vars_str = "ARRAY[" + ", ".join(dense_vars) + "]"
+        # Must be careful to avoid possible name conflicts
+
+        if grouping_col and grouping_col != 'NULL':
+            qstr = """
+                DROP TABLE IF EXISTS {tbl_predict};
+                CREATE TABLE {tbl_predict} AS
+                    SELECT
+                        {elastic_net_predict_id},
+                        {schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
+                             AS prediction
+                    FROM
+                        {tbl_model} as tbl1
+                        JOIN
+                        (SELECT
+                            {grouping_col},
+                            {col_id} as {elastic_net_predict_id},
+                            {dense_vars_str} as ind_var
+                        FROM
+                            {tbl_new_source}) tbl2
+                        USING ({grouping_col})
+                        ORDER BY {grouping_col}, {elastic_net_predict_id}
+                """.format(**locals())
+        else:
+            qstr = """
             DROP TABLE IF EXISTS {tbl_predict};
             CREATE TABLE {tbl_predict} AS
                 SELECT
@@ -723,34 +826,15 @@ def elastic_net_predict_all(schema_madlib, tbl_model, tbl_new_source,
                     {schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
                          AS prediction
                 FROM
-                    {tbl_model} as tbl1
-                    JOIN
-                    (SELECT
-                        {grouping_col},
-                        {col_id} as {elastic_net_predict_id},
-                        {dense_vars_str} as ind_var
-                    FROM
-                        {tbl_new_source}) tbl2
-                    USING ({grouping_col})
+                    {tbl_model} as tbl1,
+                    (
+                        SELECT
+                            {col_id} as {elastic_net_predict_id},
+                            {dense_vars_str} as ind_var
+                        FROM
+                            {tbl_new_source}
+                    ) tbl2
             """.format(**locals())
-    else:
-        qstr = """
-        DROP TABLE IF EXISTS {tbl_predict};
-        CREATE TABLE {tbl_predict} AS
-            SELECT
-                {elastic_net_predict_id},
-                {schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
-                     AS prediction
-            FROM
-                {tbl_model} as tbl1,
-                (SELECT
-                    {col_id} as {elastic_net_predict_id},
-                    {dense_vars_str} as ind_var
-                FROM
-                    {tbl_new_source}) tbl2
-        """.format(**locals())
-    plpy.execute(qstr)
-
-    set_client_min_messages(old_msg_level)
+        plpy.execute(qstr)
     return None
-# ========================================================================
+# ------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index 4677f1c..9bed5ac 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -166,34 +166,25 @@ computation stops.</DD>
 </DL>
 
 @anchor optimizer
-@par Optimizer Parameters
-Optimizer parameters are supplied in a string containing a comma-delimited
-list of name-value pairs. All of these named parameters are optional, and
-their order does not matter. You must use the format "<param_name> = <value>"
-to specify the value of a parameter, otherwise the parameter is ignored.
+@par Other Parameters
 
-When the \ref elastic_net_train() \e optimizer argument value is \b 'fista', the \e optimizer_params argument is a string containing name-value pairs with the following format. (Line breaks are inserted for readability.)
+Multiple other (optional) parameters are supplied in a string containing a
+comma-delimited list of name-value pairs. All of these named parameters are
+optional and use the format "<param_name> = <value>".
+
+The parameters described below are organized by their functionality.
+
+<em><b>Warmup parameters</b></em>
 <pre class="syntax">
-  'max_stepsize = &lt;value>,
-   eta = &lt;value>,
-   warmup = &lt;value>,
-   warmup_lambdas = &lt;value>,
-   warmup_lambda_no = &lt;value>,
-   warmup_tolerance = &lt;value>,
-   use_active_set = &lt;value>,
-   activeset_tolerance = &lt;value>,
-   random_stepsize = &lt;value>'
+  $$
+    warmup = &lt;value>,
+    warmup_lambdas = &lt;value>,
+    warmup_lambda_no = &lt;value>,
+    warmup_tolerance = &lt;value>
+  $$
 </pre>
-\b Parameters
-<DL class="arglist">
-<DT>max_stepsize</dt>
-<DD>Default: 4.0. Initial backtracking step size. At each iteration, the algorithm first tries
-<em>stepsize = max_stepsize</em>, and if it does not work out, it then tries a
-smaller step size, <em>stepsize = stepsize/eta</em>, where \e eta must
-be larger than 1. At first glance, this seems to perform repeated iterations for even one step, but using a larger step size actually greatly increases the computation speed and minimizes the total number of iterations. A careful choice of \e max_stepsize can decrease the computation time by more than 10 times.</DD>
-<DT>eta</DT>
-<DD>Default: 2. If stepsize does not work \e stepsize / \e eta is tried. Must be greater than 1. </DD>
 
+<DL class="arglist">
 <DT>warmup</DT>
 <DD>Default: FALSE. If \e warmup is TRUE, a series of lambda values, which is
 strictly descent and ends at the lambda value that the user wants to calculate,
@@ -212,6 +203,86 @@ computation on only one lambda value.</DD>
 <DT>warmup_tolerance</DT>
 <DD>The value of tolerance used during warmup. The default is the same as the
 \e tolerance argument.</DD>
+</DL>
+
+<em><b>Cross validation parameters</b></em>
+@note Cross validation is not supported if grouping is used.
+<pre class="syntax">
+  $$
+    n_folds = &lt;value>,
+    validation_result = &lt;value>,
+    lambda_value = &lt;value>,
+    n_lambdas = &lt;value>,
+    alpha = &lt;value>
+  $$
+</pre>
+
+Hyperparameter optimization can be carried out using the built-in cross
+validation mechanism, which is activated by assigning a value greater than 1 to
+the parameter \e n_folds in \e params.  Presently, misclassification error is used
+for classification and mean squared error is used for regression.
+
+The values of a parameter to cross validate should be provided in a list. For
+example, to regularize with the L1 norm and use a lambda value
+from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}' in
+\e other_params. Note that the use of '{}' and '[]' are both valid
+here.
+
+
+<DL class="arglist">
+
+<DT>n_folds</dt>
+<DD>Default: 0.
+Number of folds (k). Must be at least 2 to activate cross validation.
+If a value of k > 2 is specified, each fold is then used as a validation set once,
+while the other k - 1 folds form the training set.
+</DD>
+
+
+<DT>validation_result</dt>
+<DD>Default: NULL.
+Name of the table to store the cross validation results including the values of
+parameters and their averaged error values. The table is only created if the name is not NULL.
+</DD>
+
+<DT>lambda_value</DT>
+<DD>Regularization value. If a list is provided for cross validation, then warmup is
+disabled on each lambda for performance reasons. </DD>
+
+<DT>n_lambdas</DT>
+<DD>Number of lambdas to cross validate over. If a list of lambda values is not
+provided, this parameter can be used to autogenerate a list of lambdas (using the
+warmup procedure)
+disabled on each lambda for performance reasons. </DD>
+
+<DT>alpha</DT>
+<DD>Elastic net control parameter. Needs to be a list of values to apply
+cross validation on it.
+</DD>
+</DL>
+
+<em><b>Optimizer parameters</b></em>
+
+\b FISTA Parameters
+<pre class="syntax">
+  $$
+    max_stepsize = &lt;value>,
+    eta = &lt;value>,
+    use_active_set = &lt;value>,
+    activeset_tolerance = &lt;value>,
+    random_stepsize = &lt;value>
+  $$
+</pre>
+
+<DL class="arglist">
+<DT>max_stepsize</dt>
+<DD>Default: 4.0. Initial backtracking step size. At each iteration, the algorithm first tries
+<em>stepsize = max_stepsize</em>, and if it does not work out, it then tries a
+smaller step size, <em>stepsize = stepsize/eta</em>, where \e eta must
+be larger than 1. At first glance, this seems to perform repeated iterations for even one step, but using a larger step size actually greatly increases the computation speed and minimizes the total number of iterations. A careful choice of \e max_stepsize can decrease the computation time by more than 10 times.</DD>
+
+<DT>eta</DT>
+<DD>Default: 2. If stepsize does not work \e stepsize / \e eta is tried. Must be greater than 1. </DD>
 
 <DT>use_active_set</DT>
 <DD>Default: FALSE. If \e use_active_set is TRUE, an active-set method is used to
@@ -229,25 +300,23 @@ we are done, otherwise the process is repeated.</DD>
 up the calculation.</DD>
 </DL>
 
-When the \ref elastic_net_train() \e optimizer argument value is \b 'igd', the
-\e optimizer_params argument is a string containing name-value pairs with
-the following format. (Line breaks are inserted for readability.)
+\b IGD parameters
 <pre class="syntax">
-  'stepsize = &lt;value>,
-   step_decay = &lt;value>,
-   threshold = &lt;value>,
-   warmup = &lt;value>,
-   warmup_lambdas = &lt;value>,
-   warmup_lambda_no = &lt;value>,
-   warmup_tolerance = &lt;value>,
-   parallel = &lt;value>'
+  $$
+      stepsize = &lt;value>,
+      step_decay = &lt;value>,
+      threshold = &lt;value>,
+      parallel = &lt;value>
+  $$
 </pre>
-\b Parameters
 <DL class="arglist">
+
 <DT>stepsize</DT>
 <DD>The default is 0.01.</DD>
+
 <DT>step_decay</DT>
-<DD>The actual setpsize used for current step is (previous stepsize) / exp(setp_decay). The default value is 0, which means that a constant stepsize is used in IGD.</DD>
+<DD>The actual stepsize used for current step is (previous stepsize) / exp(step_decay). The default value is 0, which means that a constant stepsize is used in IGD.</DD>
+
 <DT>threshold</DT>
 <DD>Default: 1e-10. When a coefficient is really small, set this coefficient to be 0.
 
@@ -259,31 +328,16 @@ standard deviation of the corresponding feature; (2) compute the average of
 absolute values of re-scaled coefficients; (3) divide each rescaled coefficient
 with the average, and if the resulting absolute value is smaller than
 \e threshold, set the original coefficient to zero.</DD>
-<DT>warmup</DT>
-<DD>Default: FALSE. If \e warmup is TRUE, a series of lambda values, which is
-strictly descent and ends at the lambda value that the user wants to calculate,
-is used. The larger lambda gives very sparse solution, and the sparse
-solution again is used as the initial guess for the next lambda's solution,
-which speeds up the computation for the next lambda. For larger data sets,
-this can sometimes accelerate the whole computation and may be faster than
-computation on only one lambda value.</DD>
-<DT>warmup_lambdas</DT>
-<DD>Default: NULL. An array of lambda values to use for warmup.</DD>
-<DT>warmup_lambda_no</DT>
-<DD>The number of lambdas used in warm-up. The default is 15. If \e
-warmup_lambdas is not NULL, this argument is overridden by the size of the \e
-warmup_lambdas array.</DD>
-<DT>warmup_tolerance</DT>
-<DD>The value of tolerance used during warmup.The default is the same as the \e tolerance argument.</DD>
+
 <DT>parallel</DT>
 <DD>Whether to run the computation on multiple segments. The default is True.
 
 SGD is a sequential algorithm in nature. When running in a distributed
 manner, each segment  of the data runs its own SGD model and then the models
-are averaged to get a model for each iteration.  This averaging might slow
+are averaged to get a model for each iteration. This averaging might slow
 down the convergence speed, although we also acquire the ability to process
 large datasets on multiple machines. This algorithm, therefore, provides the
-\e parallel option to allow you to  choose whether to do parallel computation.
+\e parallel option to allow you to choose whether to do parallel computation.
 </DD>
 </DL>
 
@@ -512,12 +566,12 @@ SELECT madlib.elastic_net_predict(
 </pre>
 -# View the results:
 <pre class="example">
-SELECT  houses.id, 
-        houses.price, 
-        houses_en1_prediction.prediction, 
-        houses.price - houses_en1_prediction.prediction AS residual 
-FROM houses_en1_prediction, houses 
-WHERE houses.id=houses_en1_prediction.id;
+SELECT  houses.id,
+        houses.price,
+        houses_en1_prediction.prediction,
+        houses.price - houses_en1_prediction.prediction AS residual
+FROM houses_en1_prediction, houses
+WHERE houses.id = houses_en1_prediction.id;
 </pre>
 
 @anchor additional_example
@@ -724,7 +778,12 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.elastic_net_train (
     max_iter            INTEGER,
     tolerance           DOUBLE PRECISION
 ) RETURNS VOID AS $$
-PythonFunction(elastic_net, elastic_net, elastic_net_train)
+    PythonFunctionBodyOnly(`elastic_net', `elastic_net')
+    return elastic_net.elastic_net_train(
+        schema_madlib, tbl_source, tbl_result, col_dep_var,
+        col_ind_var, regress_family, alpha, lambda_value,
+        standardize, grouping_col, optimizer, optimizer_params,
+        excluded, max_iter, tolerance)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in
deleted file mode 100644
index fd56fa0..0000000
--- a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in
+++ /dev/null
@@ -1,490 +0,0 @@
-# coding=utf-8
-m4_changequote(`<!', `!>')
-
-import plpy
-import math
-from elastic_net_utils import __normalize_data
-from elastic_net_utils import __compute_data_scales
-from elastic_net_utils import __compute_means
-from elastic_net_utils import __tbl_dimension_rownum
-from utilities.utilities import unique_string
-from elastic_net_gaussian_igd import __tbl_dimension_rownum
-from utilities.control import IterationController2S
-from elastic_net_utils import IterationControllerNoTableDrop
-from elastic_net_utils import __elastic_net_validate_args
-from utilities.utilities import _array_to_string
-from elastic_net_utils import __compute_average_sq
-from elastic_net_utils import __generate_warmup_lambda_sequence
-from elastic_net_utils import __process_warmup_lambdas
-from elastic_net_generate_result import __elastic_net_generate_result
-from utilities.utilities import __mad_version
-from utilities.utilities import preprocess_keyvalue_params
-
-version_wrapper = __mad_version()
-mad_vec = version_wrapper.select_vecfunc()
-
-## ========================================================================
-
-
-def __fista_params_parser(optimizer_params, lambda_value, schema_madlib):
-    """
-    Parse fista parameters.
-    """
-    allowed_params = set(["max_stepsize", "eta", "warmup", "warmup_lambdas",
-                          "warmup_lambda_no", "use_active_set"])
-    name_value = dict()
-    # default values
-    name_value["max_stepsize"] = 2.
-    name_value["use_active_set"] = 1  # use of active set
-    name_value["eta"] = 2
-    name_value["warmup"] = False
-    name_value["warmup_lambdas"] = None
-    name_value["warmup_lambda_no"] = 15
-
-    warmup_lambdas = None
-    warmup_lambda_no = None
-
-    if optimizer_params is None:
-        return name_value
-
-    for s in preprocess_keyvalue_params(optimizer_params):
-        items = s.split("=")
-        if (len(items) != 2):
-            plpy.error("Elastic Net error: Optimizer parameter list has incorrect format!")
-        param_name = items[0].strip(" \"").lower()
-        param_value = items[1].strip(" \"").lower()
-
-        if param_name not in allowed_params:
-            plpy.error(
-                """
-                Elastic Net error: {param_name} is not a valid parameter name for the FISTA optimizer.
-                Run:
-
-                SELECT {schema_madlib}.elastic_net_train('fista');
-
-                to see the parameters for FISTA algorithm.
-                """.format(param_name=param_name,
-                           schema_madlib=schema_madlib))
-
-        if param_name == "max_stepsize":
-            try:
-                name_value["max_stepsize"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: max_stepsize must be a float number!")
-
-        if param_name == "eta":
-            try:
-                name_value["eta"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: eta must be a float number!")
-
-        if param_name == "warmup":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["warmup"] = True
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["warmup"] = False
-            else:
-                plpy.error("Elastic Net error: Do you need warmup "
-                           "(True/False or yes/no) ?")
-
-        if param_name == "warmup_lambdas" and param_value != "null":
-            warmup_lambdas = param_value
-
-        if param_name == "warmup_lambda_no":
-            warmup_lambda_no = param_value
-
-        if param_name == "use_active_set":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["use_active_set"] = 1
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["use_active_set"] = 0
-            else:
-                plpy.error("Elastic Net error: Do you need warmup "
-                           "(True/False or yes/no) ?")
-
-    if name_value["warmup"]:
-        if warmup_lambdas is not None:
-            # errors are handled in __process_warmup_lambdas
-            name_value["warmup_lambdas"] = __process_warmup_lambdas(warmup_lambdas, lambda_value)
-        if warmup_lambda_no is not None:
-            try:
-                name_value["warmup_lambda_no"] = int(warmup_lambda_no)
-            except:
-                plpy.error("Elastic Net error: warmup_lambda_no must be an "
-                           "integer!")
-
-    # validate the parameters
-    if name_value["max_stepsize"] <= 0:
-        plpy.error("Elastic Net error: backtracking parameter max_stepsize "
-                   "must be positive!")
-
-    if name_value["eta"] < 1:
-        plpy.error("Elastic Net error: backtracking parameter eta must be "
-                   "larger than 1!")
-
-    if (name_value["warmup"] and name_value["warmup_lambdas"] is None and
-            name_value["warmup_lambda_no"] < 1):
-        plpy.error("Elastic Net error: Number of warm-up lambdas must be a "
-                   "positive integer!")
-
-    return name_value
-## ========================================================================
-
-
-def __fista_create_tbl_args(**args):
-    """
-    create the temporary schema and argument table used in FISTA iterations
-    """
-    (xmean_str0, ymean) = __compute_means(**args)
-
-    plpy.execute("""
-                 drop table if exists {tbl_fista_args};
-                 create temp table {tbl_fista_args} (
-                     {dimension_name}       integer,
-                     {lambda_name}          double precision[],
-                     {alpha_name}           double precision,
-                     {total_rows_name}      integer,
-                     {max_iter_name}        integer,
-                     {tolerance_name}       double precision,
-                     {xmean_name}           double precision[],
-                     {ymean_name}           double precision,
-                     {max_stepsize_name}    double precision,
-                     {eta_name}             double precision,
-                     {activeset_name}       integer
-                 );
-                 """.format(**args))
-    plpy.execute("""
-                 insert into {tbl_fista_args} values
-                     ({dimension}, '{warmup_lambdas}'::double precision[],
-                     {alpha}, {row_num}, {max_iter}, {tolerance},
-                     '{xmean_str0}'::double precision[], {ymean}, {max_stepsize}, {eta},
-                     {use_active_set});
-                 """.format(xmean_str0=xmean_str0, ymean=ymean, **args))
-
-    return None
-## ========================================================================
-
-
-def __fista_construct_dict(schema_madlib, tbl_source, col_ind_var, col_dep_var,
-                           tbl_result, dimension, row_num, lambda_value, alpha,
-                           normalization, max_iter, tolerance,
-                           outstr_array, optimizer_params_dict):
-    """
-    Construct the dict used by a series of SQL queries in FISTA optimizer.
-    """
-    args = dict(schema_madlib=schema_madlib, tbl_source=tbl_source,
-                tbl_data=tbl_source,  # argument name used in normalization
-                col_ind_var=col_ind_var, col_dep_var=col_dep_var,
-                col_ind_var_norm_new=unique_string(),  # for normalization usage
-                col_ind_var_tmp=unique_string(),
-                col_dep_var_norm_new=unique_string(),  # for normalization usage
-                col_dep_var_tmp=unique_string(),
-                tbl_result=tbl_result,
-                lambda_value=lambda_value, alpha=alpha,
-                dimension=dimension, row_num=row_num,
-                max_iter=max_iter, tolerance=tolerance,
-                outstr_array=outstr_array,
-                normalization=normalization)
-
-    # Add the optimizer parameters
-    args.update(optimizer_params_dict)
-
-    # Table names useful when normalizing the original data
-    # Note: in order to be consistent with the calling convention
-    # of the normalization functions, multiple elements of the dict
-    # actually have the same value. This is a price that one has to pay
-    # if he wants to save typing argument names by using **args as the
-    # function argument.
-    tbl_ind_scales = unique_string()
-    tbl_dep_scale = unique_string()
-    tbl_data_scaled = unique_string()
-    args.update(tbl_scale=tbl_dep_scale, tbl_dep_scale=tbl_dep_scale,
-                tbl_scales=tbl_ind_scales, tbl_ind_scales=tbl_ind_scales,
-                tbl_data_scaled=tbl_data_scaled)
-
-    # Table names used in IGD iterations
-    args.update(tbl_fista_state=unique_string(),
-                tbl_fista_args=unique_string())
-
-    # more, for args table
-    args["dimension_name"] = unique_string()
-    args["lambda_name"] = unique_string()
-    args["alpha_name"] = unique_string()
-    args["total_rows_name"] = unique_string()
-    args["max_iter_name"] = unique_string()
-    args["tolerance_name"] = unique_string()
-    args["xmean_name"] = unique_string()
-    args["ymean_name"] = unique_string()
-    args["max_stepsize_name"] = unique_string()
-    args["eta_name"] = unique_string()
-    args["activeset_name"] = unique_string()
-
-    return args
-## ========================================================================
-
-
-def __fista_cleanup_temp_tbls(**args):
-    """
-    Drop all temporary tables used by FISTA optimizer,
-    including tables used in the possible normalization
-    and FISTA iterations.
-    """
-    plpy.execute("""
-                 drop table if exists {tbl_ind_scales};
-                 drop table if exists {tbl_dep_scale};
-                 drop table if exists {tbl_data_scaled};
-                 drop table if exists {tbl_fista_args};
-                 drop table if exists pg_temp.{tbl_fista_state};
-                 """.format(**args))
-
-    return None
-## ========================================================================
-
-
-def __elastic_net_gaussian_fista_train(schema_madlib, tbl_source, col_ind_var,
-                                       col_dep_var, tbl_result, lambda_value, alpha,
-                                       normalization, optimizer_params, max_iter,
-                                       tolerance, outstr_array, **kwargs):
-    __elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, tbl_result,
-                                lambda_value, alpha, normalization, max_iter,
-                                tolerance)
-
-    return __elastic_net_gaussian_fista_train_compute(
-        schema_madlib, tbl_source, col_ind_var, col_dep_var, tbl_result,
-        lambda_value, alpha, normalization, optimizer_params, max_iter,
-        tolerance, outstr_array, **kwargs)
-## ========================================================================
-
-
-def __elastic_net_gaussian_fista_train_compute(
-        schema_madlib, tbl_source, col_ind_var, col_dep_var, tbl_result,
-        lambda_value, alpha, normalization, optimizer_params, max_iter,
-        tolerance, outstr_array, **kwargs):
-    """
-    Fit linear model with elastic net regularization using FISTA optimization.
-
-    @param tbl_source        Name of data source table
-    @param col_ind_var       Name of independent variable column,
-                             independent variable is an array
-    @param col_dep_var       Name of dependent variable column
-    @param tbl_result        Name of the table to store the results,
-                             will return fitting coefficients and
-                             likelihood
-    @param lambda_value      The regularization parameter
-    @param alpha             The elastic net parameter, [0, 1]
-    @param normalization     Whether to normalize the variables
-    @param optimizer_params  Parameters of the above optimizer, the format
-                             is '{arg = value, ...}'::varchar[]
-    """
-    old_msg_level = plpy.execute("""
-                                 select setting from pg_settings
-                                 where name='client_min_messages'
-                                 """)[0]['setting']
-    plpy.execute("set client_min_messages to error")
-
-    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
-
-    # generate a full dict to ease the following string format
-    # including several temporary table names
-    args = __fista_construct_dict(
-        schema_madlib, tbl_source, col_ind_var, col_dep_var, tbl_result,
-        dimension, row_num, lambda_value, alpha, normalization,
-        max_iter, tolerance, outstr_array,
-        __fista_params_parser(optimizer_params, lambda_value, schema_madlib))
-
-    # use normalized data or not
-    if normalization:
-        __normalize_data(args)
-        tbl_used = args["tbl_data_scaled"]
-        args["col_ind_var_new"] = args["col_ind_var_norm_new"]
-        args["col_dep_var_new"] = args["col_dep_var_norm_new"]
-    else:
-        __compute_data_scales(args)
-        tbl_used = tbl_source
-        args["col_ind_var_new"] = col_ind_var
-        args["col_dep_var_new"] = col_dep_var
-
-    args["tbl_used"] = tbl_used
-
-    if args["warmup_lambdas"] is not None:
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = _array_to_string(args["warmup_lambdas"])
-
-    if args["warmup"] and args["warmup_lambdas"] is None:
-        # average squares of each feature
-        # used to estimate the largest lambda value
-        args["sq"] = __compute_average_sq(**args)
-        args["warmup_lambdas"] = __generate_warmup_lambda_sequence(
-            tbl_used, args["col_ind_var_new"], args["col_dep_var_new"],
-            dimension, row_num, lambda_value, alpha,
-            args["warmup_lambda_no"], args["sq"])
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = _array_to_string(args["warmup_lambdas"])
-    elif args["warmup"] is False:
-        args["warm_no"] = 1
-        args["warmup_lambdas"] = _array_to_string([lambda_value])
-
-    # create the temp table that passes parameter values to FISTA optimizer
-    __fista_create_tbl_args(**args)
-
-    # perform the actual calculation
-    iteration_run = __compute_gaussian_fista(
-        schema_madlib, args["tbl_fista_args"],
-        args["tbl_fista_state"], tbl_used,
-        args["col_ind_var_new"],
-        args["col_dep_var_new"],
-        start_iter=0,
-        max_iter=args["max_iter"],
-        warm_no=args["warm_no"],
-        use_active_set=args["use_active_set"],
-        dimension_name=args["dimension_name"],
-        lambda_name=args["lambda_name"],
-        alpha_name=args["alpha_name"],
-        total_rows_name=args["total_rows_name"],
-        max_iter_name=args["max_iter_name"],
-        xmean_name=args["xmean_name"],
-        ymean_name=args["ymean_name"],
-        max_stepsize_name=args["max_stepsize_name"],
-        eta_name=args["eta_name"],
-        activeset_name=args["activeset_name"],
-        tolerance_name=args["tolerance_name"])
-
-    __elastic_net_generate_result("fista", iteration_run, **args)
-
-    # cleanup
-    __fista_cleanup_temp_tbls(**args)
-    plpy.execute("set client_min_messages to " + old_msg_level)
-    return None
-## ========================================================================
-
-
-def __compute_gaussian_fista(schema_madlib, tbl_args, tbl_state, tbl_source,
-                             col_ind_var, col_dep_var, start_iter, **kwargs):
-    """
-    Driver function for elastic net with Gaussian response using FISTA
-
-    @param schema_madlib Name of the MADlib schema, properly escaped/quoted
-    @param tbl_args Name of the (temporary) table containing all non-template
-        arguments
-    @param tbl_state Name of the (temporary) table containing the inter-iteration
-        states
-    @param rel_source Name of the relation containing input points
-    @param col_ind_var Name of the independent variables column
-    @param col_dep_var Name of the dependent variable column
-    @param drop_table Boolean, whether to use IterationController (True) or
-                      IterationControllerNoTableDrop (False)
-    @param kwargs We allow the caller to specify additional arguments (all of
-        which will be ignored though). The purpose of this is to allow the
-        caller to unpack a dictionary whose element set is a superset of
-        the required arguments by this function.
-
-    @return The iteration number (i.e., the key) with which to look up the
-        result in \c tbl_state
-    """
-    iterationCtrl = IterationController2S(  # TableAppend(
-        rel_args=tbl_args,
-        rel_state=tbl_state,
-        stateType="double precision[]",
-        truncAfterIteration=False,
-        schema_madlib=schema_madlib,  # Identifiers start here
-        rel_source=tbl_source,
-        col_ind_var=col_ind_var,
-        col_dep_var=col_dep_var,
-        lambda_count=1,
-        is_active=0,
-        **kwargs)
-
-    state_size = None
-
-    with iterationCtrl as it:
-        it.iteration = start_iter
-        while True:
-            # manually add the intercept term
-            it.update("""
-                      select
-                          {schema_madlib}.__gaussian_fista_step(
-                              ({col_ind_var})::double precision[],
-                              ({col_dep_var})::double precision,
-
-                              m4_ifdef(<!__HAWQ__!>, <!{{__state__}}!>,
-                              <!(select _state from {rel_state}
-                                  where _iteration = {iteration})!>),
-
-                              (_args.{lambda_name}[{lambda_count}])::double precision,
-                              (_args.{alpha_name})::double precision,
-                              (_args.{dimension_name})::integer,
-                              (_args.{xmean_name})::double precision[],
-                              (_args.{ymean_name})::double precision,
-                              1::double precision,
-                              (_args.{total_rows_name})::integer,
-                              (_args.{max_stepsize_name})::double precision,
-                              (_args.{eta_name})::double precision,
-                              (_args.{activeset_name})::integer,
-                              {is_active}::integer
-                          )
-                      from {rel_source} as _src, {rel_args} as _args
-                      """)
-
-            if it.kwargs["use_active_set"] == 1:
-                if state_size is None:
-                    m4_ifdef(<!__HAWQ__!>,
-                    <!state_size = it.get_state_size()!>,
-                    <!state_size = plpy.execute(
-                        """
-                        select array_upper(_state, 1) as size
-                        from {rel_state} limit 1;
-                        """.format(**it.kwargs))[0]["size"]!>)
-
-                m4_ifdef(<!__HAWQ__!>,
-                <!is_backtracking = it.get_state_value(state_size - 1)!>,
-                <!is_backtracking = plpy.execute(
-                    """
-                    select _state[{state_size}] as backtracking
-                    from {rel_state}
-                    where _iteration = {iteration}
-                    """.format(state_size = state_size,
-                               iteration = it.iteration,
-                               **it.kwargs))[0]["backtracking"]!>)
-
-                if it.test(
-                    """
-                    {iteration} >= _args.{max_iter_name}
-                    or
-                    {schema_madlib}.__gaussian_fista_state_diff(
-                        _state_previous, _state_current) < _args.{tolerance_name}
-                    """):
-                    if it.iteration < it.kwargs["max_iter"]:
-                        if it.kwargs["is_active"] == 0:
-                            if (it.kwargs["lambda_count"] < it.kwargs["warm_no"]):
-                                it.kwargs["lambda_count"] += 1
-                            else:
-                                break
-                        else:
-                            it.kwargs["is_active"] = 0
-                    else:
-                        break
-                else:
-                    # change active state only outside of backtracking
-                    if is_backtracking == 0:
-                        it.kwargs["is_active"] = 1
-            else:
-                if it.test(
-                    """
-                    {iteration} >= _args.{max_iter_name} or
-                    {schema_madlib}.__gaussian_fista_state_diff(
-                        _state_previous, _state_current
-                            ) < _args.{tolerance_name}
-                    """):
-                    if (it.iteration < it.kwargs["max_iter"] and
-                        it.kwargs["lambda_count"] < it.kwargs["warm_no"]):
-                        it.kwargs["lambda_count"] += 1
-                    else:
-                        break
-
-        if it.kwargs["lambda_count"] < it.kwargs["warm_no"]:
-            plpy.error("""
-                       Elastic Net error: The final target lambda value is not
-                       reached in warm-up iterations. You need more iterations!
-                       """)
-
-    return iterationCtrl.iteration

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6939fd63/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in
deleted file mode 100644
index d9f9c2d..0000000
--- a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in
+++ /dev/null
@@ -1,451 +0,0 @@
-
-import plpy
-import math
-from utilities.utilities import unique_string
-from utilities.control import IterationController2S
-from elastic_net_utils import IterationControllerNoTableDrop
-from elastic_net_utils import __compute_means
-from elastic_net_utils import __normalize_data
-from elastic_net_utils import __compute_data_scales
-from elastic_net_utils import __tbl_dimension_rownum
-from elastic_net_utils import __elastic_net_validate_args
-from utilities.utilities import _array_to_string
-from elastic_net_utils import __compute_average_sq
-from elastic_net_utils import __generate_warmup_lambda_sequence
-from elastic_net_utils import __process_warmup_lambdas
-from elastic_net_generate_result import __elastic_net_generate_result
-from utilities.utilities import __mad_version
-from utilities.utilities import preprocess_keyvalue_params
-
-version_wrapper = __mad_version()
-mad_vec = version_wrapper.select_vecfunc()
-
-## ========================================================================
-
-def __igd_params_parser(optimizer_params, lambda_value, schema_madlib):
-    """
-    Parse IGD parameters.
-    """
-    allowed_params = set(["stepsize", "warmup", "warmup_lambdas",
-                          "warmup_lambda_no", "threshold", "parallel"])
-    name_value = dict()
-    # default values
-    name_value["parallel"] = True
-    name_value["stepsize"] = 0.01
-    name_value["warmup"] = False
-    name_value["warmup_lambdas"] = None
-    name_value["warmup_lambda_no"] = 15
-    name_value["threshold"] = 1e-10
-
-    warmup_lambdas = None
-    warmup_lambda_no = None
-
-    if optimizer_params is None:
-        return name_value
-
-    for s in preprocess_keyvalue_params(optimizer_params):
-        items = s.split("=")
-        if (len(items) != 2):
-            plpy.error("Elastic Net error: Optimizer parameter list has incorrect format!")
-        param_name = items[0].strip(" \"").lower()
-        param_value = items[1].strip(" \"").lower()
-
-        if param_name not in allowed_params:
-            plpy.error(
-                """
-                Elastic Net error: {param_name} is not a valid parameter name for the IGD optimizer.
-                Run:
-
-                SELECT {schema_madlib}.elastic_net_train('igd');
-
-                to see the parameters for IGD algorithm.
-                """.format(param_name = param_name,
-                           schema_madlib = schema_madlib))
-
-        if param_name == "stepsize":
-            try:
-                name_value["stepsize"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: stepsize must be a float number!")
-
-        if param_name == "warmup":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["warmup"] = True
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["warmup"] = False
-            else:
-                plpy.error("Elastic Net error: Do you need warmup (True/False or yes/no) ?")
-
-        if param_name == "warmup_lambdas" and param_value != "null":
-            warmup_lambdas = param_value
-
-        if param_name == "warmup_lambda_no":
-            warmup_lambda_no = param_value
-
-        if param_name == "threshold":
-            try:
-                name_value["threshold"] = float(param_value)
-            except:
-                plpy.error("Elastic Net error: threshold must be a float number!")
-
-        if param_name == "parallel":
-            if param_value in ["true", "t", "yes", "y"]:
-                name_value["parallel"] = True
-            elif param_value in ["false", "f", "no", "n"]:
-                name_value["parallel"] = False
-            else:
-                plpy.error("Elastic Net error: Do you need parallel (True/False or yes/no) ? IGD in parallel might be slower !")
-
-    if name_value["warmup"]:
-        if warmup_lambdas is not None:
-            # errors are handled in __process_warmup_lambdas
-            name_value["warmup_lambdas"] = __process_warmup_lambdas(warmup_lambdas, lambda_value)
-        if warmup_lambda_no is not None:
-            try:
-                name_value["warmup_lambda_no"] = int(warmup_lambda_no)
-            except:
-                plpy.error("Elastic Net error: warmup_lambda_no must be an integer!")
-
-
-    # validate the parameters
-    if name_value["stepsize"] <= 0:
-        plpy.error("Elastic Net error: step size must be positive!")
-
-    if (name_value["warmup"] and name_value["warmup_lambdas"] is None and
-        name_value["warmup_lambda_no"] < 1):
-        plpy.error("Elastic Net error: Number of warm-up lambdas must be a positive integer!")
-
-    if name_value["threshold"] < 0:
-        plpy.error("Elastic Net error: A positive threshold is needed to screen out tiny values around zero!")
-
-    return name_value
-
-## ========================================================================
-
-def __igd_create_tbl_args(**args):
-    """
-    create the temporary schema and argument table used in IGD iterations
-    """
-    (xmean_str0, ymean) = __compute_means(**args)
-
-    plpy.execute("""
-                 drop table if exists {tbl_igd_args};
-                 create temp table {tbl_igd_args} (
-                    {dimension_name}       integer,
-                    {stepsize_name}        double precision,
-                    {lambda_name}          double precision[],
-                    {alpha_name}           double precision,
-                    {total_rows_name}      integer,
-                    {max_iter_name}        integer,
-                    {tolerance_name}       double precision,
-                    {xmean_name}           double precision[],
-                    {ymean_name}           double precision
-                 );
-                 """.format(**args))
-    plpy.execute("""
-                 insert into {tbl_igd_args} values
-                    ({dimension}, {stepsize}, '{warmup_lambdas}'::double precision[],
-                     {alpha},
-                     {row_num}, {max_iter}, {tolerance},
-                     '{xmean_str0}'::double precision[], {ymean})
-                 """.format(xmean_str0 = xmean_str0, ymean = ymean,
-                            **args))
-
-    return None
-
-## ========================================================================
-
-def __igd_construct_dict(schema_madlib, tbl_source, col_ind_var, col_dep_var,
-                         tbl_result, dimension, row_num, lambda_value, alpha,
-                         normalization, max_iter, tolerance, outstr_array,
-                         optimizer_params_dict):
-    """
-    Construct the dict used by a series of SQL queries in IGD optimizer.
-    """
-    args = dict(schema_madlib = schema_madlib, tbl_source = tbl_source,
-                tbl_data = tbl_source, # argument name used in normalization
-                col_ind_var = col_ind_var, col_dep_var = col_dep_var,
-                col_ind_var_norm_new = unique_string(), # for normalization usage
-                col_ind_var_tmp = unique_string(),
-                col_dep_var_norm_new = unique_string(), # for normalization usage
-                col_dep_var_tmp = unique_string(),
-                tbl_result = tbl_result,
-                lambda_value = lambda_value, alpha = alpha,
-                dimension = dimension, row_num = row_num,
-                max_iter = max_iter, tolerance = tolerance,
-                outstr_array = outstr_array,
-                normalization = normalization)
-
-    # Add the optimizer parameters
-    args.update(optimizer_params_dict)
-
-    # Table names useful when normalizing the original data
-    # Note: in order to be consistent with the calling convention
-    # of the normalization functions, multiple elements of the dict
-    # actually have the same value. This is a price that one has to pay
-    # if he wants to save typing argument names by using **args as the
-    # function argument.
-    tbl_ind_scales = unique_string()
-    tbl_dep_scale = unique_string()
-    tbl_data_scaled = unique_string()
-    args.update(tbl_dep_scale = tbl_dep_scale,
-                tbl_ind_scales = tbl_ind_scales,
-                tbl_data_scaled = tbl_data_scaled)
-
-    # Table names used in IGD iterations
-    args.update(tbl_igd_state = unique_string(),
-                tbl_igd_args = unique_string())
-
-    # more, for args table
-    args["dimension_name"] = unique_string()
-    args["stepsize_name"] = unique_string()
-    args["lambda_name"] = unique_string()
-    args["alpha_name"] = unique_string()
-    args["total_rows_name"] = unique_string()
-    args["max_iter_name"] = unique_string()
-    args["tolerance_name"] = unique_string()
-    args["xmean_name"] = unique_string()
-    args["ymean_name"] = unique_string()
-
-    return args
-
-## ========================================================================
-
-def __igd_cleanup_temp_tbls(**args):
-    """
-    Drop all temporary tables used by IGD optimizer,
-    including tables used in the possible normalization
-    and IGD iterations.
-    """
-    plpy.execute("""
-                 drop table if exists {tbl_ind_scales};
-                 drop table if exists {tbl_dep_scale};
-                 drop table if exists {tbl_data_scaled};
-                 drop table if exists {tbl_igd_args};
-                 drop table if exists pg_temp.{tbl_igd_state};
-                 """.format(**args))
-    return None
-
-## ========================================================================
-
-def __elastic_net_gaussian_igd_train(schema_madlib, tbl_source, col_ind_var,
-                                     col_dep_var, tbl_result, lambda_value, alpha,
-                                     normalization, optimizer_params, max_iter,
-                                     tolerance, outstr_array, **kwargs):
-    __elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, tbl_result,
-                                lambda_value, alpha, normalization, max_iter, tolerance)
-
-    return __elastic_net_gaussian_igd_train_compute(schema_madlib, tbl_source, col_ind_var,
-                                                    col_dep_var, tbl_result, lambda_value, alpha,
-                                                    normalization, optimizer_params, max_iter,
-                                                    tolerance, outstr_array, **kwargs)
-
-## ========================================================================
-
-def __elastic_net_gaussian_igd_train_compute(schema_madlib, tbl_source, col_ind_var,
-                                             col_dep_var, tbl_result, lambda_value, alpha,
-                                             normalization, optimizer_params, max_iter,
-                                             tolerance, outstr_array, **kwargs):
-    """
-    Fit linear model with elastic net regularization using IGD optimization.
-
-    @param tbl_source        Name of data source table
-    @param col_ind_var       Name of independent variable column,
-                             independent variable is an array
-    @param col_dep_var       Name of dependent variable column
-    @param tbl_result        Name of the table to store the results,
-                             will return fitting coefficients and
-                             likelihood
-    @param lambda_value      The regularization parameter
-    @param alpha             The elastic net parameter, [0, 1]
-    @param normalization     Whether to normalize the variables
-    @param optimizer_params  Parameters of the above optimizer, the format
-                             is '{arg = value, ...}'::varchar[]
-    """
-    old_msg_level = plpy.execute("""
-                                 select setting from pg_settings
-                                 where name='client_min_messages'
-                                 """)[0]['setting']
-    plpy.execute("set client_min_messages to error")
-
-    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
-
-    # generate a full dict to ease the following string format
-    # including several temporary table names
-    args = __igd_construct_dict(schema_madlib, tbl_source, col_ind_var, col_dep_var, tbl_result,
-                                dimension, row_num, lambda_value, alpha, normalization,
-                                max_iter, tolerance, outstr_array,
-                                __igd_params_parser(optimizer_params, lambda_value, schema_madlib))
-
-    # use normalized data or not
-    if normalization:
-        __normalize_data(args)
-        args["tbl_used"] = args["tbl_data_scaled"]
-        args["col_ind_var_new"] = args["col_ind_var_norm_new"]
-        args["col_dep_var_new"] = args["col_dep_var_norm_new"]
-    else:
-        __compute_data_scales(args)
-        args["tbl_used"] = tbl_source
-        args["col_ind_var_new"] = col_ind_var
-        args["col_dep_var_new"] = col_dep_var
-
-    # average squares of each feature
-    # used to estimate the largest lambda value
-    # also used to screen out tiny values, so order is needed
-    args["sq"] = __compute_average_sq(**args)
-    args["sq_str"] = _array_to_string(args["sq"])
-
-    if args["warmup_lambdas"] is not None:
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = _array_to_string(args["warmup_lambdas"])
-
-    if args["warmup"] and args["warmup_lambdas"] is None:
-        args["warmup_lambdas"] = __generate_warmup_lambda_sequence(args["tbl_used"], args["col_ind_var_new"],
-                                                                   args["col_dep_var_new"],
-                                                                   dimension, row_num, lambda_value, alpha,
-                                                                   args["warmup_lambda_no"], args["sq"])
-        args["warm_no"] = len(args["warmup_lambdas"])
-        args["warmup_lambdas"] = _array_to_string(args["warmup_lambdas"])
-    elif args["warmup"] is False:
-        args["warm_no"] = 1
-        args["warmup_lambdas"] = _array_to_string([lambda_value]) # only one value
-
-    # create the temp table that passes parameter values to IGD optimizer
-    __igd_create_tbl_args(**args)
-
-    # perform the actual calculation
-    iteration_run = __compute_gaussian_igd(schema_madlib, args["tbl_igd_args"],
-                                           args["tbl_igd_state"], args["tbl_used"],
-                                           args["col_ind_var_new"], args["col_dep_var_new"],
-                                           True,
-                                           max_iter = args["max_iter"],
-                                           warm_no = args["warm_no"],
-                                           parallel = args["parallel"],
-                                           col_ind_var_new = args["col_ind_var_new"],
-                                           col_dep_var_new = args["col_dep_var_new"],
-                                           dimension_name = args["dimension_name"],
-                                           stepsize_name = args["stepsize_name"],
-                                           lambda_name = args["lambda_name"],
-                                           alpha_name = args["alpha_name"],
-                                           total_rows_name = args["total_rows_name"],
-                                           max_iter_name = args["max_iter_name"],
-                                           tolerance_name = args["tolerance_name"],
-                                           xmean_name = args["xmean_name"],
-                                           ymean_name = args["ymean_name"])
-
-    __elastic_net_generate_result("igd", iteration_run, **args)
-
-    # cleanup
-    __igd_cleanup_temp_tbls(**args)
-    plpy.execute("set client_min_messages to " + old_msg_level)
-    return None
-
-## ========================================================================
-
-def __compute_gaussian_igd(schema_madlib, tbl_args, tbl_state, tbl_source,
-                           col_ind_var, col_dep_var, drop_table, **kwargs):
-    """
-    Driver function for elastic net with Gaussian response using IGD
-
-    @param schema_madlib Name of the MADlib schema, properly escaped/quoted
-    @param tbl_args Name of the (temporary) table containing all non-template
-        arguments
-    @param tbl_state Name of the (temporary) table containing the inter-iteration
-        states
-    @param rel_source Name of the relation containing input points
-    @param col_ind_var Name of the independent variables column
-    @param col_dep_var Name of the dependent variable column
-    @param drop_table Boolean, whether to use IterationController (True) or
-                      IterationControllerNoTableDrop (False)
-    @param kwargs We allow the caller to specify additional arguments (all of
-        which will be ignored though). The purpose of this is to allow the
-        caller to unpack a dictionary whose element set is a superset of
-        the required arguments by this function.
-
-    @return The iteration number (i.e., the key) with which to look up the
-        result in \c tbl_state
-    """
-    m4_ifdef(`__HAWQ__', `
-    iterationCtrl = IterationController2S(
-        rel_args = tbl_args,
-        rel_state = tbl_state,
-        stateType = "double precision[]",
-        truncAfterIteration = False,
-        schema_madlib = schema_madlib, # Identifiers start here
-        rel_source = tbl_source,
-        col_ind_var = col_ind_var,
-        col_dep_var = col_dep_var,
-        lambda_count = 1,
-        **kwargs)
-    ', `
-    if drop_table:
-        iterationCtrl = IterationController2S(
-            rel_args = tbl_args,
-            rel_state = tbl_state,
-            stateType = "double precision[]",
-            truncAfterIteration = False,
-            schema_madlib = schema_madlib, # Identifiers start here
-            rel_source = tbl_source,
-            col_ind_var = col_ind_var,
-            col_dep_var = col_dep_var,
-            lambda_count = 1,
-            **kwargs)
-    else:
-        iterationCtrl = IterationControllerNoTableDrop(
-            rel_args = tbl_args,
-            rel_state = tbl_state,
-            stateType = "double precision[]",
-            truncAfterIteration = False,
-            schema_madlib = schema_madlib, # Identifiers start here
-            rel_source = tbl_source,
-            col_ind_var = col_ind_var,
-            col_dep_var = col_dep_var,
-            lambda_count = 1,
-            **kwargs)
-    ')
-
-    with iterationCtrl as it:
-        it.iteration = 0
-
-        if it.kwargs["parallel"]:
-            it.kwargs["parallel_step_func"] = "__gaussian_igd_step"
-        else:
-            it.kwargs["parallel_step_func"] = "__gaussian_igd_step_single_seg"
-
-        while True:
-            # manually add the intercept term
-            it.update("""
-                      select
-                        {schema_madlib}.{parallel_step_func}(
-                            {col_ind_var}::double precision[],
-                            {col_dep_var}::double precision,
-                            m4_ifdef(`__HAWQ__', `({{__state__}})',
-                            `(select _state from {rel_state}
-                                where _iteration = {iteration})'),
-                            (_args.{lambda_name}[{lambda_count}])::double precision,
-                            (_args.{alpha_name})::double precision,
-                            (_args.{dimension_name})::integer,
-                            (_args.{stepsize_name})::double precision,
-                            (_args.{total_rows_name})::integer,
-                            (_args.{xmean_name})::double precision[],
-                            (_args.{ymean_name})::double precision)
-                      from {rel_source} as _src, {rel_args} as _args
-                      """)
-
-            if it.test("""
-                       {iteration} > _args.{max_iter_name} or
-                       {schema_madlib}.__gaussian_igd_state_diff(
-                            _state_previous, _state_current) < _args.{tolerance_name}
-                       """):
-                if (it.iteration < it.kwargs["max_iter"] and
-                    it.kwargs["lambda_count"] < it.kwargs["warm_no"]):
-                    it.kwargs["lambda_count"] += 1
-                else:
-                    break
-        if it.kwargs["lambda_count"] < it.kwargs["warm_no"]:
-            plpy.error("""
-                       Elastic Net error: The final target lambda value is not
-                       reached in warm-up iterations. You need more iterations!
-                       """)
-
-    return iterationCtrl.iteration