You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@madlib.apache.org by ri...@apache.org on 2017/01/11 23:09:00 UTC

[1/2] incubator-madlib git commit: CV: Fix order of validation output table columns

Repository: incubator-madlib
Updated Branches:
  refs/heads/master e1f37bb7f -> e75a944e3


CV: Fix order of validation output table columns


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/6f12264c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/6f12264c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/6f12264c

Branch: refs/heads/master
Commit: 6f12264c3ef34345b0b9b812afbd9bee5f6b815b
Parents: e1f37bb
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Jan 11 15:04:53 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Jan 11 15:08:32 2017 -0800

----------------------------------------------------------------------
 .../validation/internal/cross_validation.py_in  | 25 +++++++-------------
 1 file changed, 8 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6f12264c/src/ports/postgres/modules/validation/internal/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
index a79b45a..c1b2561 100644
--- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
@@ -139,30 +139,21 @@ class ValidationResult(object):
         if not tbl_name or not str(tbl_name).strip():
             return
 
-        cv_history_f = self._flatten()
-        header = cv_history_f[0].keys()
-        # assuming all keys are string
-        header_str = ','.join(header)
-        # assuming all values are double precision
-        header_with_type_str = ','.join([c + ' double precision'
-                                        for c in header])
-        plpy.execute("""
-                     DROP TABLE IF EXISTS {tbl_name};
-                     CREATE TABLE {tbl_name} ({header})
-                     """.format(tbl_name=tbl_name,
-                                header=header_with_type_str))
+        header = self._cv_history[0]['sub_args'].keys() + ['mean', 'std']
+        header_str = ','.join(map(str, header))
 
         data = []
-        for h in cv_history_f:
+        for h in self._flatten():
             values = ','.join([str(h[k]) for k in header])
             data.append("({0})".format(values))
         data = ','.join(data)
 
         plpy.execute("""
-                     INSERT INTO {tbl_name}({header}) VALUES
-                     {data}""".format(data=data,
-                                      header=header_str,
-                                      tbl_name=tbl_name))
+                     CREATE TABLE {tbl_name} ({header_str}) AS
+                     VALUES
+                        {data}
+                     """.
+                     format(tbl_name=tbl_name, header_str=header_str, data=data))
 
 
 class _ValidationArgs(object):

[2/2] incubator-madlib git commit: Elastic Net: Add CV examples, clean user docs

Posted by ri...@apache.org.

Elastic Net: Add CV examples, clean user docs

Closes #85


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e75a944e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e75a944e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e75a944e

Branch: refs/heads/master
Commit: e75a944e33ca7e11f736e8571ded78840b29f3c4
Parents: 6f12264
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Thu Jan 5 12:14:55 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Jan 11 15:08:49 2017 -0800

----------------------------------------------------------------------
 .../modules/elastic_net/elastic_net.sql_in      | 482 ++++++++++++-------
 1 file changed, 297 insertions(+), 185 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e75a944e/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index 9bed5ac..2949fc5 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -29,7 +29,8 @@ m4_include(`SQLCommon.m4')
 and logistic regression problems, combining the L1 and L2 penalties of the
 lasso and ridge methods.
 
-This module implements elastic net regularization for linear and logistic regression problems.
+This module implements elastic net regularization [1] for linear and logistic regression.
+Regularization is a technique often used to prevent overfitting.
 
 @anchor train
 @par Training Function
@@ -58,7 +59,7 @@ elastic_net_train( tbl_source,
 <DD>TEXT. The name of the table containing the training data.</DD>
 
 <DT>tbl_result</DT>
-<DD>TEXT. Name of the generated table containing the output model.
+<DD>TEXT. Name of the output table containing output model.
 The output table produced by the elastic_net_train() function has the following columns:
 <table class="output">
   <tr><th>regress_family</th>
@@ -66,31 +67,31 @@ The output table produced by the elastic_net_train() function has the following
   </tr>
   <tr>
     <th>features</th>
-    <td>An array of the features (independent variables) passed into the analysis.</td>
+    <td>Array of features (independent variables) passed to the algorithm.</td>
   </tr>
   <tr>
     <th>features_selected</th>
-    <td>An array of the features selected by the analysis.</td>
+    <td>Array of features selected by the algorithm.</td>
   </tr>
   <tr>
     <th>coef_nonzero</th>
-    <td>Fitting coefficients for the selected features.</td>
+    <td>Coefficients of the selected features.</td>
   </tr>
   <tr>
     <th>coef_all</th>
-    <td>Coefficients for all selected and unselected features</td>
+    <td>Coefficients of all features, both selected and unselected.</td>
   </tr>
   <tr>
     <th>intercept</th>
-    <td>Fitting intercept for the model.</td>
+    <td>Intercept for the model.</td>
   </tr>
   <tr>
     <th>log_likelihood</th>
-    <td>The negative value of the first equation above (up to a constant depending on the data set).</td>
+    <td>Log of the likelihood value produced by the algorithm.</td>
   </tr>
   <tr>
     <th>standardize</th>
-    <td>BOOLEAN. Whether the data was normalized (\e standardize argument was TRUE).</td>
+    <td>BOOLEAN. If data has been normalized, will be set to TRUE.</td>
   </tr>
   <tr>
     <th>iteration_run</th>
@@ -102,48 +103,53 @@ The output table produced by the elastic_net_train() function has the following
 <DT>col_dep_var</DT>
 <DD>TEXT. An expression for the dependent variable.
 
-Both \e col_dep_var and \e col_ind_var can be valid Postgres
+@note  Both \e col_dep_var and \e col_ind_var can be valid PostgreSQL
 expressions. For example, <tt>col_dep_var = 'log(y+1)'</tt>, and <tt>col_ind_var
-= 'array[exp(x[1]), x[2], 1/(1+x[3])]'</tt>. In the binomial case, you can
+= 'array[exp(x[1]), x[2], 1/(1+x[3])]'</tt>.  In the binomial case, you can
 use a Boolean expression, for example, <tt>col_dep_var = 'y < 0'</tt>.</DD>
 
 <DT>col_ind_var</DT>
 <DD>TEXT. An expression for the independent variables. Use \c '*' to
 specify all columns of <em>tbl_source</em> except those listed in the
-<em>excluded</em> string. If \e col_dep_var is a column name, it is
+<em>excluded</em> string described below. If \e col_dep_var is a column name, it is
 automatically excluded from the independent variables. However, if
-\e col_dep_var is a valid Postgres expression, any column names used
-within the expression are only excluded if they are explicitly included in the
-\e excluded argument. It is a good idea to add all column names involved in
+\e col_dep_var is a valid PostgreSQL expression, any column names used
+within the expression are only excluded if they are explicitly listed in the
+\e excluded argument. Therefore, it is a good idea to add all column names involved in
 the dependent variable expression to the <em>excluded</em> string.</DD>
 
 <DT>regress_family</DT>
-<DD>TEXT. The regression type, either 'gaussian' ('linear') or 'binomial' ('logistic').</DD>
+<DD>TEXT. For regression type, specify either 'gaussian' ('linear') or 'binomial' ('logistic').</DD>
 
 <DT>alpha</DT>
-<DD>FLOAT8. Elastic net control parameter, value in [0, 1], 1 for L-1 regularization, 0 for L-2.</DD>
+<DD>FLOAT8. Elastic net control parameter with a value in the range [0, 1].
+A value of 1 means L1 regularization, and a value of 0 means L2 regularization.</DD>
 
 <DT>lambda_value</DT>
-<DD>FLOAT8. Regularization parameter, positive.</DD>
+<DD>FLOAT8. Regularization parameter (must be positive).</DD>
 
 <DT>standardize (optional)</DT>
-<DD>BOOLEAN, default: TRUE. Whether to normalize the data. Setting this to TRUE usually yields better results and faster convergence.</DD>
+<DD>BOOLEAN, default: TRUE. Whether to normalize the data or not. 
+Setting to TRUE usually yields better results and faster convergence.</DD>
 
 <DT>grouping_col (optional)</DT>
 <DD>TEXT, default: NULL. A single column or a list of comma-separated
-columns that divides the input data into discrete groups, running one
+columns that divides the input data into discrete groups, resulting in one
 regression per group. When this value is NULL, no grouping is used and
-a single result model is generated.
+a single model is generated for all data.
 
-@note <em>We currently do not support expressions for grouping_col. When
-implemented, grouping_col can also be an expression, similar to the SQL
-<tt>GROUP BY</tt> clause. </em></DD>
+@note Expressions are not currently supported for 'grouping_col'.
 
 <DT>optimizer (optional)</DT>
-<DD>TEXT, default: 'fista'. Name of optimizer, either 'fista' or 'igd'.</DD>
+<DD>TEXT, default: 'fista'. Name of optimizer, either 'fista' or 'igd'.  
+FISTA [2] is an algorithm with a fast global rate of convergence for 
+solving linear inverse problems. Incremental gradient descent (IGD)
+is a stochastic approach to minimizing an objective function [4].</DD>
 
 <DT>optimizer_params (optional)</DT>
-<DD>TEXT, default: NULL. Optimizer parameters, delimited with commas. The parameters differ depending on the value of \e optimizer. See the descriptions below for details.</DD>
+<DD>TEXT, default: NULL. Optimizer parameters, delimited with commas. 
+These parameters differ depending on the value of \e optimizer parameter. 
+See the descriptions below for details.</DD>
 
 <DT>excluded (optional)</DT>
 <DD>TEXT, default: NULL. If the \e col_ind_var input is '*' then \e excluded can
@@ -152,15 +158,15 @@ from the features.
 For example, <tt>'col1, col2'</tt>. If the \e col_ind_var is an array,
 \e excluded must be a list of the integer array positions to exclude,
 for example <tt>'1,2'</tt>. If this argument is NULL or an
-empty string <tt>''</tt>, no columns are excluded.</DD>
+empty string, no columns are excluded.</DD>
 
 <DT>max_iter (optional)</DT>
-<DD>INTEGER, default: 1000. The maximum number of iterations that are allowed.</DD>
+<DD>INTEGER, default: 1000. The maximum number of iterations allowed.</DD>
 
 <DT>tolerance</DT>
-<DD>FLOAT8, default: default is 1e-6. The criteria to end iterations. Both the
-'fista' and 'igd' optimizers compute the difference between  the
-loglikelihood of two consecutive iterations, and when the difference is smaller
+<DD>FLOAT8, default: 1e-6. This is the criterion to stop iterating. Both the
+'fista' and 'igd' optimizers compute the difference between the
+log likelihood of two consecutive iterations, and when the difference is smaller
 than \e tolerance or the iteration number is larger than \e max_iter, the
 computation stops.</DD>
 </DL>
@@ -168,11 +174,13 @@ computation stops.</DD>
 @anchor optimizer
 @par Other Parameters
 
-Multiple other (optional) parameters are supplied in a string containing a
-comma-delimited list of name-value pairs. All of these named parameters are
+For \e optimizer_params, there are several 
+parameters that can be supplied in a string containing a
+comma-delimited list of name-value pairs . All of these named parameters are
 optional and use the format "<param_name> = <value>".
 
-The parameters described below are organized by their functionality.
+The parameters described below are organized by category:  warmup, cross validation and 
+optimization.
 
 <em><b>Warmup parameters</b></em>
 <pre class="syntax">
@@ -186,27 +194,31 @@ The parameters described below are organized by their functionality.
 
 <DL class="arglist">
 <DT>warmup</DT>
-<DD>Default: FALSE. If \e warmup is TRUE, a series of lambda values, which is
-strictly descent and ends at the lambda value that the user wants to calculate,
-is used. The larger lambda gives very sparse solution, and the sparse
-solution again is used as the initial guess for the next lambda's solution,
-which speeds up the computation for the next lambda. For larger data sets,
-this can sometimes accelerate the whole computation and may be faster than
-computation on only one lambda value.</DD>
+<DD>Default: FALSE. If \e warmup is TRUE, a series of strictly descending lambda values
+are used, which end with the lambda value that the user wants to calculate.
+A larger lambda gives a sparser solution, and the sparse
+solution is then used as the initial guess for the next lambda's solution,
+which can speed up the computation for the next lambda. For larger data sets,
+this can sometimes accelerate the whole computation and may in fact be faster than
+computation with only a single lambda value.</DD>
 
 <DT>warmup_lambdas</DT>
-<DD>Default: NULL. The lambda value series to use when \e warmup is True. The default is NULL, which means that lambda values will be automatically generated.</DD>
+<DD>Default: NULL. Set of lambda values to use when \e warmup is TRUE. 
+The default is NULL, which means that lambda values will be automatically generated.</DD>
 
 <DT>warmup_lambda_no</DT>
-<DD>Default: 15. How many lambdas are used in warm-up. If \e warmup_lambdas is not NULL, this value is overridden by the number of provided lambda values.</DD>
+<DD>Default: 15. Number of lambda values used in \e warm-up. 
+If \e warmup_lambdas is not NULL, this value is overridden by the number of provided lambda values.</DD>
 
 <DT>warmup_tolerance</DT>
-<DD>The value of tolerance used during warmup. The default is the same as the
-\e tolerance argument.</DD>
+<DD>The value of tolerance used during warmup. The default value is the same as the
+\e tolerance argument described above.</DD>
 </DL>
 
 <em><b>Cross validation parameters</b></em>
-@note Cross validation is not supported if grouping is used.
+@note Please note that for performance reasons, warmup is disabled whenever 
+cross validation is used.  Also, cross validation is not supported if grouping is used.
+
 <pre class="syntax">
   $$
     n_folds = &lt;value>,
@@ -219,15 +231,13 @@ computation on only one lambda value.</DD>
 
 Hyperparameter optimization can be carried out using the built-in cross
 validation mechanism, which is activated by assigning a value greater than 1 to
-the parameter \e n_folds in \e params.  Presently, misclassification error is used
+the parameter \e n_folds.  Misclassification error is used
 for classification and mean squared error is used for regression.
 
 The values of a parameter to cross validate should be provided in a list. For
 example, to regularize with the L1 norm and use a lambda value
-from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}' in
-\e other_params. Note that the use of '{}' and '[]' are both valid
-here.
-
+from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'. 
+Note that the use of '{}' and '[]' are both valid here.
 
 <DL class="arglist">
 
@@ -238,26 +248,33 @@ If a value of k > 2 is specified, each fold is then used as a validation set onc
 while the other k - 1 folds form the training set.
 </DD>
 
-
 <DT>validation_result</dt>
 <DD>Default: NULL.
-Name of the table to store the cross validation results including the values of
+Name of the table to store the cross validation results, including the values of
 parameters and their averaged error values. The table is only created if the name is not NULL.
 </DD>
 
 <DT>lambda_value</DT>
-<DD>Regularization value. If a list is provided for cross validation, then warmup is
-disabled on each lambda for performance reasons. </DD>
+<DD>Default: NULL. Set of regularization values to be used for cross validation.
+The default is NULL, which means that lambda values will be automatically generated.</DD>
 
 <DT>n_lambdas</DT>
-<DD>Number of lambdas to cross validate over. If a list of lambda values is not
-provided, this parameter can be used to autogenerate a list of lambdas (using the
-warmup procedure)
-disabled on each lambda for performance reasons. </DD>
+<DD>Default: 15. Number of lambdas to cross validate over. If a list of lambda values is not
+provided in the \e lambda_value set above, this parameter can be used to 
+autogenerate the set of lambdas.  If the \e lambda_value set is not NULL, this value
+is overridden by the number of provided lambda values. </DD>
+
+@note If you want to cross validate over alpha only and not lambda,
+then set \e lambda_value to NULL and \e n_lambdas to 0.  In this case, 
+cross validation will be done on the set of \e alpha values specified
+in the next parameter.  The lambda value used will be the one 
+specified in the main function call at the top of this page.
 
 <DT>alpha</DT>
-<DD>Elastic net control parameter. Needs to be a list of values to apply
-cross validation on it.
+<DD>Elastic net control parameter. This is a list of values to apply
+cross validation on.  (Note that alpha values are not autogenerated.)
+If not specified, the alpha value used will be the one 
+specified in the main function call at the top of this page.
 </DD>
 </DL>
 
@@ -282,18 +299,19 @@ smaller step size, <em>stepsize = stepsize/eta</em>, where \e eta must
 be larger than 1. At first glance, this seems to perform repeated iterations for even one step, but using a larger step size actually greatly increases the computation speed and minimizes the total number of iterations. A careful choice of \e max_stepsize can decrease the computation time by more than 10 times.</DD>
 
 <DT>eta</DT>
-<DD>Default: 2. If stepsize does not work \e stepsize / \e eta is tried. Must be greater than 1. </DD>
+<DD>Default: 2.0 If stepsize does not work, \e stepsize/\e eta is tried. Must be greater than 1. </DD>
 
 <DT>use_active_set</DT>
 <DD>Default: FALSE. If \e use_active_set is TRUE, an active-set method is used to
 speed up the computation. Considerable speedup is obtained by organizing the
 iterations around the active set of features&mdash;those with nonzero coefficients.
-After a complete cycle through all the variables, we iterate on only the active
+After a complete cycle through all the variables, we iterate only on the active
 set until convergence. If another complete cycle does not change the active set,
-we are done, otherwise the process is repeated.</DD>
+we are done.  Otherwise, the process is repeated.</DD>
 
 <DT>activeset_tolerance</DT>
-<DD>Default: the value of the tolerance argument. The value of tolerance used during active set calculation. </DD>
+<DD>The value of tolerance used during active set calculation. The default
+value is the same as the \e tolerance argument described above.  </DD>
 
 <DT>random_stepsize</DT>
 <DD>Default: FALSE. Whether to add some randomness to the step size. Sometimes, this can speed
@@ -330,13 +348,13 @@ with the average, and if the resulting absolute value is smaller than
 \e threshold, set the original coefficient to zero.</DD>
 
 <DT>parallel</DT>
-<DD>Whether to run the computation on multiple segments. The default is True.
+<DD>Whether to run the computation on multiple segments. The default is TRUE.
 
 SGD is a sequential algorithm in nature. When running in a distributed
 manner, each segment  of the data runs its own SGD model and then the models
 are averaged to get a model for each iteration. This averaging might slow
-down the convergence speed, although we also acquire the ability to process
-large datasets on multiple machines. This algorithm, therefore, provides the
+down the convergence speed, but it affords the ability to process
+large datasets on a cluster of machines. This algorithm, therefore, provides the
 \e parallel option to allow you to choose whether to do parallel computation.
 </DD>
 </DL>
@@ -346,7 +364,8 @@ large datasets on multiple machines. This algorithm, therefore, provides the
 @par Prediction Function
 
 <h4>Per-Tuple Prediction</h4>
-The prediction function returns a double value for Gaussian family and boolean value for Binomial family.
+The prediction function returns a double value for the Gaussian family 
+and a Boolean value for the Binomial family.
 
 The predict function has the following syntax (elastic_net_gaussian_predict() and elastic_net_binomial_predict()):
 <pre class="syntax">
@@ -360,14 +379,15 @@ elastic_net_<family>_predict(
 \b Arguments
 <DL class="arglist">
   <DT>coefficients</DT>
-  <DD>DOUBLE PRECISION[]. Fitting coefficients, usually coef_all or coef_nonzero.</DD>
+  <DD>DOUBLE PRECISION[]. Fitting coefficients, usually \e coef_all or \e coef_nonzero.</DD>
   <DT>intercept</DT>
-  <DD>DOUBLE PRECISION. The intercept for the model.</DD>
+  <DD>DOUBLE PRECISION. Intercept for the model.</DD>
   <DT>ind_var</DT>
-  <DD>DOUBLE PRECISION[]. Independent variables that correspond to coefficients, use <EM>features</EM> column in <EM>tbl_result</EM> for coef_all, and <EM>features_selected</EM> for coef_nonzero. See also <a href="#additional_example">examples</a>. Note that unexpected results or errors may be returned in the case that this argument is not given properly.</DD>
+  <DD>DOUBLE PRECISION[]. Independent variables that correspond to coefficients.  Use <EM>features</EM> column in <EM>tbl_result</EM> for \e coef_all, and <EM>features_selected</EM> for \e coef_nonzero. See the <a href="#additional_example">examples for this case below</a>. 
+  @note Unexpected results or errors may be returned in the case that this argument \e ind_var is not specified properly.</DD>
 </DL>
 
-For binomial family, there is a function (elastic_net_binomial_prob()) that outputs the probability of the instance being True:
+For the binomial family, there is a function (elastic_net_binomial_prob()) that outputs the probability of the instance being TRUE:
 <pre class="syntax">
 elastic_net_binomial_prob(
                      coefficients,
@@ -390,25 +410,25 @@ elastic_net_predict( tbl_model,
 \b Arguments
 <dl class="arglist">
 <dt>tbl_model</dt>
-<dd>TEXT. The name of the table containing the output from the training function.</dd>
+<dd>TEXT. Name of the table containing the output from the training function.</dd>
 <dt>tbl_new_sourcedata</dt>
-<dd>TEXT. The name of the table containing the new source data.</dd>
+<dd>TEXT. Name of the table containing the new source data.</dd>
 <dt>col_id</dt>
-<dd>TEXT. The unique ID associated with each row.</dd>
+<dd>TEXT. Unique ID associated with each row.</dd>
 <dt>tbl_predict</dt>
-<dd>TEXT. The name of table to store the prediction result. </dd>
+<dd>TEXT. Name of table to store the prediction result. </dd>
 </dl>
 You do not need to specify whether the model is "linear" or "logistic" because this information is already included in the \e tbl_model table.
 
 @anchor examples
 @examp
 
--# Display online help for the elastic_net_train() function.
+-# Display online help for the elastic_net_train() function:
 <pre class="example">
 SELECT madlib.elastic_net_train();
 </pre>
 
--# Create an input data set.
+-# Create an input data set of house prices and features:
 <pre class="example">
 DROP TABLE IF EXISTS houses;
 CREATE TABLE houses ( id INT,
@@ -419,56 +439,55 @@ CREATE TABLE houses ( id INT,
                       size INT,
                       lot INT,
                       zipcode INT);
-COPY houses FROM STDIN WITH DELIMITER '|';
-  1  |  590 |       2 |    1 |  50000 |  770 | 22100 | 94301
-  2  | 1050 |       3 |    2 |  85000 | 1410 | 12000  | 94301
-  3  |   20 |       3 |    1 |  22500 | 1060 |  3500 | 94301
-  4  |  870 |       2 |    2 |  90000 | 1300 | 17500  | 94301
-  5  | 1320 |       3 |    2 | 133000 | 1500 | 30000 | 94301
-  6  | 1350 |       2 |    1 |  90500 |  820 | 25700  | 94301
-  7  | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | 94301
-  8  |  680 |       2 |    1 | 142500 | 1170 | 22000  | 94301
-  9  | 1840 |       3 |    2 | 160000 | 1500 | 19000 | 94301
-  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000  | 94301
-  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | 94301
-  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000  | 94301
-  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | 94301
-  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000  | 94301
-  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | 94301
-  16 |  770 |       2 |    2 |  91000 | 1300 | 17500 | 76010
-  17 | 1220 |       3 |    2 | 132300 | 1500 | 30000  | 76010
-  18 | 1150 |       2 |    1 |  91100 |  820 | 25700 | 76010
-  19 | 2690 |       3 |  2.5 | 260011 | 2130 | 25000  | 76010
-  20 |  780 |       2 |    1 | 141800 | 1170 | 22000 | 76010
-  21 | 1910 |       3 |    2 | 160900 | 1500 | 19000  | 76010
-  22 | 3600 |       4 |    2 | 239000 | 2790 | 20000 | 76010
-  23 | 1600 |       3 |    1 |  81010 | 1030 | 17500  | 76010
-  24 | 1590 |       3 |    2 | 117910 | 1250 | 20000 | 76010
-  25 | 3200 |       3 |    2 | 141100 | 1760 | 38000  | 76010
-  26 | 2270 |       2 |    3 | 148011 | 1550 | 14000 | 76010
-  27 |  750 |       3 |  1.5 |  66000 | 1450 | 12000  | 76010
-\\.
+INSERT INTO houses (id, tax, bedroom, bath, price, size, lot, zipcode) VALUES
+(1  ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100  , 94301),
+(2  , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000  , 94301),
+(3  ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500  , 94301),
+(4  ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500  , 94301),
+(5  , 1320 ,       3 ,    2 , 133000 , 1500 , 30000  , 94301),
+(6  , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700  , 94301),
+(7  , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000  , 94301),
+(8  ,  680 ,       2 ,    1 , 142500 , 1170 , 22000  , 94301),
+(9  , 1840 ,       3 ,    2 , 160000 , 1500 , 19000  , 94301),
+(10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000  , 94301),
+(11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500  , 94301),
+(12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000  , 94301),
+(13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000  , 94301),
+(14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000  , 94301),
+(15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000  , 94301),
+(16 ,  770 ,       2 ,    2 ,  91000 , 1300 , 17500  , 76010),
+(17 , 1220 ,       3 ,    2 , 132300 , 1500 , 30000  , 76010),
+(18 , 1150 ,       2 ,    1 ,  91100 ,  820 , 25700  , 76010),
+(19 , 2690 ,       3 ,  2.5 , 260011 , 2130 , 25000  , 76010),
+(20 ,  780 ,       2 ,    1 , 141800 , 1170 , 22000  , 76010),
+(21 , 1910 ,       3 ,    2 , 160900 , 1500 , 19000  , 76010),
+(22 , 3600 ,       4 ,    2 , 239000 , 2790 , 20000  , 76010),
+(23 , 1600 ,       3 ,    1 ,  81010 , 1030 , 17500  , 76010),
+(24 , 1590 ,       3 ,    2 , 117910 , 1250 , 20000  , 76010),
+(25 , 3200 ,       3 ,    2 , 141100 , 1760 , 38000  , 76010),
+(26 , 2270 ,       2 ,    3 , 148011 , 1550 , 14000  , 76010),
+(27 ,  750 ,       3 ,  1.5 ,  66000 , 1450 , 12000  , 76010);
 </pre>
--# Train the model.
+-# Train the model:
 <pre class="example">
 DROP TABLE IF EXISTS houses_en, houses_en_summary;
-SELECT madlib.elastic_net_train( 'houses',                  -- source table
-                                 'houses_en',               -- result table
-                                 'price',                   -- dependent variable
-                                 'array[tax, bath, size]',  -- independent variable
-                                 'gaussian',                -- regression family
-                                 0.5,                       -- alpha value
-                                 0.1,                       -- lambda value
-                                 TRUE,                      -- standardize
-                                 NULL,                      -- grouping column(s)
-                                 'fista',                   -- optimizer
-                                 '',                        -- optimizer parameters
-                                 NULL,                      -- excluded columns
-                                 10000,                     -- maximum iterations
-                                 1e-6                       -- tolerance value
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
                                );
 </pre>
--# View the resulting model.
+-# View the resulting model:
 <pre class="example">
 -- Turn on expanded display to make it easier to read results.
 \\x on
@@ -487,7 +506,7 @@ log_likelihood    | -512248641.971
 standardize       | t
 iteration_run     | 10000
 </pre>
--# Use the prediction function to evaluate residuals.
+-# Use the prediction function to evaluate residuals:
 <pre class="example">
 \\x off
 SELECT id, price, predict, price - predict AS residual
@@ -495,35 +514,68 @@ FROM (
     SELECT
         houses.*,
         madlib.elastic_net_gaussian_predict(
-            m.coef_all,
-            m.intercept,
-            ARRAY[tax,bath,size]
+            m.coef_all,             -- Coefficients
+            m.intercept,            -- Intercept
+            ARRAY[tax,bath,size]    -- Features (corresponding to coefficients)
             ) AS predict
     FROM houses, houses_en m) s
 ORDER BY id;
 </pre>
+Result:
+<pre class="result">
+ id | price  |     predict      |     residual      
+----+--------+------------------+-------------------
+  1 |  50000 |  58545.391894031 |   -8545.391894031
+  2 |  85000 | 114804.077663003 |  -29804.077663003
+  3 |  22500 |  61448.835664388 |  -38948.835664388
+  4 |  90000 |  104675.17768007 |   -14675.17768007
+  5 | 133000 |  125887.70644358 |     7112.29355642
+  6 |  90500 |  78601.843595366 |   11898.156404634
+  7 | 260000 | 199257.358231079 |   60742.641768921
+  8 | 142500 |  82514.559377081 |   59985.440622919
+  9 | 160000 |  137735.93215082 |    22264.06784918
+ 10 | 240000 | 250347.627648647 |  -10347.627648647
+ 11 |  87000 |  97172.428263539 |  -10172.428263539
+ 12 | 118600 | 119024.150628605 | -424.150628604999
+ 13 | 140000 | 180692.127913358 |  -40692.127913358
+ 14 | 148000 | 156424.249824545 |   -8424.249824545
+ 15 |  65000 | 102527.938104575 |  -37527.938104575
+ 16 |  91000 |  102396.67273637 |   -11396.67273637
+ 17 | 132300 |  123609.20149988 |     8690.79850012
+ 18 |  91100 |  74044.833707966 |   17055.166292034
+ 19 | 260011 | 196978.853287379 |   63032.146712621
+ 20 | 141800 |  84793.064320781 |   57006.935679219
+ 21 | 160900 |  139330.88561141 |    21569.11438859
+ 22 | 239000 | 248524.823693687 | -9524.82369368701
+ 23 |  81010 |  95805.325297319 |  -14795.325297319
+ 24 | 117910 | 118340.599145495 | -430.599145494998
+ 25 | 141100 | 182970.632857058 |  -41870.632857058
+ 26 | 148011 | 160981.259711945 |  -12970.259711945
+ 27 |  66000 | 104806.443048275 |  -38806.443048275
+</pre>
 
-<h4>Additional Example (with grouping)</h4>
--# Reuse the <a href="#examples">houses</a> table above and train the model by grouping the data on zip code.
+<h4>Example with Grouping</h4>
+-# Reuse the houses table above and train the model by grouping
+on zip code:
 <pre class="example">
 DROP TABLE IF EXISTS houses_en1, houses_en1_summary;
-SELECT madlib.elastic_net_train( 'houses',
-                                 'houses_en1',
-                                 'price',
-                                 'array[tax, bath, size]',
-                                 'gaussian',
-                                 0.5,
-                                 0.1,
-                                 TRUE,
-                                 'zipcode',
-                                 'fista',
-                                 '',
-                                 NULL,
-                                 10000,
-                                 1e-6
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en1',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 'zipcode',                 -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
                                );
 </pre>
--# View the resulting model and see a separate model for each group.
+-# View the resulting model with a separate model for each group:
 <pre class="example">
 -- Turn on expanded display to make it easier to read results.
 \\x on
@@ -554,48 +606,46 @@ log_likelihood    | -538806528.45
 standardize       | t
 iteration_run     | 10000
 </pre>
--# Use the prediction function to evaluate residuals.
+-# Use the prediction function to evaluate residuals:
 <pre class="example">
 \\x off
 SELECT madlib.elastic_net_predict(
-                'houses_en1',             -- model table
-                'houses',                 -- new source data table
-                'id',                     -- unique ID associated with each row
-                'houses_en1_prediction'   -- table to store prediction result
+                'houses_en1',             -- Model table
+                'houses',                 -- New source data table
+                'id',                     -- Unique ID associated with each row
+                'houses_en1_prediction'   -- Table to store prediction result
               );
-</pre>
--# View the results:
-<pre class="example">
 SELECT  houses.id,
         houses.price,
         houses_en1_prediction.prediction,
         houses.price - houses_en1_prediction.prediction AS residual
 FROM houses_en1_prediction, houses
-WHERE houses.id = houses_en1_prediction.id;
+WHERE houses.id = houses_en1_prediction.id ORDER BY id;
 </pre>
 
 @anchor additional_example
-<h4>Another Example (when coef_nonzero is different from coef_all)</h4>
--# Reuse the <a href="#examples">houses</a> table above and train the model with alpha=1 (L-1) and a large lambda (30000).
+<h4>Example where coef_nonzero is different from coef_all</h4>
+-# Reuse the <a href="#examples">houses</a> table above and train the model with alpha=1 (L1) 
+and a large lambda value (30000).
 <pre class="example">
 DROP TABLE IF EXISTS houses_en2, houses_en2_summary;
-SELECT madlib.elastic_net_train( 'houses',
-                                 'houses_en2',
-                                 'price',
-                                 'array[tax, bath, size]',
-                                 'gaussian',
-                                 1,
-                                 30000,
-                                 TRUE,
-                                 NULL,
-                                 'fista',
-                                 '',
-                                 NULL,
-                                 10000,
-                                 1e-6
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en2',              -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 1,                         -- Alpha value
+                                 30000,                     -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
                                );
 </pre>
--# View the resulting model and see coef_nonzero is different from coef_all.
+-# View the resulting model and see coef_nonzero is different from coef_all:
 <pre class="example">
 -- Turn on expanded display to make it easier to read results.
 \\x on
@@ -614,7 +664,7 @@ log_likelihood    | -1635348585.07
 standardize       | t
 iteration_run     | 151
 </pre>
--# We can still use the prediction function with coef_all to evaluate residuals.
+-# We can still use the prediction function with \e coef_all to evaluate residuals:
 <pre class="example">
 \\x off
 SELECT id, price, predict, price - predict AS residual
@@ -622,14 +672,17 @@ FROM (
     SELECT
         houses.*,
         madlib.elastic_net_gaussian_predict(
-            m.coef_all,
-            m.intercept,
-            ARRAY[tax,bath,size]
+            m.coef_all,                   -- All coefficients
+            m.intercept,                  -- Intercept
+            ARRAY[tax,bath,size]          -- All features
             ) AS predict
     FROM houses, houses_en2 m) s
 ORDER BY id;
 </pre>
--# While we can also speed up the prediction function with coef_nonzero to evaluate residuals. This requires user to examine the feature_selected column in the result table to construct the correct independent variables.
+-# We can speed up the prediction function with \e coef_nonzero 
+to evaluate residuals. This requires the user to examine the 
+\e feature_selected column in the result table to construct the correct 
+set of independent variables to provide to the prediction function:
 <pre class="example">
 \\x off
 SELECT id, price, predict, price - predict AS residual
@@ -637,14 +690,14 @@ FROM (
     SELECT
         houses.*,
         madlib.elastic_net_gaussian_predict(
-            m.coef_nonzero,
-            m.intercept,
-            ARRAY[tax,size]
+            m.coef_nonzero,               -- Non-zero coefficients
+            m.intercept,                  -- Intercept
+            ARRAY[tax,size]               -- Features corresponding to non-zero coefficients
             ) AS predict
     FROM houses, houses_en2 m) s
 ORDER BY id;
 </pre>
-The two queries are expected to give same residuals:
+The two queries above will result in same residuals:
 <pre class="result">
  id | price  |     predict      |     residual
 ----+--------+------------------+-------------------
@@ -678,6 +731,66 @@ The two queries are expected to give same residuals:
 (27 rows)
 </pre>
 
+<h4>Example with Cross Validation</h4>
+-# Reuse the houses table above.
+Here we use 3-fold cross validation with 3 automatically generated 
+lambda values and 3 specified alpha values. (This can take some time to 
+run since elastic net is effectively being called 27 times.)
+<pre class="example">
+DROP TABLE IF EXISTS houses_en3, houses_en3_summary, houses_en3_cv;
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en3',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 $$ n_folds = 3,            -- Cross validation parameters
+                                    validation_result=houses_en3_cv,
+                                    n_lambdas = 3, 
+                                    alpha = {0, 0.1, 1}
+                                 $$,                       
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
+                               );
+SELECT * FROM houses_en3;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-----+--------------------------------------------
+family            | gaussian
+features          | {tax,bath,size}
+features_selected | {tax,bath,size}
+coef_nonzero      | {22.4584783679,11657.0825871,52.1622899664}
+coef_all          | {22.4584783679,11657.0825871,52.1622899664}
+intercept         | -5067.27288499
+log_likelihood    | -543193170.15
+standardize       | t
+iteration_run     | 392
+</pre>
+
+-# Details of the cross validation:
+<pre class="example">
+SELECT * FROM houses_en3_cv ORDER BY lambda_value DESC, alpha ASC;
+</pre>
+<pre class="result">
+alpha | lambda_value |        mean         |     std
+------+--------------+---------------------+--------------------
+    0 |       100000 | -1.41777698585e+110 | 1.80536123195e+110
+  0.1 |       100000 | -1.19953054719e+107 | 1.72846143163e+107
+    1 |       100000 |      -4175743937.91 |      2485189261.38
+    0 |          100 |      -4054694238.18 |      2424765457.66
+  0.1 |          100 |      -4041768667.28 |      2418294966.72 
+    1 |          100 |      -1458791218.11 |      483327430.802
+    0 |          0.1 |      -1442293698.38 |      426795110.876
+  0.1 |          0.1 |       -1442705511.6 |       429680202.16
+|   1 |          0.1 |      -1459206061.39 |       485107796.02
+(9 rows)
+</pre>
+
 @anchor notes
 @par Note
 It is \b strongly \b recommended that you run
@@ -721,19 +834,18 @@ Note that fitting after scaling is not equivalent to directly fitting.
 @anchor literature
 @literature
 
-[1] Elastic net regularization. http://en.wikipedia.org/wiki/Elastic_net_regularization
+[1] Elastic net regularization, http://en.wikipedia.org/wiki/Elastic_net_regularization
 
 [2] Beck, A. and M. Teboulle (2009), A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM J. on Imaging Sciences 2(1), 183-202.
 
-[3] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for l1 Regularized Loss Minimization. Proceedings of the 26th International Conference on Machine Learning, Montreal, Canada, 2009.
+[3] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for L1 Regularized Loss Minimization. Proceedings of the 26th International Conference on Machine Learning, Montreal, Canada, 2009.
+
+[4] Stochastic gradient descent, https://en.wikipedia.org/wiki/Stochastic_gradient_descent
 
 @anchor related
 @par Related Topics
 
 File elastic_net.sql_in documenting the SQL functions.
-
-grp_validation
-
 */
 
 ------------------------------------------------------------------------