You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2017/12/08 18:19:26 UTC
madlib git commit: Regularized Regression: Change cross validation stats

Repository: madlib
Updated Branches:
  refs/heads/master 4aa073294 -> edc93f529


Regularized Regression: Change cross validation stats

JIRA:MADLIB-1169

Cross Validation seems to be supported by Elastic Net, SVM, and
Decision Trees. If a module is run with cross validation optimization
params, the output table corresponding to it displays `mean` and
`std` of the negative loss error for each permutation of the CV
params.
- This commit changes column names: `mean`->`mean_neg_loss` and
`std`->`std_neg_loss`.
- CV now uses negative Root Mean Squared Error, instead
of the negative Mean Squared Error.
- Update Elastic Net user docs to reflect these changes.

Additional Author: Nandish Jayaram <nj...@apache.org>

Closes #210


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/edc93f52
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/edc93f52
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/edc93f52

Branch: refs/heads/master
Commit: edc93f5295256a18943aa3c0f88e9435081ff50f
Parents: 4aa0732
Author: Swati Soni <so...@gmail.com>
Authored: Wed Dec 6 11:58:46 2017 -0800
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Dec 8 10:15:58 2017 -0800

----------------------------------------------------------------------
 .../modules/elastic_net/elastic_net.sql_in      | 46 ++++++++++----------
 .../validation/internal/cross_validation.py_in  | 16 +++----
 2 files changed, 31 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/edc93f52/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index f3a8980..f367774 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -231,12 +231,12 @@ cross validation is used.  Also, cross validation is not supported if grouping i
 
 Hyperparameter optimization can be carried out using the built-in cross
 validation mechanism, which is activated by assigning a value greater than 1 to
-the parameter \e n_folds.  Misclassification error is used
-for classification and mean squared error is used for regression.
+the parameter \e n_folds.  Negative misclassification error is used
+for classification and negative root mean squared error is used for regression.
 
 The values of a parameter to cross validate should be provided in a list. For
 example, to regularize with the L1 norm and use a lambda value
-from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'. 
+from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'.
 Note that the use of '{}' and '[]' are both valid here.
 
 <DL class="arglist">
@@ -733,9 +733,9 @@ The two queries above will result in same residuals:
 
 <h4>Example with Cross Validation</h4>
 -# Reuse the houses table above.
-Here we use 3-fold cross validation with 3 automatically generated 
-lambda values and 3 specified alpha values. (This can take some time to 
-run since elastic net is effectively being called 27 times for 
+Here we use 3-fold cross validation with 3 automatically generated
+lambda values and 3 specified alpha values. (This can take some time to
+run since elastic net is effectively being called 27 times for
 these combinations, then a 28th time for the whole dataset.)
 <pre class="example">
 DROP TABLE IF EXISTS houses_en3, houses_en3_summary, houses_en3_cv;
@@ -751,9 +751,9 @@ SELECT madlib.elastic_net_train( 'houses',                  -- Source table
                                  'fista',                   -- Optimizer
                                  $$ n_folds = 3,            -- Cross validation parameters
                                     validation_result=houses_en3_cv,
-                                    n_lambdas = 3, 
+                                    n_lambdas = 3,
                                     alpha = {0, 0.1, 1}
-                                 $$,                       
+                                 $$,
                                  NULL,                      -- Excluded columns
                                  10000,                     -- Maximum iterations
                                  1e-6                       -- Tolerance value
@@ -765,12 +765,12 @@ SELECT * FROM houses_en3;
 family            | gaussian
 features          | {tax,bath,size}
 features_selected | {tax,bath,size}
-coef_nonzero      | {22.4584783679,11657.0825871,52.1622899664}
-coef_all          | {22.4584783679,11657.0825871,52.1622899664}
-intercept         | -5067.27288499
+coef_nonzero      | {22.4584188479,11657.0739045,52.1624090811}
+coef_all          | {22.4584188479,11657.0739045,52.1624090811}
+intercept         | -5067.33396522
 log_likelihood    | -543193170.15
 standardize       | t
-iteration_run     | 392
+iteration_run     | 10000
 </pre>
 
 -# Details of the cross validation:
@@ -778,17 +778,17 @@ iteration_run     | 392
 SELECT * FROM houses_en3_cv ORDER BY lambda_value DESC, alpha ASC;
 </pre>
 <pre class="result">
-alpha | lambda_value |        mean         |     std
-------+--------------+---------------------+--------------------
-    0 |       100000 | -1.41777698585e+110 | 1.80536123195e+110
-  0.1 |       100000 | -1.19953054719e+107 | 1.72846143163e+107
-    1 |       100000 |      -4175743937.91 |      2485189261.38
-    0 |          100 |      -4054694238.18 |      2424765457.66
-  0.1 |          100 |      -4041768667.28 |      2418294966.72 
-    1 |          100 |      -1458791218.11 |      483327430.802
-    0 |          0.1 |      -1442293698.38 |      426795110.876
-  0.1 |          0.1 |       -1442705511.6 |       429680202.16
-|   1 |          0.1 |      -1459206061.39 |       485107796.02
+ alpha | lambda_value |    mean_neg_loss   |   std_neg_loss
+-------+--------------+--------------------+-------------------
+   0.0 |     100000.0 | -1.617365261170+55 | 1.26711815498+55
+   0.0 |        100.0 |     -63555.0502789 |    3973.78527042
+   0.0 |          0.1 |     -37136.5397256 |    9022.78236248
+   0.1 |     100000.0 | -3.260479720340+53 | 9.10745448826+53
+   0.1 |        100.0 |     -63445.8310011 |    3965.83900962
+   0.1 |          0.1 |     -37192.0390897 |    9058.79757772
+   1.0 |     100000.0 |     -64569.8882099 |     4051.1856361
+   1.0 |        100.0 |     -38121.9154268 |    9332.65800111
+   1.0 |          0.1 |     -38117.5477067 |    9384.36765881
 (9 rows)
 </pre>
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/edc93f52/src/ports/postgres/modules/validation/internal/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
index 11cde2f..84e52e9 100644
--- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
@@ -67,8 +67,8 @@ class ValidationResult(object):
                  List of dictionaries.
                  Each dictionary contains the following three keys:
 
-                 - mean: float, average of scores using sub_args
-                 - std: float, standard deviation of scores using sub_args
+                 - mean_neg_loss: float, average of scores using sub_args
+                 - std_neg_loss: float, standard deviation of scores using sub_args
                  - sub_args: dict, the values of arguments being validated
     """
     def __init__(self, cv_history=None):
@@ -98,12 +98,12 @@ class ValidationResult(object):
 
     def add_one(self, mean, std, sub_args):
         """Add one record to the history"""
-        record = dict(mean=mean, std=std, sub_args=sub_args)
+        record = dict(mean_neg_loss=mean, std_neg_loss=std, sub_args=sub_args)
         self._cv_history.append(record)
 
     def sorted(self):
         """Sort the history w.r.t. mean value and return a new ValidationResult object"""
-        ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean'))
+        ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean_neg_loss'))
         return ValidationResult(ch)
 
     def first(self, attr=None):
@@ -112,7 +112,7 @@ class ValidationResult(object):
         Parameters
         ==========
         attr : string, optional
-               Any string in {'mean', 'std', 'sub_args'} or None
+               Any string in {'mean_neg_loss', 'std_neg_loss', 'sub_args'} or None
 
         Returns
         =======
@@ -133,13 +133,13 @@ class ValidationResult(object):
     def output_tbl(self, tbl_name):
         """Create a table tbl_name that contains the history
 
-        The columns of tbl_name are mean, std and the leaf keys in sub_args.
+        The columns of tbl_name are mean_neg_loss, std_neg_loss and the leaf keys in sub_args.
         All column types are assumed to be double precision.
         """
         if not tbl_name or not str(tbl_name).strip():
             return
 
-        header = self._cv_history[0]['sub_args'].keys() + ['mean', 'std']
+        header = self._cv_history[0]['sub_args'].keys() + ['mean_neg_loss', 'std_neg_loss']
         header_str = ','.join(map(str, header))
 
         data = []
@@ -352,7 +352,7 @@ class CrossValidator(object):
             return plpy.execute(
                 """
                 SELECT
-                    -avg(({target}-prediction)^2) AS accuracy
+                    -sqrt(avg(({target}-prediction)^2)) AS accuracy
                 FROM {pred} JOIN {orig}
                 ON {pred}.{id} = {orig}.{id}
                 """.format(pred=pred,