You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2017/12/08 18:19:26 UTC
madlib git commit: Regularized Regression: Change cross validation
stats
Repository: madlib
Updated Branches:
refs/heads/master 4aa073294 -> edc93f529
Regularized Regression: Change cross validation stats
JIRA:MADLIB-1169
Cross Validation seems to be supported by Elastic Net, SVM, and
Decision Trees. If a module is run with cross validation optimization
params, the output table corresponding to it displays `mean` and
`std` of the negative loss error for each permutation of the CV
params.
- This commit changes column names: `mean`->`mean_neg_loss` and
`std`->`std_neg_loss`.
- CV now uses negative Root Mean Squared Error, instead
of the negative Mean Squared Error.
- Update Elastic Net user docs to reflect these changes.
Additional Author: Nandish Jayaram <nj...@apache.org>
Closes #210
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/edc93f52
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/edc93f52
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/edc93f52
Branch: refs/heads/master
Commit: edc93f5295256a18943aa3c0f88e9435081ff50f
Parents: 4aa0732
Author: Swati Soni <so...@gmail.com>
Authored: Wed Dec 6 11:58:46 2017 -0800
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Dec 8 10:15:58 2017 -0800
----------------------------------------------------------------------
.../modules/elastic_net/elastic_net.sql_in | 46 ++++++++++----------
.../validation/internal/cross_validation.py_in | 16 +++----
2 files changed, 31 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/edc93f52/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index f3a8980..f367774 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -231,12 +231,12 @@ cross validation is used. Also, cross validation is not supported if grouping i
Hyperparameter optimization can be carried out using the built-in cross
validation mechanism, which is activated by assigning a value greater than 1 to
-the parameter \e n_folds. Misclassification error is used
-for classification and mean squared error is used for regression.
+the parameter \e n_folds. Negative misclassification error is used
+for classification and negative root mean squared error is used for regression.
The values of a parameter to cross validate should be provided in a list. For
example, to regularize with the L1 norm and use a lambda value
-from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'.
+from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'.
Note that the use of '{}' and '[]' are both valid here.
<DL class="arglist">
@@ -733,9 +733,9 @@ The two queries above will result in same residuals:
<h4>Example with Cross Validation</h4>
-# Reuse the houses table above.
-Here we use 3-fold cross validation with 3 automatically generated
-lambda values and 3 specified alpha values. (This can take some time to
-run since elastic net is effectively being called 27 times for
+Here we use 3-fold cross validation with 3 automatically generated
+lambda values and 3 specified alpha values. (This can take some time to
+run since elastic net is effectively being called 27 times for
these combinations, then a 28th time for the whole dataset.)
<pre class="example">
DROP TABLE IF EXISTS houses_en3, houses_en3_summary, houses_en3_cv;
@@ -751,9 +751,9 @@ SELECT madlib.elastic_net_train( 'houses', -- Source table
'fista', -- Optimizer
$$ n_folds = 3, -- Cross validation parameters
validation_result=houses_en3_cv,
- n_lambdas = 3,
+ n_lambdas = 3,
alpha = {0, 0.1, 1}
- $$,
+ $$,
NULL, -- Excluded columns
10000, -- Maximum iterations
1e-6 -- Tolerance value
@@ -765,12 +765,12 @@ SELECT * FROM houses_en3;
family | gaussian
features | {tax,bath,size}
features_selected | {tax,bath,size}
-coef_nonzero | {22.4584783679,11657.0825871,52.1622899664}
-coef_all | {22.4584783679,11657.0825871,52.1622899664}
-intercept | -5067.27288499
+coef_nonzero | {22.4584188479,11657.0739045,52.1624090811}
+coef_all | {22.4584188479,11657.0739045,52.1624090811}
+intercept | -5067.33396522
log_likelihood | -543193170.15
standardize | t
-iteration_run | 392
+iteration_run | 10000
</pre>
-# Details of the cross validation:
@@ -778,17 +778,17 @@ iteration_run | 392
SELECT * FROM houses_en3_cv ORDER BY lambda_value DESC, alpha ASC;
</pre>
<pre class="result">
-alpha | lambda_value | mean | std
-------+--------------+---------------------+--------------------
- 0 | 100000 | -1.41777698585e+110 | 1.80536123195e+110
- 0.1 | 100000 | -1.19953054719e+107 | 1.72846143163e+107
- 1 | 100000 | -4175743937.91 | 2485189261.38
- 0 | 100 | -4054694238.18 | 2424765457.66
- 0.1 | 100 | -4041768667.28 | 2418294966.72
- 1 | 100 | -1458791218.11 | 483327430.802
- 0 | 0.1 | -1442293698.38 | 426795110.876
- 0.1 | 0.1 | -1442705511.6 | 429680202.16
-| 1 | 0.1 | -1459206061.39 | 485107796.02
+ alpha | lambda_value | mean_neg_loss | std_neg_loss
+-------+--------------+--------------------+-------------------
+ 0.0 | 100000.0 | -1.617365261170+55 | 1.26711815498+55
+ 0.0 | 100.0 | -63555.0502789 | 3973.78527042
+ 0.0 | 0.1 | -37136.5397256 | 9022.78236248
+ 0.1 | 100000.0 | -3.260479720340+53 | 9.10745448826+53
+ 0.1 | 100.0 | -63445.8310011 | 3965.83900962
+ 0.1 | 0.1 | -37192.0390897 | 9058.79757772
+ 1.0 | 100000.0 | -64569.8882099 | 4051.1856361
+ 1.0 | 100.0 | -38121.9154268 | 9332.65800111
+ 1.0 | 0.1 | -38117.5477067 | 9384.36765881
(9 rows)
</pre>
http://git-wip-us.apache.org/repos/asf/madlib/blob/edc93f52/src/ports/postgres/modules/validation/internal/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
index 11cde2f..84e52e9 100644
--- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
@@ -67,8 +67,8 @@ class ValidationResult(object):
List of dictionaries.
Each dictionary contains the following three keys:
- - mean: float, average of scores using sub_args
- - std: float, standard deviation of scores using sub_args
+ - mean_neg_loss: float, average of scores using sub_args
+ - std_neg_loss: float, standard deviation of scores using sub_args
- sub_args: dict, the values of arguments being validated
"""
def __init__(self, cv_history=None):
@@ -98,12 +98,12 @@ class ValidationResult(object):
def add_one(self, mean, std, sub_args):
"""Add one record to the history"""
- record = dict(mean=mean, std=std, sub_args=sub_args)
+ record = dict(mean_neg_loss=mean, std_neg_loss=std, sub_args=sub_args)
self._cv_history.append(record)
def sorted(self):
"""Sort the history w.r.t. mean value and return a new ValidationResult object"""
- ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean'))
+ ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean_neg_loss'))
return ValidationResult(ch)
def first(self, attr=None):
@@ -112,7 +112,7 @@ class ValidationResult(object):
Parameters
==========
attr : string, optional
- Any string in {'mean', 'std', 'sub_args'} or None
+ Any string in {'mean_neg_loss', 'std_neg_loss', 'sub_args'} or None
Returns
=======
@@ -133,13 +133,13 @@ class ValidationResult(object):
def output_tbl(self, tbl_name):
"""Create a table tbl_name that contains the history
- The columns of tbl_name are mean, std and the leaf keys in sub_args.
+ The columns of tbl_name are mean_neg_loss, std_neg_loss and the leaf keys in sub_args.
All column types are assumed to be double precision.
"""
if not tbl_name or not str(tbl_name).strip():
return
- header = self._cv_history[0]['sub_args'].keys() + ['mean', 'std']
+ header = self._cv_history[0]['sub_args'].keys() + ['mean_neg_loss', 'std_neg_loss']
header_str = ','.join(map(str, header))
data = []
@@ -352,7 +352,7 @@ class CrossValidator(object):
return plpy.execute(
"""
SELECT
- -avg(({target}-prediction)^2) AS accuracy
+ -sqrt(avg(({target}-prediction)^2)) AS accuracy
FROM {pred} JOIN {orig}
ON {pred}.{id} = {orig}.{id}
""".format(pred=pred,