You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/07/06 20:30:13 UTC
[2/4] incubator-madlib git commit: SVM: Novelty detection using 1-class SVM

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b7484c1f/src/ports/postgres/modules/svm/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.sql_in b/src/ports/postgres/modules/svm/svm.sql_in
index 4d74382..ce7ec69 100644
--- a/src/ports/postgres/modules/svm/svm.sql_in
+++ b/src/ports/postgres/modules/svm/svm.sql_in
@@ -19,6 +19,7 @@ m4_include(`SQLCommon.m4')
 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#svm_classification">Classification Function</a></li>
 <li class="level1"><a href="#svm_regression">Regression Function</a></li>
+<li class="level1"><a href="#novelty_detection">Novelty Detection</a></li>
 <li class="level1"><a href="#kernel_params">Kernel Parameters</a></li>
 <li class="level1"><a href="#parameters">Other Parameters</a></li>
 <li class="level1"><a href="#predict">Prediction Functions</a></li>
@@ -76,10 +77,11 @@ svm_classification(
 
   <DT>independent_varname</DT>
   <DD>TEXT. Expression list to evaluate for the
-    independent variables. An intercept variable is not assumed. It is common to
-    provide an explicit intercept term by including a single constant \c 1 term
-    in the independent variable list. Expression should be able to be cast
-    into DOUBLE PRECISION [].
+    independent variables. An intercept variable should not be included as part
+    of this expression. See 'fit_intercept' in the kernel params for info on
+    intercepts.
+
+    @note Expression should be able to be cast into DOUBLE PRECISION[].
     </DD>
 
   <DT>kernel_func (optional)</DT>
@@ -92,19 +94,24 @@ svm_classification(
   <DT>kernel_params (optional)</DT>
   <DD>TEXT, defaults: NULL.
     Parameters for non-linear kernel in a comma-separated string of key-value pairs.
-    The actual parameters differ depending on the value of \e kernel_func. See the description below for details.
+    The actual parameters differ depending on the value of \e kernel_func.
+    See the description below for details.
   </DD>
 
   <DT>grouping_col (optional)</DT>
   <DD>TEXT, default: NULL. An expression list used to group
     the input dataset into discrete groups, which results in running one model per group.
     Similar to the SQL "GROUP BY" clause. When this value is NULL, no
-    grouping is used and a single model is generated.  Please note that cross validation is not supported if grouping is used.</DD>
+    grouping is used and a single model is generated.  Please note that
+    cross validation is not supported if grouping is used.</DD>
 
   <DT>params (optional)</DT>
   <DD>TEXT, default: NULL.
-    Parameters for optimization and regularization in a comma-separated string of key-value pairs. If a list of values is provided, then cross-validation will be performed to select the \e best value from the list.
-    See the description below for details. </DD>
+    Parameters for optimization and regularization in a comma-separated string
+    of key-value pairs. If a list of values is provided, then cross-validation
+    will be performed to select the \e best value from the list. See the
+    description below for details.
+  </DD>
 
   <DT>verbose (optional)</DT>
   <DD>BOOLEAN default: FALSE.
@@ -152,7 +159,10 @@ svm_classification(
       </tr>
     </table>
 
-    An auxiliary table named \<model_table\>_random is created if the kernel is not linear. It contains data needed to embed test data into a random feature space (see references [2,3]). This data is used internally by svm_predict and not meaningful on its own to the user, so you can ignore it.
+    An auxiliary table named \<model_table\>_random is created if the kernel is
+    not linear. It contains data needed to embed test data into a random feature
+    space (see references [2,3]). This data is used internally by svm_predict
+    and not meaningful on its own to the user, so you can ignore it.
 
 A summary table named \<model_table\>_summary is also created, which has the following columns:
     <table class="output">
@@ -251,10 +261,31 @@ arguments have specifications which differ from svm_classification:
 
 <DT>params (optional)</DT>
 <DD>TEXT, default: NULL.
-The parameters \e epsilon and \e eps_table are only meaningful for regression. See description below for more details.
+The parameters \e epsilon and \e eps_table are only meaningful for regression.
+See description below for more details.
 </DD>
 </DL>
 
+@anchor novelty_detection
+@par Novelty Detection Training Function
+The novelty detection function is a one-class SVM classifier, and has the following format:
+<pre class="syntax">
+svm_one_class(
+    source_table,
+    model_table,
+    independent_varname,
+    kernel_func,
+    kernel_params,
+    grouping_col,
+    params,
+    verbose
+    )
+</pre>
+\b Arguments
+
+Specifications for novelty detection are largely the same as for classification,
+except the dependent variable name is not specified. The model table is the same
+as that for classification.
 
 @anchor kernel_params
 @par Kernel Parameters
@@ -263,47 +294,42 @@ list of name-value pairs. All of these named parameters are optional, and
 their order does not matter. You must use the format "<param_name> = <value>"
 to specify the value of a parameter, otherwise the parameter is ignored.
 
-When the \ref svm_classification() \e kernel_func argument value is 'gaussian', the \e kernel_params argument is a string containing name-value pairs with the following format. (Line breaks are inserted for readability.)
-<pre class="syntax">
-  'gamma = &lt;value>,
-   n_components = &lt;value>,
-   random_state = &lt;value>'
-</pre>
-\b Parameters
 <DL class="arglist">
-<DT>gamma</dt>
-<DD>Default: 1/num_features. The parameter \f$\gamma\f$ in the Radius Basis Function kernel, i.e., \f$\exp(-\gamma||x-y||^2)\f$. Choosing a proper value for \e gamma is critical to the performance of kernel machine; e.g., while a large \e gamma tends to cause overfitting, a small \e gamma will make the model too constrained to capture the complexity of the data.
+<DT><i>Parameters common to all kernels</i></dt><dd></dd>
+<DT>fit_intercept</dt>
+<DD>Default: True. The parameter \e fit_intercept is an indicator to add an
+intercept to the \e independent_varname array expression. The intercept is added
+to the end of the feature list - thus the last element of the coefficient list
+is the intercept.
 </DD>
-
 <DT>n_components</DT>
-<DD>Default: 2*num_features. The dimensionality of the transformed feature space. A larger value lowers the variance of the estimate of the kernel but requires more memory and takes longer to train.</DD>
-
+<DD>Default: 2*num_features. The dimensionality of the transformed feature space.
+A larger value lowers the variance of the estimate of the kernel but requires
+more memory and takes longer to train.</DD>
 <DT>random_state</DT>
-<DD>Default: 1. Seed used by the random number generator. </DD>
+<DD>Default: 1. Seed used by a random number generator. </DD>
 </DL>
 
-When the \ref svm_classification() \e kernel_func argument value is 'polynomial\u2019, the \e kernel_params argument is a string containing name-value pairs with the following format. (Line breaks are inserted for readability.)
-<pre class="syntax">
-  'coef0 = &lt;value>,
-   degree = &lt;value>,
-   n_components = &lt;value>,
-   random_state = &lt;value>'
-</pre>
-\b Parameters
 <DL class="arglist">
-<DT>coef0</dt>
-<DD>Default: 1.0. The independent term \f$q\f$ in \f$ (\langle x,y\rangle + q)^r \f$. Must be larger than or equal to 0. When it is 0, the polynomial kernel is in homogeneous form.
+<DT><i>Parameters for 'gaussian' kernel</i></dt><dd></dd>
+<DT>gamma</dt>
+<DD> Default: 1/num_features. The parameter \f$\gamma\f$ in the Radius Basis Function
+kernel, i.e., \f$\exp(-\gamma||x-y||^2)\f$. Choosing a proper value for \e gamma
+is critical to the performance of kernel machine; e.g., while a large \e gamma
+tends to cause overfitting, a small \e gamma will make the model too constrained
+to capture the complexity of the data.
 </DD>
+</DL>
 
+<DL class="arglist">
+<DT><i>Parameters for 'polynomial' kernel</i></dt><dd></dd>
+<DT>coef0</dt>
+<DD>Default: 1.0. The independent term \f$q\f$ in \f$ (\langle x,y\rangle + q)^r \f$.
+Must be larger than or equal to 0. When it is 0, the polynomial kernel is in homogeneous form.
+</DD>
 <DT>degree</dt>
 <DD>Default: 3. The parameter \f$r\f$ in \f$ (\langle x,y\rangle + q)^r \f$.
 </DD>
-
-<DT>n_components</DT>
-<DD>Default: 2*num_features. The dimensionality of the transformed feature space. A larger value lowers the variance of the estimate of kernel but requires more memory and takes longer to train.</DD>
-
-<DT>random_state</DT>
-<DD>Default: 1. Seed used by the random number generator. </DD>
 </DL>
 
 
@@ -373,7 +399,7 @@ resulting \e init_stepsize can be run on the whole dataset.
 </DD>
 
 <DT>lambda</dt>
-<DD>Default: [0.01]. Regularization parameter.  Must be positive, can\u2019t be 0.
+<DD>Default: [0.01]. Regularization parameter.  Must be non-negative.
 </DD>
 
 <DT>norm</dt>
@@ -390,19 +416,29 @@ Generally, it has been suggested that epsilon should increase with noisier
 data, and decrease with the number of samples. See [5].
 </DD>
 
-<DT>eps_tabl</dt>
+<DT>eps_table</dt>
 <DD>Default: NULL.
-Name of the input table that contains values of epsilon for different groups. Ignored when \e grouping_col is NULL.  Define this input table if you want different epsilon values for different groups.  The table consists of a column named \e epsilon which specifies the epsilon values, and one or more columns for \e grouping_col. Extra groups are ignored, and groups not present in this table will use the epsilon value specified in parameter \e epsilon.
+Name of the input table that contains values of epsilon for different groups.
+Ignored when \e grouping_col is NULL.  Define this input table if you want
+different epsilon values for different groups.  The table consists of a column
+named \e epsilon which specifies the epsilon values, and one or more columns for
+\e grouping_col. Extra groups are ignored, and groups not present in this table
+will use the epsilon value specified in parameter \e epsilon.
 </DD>
 
 <DT>validation_result</dt>
 <DD>Default: NULL.
-Name of the table to store the cross validation results including the values of parameters and their averaged error values. For now, simple metric like 0-1 loss is used for classification and mean square error is used for regression. The table is only created if the name is not NULL.
+Name of the table to store the cross validation results including the values of
+parameters and their averaged error values. For now, simple metric like 0-1 loss
+is used for classification and mean square error is used for regression. The
+table is only created if the name is not NULL.
 </DD>
 
 <DT>n_folds</dt>
 <DD>Default: 0.
-Number of folds (k). Must be at least 2 to activate cross validation. If a value of k > 2 is specified, each fold is then used as a validation set once, while the other k - 1 folds form the training set.
+Number of folds (k). Must be at least 2 to activate cross validation.
+If a value of k > 2 is specified, each fold is then used as a validation set once,
+while the other k - 1 folds form the training set.
 </DD>
 
 <DT>class_weight</dt>
@@ -486,7 +522,13 @@ SELECT madlib.svm_classification('houses',
                                  'ARRAY[1, tax, bath, size]'
                            );
 </pre>
--# Next generate a nonlinear model using a Gaussian kernel. This time we specify the initial step size and maximum number of iterations to run. As part of the kernel parameter, we choose 10 as the dimension of the space where we train SVM. A larger number will lead to a more powerful model but run the risk of overfitting. As a result, the model will be a 10 dimensional vector, instead of 4 as in the case of linear model, which we will verify when we examine the models.
+-# Next generate a nonlinear model using a Gaussian kernel. This time we specify
+the initial step size and maximum number of iterations to run. As part of the
+kernel parameter, we choose 10 as the dimension of the space where we train
+SVM. A larger number will lead to a more powerful model but run the risk of
+overfitting. As a result, the model will be a 10 dimensional vector, instead
+of 4 as in the case of linear model, which we will verify when we examine the
+models.
 <pre class="example">
 SELECT madlib.svm_classification( 'houses',
                                   'houses_svm_gaussian',
@@ -498,7 +540,38 @@ SELECT madlib.svm_classification( 'houses',
                                   'init_stepsize=1, max_iter=200'
                            );
 </pre>
--# View the result for the linear model.
+-#  Now train a regression model. First, use a linear model.
+<pre class="example">
+SELECT madlib.svm_regression('houses',
+                             'houses_svm_regression',
+                             'price',
+                             'ARRAY[1, tax, bath, size]'
+                           );
+</pre>
+-#  Next train a non-linear regression model using a Gaussian kernel.
+<pre class="example">
+SELECT madlib.svm_regression( 'houses',
+                              'houses_svm_gaussian_regression',
+                              'price',
+                              'ARRAY[1, tax, bath, size]',
+                              'gaussian',
+                              'n_components=10',
+                              '',
+                              'init_stepsize=1, max_iter=200'
+                           );
+</pre>
+-#  Now train a non-linear one-class SVM for novelty detection, using a Gaussian kernel.
+<pre class="example">
+select madlib.svm_one_class('houses',
+                            'houses_one_class_gaussian',
+                            'ARRAY[1,tax,bedroom,bath,size,lot,price]',
+                            'gaussian',
+                            'gamma=0.01,n_components=10',
+                            NULL,
+                            'max_iter=250, init_stepsize=100,lambda=0.9'
+                            );
+</pre>
+-# View the result for the linear classification model.
 <pre class="example">
 -- Set extended display on for easier reading of output
 \\x ON
@@ -515,8 +588,7 @@ num_rows_processed | 15
 num_rows_skipped   | 0
 dep_var_mapping    | [False, True]
 </pre>
-
--# View the results from kernel SVM.
+-# View the results from kernel SVM for classification.
 <pre class="example">
 -- Set extended display on for easier reading of output
 \\x ON
@@ -533,7 +605,57 @@ num_rows_processed | 15
 num_rows_skipped   | 0
 dep_var_mapping    | [False, True]
 </pre>
-
+-# View the result for the linear regression model.
+<pre class="example">
+-- Set extended display on for easier reading of output
+\\x ON
+SELECT * FROM houses_svm_regression;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]------+-----------------------------------------------------------
+coef               | {0.108331895548,38.7056695139,0.40444087325,43.3271927438}
+loss               | 342988.843452
+norm_of_gradient   | 1659.20424823876
+num_iterations     | 183
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {NULL}
+</pre>
+-# View the results from kernel SVM for regression.
+<pre class="example">
+-- Set extended display on for easier reading of output
+\\x ON
+SELECT * FROM houses_svm_gaussian_regression;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]------+----------------------------------------------------------------------------------------------------------------------------------------------------
+coef               | {12.6287285299,-12.9733002831,0.848080861156,5.94293328578,-6.34552545847,-19.7726249109,5.50362313774,-10.3908022228,-3.36031145266,16.0655950574}
+loss               | 1831979.57085
+norm_of_gradient   | 3.29486194349202
+num_iterations     | 107
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {NULL}
+</pre>
+-# View the result for the Gaussian novelty detection model.
+<pre class="example">
+-- Set extended display on for easier reading of output
+\\x ON
+SELECT * FROM houses_one_class_gaussian;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]------+-----------------------------------------------------------------------------------------------------------------------------------------------------------
+coef               | {0.653884911436,-0.963113029604,-0.49030220733,-0.352594122599,-1.29858359915,0.821120203574,-0.0657750847612,0.563678190211,0.822382924741,1.08544389673}
+loss               | 7.29444919305
+norm_of_gradient   | 0.76407243171229
+num_iterations     | 208
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {1,-1}
+</pre>
 -# Use the prediction function to evaluate the models. The predicted results are in the \e prediction column and the actual data is in the \e target column.
 For the linear model:
 <pre class="example">
@@ -545,7 +667,45 @@ For the Gaussian model:
 SELECT madlib.svm_predict('houses_svm_gaussian', 'houses', 'id', 'houses_pred_gaussian');
 SELECT *, price < 100000 AS target FROM houses JOIN houses_pred_gaussian USING (id) ORDER BY id;
 </pre>
-Result for the Gaussian model:
+For the linear regression model:
+<pre class="example">'m1','houses','id','regre'
+SELECT madlib.svm_predict('houses_svm_regression', 'houses', 'id', 'houses_regr');
+SELECT * FROM houses JOIN houses_regr USING (id) ORDER BY id;
+</pre>
+For the non-linear Gaussian regression model:
+<pre class="example">'m1','houses','id','regre'
+SELECT madlib.svm_predict('houses_svm_gaussian_regression', 'houses', 'id', 'houses_gaussian_regr');
+SELECT * FROM houses JOIN houses_gaussian_regr USING (id) ORDER BY id;
+</pre>
+-#  Create test data set for novelty detection.
+<pre class="example">
+DROP TABLE IF EXISTS houses_novelty_test;
+CREATE TABLE houses_novelty_test (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
+            size INT, lot INT);
+COPY houses_novelty_test FROM STDIN WITH DELIMITER '|';
+  1 |  33590 |       12 |    11 |  5000000 |  12770 | 221100
+  2 | 1050 |       31 |    21 |  85000000 | 141210 | 120010
+  3 |   233330 |     13 |    11 |  22500000 | 112060 |  351100
+  4 |  833370 |       12 |    12 |  9000000 | 130120 | 1751100
+  5 | 132330 |       31 |    12 | 133000000 | 150120 | 30011100
+  6 | 135330 |       21 |    11 |  90500000 |  8212120 | 25711100
+  7 | 279330 |       31 |  21.5 | 260000000 | 213012 | 25011100
+  8 | 6803333 |       12 |    11 | 142500000 | 117012 | 22111000
+  9 | 33331840 |       31 |    12 | 160000000 | 150120 | 19011100
+ 10 | 3780 |       4 |    2 | 220000 | 2790 | 21000
+ 11 | 1760 |       3 |    1 |  77000 | 1030 | 18500
+ 12 | 1520 |       3 |    2 | 128600 | 1250 | 21000
+ 13 | 3000 |       3 |    2 | 130000 | 1760 | 37000
+ 14 | 2170 |       2 |    3 | 138000 | 1550 | 13000
+ 15 |  750 |       3 |  1.5 |  75000 | 1450 | 13000
+\\.
+</pre>
+Evaluate the Gaussian one-class novelty detection model with the prediction function:
+<pre class="example">
+SELECT madlib.svm_predict('houses_one_class_gaussian', 'houses_novelty_test', 'id', 'houses_novelty_predict');
+SELECT * FROM houses_novelty_test JOIN houses_novelty_predict USING (id) ORDER BY id;
+</pre>
+Result for the Gaussian classification model:
 <pre class="result">
  id | tax  | bedroom | bath | price  | size |  lot  | prediction | target
 ----+------+---------+------+--------+------+-------+------------+--------
@@ -565,6 +725,46 @@ Result for the Gaussian model:
  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          | f
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          | t
 </pre>
+Result for the linear regression model:
+<pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  |    prediction
+----+------+---------+------+--------+------+-------+------------------
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | 56191.8809366501
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | 101723.720832136
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | 46774.6724252011
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | 90001.6946038879
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | 116051.892408163
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | 87698.7223915241
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | 200136.782523288
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | 77025.2857759612
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | 136124.528457551
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | 263134.504674981
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | 108778.731320579
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | 116782.700138583
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | 196045.611244141
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | 147173.127261586
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 |  88018.958223626
+</pre>
+Result for the Gaussian one-class novelty detection model:
+<pre class="result">
+ id |   tax    | bedroom | bath |   price   |  size   |   lot    | prediction
+----+----------+---------+------+-----------+---------+----------+------------
+  1 |    33590 |      12 |   11 |   5000000 |   12770 |   221100 |          1
+  2 |     1050 |      31 |   21 |  85000000 |  141210 |   120010 |         -1
+  3 |   233330 |      13 |   11 |  22500000 |  112060 |   351100 |          1
+  4 |   833370 |      12 |   12 |   9000000 |  130120 |  1751100 |         -1
+  5 |   132330 |      31 |   12 | 133000000 |  150120 | 30011100 |         -1
+  6 |   135330 |      21 |   11 |  90500000 | 8212120 | 25711100 |         -1
+  7 |   279330 |      31 | 21.5 | 260000000 |  213012 | 25011100 |         -1
+  8 |  6803333 |      12 |   11 | 142500000 |  117012 | 22111000 |         -1
+  9 | 33331840 |      31 |   12 | 160000000 |  150120 | 19011100 |         -1
+ 10 |     3780 |       4 |    2 |    220000 |    2790 |    21000 |          1
+ 11 |     1760 |       3 |    1 |     77000 |    1030 |    18500 |         -1
+ 12 |     1520 |       3 |    2 |    128600 |    1250 |    21000 |          1
+ 13 |     3000 |       3 |    2 |    130000 |    1760 |    37000 |          1
+ 14 |     2170 |       2 |    3 |    138000 |    1550 |    13000 |          1
+ 15 |      750 |       3 |  1.5 |     75000 |    1450 |    13000 |         -1
+</pre>
 Note the result may vary somewhat with the platform configuration you are using.
 -# Create a model for an unbalanced class-size dataset.
 <pre class="example">
@@ -912,6 +1112,82 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
     SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+-----------------------------------------------------------------
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
+    source_table text,
+    model_table text,
+    independent_varname text,
+    kernel_func text,
+    kernel_params text,
+    grouping_col text,
+    params text,
+    verbose bool)
+RETURNS void AS $$
+    PythonFunction(svm, svm, svm_one_class)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+-- all default value handling implemented in Python
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
+    source_table text,
+    model_table text,
+    independent_varname text,
+    kernel_func text,
+    kernel_params text,
+    grouping_col text,
+    params text)
+RETURNS void AS $$
+    SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, $7, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
+    source_table text,
+    model_table text,
+    independent_varname text,
+    kernel_func text,
+    kernel_params text,
+    grouping_col text)
+RETURNS void AS $$
+    SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, ''::text, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
+    source_table text,
+    model_table text,
+    independent_varname text,
+    kernel_func text,
+    kernel_params text)
+  RETURNS void AS $$
+    SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, ''::text, ''::text, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
+    source_table text,
+    model_table text,
+    independent_varname text,
+    kernel_func text)
+  RETURNS void AS $$
+    SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, ''::text, ''::text, ''::text, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
+    source_table text,
+    model_table text,
+    independent_varname text)
+  RETURNS void AS $$
+    SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, ''::text,
+                                       ''::text, ''::text, ''::text, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
 
 ------ Prediction -------------------------------------------------------------
 /**
@@ -968,9 +1244,8 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification (
     message  TEXT
 ) RETURNS TEXT AS $$
-    global is_svc
-    is_svc = True
-    PythonFunction(svm, svm, svm_help)
+    PythonFunctionBodyOnly(svm, svm)
+    return svm.svm_help(schema_madlib, message, True)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
@@ -980,17 +1255,30 @@ RETURNS TEXT AS $$
 $$ LANGUAGE SQL IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class (
+    message  TEXT
+) RETURNS TEXT AS $$
+    PythonFunctionBodyOnly(svm, svm)
+    return svm.svm_one_class_help(schema_madlib, message, True)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class ()
+RETURNS TEXT AS $$
+  SELECT MADLIB_SCHEMA.svm_one_class(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression (
     message  TEXT
 ) RETURNS TEXT AS $$
-    global is_svc
-    is_svc = False
-    PythonFunction(svm, svm, svm_help)
+    PythonFunctionBodyOnly(svm, svm)
+    return svm.svm_help(schema_madlib, message, False)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression ()
 RETURNS TEXT AS $$
-  SELECT MADLIB_SCHEMA.svm_regression(NULL::TEXT)
+  SELECT MADLIB_SCHEMA.svm_regression(''::TEXT)
 $$ LANGUAGE SQL IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b7484c1f/src/ports/postgres/modules/svm/test/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in
index 6ead311..50a5475 100644
--- a/src/ports/postgres/modules/svm/test/svm.sql_in
+++ b/src/ports/postgres/modules/svm/test/svm.sql_in
@@ -41,7 +41,6 @@ DECLARE
     temp_table text;
 BEGIN
     temp_table := 'madlib_temp_' || output_table;
-    EXECUTE 'DROP TABLE IF EXISTS ' || temp_table;
     EXECUTE '
         CREATE TABLE ' || temp_table || ' AS
             SELECT
@@ -63,7 +62,6 @@ DECLARE
     temp_table text;
 BEGIN
     temp_table := 'madlib_temp_' || output_table;
-    EXECUTE 'DROP TABLE IF EXISTS ' || temp_table;
     EXECUTE '
         CREATE TABLE ' || temp_table || ' AS
             SELECT
@@ -78,17 +76,12 @@ BEGIN
 END
 $$ LANGUAGE plpgsql;
 
-DROP TABLE IF EXISTS svm_train_data;
 SELECT svm_generate_cls_data('svm_train_data', 1000, 4);
-DROP TABLE IF EXISTS svm_test_data;
 SELECT svm_generate_cls_data('svm_test_data', 1000, 4);
-DROP TABLE IF EXISTS svr_train_data;
 SELECT svr_generate_cls_data('svr_train_data', 1000, 4);
-DROP TABLE IF EXISTS svr_test_data;
 SELECT svr_generate_cls_data('svr_test_data', 1000, 4);
 
 -- check the default values
-DROP TABLE IF EXISTS svr_model, svr_model_summary;
 SELECT svm_regression(
      'svr_train_data',
      'svr_model',
@@ -105,10 +98,9 @@ SELECT
 FROM svr_model;
 
 -- check the use of l1 norm
-DROP TABLE IF EXISTS svr_model, svr_model_summary;
 SELECT svm_regression(
      'svr_train_data',
-     'svr_model',
+     'svr_model2',
      'label',
      'ind',
      NULL,
@@ -116,7 +108,6 @@ SELECT svm_regression(
      NULL,
      'init_stepsize=0.01, max_iter=50, lambda=2, norm=l2, epsilon=0.01',
      false);
-DROP TABLE IF EXISTS svr_test_result;
 SELECT svm_predict('svr_model', 'svr_train_data', 'id', 'svr_test_result');
 \x on
 SELECT * FROM svr_model;
@@ -148,11 +139,9 @@ SELECT svm_classification(
 SELECT * FROM lclss;
 SELECT * FROM lclss_summary;
 
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
 SELECT svm_predict('lclss', 'svm_test_data', 'id', 'svm_test_predict');
 
 -- checking correctness with pre-conditioning
-DROP TABLE IF EXISTS svm_normalized CASCADE;
 CREATE TABLE svm_normalized AS
 SELECT
     id,
@@ -171,7 +160,6 @@ FROM svm_train_data,
     ) AS svm_ind_stddev
 ORDER BY random();
 
-DROP TABLE IF EXISTS svm_test_normalized CASCADE;
 CREATE TABLE svm_test_normalized AS
 SELECT
     id,
@@ -192,8 +180,6 @@ FROM svm_test_data,
 ----------------------------------------------------------------
 -- serial
 -- learning
-DROP TABLE IF EXISTS svm_model CASCADE;
-DROP TABLE IF EXISTS svm_model_summary CASCADE;
 SELECT svm_classification(
     'svm_normalized',
     'svm_model',
@@ -203,7 +189,7 @@ SELECT svm_classification(
     NULL, -- kernel_pararms
     NULL, -- grouping_col
     'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0, lambda=0',
-    true -- verbose
+    false -- verbose
     );
 \x on
 SELECT * FROM svm_model;
@@ -211,8 +197,6 @@ SELECT * FROM svm_model_summary;
 \x off
 
 -- l2
-DROP TABLE IF EXISTS svm_model_small_norm2 CASCADE;
-DROP TABLE IF EXISTS svm_model_small_norm2_summary CASCADE;
 SELECT svm_classification(
     'svm_normalized',
     'svm_model_small_norm2',
@@ -235,8 +219,6 @@ FROM svm_model AS noreg, svm_model_small_norm2 AS l2;
 
 
 -- l1 makes sprase models
-DROP TABLE IF EXISTS svm_model_very_sparse CASCADE;
-DROP TABLE IF EXISTS svm_model_very_sparse_summary CASCADE;
 SELECT svm_classification(
     'svm_normalized',
     'svm_model_very_sparse',
@@ -261,53 +243,48 @@ FROM
 WHERE w_i != 0;
 
 -- predicting
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('svm_model','svm_test_normalized', 'id', 'svm_test_predict');
+SELECT svm_predict('svm_model','svm_test_normalized', 'id', 'svm_test_predict2');
 
 -- calculating accuracy
 -- the accuracy is not guaranteed to be high because the stepsize & decay_factor
 -- depend on the actual number of segments
 SELECT
     count(*) AS misclassification_count
-FROM svm_test_predict NATURAL JOIN svm_test_normalized
+FROM svm_test_predict2 NATURAL JOIN svm_test_normalized
 WHERE prediction <> label;
 
 ----------------------------------------------------------------
 -- decay factor non-zero
 -- learning
-DROP TABLE IF EXISTS svm_model CASCADE;
-DROP TABLE IF EXISTS svm_model_summary CASCADE;
 SELECT svm_classification(
     'svm_normalized',
-    'svm_model',
+    'svm_model_decay_factor_non_zero',
     'label',
     'ind',
     NULL, -- kernel_func
     NULL, -- kernel_pararms
     NULL, --grouping_col
     'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0, lambda={0.001}',
-    true -- verbose
+    false -- verbose
     );
-SELECT norm_of_gradient FROM svm_model;
+SELECT norm_of_gradient FROM svm_model_decay_factor_non_zero;
 
 -- predicting
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-CREATE TABLE svm_test_predict AS
+CREATE TABLE svm_test_predict_decay_factor_nonzero AS
 SELECT
     svm_test_normalized.id,
     CASE WHEN array_dot(coef, ind) >= 0 THEN 1 ELSE -1 END AS prediction,
     label
-FROM svm_test_normalized, svm_model;
+FROM svm_test_normalized, svm_model_decay_factor_non_zero;
 
 -- stats for info
 SELECT count(*) AS misclassification_count
-FROM svm_test_predict
+FROM svm_test_predict_decay_factor_nonzero
 WHERE prediction <> label;
 
 
 -----------------------------------------------------------------
 -- labels that are not just 1,-1
-DROP TABLE IF EXISTS svm_normalized_fancy_label CASCADE;
 CREATE TABLE svm_normalized_fancy_label AS
 SELECT
     id,
@@ -333,7 +310,6 @@ INSERT INTO svm_normalized_fancy_label VALUES
 (1002, ARRAY[5,1,1,1,1]::float8[], NULL, 1002 % 4),
 (1003, ARRAY[5,1,NULL,1,1]::float8[], NULL, 1003 % 4);
 
-DROP TABLE IF EXISTS svm_test_normalized_fancy_label CASCADE;
 CREATE TABLE svm_test_normalized_fancy_label AS
 SELECT
     id,
@@ -357,8 +333,6 @@ INSERT INTO svm_test_normalized_fancy_label VALUES
 (1001, ARRAY[NULL,1,1,1,1]::float8[], 'YES', 1001 % 4);
 
 -- training
-DROP TABLE IF EXISTS svm_model_fancy_label CASCADE;
-DROP TABLE IF EXISTS svm_model_fancy_label_summary CASCADE;
 SELECT svm_classification(
     'svm_normalized_fancy_label',
     'svm_model_fancy_label',
@@ -378,8 +352,7 @@ SELECT assert(count(*)=4, '4 group exist') FROM svm_model_fancy_label;
 -- SELECT assert(total_rows_skipped=3, 'total_rows_skipped is wrong')
 -- FROM svm_model_fancy_label_summary;
 
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('svm_model_fancy_label', 'svm_test_normalized_fancy_label', 'id', 'svm_test_predict');
+SELECT svm_predict('svm_model_fancy_label', 'svm_test_normalized_fancy_label', 'id', 'svm_test_fancy_label');
 SELECT o.id, label, prediction, o.gid FROM svm_test_predict p, svm_test_normalized_fancy_label o where o.id = p.id;
 
 -- calculating accuracy
@@ -391,8 +364,6 @@ SELECT o.id, label, prediction, o.gid FROM svm_test_predict p, svm_test_normaliz
 -- WHERE prediction <> label;
 
 -- tests for depend varname being expression
-DROP TABLE IF EXISTS svm_model_expression CASCADE;
-DROP TABLE IF EXISTS svm_model_expression_summary CASCADE;
 SELECT svm_classification(
     'svm_normalized',
     'svm_model_expression',
@@ -402,18 +373,43 @@ SELECT svm_classification(
     NULL, -- kernel_pararms
     NULL, --grouping_col
     'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0, lambda=0.001',
-    true -- verbose
+    false -- verbose
     );
 \x on
 SELECT * FROM svm_model_expression;
 SELECT * FROM svm_model_expression_summary;
 \x off
 
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('svm_model_expression', 'svm_test_normalized', 'id', 'svm_test_predict');
-SELECT * FROM svm_test_predict;
 
-DROP TABLE IF EXISTS abalone_train_small_tmp;
+SELECT svm_one_class(
+    'svm_normalized',
+    'svm_model_expression1',
+    'ind',
+    'gaussian'
+    );
+\x on
+SELECT * FROM svm_model_expression1;
+SELECT * FROM svm_model_expression1_summary;
+\x off
+
+SELECT svm_one_class(
+    'svm_normalized',
+    'svm_model_expression2',
+    'ind',
+    'gaussian',
+    NULL,
+    NULL,
+    'init_stepsize=0.01, max_iter=3, lambda=[0.0002, 0.2], '
+    'n_folds=3, epsilon = [0.003, 0.2]'
+    );
+\x on
+SELECT * FROM svm_model_expression2;
+SELECT * FROM svm_model_expression2_summary;
+\x off
+
+SELECT svm_predict('svm_model_expression2', 'svm_test_normalized', 'id', 'svm_test_model_expression2');
+SELECT * FROM svm_test_model_expression2;
+
 CREATE TABLE abalone_train_small_tmp (
     sex TEXT,
     id SERIAL NOT NULL,
@@ -527,52 +523,47 @@ INSERT INTO abalone_train_small_tmp(id,sex,length,diameter,height,whole,shucked,
 (831,'M',0.415,0.305,0.1,0.325,0.156,0.0505,0.091,6),
 (3359,'M',0.285,0.215,0.075,0.106,0.0415,0.023,0.035,5);
 
-DROP TABLE IF EXISTS abalone_train_small;
 CREATE TABLE abalone_train_small AS (
     SELECT * FROM abalone_train_small_tmp
 );
 
 -- create epsilon input table
-
-DROP TABLE IF EXISTS abalone_eps;
 CREATE TABLE abalone_eps (
     sex TEXT,
     epsilon DOUBLE PRECISION);
-
 INSERT INTO abalone_eps(sex, epsilon) VALUES
 ('I', 0.2),
 ('M', 0.05);
 
-DROP TABLE IF EXISTS m1, m1_summary;
 SELECT svm_regression(
      'svr_train_data',
      'm1',
      'label',
      'ind',
-     NULL,NULL,NULL,
+     'poly',
+      NULL,
+      NULL,
      'init_stepsize=0.01, max_iter=3, lambda=[0.0002, 0.2], '
      'n_folds=3, epsilon = [0.003, 0.2]',
      true);
 
-DROP TABLE IF EXISTS m1, m1_summary;
 SELECT svm_regression(
      'svr_train_data',
-     'm1',
+     'm2',
      'label',
      'ind',
      NULL,NULL,NULL,
      'init_stepsize=0.01, max_iter=2, lambda=[0.0002, 0.2], n_folds=3',
      false);
 -- check which lambda is selected
-SELECT reg_params FROM m1_summary;
+SELECT reg_params FROM m2_summary;
 
 -- epsilon values are ignored
 -- the validation table only contains
 -- init_stepsize and lambda
-DROP TABLE IF EXISTS m1, m1_summary, val_res;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'm3',
      'label',
      'ind',
      NULL,NULL,NULL,
@@ -580,65 +571,53 @@ SELECT svm_classification(
      'n_folds=3, epsilon=[0.1, 1], validation_result=val_res');
 SELECT * FROM val_res;
 
-DROP TABLE IF EXISTS m1, m1_summary, val_res;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'm4',
      'label',
      'ind',
      NULL,NULL,NULL,
      'init_stepsize=0.01, max_iter=20, lambda=[20, 0.0002, 0.02], '
-     'n_folds=3, validation_result=val_res');
+     'n_folds=3, validation_result=val_res2');
 SELECT * FROM val_res;
 -- check which lambda is selected
 SELECT reg_params FROM m1_summary;
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
--- accuracy with cv
-SELECT
-    count(*) AS misclassification_count
-FROM svm_test_predict NATURAL JOIN svm_test_data
-WHERE prediction <> label;
+SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_reg_params');
 
-DROP TABLE IF EXISTS m1, m1_summary;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'm5',
      'label',
      'ind',
      NULL,NULL,NULL,
      'init_stepsize=0.01, max_iter=20, lambda=0.000002');
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT svm_predict('m5','svm_test_data', 'id', 'svm_test_5');
 -- accuracy without cv
 SELECT
     count(*) AS misclassification_count
-FROM svm_test_predict NATURAL JOIN svm_test_data
+FROM svm_test_5 NATURAL JOIN svm_test_data
 WHERE prediction <> label;
 
 -- SVM with kernels -----------------------------------------------------------
 -- verify guassian kernel mapping dimensions
-DROP TABLE IF EXISTS m1, m1_summary, m1_random;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'm6',
      'label',
      'ind',
      'gaussian',
-     'n_components=3',
+     'n_components=3, fit_intercept=false',
      NULL,
      'max_iter=2');
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT svm_predict('m6','svm_test_data', 'id', 'svm_test_6');
 SELECT
     assert(
         array_upper(coef, 1) = 3,
         'The dimension of the coefficients must be equal to n_components (3)!')
-FROM m1;
+FROM m6;
 
 -- verify gaussian kernel with grouping
 -- verify partial string support in kernel specification
-DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary, svr_mdl_m_random;
 SELECT svm_regression(
         'abalone_train_small',
         'svr_mdl_m',
@@ -649,8 +628,7 @@ SELECT svm_regression(
         'sex',
         'max_iter=2, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.05',
         false);
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('svr_mdl_m','abalone_train_small', 'id', 'svm_test_predict');
+SELECT svm_predict('svr_mdl_m','abalone_train_small', 'id', 'svm_test_mdl_m');
 SELECT
     assert(
         array_upper(coef, 1) = 10,
@@ -658,44 +636,40 @@ SELECT
 FROM svr_mdl_m;
 
 -- verify guassian kernel with cross validation
-DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'm7',
      'label',
      'ind',
      'gaussian',
-     'n_components=3',
+     'n_components=3, fit_intercept=true',
      NULL,
      'max_iter=2, n_folds=3, lambda=[0.01, 0.1, 0.5]');
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+
+SELECT svm_predict('m7','svm_test_data', 'id', 'svm_test_7');
 SELECT
     assert(
-        array_upper(coef, 1) = 3,
-        'The dimension of the coefficients must be equal to n_components (3)!')
-FROM m1;
+        array_upper(coef, 1) = 4,
+        'The dimension of the coefficients must be equal to n_components + 1 (4)!')
+FROM m7;
 
 -- verify guassian kernel with out-of-memory support
-DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'm8',
      'label',
      'ind',
      'gaussian',
-     'n_components=3, in_memory=0',
+     'n_components=3, fit_in_memory=False',
      NULL,
      'max_iter=2, n_folds=3, lambda=[0.01, 0.1, 0.5]');
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_8');
 SELECT
     assert(
         array_upper(coef, 1) = 3,
         'The dimension of the coefficients must be equal to n_components (3)!')
-FROM m1;
+FROM m8;
 
-DROP TABLE IF EXISTS kernel_data;
 CREATE TABLE kernel_data (
     index bigint,
     x1 double precision,
@@ -720,75 +694,68 @@ INSERT INTO kernel_data (index, x1, x2, y) VALUES (14, 0, -2.70000000000000018,
 INSERT INTO kernel_data (index, x1, x2, y) VALUES (15, 1.30000000000000004, 2.10000000000000009, 1);
 
 -- verify poly kernel mapping dimensions
-DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'poly_mapping',
      'label',
      'ind',
      'poly',
-     'n_components=3',
+     'n_components=3, fit_intercept=true',
      NULL,
      'max_iter=2');
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT svm_predict('poly_mapping','svm_test_data', 'id', 'svm_test_poly_mapping');
 SELECT
     assert(
-        array_upper(coef, 1) = 3,
-        'The dimension of the coefficients must be equal to n_components (3)!')
-FROM m1;
+        array_upper(coef, 1) = 4,
+        'The dimension of the coefficients must be equal to n_components + 1 (4)!')
+FROM poly_mapping;
 
 -- verify poly kernel with grouping
 -- verify partial string support in kernel specification
-DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary, svr_mdl_m_random CASCADE;
 SELECT svm_regression(
         'abalone_train_small',
-        'svr_mdl_m',
+        'svr_mdl_poly',
         'rings',
         'ARRAY[1,diameter,shell,shucked,length]',
         'po',
-        'degree=2, n_components=10',
+        'degree=2, n_components=10, fit_intercept=true',
         'sex',
         'max_iter=2, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.05',
         false);
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('svr_mdl_m','abalone_train_small', 'id', 'svm_test_predict');
+SELECT svm_predict('svr_mdl_poly','abalone_train_small', 'id', 'svm_test_poly');
 SELECT
     assert(
-        array_upper(coef, 1) = 10,
-        'The dimension of the coefficients must be equal to n_components (10)!')
-FROM svr_mdl_m;
+        array_upper(coef, 1) = 11,
+        'The dimension of the coefficients must be equal to n_components + 1 (11)!')
+FROM svr_mdl_poly;
 
 -- verify poly kernel with cross validation
-DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
 SELECT svm_classification(
      'svm_train_data',
-     'm1',
+     'm9',
      'label',
      'ind',
      'poly',
      'n_components=3',
      NULL,
      'max_iter=2, n_folds=3, lambda=[0.01, 0.1, 0.5]');
-DROP TABLE IF EXISTS svm_test_predict CASCADE;
-SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT svm_predict('m9','svm_test_data', 'id', 'svm_test_9');
 SELECT
     assert(
         array_upper(coef, 1) = 3,
         'The dimension of the coefficients must be equal to n_components (3)!')
-FROM m1;
+FROM m9;
 
-DROP TABLE IF EXISTS m1, m1_summary, m1_random;
 SELECT svm_classification(
      'kernel_data',
-     'm1',
+     'm10',
      'y',
      'array[x1, x2]',
      'gaussian',
      'gamma=1, n_components=20, random_state=2',
      NULL,
      'init_stepsize=1, max_iter=10');
-SELECT * FROM m1;
+SELECT * FROM m10;
 
 -- Test for class weight
 CREATE TABLE svm_unbalanced (

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b7484c1f/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
index 67667ba..9803329 100644
--- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
+++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in
@@ -304,17 +304,14 @@ class GroupIterationController:
                                         group_param.grouped_state_type)
         for grp, t in res.iteritems():
             loss, normg = t['loss'], t['norm_of_gradient']
-            iteration = self.iteration
-            output_str = "DEBUG: \
-                    grp = {grp:10s}, \
-                    iter = {iteration:5d}, \
-                    loss = {loss:.5e}, \
-                    |gradient| = {normg:.5e}, \
-                    stepsize = {stepsize:.5e}"
+            output_str = ("DEBUG: grp = {grp:10s}, "
+                          "iter = {iteration:5d}, "
+                          "loss = {loss:.5e}, "
+                          "|gradient| = {normg:.5e}, "
+                          "stepsize = {stepsize:.5e}")
             plpy.notice(output_str.format(
-                        grp=grp, iteration=iteration,
-                        loss=loss, normg=normg,
-                        **self.kwargs))
+                        grp=grp, iteration=self.iteration,
+                        loss=loss, normg=normg, **self.kwargs))
 
     def final(self):
         """ Store the final converged state to a table for output """

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b7484c1f/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index f0876b3..be20da1 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -432,8 +432,9 @@ def preprocess_keyvalue_params(input_params, split_char='='):
 
 
     """
-    re_str = (r"(\w+\s*" +       # key is any alphanumeric character string
-              split_char +       # key and value are separated by char
+    re_str = (r"([-\w]+\s*" +     # key is any alphanumeric character
+                                  # (including -) string
+              split_char +        # key and value are separated by split_char
               r"""
                 \s*([\(\{\[]      # value can be string or array
                 [^\[\]\(\)\{\}]*  #  if value is array then accept anything inside

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b7484c1f/src/ports/postgres/modules/validation/internal/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
index 8b361f2..7f8c64e 100644
--- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
@@ -5,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,10 +24,8 @@ from utilities.utilities import num_samples
 
 from math import sqrt
 from collections import namedtuple
-from functools import partial
 from operator import itemgetter
-from operator import attrgetter
-from itertools import product, repeat, imap, chain
+from itertools import product, repeat, chain
 
 version_wrapper = __mad_version()
 mad_vec = version_wrapper.select_vecfunc()
@@ -154,8 +152,8 @@ class ValidationResult(object):
 
         data = []
         for h in cv_history_f:
-           values = ','.join([str(h[k]) for k in header])
-           data.append("({0})".format(values))
+            values = ','.join([str(h[k]) for k in header])
+            data.append("({0})".format(values))
         data = ','.join(data)
 
         plpy.execute("""
@@ -174,6 +172,7 @@ class _ValidationArgs(object):
     def grid(cls, sub_args):
         def comb_dict(dicts):
             return dict(chain.from_iterable(d.iteritems() for d in dicts))
+
         def make_dicts(k, vs):
             return [dict([t]) for t in zip(repeat(k), vs)]
 
@@ -223,7 +222,7 @@ def _cv_copy_data(rel_origin, dependent_varname,
                independent_varname=independent_varname,
                target_col=target_col, features_col=features_col))
     return target_col, features_col
-## ========================================================================
+# -------------------------------------------------------------------------
 
 
 def _cv_split_data(rel_source, col_data, col_id, row_num,
@@ -241,20 +240,20 @@ def _cv_split_data(rel_source, col_data, col_id, row_num,
     # which corresponds to rows outside of [start_row, end_row).
     # Extract the validation part of data,
     # which corresponds to rows inside of [start_row, end_row).
-    sql = """
+    plpy.execute("""
         drop view if exists {rel_train};
         create temp view {rel_train} as
             select {col_id}, {col_string} from {rel_source}
             where {col_id} < {start_row}
-                 or {col_id} >= {end_row};
-
+                 or {col_id} >= {end_row}
+        """.format(**kwargs))
+    plpy.execute("""
         drop view if exists {rel_valid};
         create temp view {rel_valid} as
             select {col_id}, {col_string} from {rel_source}
             where {col_id} >= {start_row}
                  and {col_id} < {end_row}
-    """.format(**kwargs)
-    plpy.execute(sql)
+    """.format(**kwargs))
     return None
 # ------------------------------------------------------------------------------
 
@@ -287,7 +286,7 @@ class CrossValidator(object):
                 - independent_varname: the column for features
                 - dependent_varname: the column for target
                 - schema_madlib: the schema where madlib is installed
-              - model_table: table created for the trained model
+                - model_table: table created for the trained model
 
     """
     def __init__(self, estimator, predictor, scorer, args):
@@ -319,8 +318,8 @@ class CrossValidator(object):
         row_num = self._row_num
         SplitData = namedtuple('SplitData', 'rel_train, rel_valid')
         for k in range(n_folds):
-            rel_train = unique_string(desp='cv_train_{0}'.format(k))
-            rel_valid = unique_string(desp='cv_valid_{0}'.format(k))
+            rel_train = unique_string(desp='cv_train_{0}_'.format(k))
+            rel_valid = unique_string(desp='cv_valid_{0}_'.format(k))
             _cv_split_data(rel_copied, col_data, col_id, row_num,
                            rel_train, rel_valid, n_folds, k+1)
             yield SplitData(rel_train=rel_train, rel_valid=rel_valid)
@@ -345,6 +344,7 @@ class CrossValidator(object):
 
         predictor(schema_madlib, model_table,
                   rel_valid, col_id, output_table)
+
         score = self._score(output_table, rel_valid, scorer)
         plpy.execute("""
                      DROP TABLE IF EXISTS {model_table}, {model_table}_summary;
@@ -359,32 +359,32 @@ class CrossValidator(object):
         col_id = self._col_id
         if method == 'regression':
             return plpy.execute(
-                    """
-                    SELECT
-                        -avg(({target}-prediction)^2) AS accuracy
-                    FROM {pred} JOIN {orig}
-                    ON {pred}.{id} = {orig}.{id}
-                    """.format(pred=pred,
-                               orig=orig,
-                               id=col_id,
-                               target=target))[0]['accuracy']
+                """
+                SELECT
+                    -avg(({target}-prediction)^2) AS accuracy
+                FROM {pred} JOIN {orig}
+                ON {pred}.{id} = {orig}.{id}
+                """.format(pred=pred,
+                           orig=orig,
+                           id=col_id,
+                           target=target))[0]['accuracy']
         elif method == 'classification':
             return plpy.execute(
-                    """
-                    SELECT (1 - miss / total) AS accuracy
-                    FROM
-                    (
-                      SELECT count(*)::float8 AS miss
-                      FROM {pred} JOIN {orig}
-                      ON {pred}.{id} = {orig}.{id}
-                      WHERE prediction <> {target}) s,
-                    (
-                      SELECT count(*)::float8 AS total
-                      FROM {orig}) r;
-                    """.format(pred=pred,
-                               orig=orig,
-                               id=col_id,
-                               target=target))[0]['accuracy']
+                """
+                SELECT (1 - miss / total) AS accuracy
+                FROM
+                (
+                  SELECT count(*)::float8 AS miss
+                  FROM {pred} JOIN {orig}
+                  ON {pred}.{id} = {orig}.{id}
+                  WHERE prediction <> {target}) s,
+                (
+                  SELECT count(*)::float8 AS total
+                  FROM {orig}) r;
+                """.format(pred=pred,
+                           orig=orig,
+                           id=col_id,
+                           target=target))[0]['accuracy']
         else:
             plpy.error("Cross Validation Error: invalid method value ({0})! "
                        "Need to be either classification "
@@ -420,18 +420,18 @@ class CrossValidator(object):
             n = len(nums)
             # no need to check against 0 division
             # because n_folds is larger than 1
-            a = sum(nums) / n
-            s = sqrt(sum([(x - a)**2 for x in nums]) / (n - 1))
-            return a, s
+            mean = sum(nums) / n
+            std_dev = sqrt(sum((x - mean)**2 for x in nums) / (n - 1))
+            return mean, std_dev
 
         if not sub_args:
             return []
 
         cv_history = ValidationResult()
-        split_data = list(self._gen_split_data(n_folds))
+        all_split_data = list(self._gen_split_data(n_folds))
+        tof = self._test_one_fold
         for sa in _ValidationArgs.grid(sub_args):
-            _test = partial(self._test_one_fold, sub_args=sa)
-            scores = map(_test, split_data)
+            scores = [tof(i, sub_args=sa) for i in all_split_data]
             a, s = _stats(scores)
             cv_history.add_one(mean=a, std=s, sub_args=sa)
-        return cv_history
\ No newline at end of file
+        return cv_history

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b7484c1f/src/ports/postgres/modules/validation/test/cross_validation.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/test/cross_validation.sql_in b/src/ports/postgres/modules/validation/test/cross_validation.sql_in
index 3bc32cc..258be29 100644
--- a/src/ports/postgres/modules/validation/test/cross_validation.sql_in
+++ b/src/ports/postgres/modules/validation/test/cross_validation.sql_in
@@ -5,7 +5,6 @@ m4_changequote(`<!', `!>')
  * Test Cross Validation (for ridge regression)
  * -------------------------------------------------------------------------- */
 
-DROP TABLE IF EXISTS "Lin_housing_wi";
 CREATE TABLE "Lin_housing_wi" ("X" float8[], "Y" float8);
 COPY "Lin_housing_wi" FROM STDIN NULL '?';
 {1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98}	24.00
@@ -516,7 +515,6 @@ COPY "Lin_housing_wi" FROM STDIN NULL '?';
 {1,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88}	11.90
 \.
 
-DROP TABLE IF EXISTS log_breast_cancer_wisconsin;
 CREATE TABLE log_breast_cancer_wisconsin (x float8[],y boolean);
 COPY log_breast_cancer_wisconsin FROM STDIN NULL '?' ;
 {1,5,1,1,1,2,1,3,1,1}	false
@@ -1368,7 +1366,6 @@ select check_cv0();
 -- select check_cv_ridge();
 
 m4_ifdef(<!__HAWQ__!>, <!!>, <!
-DROP TABLE IF EXISTS houses;
 CREATE TABLE houses (
     id SERIAL NOT NULL,
     tax INTEGER,
@@ -1396,7 +1393,6 @@ INSERT INTO houses(tax, bedroom, bath, price, size, lot) VALUES
 (2070, 2, 3,   148000, 1550, 14000),
 ( 650, 3, 1.5,  65000, 1450, 12000);
 
-DROP TABLE IF EXISTS valid_rst_houses CASCADE;
 SELECT cross_validation_general(
     'MADLIB_SCHEMA.elastic_net_train',   -- modelling_func
     '{%data%, %model%, (price>100000), "array[tax, bath, size]", binomial, 1, lambda, TRUE, NULL, fista, "{eta = 2, max_stepsize = 2, use_active_set = t}", NULL, 2000, 1e-6}'::varchar[],  -- modeling_params