You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by xt...@apache.org on 2016/02/23 23:35:24 UTC

incubator-madlib git commit: SVM: Update docs and online help

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 018e159c0 -> ee035efd7


SVM: Update docs and online help

JIRA: MADLIB-956

- Moved epsilon and eps_table into reg_params section of model summary table
- Online help for prediction using SVM model is available through svm_predict()
- Examples are available as part of the online help, e.g., svm_regression('example')
- Epsilon value checking (one test) is removed from install check

Closes #19


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ee035efd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ee035efd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ee035efd

Branch: refs/heads/master
Commit: ee035efd75aac95a5d2293bfa62478fbe86b4385
Parents: 018e159
Author: Xiaocheng Tang <xi...@gmail.com>
Authored: Tue Feb 23 14:30:51 2016 -0800
Committer: Xiaocheng Tang <xi...@gmail.com>
Committed: Tue Feb 23 14:30:51 2016 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/svm/svm.py_in       | 219 +++++++++++++++++++-
 src/ports/postgres/modules/svm/svm.sql_in      | 195 ++++++++++-------
 src/ports/postgres/modules/svm/test/svm.sql_in |  15 --
 3 files changed, 338 insertions(+), 91 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ee035efd/src/ports/postgres/modules/svm/svm.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in
index 4c5ac2c..9b16f83 100644
--- a/src/ports/postgres/modules/svm/svm.py_in
+++ b/src/ports/postgres/modules/svm/svm.py_in
@@ -234,19 +234,147 @@ def _build_output_tables(n_iters_run, model_table, args, transformer, **kwargs):
                 'init_stepsize={init_stepsize}, '   ||
                     'decay_factor={decay_factor}, ' ||
                     'max_iter={max_iter}, '         ||
-                    'tolerance={tolerance}'::text   AS optim_params,
+                    'tolerance={tolerance}, '       ||
+                    'epsilon={epsilon}, '           ||
+                    'eps_table={eps_table}'::text   AS optim_params,
                 'lambda={lambda}, norm={norm}, n_folds={n_folds}'::text
                                                     AS reg_params,
                 count(*)::integer                   AS num_all_groups,
                 {n_failed_groups}::integer                          AS num_failed_groups,
                 sum(num_rows_processed)::bigint     AS total_rows_processed,
-                sum(num_rows_skipped)::bigint       AS total_rows_skipped,
-                '{epsilon}'::double precision       AS epsilon,
-                '{eps_table}'::text                 AS eps_table
+                sum(num_rows_skipped)::bigint       AS total_rows_skipped
             FROM {model_table};
             """.format(**args))
 
 
+def svm_predict_help(schema_madlib, message, **kwargs):
+    args = dict(schema_madlib=schema_madlib)
+
+    summary = """
+    ----------------------------------------------------------------
+                            SUMMARY
+    ----------------------------------------------------------------
+    Prediction for SVM can be used to obtain a prediction of both the
+    boolean and continuous value of the dependent variable given a
+    value of independent variable.
+
+    For more details on function usage:
+        SELECT {schema_madlib}.svm_predict('usage')
+
+    For a small example on using the function:
+        SELECT {schema_madlib}.svm_predict('example')
+    """.format(**args)
+
+    usage = """
+    ---------------------------------------------------------------------------
+                                 PREDICTION
+    ---------------------------------------------------------------------------
+    The prediction function is used to estimate the conditional mean given a
+    new predictor. It has the following syntax:
+
+    SELECT {schema_madlib}.svm_predict(
+        model_table,        -- TEXT. Model table produced by
+                               the training function.
+        new_data_table,     -- TEXT. Name of the table containing the
+                               prediction data. This table is expected to
+                               contain the same features that were used during
+                               training. The table should also contain
+                               id_col_name used for identifying each row.
+        id_col_name,        -- TEXT. The name of the id column in
+                               the input table.
+        output_table        -- TEXT. Name of the table where output
+                               predictions are written. If this table name is
+                               already in use, then an error is returned. The
+                               table contains the id_col_name column giving
+                               the 'id' for each prediction and the prediction
+                               columns for the dependent variable.
+    );
+    """.format(**args)
+
+    example_usage = """
+    ---------------------------------------------------------------------------
+                                  EXAMPLES
+    ---------------------------------------------------------------------------
+    - Create an input data set.
+
+    CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
+                size INT, lot INT);
+    COPY houses FROM STDIN WITH DELIMITER '|';
+      1 |  590 |       2 |    1 |  50000 |  770 | 22100
+      2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
+      3 |   20 |       3 |    1 |  22500 | 1060 |  3500
+      4 |  870 |       2 |    2 |  90000 | 1300 | 17500
+      5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
+      6 | 1350 |       2 |    1 |  90500 |  820 | 25700
+      7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
+      8 |  680 |       2 |    1 | 142500 | 1170 | 22000
+      9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
+     10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
+     11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
+     12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
+     13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
+     14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
+     15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
+    \.
+
+    - Train a classification model, using a linear model.
+
+    SELECT {schema_madlib}.svm_classification('houses',
+                                     'houses_svm',
+                                     'price < 100000',
+                                     'ARRAY[1, tax, bath, size]');
+
+    - Generate a nonlinear model using a Gaussian kernel. This time we
+      specify the initial step size and maximum number of iterations to run.
+      As part of the kernel parameter, we choose 10 as the dimension of the
+      space where we train SVM. A larger number will lead to a more powerful
+      model but run the risk of overfitting. As a result, the model will be a
+      10 dimensional vector, instead of 4 as in the case of linear model.
+
+    SELECT {schema_madlib}.svm_classification( 'houses',
+                                      'houses_svm_gaussian',
+                                      'price < 100000',
+                                      'ARRAY[1, tax, bath, size]',
+                                      'gaussian',
+                                      'n_components=10',
+                                      '',
+                                      'init_stepsize=1, max_iter=200');
+
+    - Use the prediction function to evaluate the models. The predicted
+      results are in the prediction column and the actual data is in the
+      target column.
+
+    -- For the linear model:
+    SELECT {schema_madlib}.svm_predict('houses_svm',
+                                       'houses',
+                                       'id',
+                                       'houses_pred');
+    SELECT *, price < 100000 AS target
+    FROM houses JOIN houses_pred
+    USING (id) ORDER BY id;
+
+    -- For the Gaussian model:
+    SELECT {schema_madlib}.svm_predict('houses_svm_gaussian',
+                                       'houses',
+                                       'id',
+                                       'houses_pred_gaussian');
+    SELECT *, price < 100000 AS target
+    FROM houses JOIN houses_pred_gaussian
+    USING (id) ORDER BY id;
+    """.format(**args)
+
+    if not message:
+        return summary
+    elif message.lower() in ('usage', 'help', '?'):
+        return usage
+    elif message.lower() == 'example':
+        return example_usage
+    else:
+        return """
+            No such option. Use "SELECT {schema_madlib}.svm_predict()" for help.
+        """.format(**args)
+
+
 def svm_help(schema_madlib, message, is_svc, **kwargs):
     method = 'svm_classification' if is_svc else 'svm_regression'
 
@@ -265,6 +393,9 @@ def svm_help(schema_madlib, message, is_svc, **kwargs):
 
     For more details on function usage:
         SELECT {schema_madlib}.{method}('usage')
+
+    For a small example on using the function:
+        SELECT {schema_madlib}.{method}('example')
         """.format(**args)
 
     usage = """
@@ -327,6 +458,12 @@ def svm_help(schema_madlib, message, is_svc, **kwargs):
                                        number of iterations or hit the maximum
                                        number specified in the
                                        optimization parameters.
+    loss                FLOAT8,     -- value of the objective function of
+                                       SVM.  See Technical Background section
+                                       below for more details.
+    norm_of_gradient    FLOAT8,     -- value of the L2-norm of the
+                                       (sub)-gradient of the objective
+                                       function.
     __dep_var_mapping   TEXT[],     -- vector of dependendent variable labels.
                                        The first entry will correspond to -1
                                        and the second to +1, for internal use.
@@ -433,10 +570,84 @@ def svm_help(schema_madlib, message, is_svc, **kwargs):
     random_state        -- Default: 1. Seed used by the random number generator.
     """
 
+    example_usage = """
+    ---------------------------------------------------------------------------
+                                  EXAMPLES
+    ---------------------------------------------------------------------------
+    - Create an input data set.
+
+    CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
+                size INT, lot INT);
+    COPY houses FROM STDIN WITH DELIMITER '|';
+      1 |  590 |       2 |    1 |  50000 |  770 | 22100
+      2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
+      3 |   20 |       3 |    1 |  22500 | 1060 |  3500
+      4 |  870 |       2 |    2 |  90000 | 1300 | 17500
+      5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
+      6 | 1350 |       2 |    1 |  90500 |  820 | 25700
+      7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
+      8 |  680 |       2 |    1 | 142500 | 1170 | 22000
+      9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
+     10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
+     11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
+     12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
+     13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
+     14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
+     15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
+    \.
+
+    - Train a classification model, using a linear model.
+
+    SELECT {schema_madlib}.svm_classification('houses',
+                                     'houses_svm',
+                                     'price < 100000',
+                                     'ARRAY[1, tax, bath, size]');
+
+    - Generate a nonlinear model using a Gaussian kernel. This time we
+      specify the initial step size and maximum number of iterations to run.
+      As part of the kernel parameter, we choose 10 as the dimension of the
+      space where we train SVM. A larger number will lead to a more powerful
+      model but run the risk of overfitting. As a result, the model will be a
+      10 dimensional vector, instead of 4 as in the case of linear model.
+
+    SELECT {schema_madlib}.svm_classification( 'houses',
+                                      'houses_svm_gaussian',
+                                      'price < 100000',
+                                      'ARRAY[1, tax, bath, size]',
+                                      'gaussian',
+                                      'n_components=10',
+                                      '',
+                                      'init_stepsize=1, max_iter=200');
+
+    - Use the prediction function to evaluate the models. The predicted
+      results are in the prediction column and the actual data is in the
+      target column.
+
+    -- For the linear model:
+    SELECT {schema_madlib}.svm_predict('houses_svm',
+                                       'houses',
+                                       'id',
+                                       'houses_pred');
+    SELECT *, price < 100000 AS target
+    FROM houses JOIN houses_pred
+    USING (id) ORDER BY id;
+
+    -- For the Gaussian model:
+    SELECT {schema_madlib}.svm_predict('houses_svm_gaussian',
+                                       'houses',
+                                       'id',
+                                       'houses_pred_gaussian');
+    SELECT *, price < 100000 AS target
+    FROM houses JOIN houses_pred_gaussian
+    USING (id) ORDER BY id;
+    """.format(**args)
+
     if not message:
         return summary
     elif message.lower() in ('usage', 'help', '?'):
         return usage
+    elif message.lower() == 'example':
+        return example_usage
     elif message.lower() == 'params':
         return params_usage
     elif message.lower() == 'gaussian':

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ee035efd/src/ports/postgres/modules/svm/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.sql_in b/src/ports/postgres/modules/svm/svm.sql_in
index 74df817..e7af48a 100644
--- a/src/ports/postgres/modules/svm/svm.sql_in
+++ b/src/ports/postgres/modules/svm/svm.sql_in
@@ -31,20 +31,20 @@ m4_include(`SQLCommon.m4')
 Support Vector Machines (SVMs) are models for regression and classification
 tasks. SVM models have two particularly desirable features: robustness in the
 presence of noisy data and applicability to a variety of data configurations. At
-its core, a <em>linear</em> SVM model is a hyperplane separating the two
+its core, a <em>linear</em> SVM model is a hyperplane separating two
 distinct classes of data (in the case of classification problems), in such a way
 that the distance between the hyperplane and the nearest training data point
 (called the <em>margin</em>) is maximized. Vectors that lie on this margin are
 called support vectors. With the support vectors fixed, perturbations of vectors
-beyond the margin will not affect the model; this contributes to the models
-robutstness.  Subsituting a kernel function for the usual inner product, one can
-approximate a large variety of decision boundaries beyond linear hyperplanes.
-@brief Solves classification and regression problems by separating the data with
-a hyperplane or other nonlinear decision boundaries.
+beyond the margin will not affect the model; this contributes to the model’s
+robustness.  By substituting a kernel function for the usual inner product, one can
+approximate a large variety of decision boundaries in addition to linear hyperplanes.
+@brief Solves classification and regression problems by separating data with
+a hyperplane or other nonlinear decision boundary.
 
 @anchor svm_classification
 @par Classification Training Function
-SVM classification training function has the following format:
+The SVM classification training function has the following format:
 <pre class="syntax">
 svm_classification(
     source_table,
@@ -61,11 +61,11 @@ svm_classification(
 \b Arguments
 <DL class="arglist">
   <DT>source_table</DT>
-  <DD>TEXT. The name of the table containing the training data.</DD>
+  <DD>TEXT. Name of the table containing the training data.</DD>
 
   <DT>model_table</DT>
   <DD>TEXT. Name of the output table containing the model. Details of the output
-   tables provided below.
+   tables are provided below.
   </DD>
 
   <DT>dependent_varname</DT>
@@ -78,32 +78,32 @@ svm_classification(
   <DD>TEXT. Expression list to evaluate for the
     independent variables. An intercept variable is not assumed. It is common to
     provide an explicit intercept term by including a single constant \c 1 term
-    in the independent variable list. Expression should be able to be casted
+    in the independent variable list. Expression should be able to be cast
     into DOUBLE PRECISION [].
     </DD>
 
   <DT>kernel_func (optional)</DT>
   <DD>TEXT, default: 'linear'.
-    The type of kernel. Currently, three types are supported: 'linear',
+    Type of kernel. Currently three kernel types are supported: 'linear',
     'gaussian', and 'polynomial'. The text can be any subset of the three
-    strings; for eg. kernel_func='ga' will create a Gaussian kernel.
+    strings; for e.g., kernel_func='ga' will create a Gaussian kernel.
     </DD>
 
   <DT>kernel_params (optional)</DT>
-  <DD>TEXT, defaults: NULL
-   <br>Parameters for non-linear kernel in a comma-separated string of key-value pairs.
-   The parameters differ depending on the value of \e kernel_func. See the description below for details.
+  <DD>TEXT, defaults: NULL.
+    Parameters for non-linear kernel in a comma-separated string of key-value pairs.
+    The actual parameters differ depending on the value of \e kernel_func. See the description below for details.
   </DD>
 
   <DT>grouping_col (optional)</DT>
   <DD>TEXT, default: NULL. An expression list used to group
-    the input dataset into discrete groups, running one model per group.
+    the input dataset into discrete groups, which results in running one model per group.
     Similar to the SQL "GROUP BY" clause. When this value is NULL, no
-    grouping is used and a single model is generated.</DD>
+    grouping is used and a single model is generated.  Please note that cross validation is not supported if grouping is used.</DD>
 
   <DT>params (optional)</DT>
   <DD>TEXT, default: NULL.
-    <br> Parameters for optimization and regularization in a comma-separated string of key-value pairs. If a list of values are provided, then cross-validation will be performed to select the \e best value from the list.
+    Parameters for optimization and regularization in a comma-separated string of key-value pairs. If a list of values is provided, then cross-validation will be performed to select the \e best value from the list.
     See the description below for details. </DD>
 
   <DT>verbose (optional)</DT>
@@ -113,11 +113,11 @@ svm_classification(
 
 <b>Output tables</b>
 <br>
-    The model table produced by svm contains the following columns:
+    The model table produced by SVM contains the following columns:
     <table class="output">
       <tr>
         <th>coef</th>
-        <td>FLOAT8. Vector of the coefficients.</td>
+        <td>FLOAT8. Vector of coefficients.</td>
       </tr>
       <tr>
         <th>grouping_key</th>
@@ -138,13 +138,23 @@ svm_classification(
         or hit the maximum number specified in the optimization parameters. </td>
       </tr>
       <tr>
+        <th>loss</th>
+        <td>FLOAT8. Value of the objective function of SVM.  See Technical Background section below for more details.</td>
+      </tr>
+      <tr>
+        <th>norm_of_gradient</th>
+        <td>FLOAT8. Value of the L2-norm of the (sub)-gradient of the objective function.</td>
+      </tr>
+      <tr>
         <th>__dep_var_mapping</th>
-        <td>TEXT[]. Vector of dependent variable labels. The first entry will
-        correspond to -1 and the second to +1, for internal use.</td>
+        <td>TEXT[]. Vector of dependent variable labels. The first entry
+        corresponds to -1 and the second to +1.  For internal use only.</td>
       </tr>
     </table>
 
-    An auxiliary table named \<model_table\>_random is created if the kernel is not linear. It contains data needed to embed test data into random feature space (see reference [2,3]). This data is used internally by svm_predict and not meaningful on its own. And a summary table named \<model_table\>_summary is also created at the same time, which has the following columns:
+    An auxiliary table named \<model_table\>_random is created if the kernel is not linear. It contains data needed to embed test data into a random feature space (see references [2,3]). This data is used internally by svm_predict and not meaningful on its own to the user, so you can ignore it.
+
+A summary table named \<model_table\>_summary is also created, which has the following columns:
     <table class="output">
     <tr>
         <th>method</th>
@@ -152,7 +162,7 @@ svm_classification(
     </tr>
     <tr>
         <th>version_number</th>
-        <td>Version of madlib which was used to generate the model.</td>
+        <td>Version of MADlib which was used to generate the model.</td>
     </tr>
     <tr>
         <th>source_table</th>
@@ -192,11 +202,11 @@ svm_classification(
     </tr>
     <tr>
         <th>num_all_groups</th>
-        <td>Number of groups in svm training.</td>
+        <td>Number of groups in SVM training.</td>
     </tr>
     <tr>
         <th>num_failed_groups</th>
-        <td>Number of failed groups in svm training.</td>
+        <td>Number of failed groups in SVM training.</td>
     </tr>
     <tr>
       <th>total_rows_processed</th>
@@ -212,7 +222,7 @@ svm_classification(
 
 @anchor svm_regression
 @par Regression Training Function
-SVM regression training function has the following format:
+The SVM regression training function has the following format:
 <pre class="syntax">
 svm_regression(source_table,
     model_table,
@@ -228,20 +238,20 @@ svm_regression(source_table,
 \b Arguments
 
 Specifications for regression are largely the same as for classification. In the
-model table, there is no dependent variable mapping. Also, the following
+model table, there is no dependent variable mapping. The following
 arguments have specifications which differ from svm_classification:
 
 <DL class="arglist">
 
 <DT>dependent_varname</DT>
   <DD>TEXT. Name of the dependent variable column. For regression, this column
-  can contain only values or expressions that can be cast as DOUBLE PRECISION.
+  can contain only values or expressions that can be cast to DOUBLE PRECISION.
   Otherwise, an error will be thrown.
   </DD>
 
 <DT>params (optional)</DT>
 <DD>TEXT, default: NULL.
-The parameters \e epsilon and \e eps_table are only meaningful for regression. See descriptions below for more details.
+The parameters \e epsilon and \e eps_table are only meaningful for regression. See description below for more details.
 </DD>
 </DL>
 
@@ -253,7 +263,7 @@ list of name-value pairs. All of these named parameters are optional, and
 their order does not matter. You must use the format "<param_name> = <value>"
 to specify the value of a parameter, otherwise the parameter is ignored.
 
-When the \ref svm_classification() \e kernel_func argument value is \b 'gaussian', the \e kernel_params argument is a string containing name-value pairs with the following format. (Line breaks are inserted for readability.)
+When the \ref svm_classification() \e kernel_func argument value is 'gaussian', the \e kernel_params argument is a string containing name-value pairs with the following format. (Line breaks are inserted for readability.)
 <pre class="syntax">
   'gamma = &lt;value>,
    n_components = &lt;value>,
@@ -262,17 +272,17 @@ When the \ref svm_classification() \e kernel_func argument value is \b 'gaussian
 \b Parameters
 <DL class="arglist">
 <DT>gamma</dt>
-<DD>Default: 1/num_features. The parameter \f$\gamma\f$ in the Radius Basis Function kernel, i.e., \f$\exp(-\gamma||x-y||^2)\f$. Choosing a proper value for \e gamma is critical to the performance of kernel machine, e.g., while a large \e gamma tends to cause overfitting, a small \e gamma will make the model too constrained to capture the complexity of the data.
-.</DD>
+<DD>Default: 1/num_features. The parameter \f$\gamma\f$ in the Radius Basis Function kernel, i.e., \f$\exp(-\gamma||x-y||^2)\f$. Choosing a proper value for \e gamma is critical to the performance of kernel machine; e.g., while a large \e gamma tends to cause overfitting, a small \e gamma will make the model too constrained to capture the complexity of the data.
+</DD>
 
 <DT>n_components</DT>
-<DD>Default: 2*num_features. The dimensionality of the transformed feature space. A larger value lowers the variance of the estimate of kernel but requires more memory and takes longer to train.</DD>
+<DD>Default: 2*num_features. The dimensionality of the transformed feature space. A larger value lowers the variance of the estimate of the kernel but requires more memory and takes longer to train.</DD>
 
 <DT>random_state</DT>
 <DD>Default: 1. Seed used by the random number generator. </DD>
 </DL>
 
-When the \ref svm_classification() \e kernel_func argument value is \b 'poly', the \e kernel_params argument is a string containing name-value pairs with the following format. (Line breaks are inserted for readability.)
+When the \ref svm_classification() \e kernel_func argument value is 'polynomial’, the \e kernel_params argument is a string containing name-value pairs with the following format. (Line breaks are inserted for readability.)
 <pre class="syntax">
   'coef0 = &lt;value>,
    degree = &lt;value>,
@@ -282,7 +292,7 @@ When the \ref svm_classification() \e kernel_func argument value is \b 'poly', t
 \b Parameters
 <DL class="arglist">
 <DT>coef0</dt>
-<DD>Default: 1.0. The independent term \f$q\f$ in \f$ (\langle x,y\rangle + q)^r \f$. Must be larger or equal to 0. When it is 0, the polynomial kernel is in homogeneous form.
+<DD>Default: 1.0. The independent term \f$q\f$ in \f$ (\langle x,y\rangle + q)^r \f$. Must be larger than or equal to 0. When it is 0, the polynomial kernel is in homogeneous form.
 </DD>
 
 <DT>degree</dt>
@@ -299,14 +309,15 @@ When the \ref svm_classification() \e kernel_func argument value is \b 'poly', t
 
 @anchor parameters
 @par Other Parameters
-Parameters in this sections are supplied in \e params argument as a string containing a comma-delimited
-list of name-value pairs. Hyperparameter optimization can be carried out through the built-in cross validation mechanism, which is activated by assigning a value greater than 1 to the parameter \e n_folds in \e params.
-All of these named parameters are optional, and their order does not matter.
-You must use the format "<param_name> = <value>" to specify the value of a parameter, otherwise the parameter is ignored.
-The validating values of the parameter are provided in a list, i.e., "lambda = [0.01, 0.1, 1]".
-For example, if one wanted to regularize with the L1 norm and use a lambda value from the set {0.3, 0.4, 0.5}, one would input 'lambda={0.3, 0.4, 0.5}, norm=L1, n_folds=10' in \e params. Note that the use of '{}' and '[]' are both valid here.
-
-Not all parameters below can be cross-validated. And for those who do, their default values, as described below, are specified in a list, i.e., [0.01].
+Parameters in this section are supplied in the \e params argument as a string containing a comma-delimited
+list of name-value pairs. All of these named parameters are optional, and
+their order does not matter. You must use the format "<param_name> = <value>"
+to specify the value of a parameter, otherwise the parameter is ignored.
+
+Hyperparameter optimization can be carried out using the built-in cross validation mechanism, which is activated by assigning a value greater than 1 to the parameter \e n_folds in \e params. Please note that cross validation is not supported if grouping is used.
+The values of a parameter to cross validate should be provided in a list. For example, if one wanted to regularize with the L1 norm and use a lambda value from the set {0.3, 0.4, 0.5}, one might input 'lambda={0.3, 0.4, 0.5}, norm=L1, n_folds=10' in \e params. Note that the use of '{}' and '[]' are both valid here.
+
+Please note that not all of the parameters below can be cross-validated. For parameters where cross validation is allowed, their default values are presented in list format; e.g., [0.01].
 <pre class="syntax">
   'init_stepsize = &lt;value>,
    decay_factor = &lt;value>,
@@ -323,7 +334,7 @@ Not all parameters below can be cross-validated. And for those who do, their def
 <DL class="arglist">
 
 <DT>init_stepsize</dt>
-<DD>Default: [0.01]. Also known as the inital learning rate. A small value is usually desirable to ensure convergence, while a large value provides more room for progress during training. Since the best value depends on the condition number of the data, in practice one can search in a exponential grid using the built-in cross validation, i.e., "init_stepsize = [1, 0.1, 0.001]". This can be done on a subsampled dataset which usually provides a good estimate of the condition number of the whole dataset.
+<DD>Default: [0.01]. Also known as the initial learning rate. A small value is usually desirable to ensure convergence, while a large value provides more room for progress during training. Since the best value depends on the condition number of the data, in practice one often searches in an exponential grid using built-in cross validation; e.g., "init_stepsize = [1, 0.1, 0.001]". To reduce training time, it is common to run cross validation on a subsampled dataset, since this usually provides a good estimate of the condition number of the whole dataset.  Then the resulting \e init_stepsize can be run on the whole dataset.
 </DD>
 
 <DT>decay_factor</DT>
@@ -335,11 +346,11 @@ Not all parameters below can be cross-validated. And for those who do, their def
 </DD>
 
 <DT>tolerance</dt>
-<DD>Default: 1e-10. The criteria to end iterations. The training stops whenever the difference between the training models of two consecutive iterations is smaller than \e tolerance or the iteration number is larger than \e max_iter.
+<DD>Default: 1e-10. The criterion to end iterations. The training stops whenever the difference between the training models of two consecutive iterations is smaller than \e tolerance or the iteration number is larger than \e max_iter.
 </DD>
 
 <DT>lambda</dt>
-<DD>Default: [0.01]. Regularization parameter, positive.
+<DD>Default: [0.01]. Regularization parameter.  Must be positive, can’t be 0.
 </DD>
 
 <DT>norm</dt>
@@ -358,25 +369,24 @@ data, and decrease with the number of samples. See [5].
 
 <DT>eps_tabl</dt>
 <DD>Default: NULL.
-Name of the table that contains values of epsilon for different groups. Ignored when \e grouping_col is NULL.
-The table consists of the column named epsilon which specifies the epsilon values, as well as those columns used in \e grouping_col. Extra groups are ignored, and groups not present in this table will use the epsilon value specified in parameter \e epsilon.
+Name of the input table that contains values of epsilon for different groups. Ignored when \e grouping_col is NULL.  Define this input table if you want different epsilon values for different groups.  The table consists of a column named \e epsilon which specifies the epsilon values, and one or more columns for \e grouping_col. Extra groups are ignored, and groups not present in this table will use the epsilon value specified in parameter \e epsilon.
 </DD>
 
 <DT>validation_result</dt>
 <DD>Default: NULL.
-Name of the table to store the cross validation results including the values of parameters and their averaged error values. For now simple metric like 0-1 loss is used for classification and mean square error is used for regression. The table is only created if the name is not NULL.
+Name of the table to store the cross validation results including the values of parameters and their averaged error values. For now, simple metric like 0-1 loss is used for classification and mean square error is used for regression. The table is only created if the name is not NULL.
 </DD>
 
 <DT>n_folds</dt>
 <DD>Default: 0.
-Number of folds. Must be at least 2 to activate cross validation. If a value of k > 2 is specified, each fold is then used as a validation set once while the k - 1 remaining fold form the training set.
+Number of folds (k). Must be at least 2 to activate cross validation. If a value of k > 2 is specified, each fold is then used as a validation set once, while the other k - 1 folds form the training set.
 </DD>
 </DL>
 
 
 @anchor predict
 @par Prediction Function
-The prediction function is provided to estimate the conditional mean given a new
+The prediction function is used to estimate the conditional mean given a new
 predictor. It has the following syntax:
 <pre class="syntax">
 svm_predict(model_table,
@@ -391,7 +401,7 @@ svm_predict(model_table,
   <DD>TEXT. Model table produced by the training function.</DD>
 
   <DT>new_data_table</DT>
-  <DD>TEXT. Name of the table containing prediction data. This table is expected
+  <DD>TEXT. Name of the table containing the prediction data. This table is expected
   to contain the same features that were used during training. The table should
   also contain id_col_name used for identifying each row.</DD>
 
@@ -399,9 +409,8 @@ svm_predict(model_table,
   <DD>TEXT. The name of the id column in the input table.</DD>
 
   <DT>output_table</DT>
-
-  <DD>TEXT. Name of the table to which output predictions are written. If this
-table name is already in use then an error is returned. The table contains the
+  <DD>TEXT. Name of the table where output predictions are written. If this
+table name is already in use, then an error is returned. The table contains the
 id_col_name column giving the 'id' for each prediction and the prediction
 columns for the dependent variable.</DD>
 </DL>
@@ -430,7 +439,7 @@ COPY houses FROM STDIN WITH DELIMITER '|';
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
 \\.
 </pre>
--#  Train a classification model. First, a linear model.
+-#  Train a classification model. First, use a linear model.
 <pre class="example">
 SELECT madlib.svm_classification('houses',
                                  'houses_svm',
@@ -438,19 +447,19 @@ SELECT madlib.svm_classification('houses',
                                  'ARRAY[1, tax, bath, size]'
                            );
 </pre>
--# Generate a nonlinear model using gaussian kernel. This time we specify the intital step size and maximum number of iteration to run. As part of the kernel parameter, we choose 10 as the dimension of the space where we train svm. A larger number will lead to a more powerful model but run the risk of overfitting. As a result, the model will be a 10 dimensional vector, instead of 4 as in the case of linear model, which we will verify when we examine the models.
+-# Next generate a nonlinear model using a Gaussian kernel. This time we specify the initial step size and maximum number of iterations to run. As part of the kernel parameter, we choose 10 as the dimension of the space where we train SVM. A larger number will lead to a more powerful model but run the risk of overfitting. As a result, the model will be a 10 dimensional vector, instead of 4 as in the case of linear model, which we will verify when we examine the models.
 <pre class="example">
-SELECT madlib.linregr_train( 'houses',
-                             'houses_svm_gaussian',
-                             'price < 100000',
-                             'ARRAY[1, tax, bath, size]',
-                             'gaussian',
-                             'n_components=10',
-                             '',
-                             'init_stepsize=1, max_iter=200'
+SELECT madlib.svm_classification( 'houses',
+                                  'houses_svm_gaussian',
+                                  'price < 100000',
+                                  'ARRAY[1, tax, bath, size]',
+                                  'gaussian',
+                                  'n_components=10',
+                                  '',
+                                  'init_stepsize=1, max_iter=200'
                            );
 </pre>
--# Examine the resulting models.
+-# View the result for the linear model.
 <pre class="example">
 -- Set extended display on for easier reading of output
 \\x ON
@@ -468,7 +477,7 @@ num_rows_skipped   | 0
 dep_var_mapping    | [False, True]
 </pre>
 
--# View the results from kernel svm.
+-# View the results from kernel SVM.
 <pre class="example">
 -- Set extended display on for easier reading of output
 \\x ON
@@ -490,13 +499,34 @@ dep_var_mapping    | [False, True]
 For the linear model:
 <pre class="example">
 SELECT madlib.svm_predict('houses_svm', 'houses', 'id', 'houses_pred');
-SELECT *, price < 100000 AS target FROM houses JOIN houses_pred USING (id)
+SELECT *, price < 100000 AS target FROM houses JOIN houses_pred USING (id) ORDER BY id;
 </pre>
-and for the gaussian model:
+For the Gaussian model:
 <pre class="example">
 SELECT madlib.svm_predict('houses_svm_gaussian', 'houses', 'id', 'houses_pred_gaussian');
-SELECT *, price < 100000 AS target FROM houses JOIN houses_pred_gaussian USING (id)
+SELECT *, price < 100000 AS target FROM houses JOIN houses_pred_gaussian USING (id) ORDER BY id;
+</pre>
+Result for the Gaussian model:
+<pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction | target
+----+------+---------+------+--------+------+-------+------------+--------
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          | t
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          | t
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | t          | t
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | t          | t
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | f          | f
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | t          | t
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | f          | f
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | f          | f
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | f          | f
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | f          | f
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | t          | t
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | f          | f
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | f          | f
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          | f
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          | t
 </pre>
+Note the result may vary somewhat with the platform configuration you are using.
 
 
 @anchor background
@@ -524,7 +554,7 @@ the feature space approximates the kernel function in the input space. The
 linear SVM training function is then run on the resulting data. See the papers
 [2,3] for more information on random feature maps.
 
-Also, see the book [4] by Scholkopf and Smola  for more details of SVMs in general.
+Also, see the book [4] by Scholkopf and Smola  for more details on SVMs in general.
 
 @anchor literature
 @literature
@@ -850,6 +880,27 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 /**
  * @brief Help function
  */
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(
+   message TEXT
+) RETURNS TEXT AS $$
+PythonFunction(svm, svm, svm_predict_help)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict()
+RETURNS TEXT AS $$
+  SELECT MADLIB_SCHEMA.svm_predict(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(
+   message text)
+RETURNS TEXT AS $$
+PythonFunction(svm, svm, svm_predict_help)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification (
     message  TEXT
 ) RETURNS TEXT AS $$

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ee035efd/src/ports/postgres/modules/svm/test/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in
index 7fdc26c..48b8bab 100644
--- a/src/ports/postgres/modules/svm/test/svm.sql_in
+++ b/src/ports/postgres/modules/svm/test/svm.sql_in
@@ -134,21 +134,6 @@ FROM
         WHERE train.id = test.id
     ) AS subq;
 
--- by default using epsilon == 0.1
-DROP TABLE IF EXISTS svr_model, svr_model_summary;
-SELECT svm_regression(
-     'svr_train_data',
-     'svr_model',
-     'label',
-     'ind',
-     NULL,
-     NULL,
-     NULL,
-     'init_stepsize=1, max_iter=10, lambda=2');
-SELECT
-    assert(epsilon > 0,'default epsilon is positive!')
-FROM svr_model_summary;
-
 -- Example usage for LINEAR classification, replace the above by
 SELECT svm_classification(
     'svm_train_data',