You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/08/14 17:37:20 UTC

[1/2] incubator-madlib git commit: MLP: Add multiple enhancements

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 6f6f804b2 -> ff1b0f883


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/ports/postgres/modules/convex/mlp_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 6cea7b0..550d630 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -16,7 +16,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 """
 @file mlp_igd.py_in
 
@@ -24,17 +23,18 @@
 
 @namespace mlp_igd
 """
+import math
 import plpy
 
-from utilities.control import MinWarning
 from utilities.utilities import add_postfix
 from utilities.utilities import py_list_to_sql_string
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import _assert
+from utilities.utilities import _assert_equal
 from utilities.utilities import unique_string
 from utilities.utilities import strip_end_quotes
-
 from utilities.validate_args import cols_in_tbl_valid
+from utilities.validate_args import table_exists
 from utilities.validate_args import input_tbl_valid
 from utilities.validate_args import is_var_valid
 from utilities.validate_args import output_tbl_valid
@@ -42,10 +42,14 @@ from utilities.validate_args import get_expr_type
 from utilities.validate_args import array_col_has_same_dimension
 from utilities.validate_args import array_col_dimension
 
+from convex.utils_regularization import __utils_ind_var_scales
+
+from elastic_net.elastic_net_utils import _tbl_dimension_rownum
+
 
 def mlp(schema_madlib, source_table, output_table, independent_varname,
-        dependent_varname, hidden_layer_sizes,
-        optimizer_param_str, activation, is_classification, **kwargs):
+        dependent_varname, hidden_layer_sizes, optimizer_param_str, activation,
+        is_classification, weights, warm_start, verbose=False):
     """
     Args:
         @param schema_madlib
@@ -59,62 +63,128 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
     Returns:
         None
     """
-    with MinWarning('warning'):
-        optimizer_params = _get_optimizer_params(optimizer_param_str or "")
-        summary_table = add_postfix(output_table, "_summary")
-        _validate_args(source_table, output_table, summary_table, independent_varname,
-                       dependent_varname, hidden_layer_sizes,
-                       optimizer_params, is_classification)
-
-        current_iteration = 1
-        prev_state = None
-        tolerance = optimizer_params["tolerance"]
-        n_iterations = optimizer_params["n_iterations"]
-        step_size = optimizer_params["step_size"]
-        n_tries = optimizer_params["n_tries"]
-        activation_name = _get_activation_function_name(activation)
-        activation_index = _get_activation_index(activation_name)
-        num_input_nodes = array_col_dimension(
-            source_table, independent_varname)
-        num_output_nodes = 0
-        classes = []
-        dependent_type = get_expr_type(dependent_varname, source_table)
-        original_dependent_varname = dependent_varname
-
-        if is_classification:
-            dependent_variable_sql = """
-                SELECT DISTINCT {dependent_varname}
-                FROM {source_table}
-                """.format(dependent_varname=dependent_varname,
-                           source_table=source_table)
-            labels = plpy.execute(dependent_variable_sql)
-            one_hot_dependent_varname = 'ARRAY['
-            num_output_nodes = len(labels)
-            for label_obj in labels:
-                label = _format_label(label_obj[dependent_varname])
-                classes.append(label)
-                one_hot_dependent_varname += dependent_varname + \
-                    "=" + str(label) + ","
-            # Remove the last comma
-            one_hot_dependent_varname = one_hot_dependent_varname[:-1]
-            one_hot_dependent_varname += ']::integer[]'
-            dependent_varname = one_hot_dependent_varname
-        else:
-            if "[]" not in dependent_type:
-                dependent_varname = "ARRAY[" + dependent_varname + "]"
-            num_output_nodes = array_col_dimension(
-                source_table, dependent_varname)
-        layer_sizes = [num_input_nodes] + \
-            hidden_layer_sizes + [num_output_nodes]
+    warm_start = bool(warm_start)
+    optimizer_params = _get_optimizer_params(optimizer_param_str or "")
+    summary_table = add_postfix(output_table, "_summary")
+    weights = '1' if not weights or not weights.strip() else weights.strip()
+    hidden_layer_sizes = hidden_layer_sizes or []
+    activation = _get_activation_function_name(activation)
+    learning_rate_policy = _get_learning_rate_policy_name(
+        optimizer_params["learning_rate_policy"])
+    activation_index = _get_activation_index(activation)
+
+    _validate_args(source_table, output_table, summary_table, independent_varname,
+                   dependent_varname, hidden_layer_sizes,
+                   optimizer_params, is_classification, weights,
+                   warm_start, activation)
+
+    current_iteration = 1
+    prev_state = None
+    tolerance = optimizer_params["tolerance"]
+    n_iterations = optimizer_params["n_iterations"]
+    step_size_init = optimizer_params["learning_rate_init"]
+    iterations_per_step = optimizer_params["iterations_per_step"]
+    power = optimizer_params["power"]
+    gamma = optimizer_params["gamma"]
+    step_size = step_size_init
+    n_tries = optimizer_params["n_tries"]
+    # lambda is a reserved word in python
+    lmbda = optimizer_params["lambda"]
+    iterations_per_step = optimizer_params["iterations_per_step"]
+    num_input_nodes = array_col_dimension(source_table,
+                                          independent_varname)
+    num_output_nodes = 0
+    classes = []
+    dependent_type = get_expr_type(dependent_varname, source_table)
+    original_dependent_varname = dependent_varname
+    dimension, n_tuples = _tbl_dimension_rownum(
+        schema_madlib, source_table, independent_varname)
+    x_scales = __utils_ind_var_scales(
+        source_table, independent_varname, dimension, schema_madlib)
+    x_means = py_list_to_sql_string(
+        x_scales["mean"], array_type="DOUBLE PRECISION")
+    filtered_stds = [x if x != 0 else 1 for x in x_scales["std"]]
+    x_stds = py_list_to_sql_string(
+        filtered_stds, array_type="DOUBLE PRECISION")
 
+    if is_classification:
+        dependent_variable_sql = """
+        SELECT DISTINCT {dependent_varname}
+        FROM {source_table}
+        """.format(
+            dependent_varname=dependent_varname, source_table=source_table)
+        labels = plpy.execute(dependent_variable_sql)
+        one_hot_dependent_varname = 'ARRAY['
+        num_output_nodes = len(labels)
+        for label_obj in labels:
+            label = _format_label(label_obj[dependent_varname])
+            classes.append(label)
+        classes.sort()
+        for c in classes:
+            one_hot_dependent_varname += dependent_varname + \
+                "=" + str(c) + ","
+        # Remove the last comma
+        one_hot_dependent_varname = one_hot_dependent_varname[:-1]
+        one_hot_dependent_varname += ']::integer[]'
+        dependent_varname = one_hot_dependent_varname
+    else:
+        if "[]" not in dependent_type:
+            dependent_varname = "ARRAY[" + dependent_varname + "]"
+        num_output_nodes = array_col_dimension(
+            source_table, dependent_varname)
+    layer_sizes = [num_input_nodes] + \
+        hidden_layer_sizes + [num_output_nodes]
+
+    # Need layers sizes before validating for warm_start
+    coeff = []
+    for i in range(len(layer_sizes) - 1):
+        fan_in = layer_sizes[i]
+        fan_out = layer_sizes[i + 1]
+        # Initalize according to Glorot and Bengio (2010)
+        # See design doc for more info
+        span = math.sqrt(6.0 / (fan_in + fan_out))
+        dim = (layer_sizes[i] + 1) * layer_sizes[i + 1]
+        rand = plpy.execute("""SELECT array_agg({span}*(random()-0.5))
+                               AS random
+                               FROM generate_series(0,{dim})
+                """.format(span=span, dim=dim))[0]["random"]
+        coeff += rand
+
+    if warm_start:
+        coeff, x_means, x_stds = _validate_warm_start(
+                source_table, output_table, summary_table, independent_varname,
+                original_dependent_varname, layer_sizes, optimizer_params,
+                is_classification, weights, warm_start, activation)
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(output_table))
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(summary_table))
+    best_state = []
+    best_loss = [float('inf')]
+    prev_loss = float('inf')
+    loss = None
+    for _ in range(n_tries):
         while True:
             if prev_state:
                 prev_state_str = py_list_to_sql_string(
                     prev_state, array_type="double precision")
             else:
                 prev_state_str = "(NULL)::DOUBLE PRECISION[]"
+            # else block is for "constant", so don't do anything
+            zero_indexed_iteration = current_iteration - 1
+            if learning_rate_policy == "exp":
+                step_size = step_size_init * gamma**zero_indexed_iteration
+            elif learning_rate_policy == "inv":
+                step_size = step_size_init * (current_iteration)**(-power)
+            elif learning_rate_policy == "step":
+                step_size = step_size_init * gamma**(
+                    math.floor(zero_indexed_iteration / iterations_per_step))
+
+
             train_sql = """
             SELECT
+                (result).state as state,
+                (result).loss  as loss
+            FROM (
+            SELECT
                 {schema_madlib}.mlp_igd_step(
                     ({independent_varname})::DOUBLE PRECISION[],
                     ({dependent_varname})::DOUBLE PRECISION[],
@@ -122,105 +192,153 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
                     {layer_sizes},
                     ({step_size})::FLOAT8,
                     {activation},
-                    {is_classification}) as curr_state
-            FROM {source_table} AS _src
-            """.format(schema_madlib=schema_madlib,
-                       independent_varname=independent_varname,
-                       dependent_varname=dependent_varname,
-                       prev_state=prev_state_str,
-                       # C++ uses double internally
-                       layer_sizes=py_list_to_sql_string(layer_sizes,
-                                                         array_type="double precision"),
-                       step_size=step_size,
-                       source_table=source_table,
-                       activation=activation_index,
-                       is_classification=int(is_classification))
-            curr_state = plpy.execute(train_sql)[0]["curr_state"]
-            dist_sql = """
-                SELECT {schema_madlib}.internal_mlp_igd_distance(
-                        {prev_state},
-                        {curr_state}) as state_dist
-                """.format(schema_madlib=schema_madlib,
-                           prev_state=prev_state_str,
-                           curr_state=py_list_to_sql_string(curr_state, "double precision"))
-            state_dist = plpy.execute(dist_sql)[0]["state_dist"]
-            if ((state_dist and state_dist < tolerance) or
-                    current_iteration > n_iterations):
+                    {is_classification},
+                    ({weights})::DOUBLE PRECISION,
+                    {warm_start},
+                    ({warm_start_coeff})::DOUBLE PRECISION[],
+                    {n_tuples},
+                    {lmbda},
+                    {x_means},
+                    {x_stds}
+                    ) as result
+            FROM {source_table} as _src) _step_q
+            """.format(
+                schema_madlib=schema_madlib,
+                independent_varname=independent_varname,
+                dependent_varname=dependent_varname,
+                prev_state=prev_state_str,
+                # c++ uses double internally
+                layer_sizes=py_list_to_sql_string(
+                    layer_sizes, array_type="DOUBLE PRECISION"),
+                step_size=step_size,
+                source_table=source_table,
+                activation=activation_index,
+                is_classification=int(is_classification),
+                weights=weights,
+                warm_start=warm_start,
+                warm_start_coeff=py_list_to_sql_string(
+                    coeff, array_type="DOUBLE PRECISION"),
+                n_tuples=n_tuples,
+                lmbda=lmbda,
+                x_means=x_means,
+                x_stds=x_stds)
+            step_result = plpy.execute(train_sql)[0]
+            curr_state = step_result['state']
+            loss = step_result['loss']
+            if verbose and 1 < current_iteration <= n_iterations:
+                plpy.info("Iteration: " + str(current_iteration -
+                                              1) + ", Loss: " + str(loss))
+            state_dist = abs(loss-prev_loss)
+            if ((state_dist and state_dist < tolerance)
+                    or current_iteration > n_iterations):
                 break
             prev_state = curr_state
+            prev_loss = loss
             current_iteration += 1
-        _build_model_table(schema_madlib, output_table,
-                           curr_state, n_iterations)
-        layer_sizes_str = py_list_to_sql_string(
-            layer_sizes, array_type="integer")
-        classes_str = py_list_to_sql_string(
-            [strip_end_quotes(cl, "'") for cl in classes],
-            array_type=dependent_type)
-        summary_table_creation_query = """
-        CREATE TABLE {summary_table}(
-            source_table TEXT,
-            independent_varname TEXT,
-            dependent_varname TEXT,
-            tolerance FLOAT,
-            step_size FLOAT,
-            n_iterations INTEGER,
-            n_tries INTEGER,
-            layer_sizes INTEGER[],
-            activation_function TEXT,
-            is_classification BOOLEAN,
-            classes {dependent_type}[]
-        )""".format(summary_table=summary_table,
-                    dependent_type=dependent_type)
-
-        summary_table_update_query = """
-            INSERT INTO {summary_table} VALUES(
-                '{source_table}',
-                '{independent_varname}',
-                '{original_dependent_varname}',
-                {tolerance},
-                {step_size},
-                {n_iterations},
-                {n_tries},
-                {layer_sizes_str},
-                '{activation_name}',
-                {is_classification},
-                {classes_str}
-            )
-            """.format(**locals())
-        plpy.execute(summary_table_creation_query)
-        plpy.execute(summary_table_update_query)
-# ----------------------------------------------------------------------
-
-
-def _build_model_table(schema_madlib, output_table, final_state, n_iterations):
+        # We use previous state because the last iteration
+        # just calculates loss
+        if loss < best_loss:
+            best_state = prev_state
+            best_loss = loss
+        current_iteration = 1
+        prev_state = None
+    _build_model_table(schema_madlib, output_table, best_state,
+                       best_loss, n_iterations)
+    layer_sizes_str = py_list_to_sql_string(
+        layer_sizes, array_type="integer")
+    classes_str = py_list_to_sql_string(
+        [strip_end_quotes(cl, "'") for cl in classes],
+        array_type=dependent_type)
+    summary_table_creation_query = """
+    CREATE TABLE {summary_table}(
+        source_table TEXT,
+        independent_varname TEXT,
+        dependent_varname TEXT,
+        tolerance FLOAT,
+        learning_rate_init FLOAT,
+        learning_rate_policy TEXT,
+        n_iterations INTEGER,
+        n_tries INTEGER,
+        layer_sizes INTEGER[],
+        activation TEXT,
+        is_classification BOOLEAN,
+        classes {dependent_type}[],
+        weights VARCHAR,
+        x_means DOUBLE PRECISION[],
+        x_stds DOUBLE PRECISION[]
+    )""".format(summary_table=summary_table,
+                dependent_type=dependent_type)
+
+    summary_table_update_query = """
+    INSERT INTO {summary_table} VALUES(
+        '{source_table}',
+        '{independent_varname}',
+        '{original_dependent_varname}',
+        {tolerance},
+        {step_size_init},
+        '{learning_rate_policy}',
+        {n_iterations},
+        {n_tries},
+        {layer_sizes_str},
+        '{activation}',
+        {is_classification},
+        {classes_str},
+        '{weights}',
+        {x_means},
+        {x_stds}
+    )
+    """.format(**locals())
+    plpy.execute(summary_table_creation_query)
+    plpy.execute(summary_table_update_query)
+    return None
+
+
+def _get_loss(schema_madlib, state):
+    return plpy.execute("""
+    SELECT
+        (result).loss  AS loss
+    FROM (
+        SELECT
+            {schema_madlib}.internal_mlp_igd_result(
+                {final_state_str}
+            ) AS result
+    ) rel_state_subq
+    """.format(
+        schema_madlib=schema_madlib,
+        final_state_str=py_list_to_sql_string(state)))[0]["loss"]
+
+
+def _build_model_table(schema_madlib, output_table, final_state, loss, n_iterations):
     final_state_str = py_list_to_sql_string(
         final_state, array_type="double precision")
 
     model_table_query = """
-        CREATE TABLE {output_table} AS
+    CREATE TABLE {output_table} AS
+        SELECT
+            (result).coeff as coeff,
+            {loss}  as loss,
+            {n_iterations} as num_iterations
+        FROM (
             SELECT
-                (result).coeff AS coeff,
-                (result).loss  AS loss,
-                {n_iterations} AS num_iterations
-                -- (result).num_rows_processed     AS num_rows_processed,
-                -- n_tuples_including_nulls - (result).num_rows_processed
-            FROM (
-                SELECT
-                    {schema_madlib}.internal_mlp_igd_result(
-                        {final_state_str}
-                    ) AS result
-            ) rel_state_subq
-        """.format(**locals())
+                {schema_madlib}.internal_mlp_igd_result(
+                    {final_state_str}
+                ) AS result
+        ) rel_state_subq
+    """.format(**locals())
     plpy.execute(model_table_query)
-# ----------------------------------------------------------------------
 
 
 def _get_optimizer_params(param_str):
     params_defaults = {
-        "step_size": (0.001, float),
+        "learning_rate_init": (0.001, float),
         "n_iterations": (100, int),
         "n_tries": (1, int),
         "tolerance": (0.001, float),
+        "learning_rate_policy": ("constant", str),
+        "gamma": (0.1, float),
+        "iterations_per_step": (100, int),
+        "power": (0.5, float),
+        "lambda": (0, float)
     }
     param_defaults = dict([(k, v[0]) for k, v in params_defaults.items()])
     param_types = dict([(k, v[1]) for k, v in params_defaults.items()])
@@ -228,10 +346,9 @@ def _get_optimizer_params(param_str):
     if not param_str:
         return param_defaults
 
-    name_value = extract_keyvalue_params(param_str, param_types, param_defaults,
-                                         ignore_invalid=False)
+    name_value = extract_keyvalue_params(
+        param_str, param_types, param_defaults, ignore_invalid=False)
     return name_value
-# ----------------------------------------------------------------------
 
 
 def _validate_args_classification(source_table, dependent_varname):
@@ -239,89 +356,174 @@ def _validate_args_classification(source_table, dependent_varname):
     int_types = ['integer', 'smallint', 'bigint']
     text_types = ['text', 'varchar', 'character varying', 'char', 'character']
     boolean_types = ['boolean']
-    _assert("[]" in expr_type or expr_type in int_types + text_types + boolean_types,
+    _assert("[]" in expr_type
+            or expr_type in int_types + text_types + boolean_types,
             "Dependent variable column should refer to an "
             "integer, boolean, text, varchar, or character type.")
-# ----------------------------------------------------------------------
 
 
 def _validate_args_regression(source_table, dependent_varname):
     expr_type = get_expr_type(dependent_varname, source_table)
     int_types = ['integer', 'smallint', 'bigint']
     float_types = ['double precision', 'real']
-    _assert("[]" in expr_type or expr_type in int_types + float_types,
-            "Dependent variable column should refer to an array or numeric type")
+    _assert(
+        "[]" in expr_type or expr_type in int_types + float_types,
+        "Dependent variable column should refer to an array or numeric type")
     if "[]" in expr_type:
-        _assert(array_col_has_same_dimension(source_table, dependent_varname),
-                "Dependent variable column should refer to arrays of the same length")
-# ----------------------------------------------------------------------
+        _assert(
+            array_col_has_same_dimension(source_table, dependent_varname),
+            "Dependent variable column should refer to arrays of the same length"
+        )
+
+
+def _validate_summary_table(summary_table):
+    input_tbl_valid(summary_table, 'MLP')
+    cols_in_tbl_valid(summary_table, [
+        'dependent_varname', 'independent_varname', 'activation',
+        'tolerance', 'learning_rate_init', 'n_iterations', 'n_tries',
+        'classes', 'layer_sizes', 'source_table', 'x_means', 'x_stds'
+    ], 'MLP')
+
+
+def _validate_warm_start(source_table, output_table, summary_table, independent_varname,
+                         dependent_varname, layer_sizes,
+                         optimizer_params, is_classification, weights,
+                         warm_start, activation):
+    _assert(table_exists(output_table),
+            "MLP error: Warm start failed due to missing model table: " + output_table)
+    _assert(table_exists(summary_table),
+            "MLP error: Warm start failed due to missing summary table: " + summary_table)
+
+    _assert(optimizer_params["n_tries"] == 1,
+            "MLP error: warm_start is only compatible for n_tries = 1")
+
+    summary = plpy.execute("SELECT * FROM {0}".format(summary_table))[0]
+    params = [
+        "independent_varname", "dependent_varname", "layer_sizes",
+        "is_classification", "weights", "activation"
+    ]
+    for param in params:
+        _assert_equal(eval(param), summary[param],
+                      "MLP error: warm start failed due to different parameter value: " +
+                      param)
+    output = plpy.execute("SELECT * FROM {0}".format(output_table))[0]
+    coeff = output['coeff']
+    num_coeffs = sum(
+        map(lambda i: (layer_sizes[i] + 1) * (layer_sizes[i + 1]),
+            range(len(layer_sizes) - 1)))
+    _assert_equal(num_coeffs,
+                  len(coeff),
+                  "MLP error: Warm start failed to invalid output_table: " +
+                  output_table + ". Invalid number of coefficients in model.")
+    x_means = py_list_to_sql_string(
+        summary["x_means"], array_type="DOUBLE PRECISION")
+    x_stds = py_list_to_sql_string(
+        summary["x_stds"], array_type="DOUBLE PRECISION")
+
+    return coeff, x_means, x_stds
 
 
 def _validate_args(source_table, output_table, summary_table, independent_varname,
                    dependent_varname, hidden_layer_sizes,
-                   optimizer_params, is_classification):
+                   optimizer_params, is_classification, weights, warm_start, activation):
     input_tbl_valid(source_table, "MLP")
-    output_tbl_valid(output_table, "MLP")
-    output_tbl_valid(summary_table, "MLP")
-    _assert(is_var_valid(source_table, independent_varname),
-            "MLP error: invalid independent_varname "
-            "('{independent_varname}') for source_table "
-            "({source_table})!".format(independent_varname=independent_varname,
-                                       source_table=source_table))
-
-    _assert(is_var_valid(source_table, dependent_varname),
-            "MLP error: invalid dependent_varname "
-            "('{dependent_varname}') for source_table "
-            "({source_table})!".format(dependent_varname=dependent_varname,
-                                       source_table=source_table))
-    _assert(hidden_layer_sizes is not None,
-            "hidden_layer_sizes may not be null")
-    _assert(isinstance(hidden_layer_sizes, list),
-            "hidden_layer_sizes must be an array of integers")
-    _assert(all(isinstance(value, int) for value in hidden_layer_sizes),
-            "MLP error: Hidden layers sizes must be integers")
-    _assert(all(value >= 0 for value in hidden_layer_sizes),
-            "MLP error: Hidden layers sizes must be greater than 0.")
+    if not warm_start:
+        output_tbl_valid(output_table, "MLP")
+        output_tbl_valid(summary_table, "MLP")
+
+    _assert(
+        is_var_valid(source_table, independent_varname),
+        "MLP error: invalid independent_varname "
+        "('{independent_varname}') for source_table "
+        "({source_table})!".format(
+            independent_varname=independent_varname,
+            source_table=source_table))
+
+    _assert(
+        is_var_valid(source_table, dependent_varname),
+        "MLP error: invalid dependent_varname "
+        "('{dependent_varname}') for source_table "
+        "({source_table})!".format(
+            dependent_varname=dependent_varname, source_table=source_table))
+    _assert(
+        isinstance(hidden_layer_sizes, list),
+        "hidden_layer_sizes must be an array of integers")
+    # TODO put this check earlier
+    _assert(
+        all(isinstance(value, int) for value in hidden_layer_sizes),
+        "MLP error: Hidden layers sizes must be integers")
+    _assert(
+        all(value >= 0 for value in hidden_layer_sizes),
+        "MLP error: Hidden layers sizes must be greater than 0.")
+    _assert(optimizer_params["lambda"] >= 0,
+            "MLP error: lambda should be greater than or equal to 0.")
     _assert(optimizer_params["tolerance"] >= 0,
-            "MLP error: Tolerance should be greater than or equal to 0.")
+            "MLP error: tolerance should be greater than or equal to 0.")
     _assert(optimizer_params["n_tries"] >= 1,
-            "MLP error: Number of tries should be greater than or equal to 1")
-    _assert(optimizer_params["n_iterations"] >= 1,
-            "MLP error: Number of iterations should be greater than or equal to 1")
-    _assert(optimizer_params["step_size"] > 0,
-            "MLP error: Stepsize should be greater than 0.")
+            "MLP error: n_tries should be greater than or equal to 1")
+    _assert(
+        optimizer_params["n_iterations"] >= 1,
+        "MLP error: n_iterations should be greater than or equal to 1")
+    _assert(optimizer_params["power"] > 0,
+            "MLP error: power should be greater than 0.")
+    _assert(0 < optimizer_params["gamma"] <= 1,
+            "MLP error: gamma should be between 0 and 1.")
+    _assert(optimizer_params["iterations_per_step"] > 0,
+            "MLP error: iterations_per_step should be greater than 0.")
+    _assert(optimizer_params["learning_rate_init"] > 0,
+            "MLP error: learning_rate_init should be greater than 0.")
     _assert("[]" in get_expr_type(independent_varname, source_table),
             "Independent variable column should refer to an array")
-    _assert(array_col_has_same_dimension(source_table, independent_varname),
-            "Independent variable column should refer to arrays of the same length")
+    _assert(
+        array_col_has_same_dimension(source_table, independent_varname),
+        "Independent variable column should refer to arrays of the same length"
+    )
+
+    int_types = ['integer', 'smallint', 'bigint']
+    float_types = ['double precision', 'real']
+    _assert(
+        get_expr_type(weights, source_table) in int_types + float_types,
+        "MLP error: Weights should be a numeric type")
 
     if is_classification:
         _validate_args_classification(source_table, dependent_varname)
     else:
         _validate_args_regression(source_table, dependent_varname)
-# ----------------------------------------------------------------------
 
 
-def _get_activation_function_name(activation_function):
-    if not activation_function:
-        activation_function = 'sigmoid'
+def _get_learning_rate_policy_name(learning_rate_policy):
+    if not learning_rate_policy:
+        learning_rate_policy = 'constant'
+    else:
+        supported_learning_rate_policies = ['constant', 'exp', 'inv', 'step']
+        try:
+            learning_rate_policy = next(
+                x for x in supported_learning_rate_policies
+                if x.startswith(learning_rate_policy))
+        except StopIteration:
+            plpy.error(
+                "MLP Error: Invalid learning rate policy: "
+                "{0}. Supported learning rate policies are ({1})".format(
+                    learning_rate_policy,
+                    ','.join(sorted(supported_learning_rate_policies))))
+    return learning_rate_policy
+
+
+def _get_activation_function_name(activation):
+    if not activation:
+        activation = 'sigmoid'
     else:
-        # Add non-linear kernels below after implementing them.
         supported_activation_function = ['sigmoid', 'tanh', 'relu']
         try:
-            # allow user to specify a prefix substring of
-            # supported kernels. This works because the supported
-            # kernels have unique prefixes.
-            activation_function = next(x for x in supported_activation_function
-                                       if x.startswith(activation_function))
+            activation = next(
+                x for x in supported_activation_function
+                if x.startswith(activation))
         except StopIteration:
-            # next() returns a StopIteration if no element found
             plpy.error("MLP Error: Invalid activation function: "
-                       "{0}. Supported activation functions are ({1})"
-                       .format(activation_function, ','.join(
-                           sorted(supported_activation_function))))
-    return activation_function
-# ------------------------------------------------------------------------------
+                       "{0}. Supported activation functions are ({1})".format(
+                           activation,
+                           ','.join(sorted(supported_activation_function))))
+    return activation
 
 
 def _get_activation_index(activation_name):
@@ -333,12 +535,15 @@ def _format_label(label):
     if isinstance(label, str):
         return "'" + label + "'"
     return label
-# -------------------------------------------------------------------------
 
 
-def mlp_predict(schema_madlib, model_table, data_table,
-                id_col_name, output_table,
-                pred_type='response', **kwargs):
+def mlp_predict(schema_madlib,
+                model_table,
+                data_table,
+                id_col_name,
+                output_table,
+                pred_type='response',
+                **kwargs):
     """ Score new observations using a trained neural network
 
     @param schema_madlib Name of the schema where MADlib is installed
@@ -356,13 +561,7 @@ def mlp_predict(schema_madlib, model_table, data_table,
     input_tbl_valid(model_table, 'MLP')
     cols_in_tbl_valid(model_table, ['coeff'], 'MLP')
     summary_table = add_postfix(model_table, "_summary")
-    input_tbl_valid(summary_table, 'MLP')
-    cols_in_tbl_valid(summary_table,
-                      ['dependent_varname', 'independent_varname',
-                       'activation_function',
-                       'tolerance', 'step_size', 'n_iterations',
-                       'n_tries', 'classes', 'layer_sizes', 'source_table'],
-                      'MLP')
+    _validate_summary_table(summary_table)
 
     summary = plpy.execute("SELECT * FROM {0}".format(summary_table))[0]
     coeff = py_list_to_sql_string(plpy.execute(
@@ -370,106 +569,116 @@ def mlp_predict(schema_madlib, model_table, data_table,
     dependent_varname = summary['dependent_varname']
     independent_varname = summary['independent_varname']
     source_table = summary['source_table']
-    activation_function = _get_activation_index(summary['activation_function'])
+    activation = _get_activation_index(summary['activation'])
     layer_sizes = py_list_to_sql_string(
         summary['layer_sizes'], array_type="DOUBLE PRECISION")
     is_classification = int(summary["is_classification"])
     is_response = int(pred_type == 'response')
+    x_means = py_list_to_sql_string(
+        summary["x_means"], array_type="DOUBLE PRECISION")
+    x_stds = py_list_to_sql_string(
+        summary["x_stds"], array_type="DOUBLE PRECISION")
 
-    pred_name = ('"prob_{0}"' if pred_type == "prob" else
-                 '"estimated_{0}"').format(dependent_varname.replace('"', '').strip())
+    pred_name = (
+        '"prob_{0}"' if pred_type == "prob" else
+        '"estimated_{0}"').format(dependent_varname.replace('"', '').strip())
 
     input_tbl_valid(data_table, 'MLP')
 
-    _assert(is_var_valid(data_table, independent_varname),
-            "MLP Error: independent_varname ('{0}') is invalid for data_table ({1})".
-            format(independent_varname, data_table))
+    _assert(
+        is_var_valid(data_table, independent_varname),
+        "MLP Error: independent_varname ('{0}') is invalid for data_table ({1})".
+        format(independent_varname, data_table))
     _assert(id_col_name is not None, "MLP Error: id_col_name is NULL")
-    _assert(is_var_valid(data_table, id_col_name),
-            "MLP Error: id_col_name ('{0}') is invalid for {1}".
-            format(id_col_name, data_table))
+    _assert(
+        is_var_valid(data_table, id_col_name),
+        "MLP Error: id_col_name ('{0}') is invalid for {1}".format(
+            id_col_name, data_table))
     output_tbl_valid(output_table, 'MLP')
 
-    with MinWarning("warning"):
-        header = "CREATE TABLE " + output_table + " AS "
-        # Regression
-        if not is_classification:
-            dependent_type = get_expr_type(dependent_varname, source_table)
-            unnest_if_not_array = ""
-            # Return the same type as the user provided.  Internally we always use an array, but
-            # if they provided a scaler, unnest it for the user
-            if "[]" not in dependent_type:
-                unnest_if_not_array = "UNNEST"
+    header = "CREATE TABLE " + output_table + " AS "
+    # Regression
+    if not is_classification:
+        dependent_type = get_expr_type(dependent_varname, source_table)
+        unnest_if_not_array = ""
+        # Return the same type as the user provided.  Internally we always
+        # use an array, but if they provided a scaler, unnest it for
+        # the user
+        if "[]" not in dependent_type:
+            unnest_if_not_array = "UNNEST"
+        sql = header + """
+            SELECT {id_col_name},
+                   {unnest_if_not_array}({schema_madlib}.internal_predict_mlp(
+                        {coeff},
+                        {independent_varname}::DOUBLE PRECISION[],
+                        {is_classification},
+                        {activation},
+                        {layer_sizes},
+                        {is_response},
+                        {x_means},
+                        {x_stds}
+                    )) as {pred_name}
+            FROM {data_table}
+            """
+    else:
+        summary_query = """
+        SELECT classes FROM {0}
+        """.format(summary_table)
+        classes = plpy.execute(summary_query)[0]['classes']
+        if pred_type == "response":
+            classes_with_index_table = unique_string()
+            classes_table = unique_string()
             sql = header + """
-                SELECT {id_col_name},
-                       {unnest_if_not_array}({schema_madlib}.internal_predict_mlp(
-                            {coeff},
-                            {independent_varname}::DOUBLE PRECISION[],
-                            {is_classification},
-                            {activation_function},
-                            {layer_sizes},
-                            {is_response}
-                        )) as {pred_name}
-                FROM {data_table}
+                    SELECT
+                         q.{id_col_name}
+                        ,(ARRAY{classes})[pred_idx[1]+1] as {pred_name}
+                    FROM (
+                         SELECT
+                            {id_col_name},
+                            {schema_madlib}.internal_predict_mlp(
+                                    {coeff}::DOUBLE PRECISION[],
+                                    {independent_varname}::DOUBLE PRECISION[],
+                                    {is_classification},
+                                    {activation},
+                                    {layer_sizes},
+                                    {is_response},
+                                    {x_means},
+                                    {x_stds}
+                                    )
+                           as pred_idx
+                        FROM {data_table}
+                    ) q
                 """
         else:
-            summary_query = """
-            SELECT classes FROM {0}
-            """.format(summary_table)
-            classes = plpy.execute(summary_query)[0]['classes']
-            if pred_type == "response":
-                # This join is to recover the class name from the summary table,
-                #  as prediction just returns an index
-                classes_with_index_table = unique_string()
-                classes_table = unique_string()
-                sql = header + """
-                        SELECT
-                             q.{id_col_name}
-                            ,(ARRAY{classes})[pred_idx[1]+1] as {pred_name}
-                        FROM (
-                             SELECT
-                                {id_col_name},
-                                {schema_madlib}.internal_predict_mlp(
-                                        {coeff}::DOUBLE PRECISION[],
-                                        {independent_varname}::DOUBLE PRECISION[],
-                                        {is_classification},
-                                        {activation_function},
-                                        {layer_sizes},
-                                        {is_response}
-                                        )
-                               as pred_idx
-                            FROM {data_table}
-                        ) q
-                    """
-            else:
-                # Incomplete
-                intermediate_col = unique_string()
-                score_format = ',\n'.join([
-                    'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'.
-                    format(j=i + 1, c_str=str(c).strip(' "'),
-                           interim=intermediate_col)
-                    for i, c in enumerate(classes)])
-                sql = header + """
-                    SELECT
-                        {id_col_name},
-                        {score_format}
-                        FROM (
-                            SELECT {id_col_name},
-                                   {schema_madlib}.internal_predict_mlp(
-                                       {coeff}::DOUBLE PRECISION[],
-                                       {independent_varname}::DOUBLE PRECISION[],
-                                       {is_classification},
-                                       {activation_function},
-                                       {layer_sizes},
-                                       {is_response}
-                                       )::TEXT[]
-                                            AS {intermediate_col}
-                            FROM {data_table}
-                        ) q
-                    """
+            # Incomplete
+            intermediate_col = unique_string()
+            score_format = ',\n'.join([
+                'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'.
+                format(j=i + 1, c_str=str(c).strip(' "'),
+                       interim=intermediate_col)
+                for i, c in enumerate(classes)])
+            sql = header + """
+                SELECT
+                    {id_col_name},
+                    {score_format}
+                    FROM (
+                        SELECT {id_col_name},
+                               {schema_madlib}.internal_predict_mlp(
+                                   {coeff}::DOUBLE PRECISION[],
+                                   {independent_varname}::DOUBLE PRECISION[],
+                                   {is_classification},
+                                   {activation},
+                                   {layer_sizes},
+                                   {is_response},
+                                   {x_means},
+                                   {x_stds}
+                                   )::TEXT[]
+                                        AS {intermediate_col}
+                        FROM {data_table}
+                    ) q
+                """
     sql = sql.format(**locals())
     plpy.execute(sql)
-# ----------------------------------------------------------------------
 
 
 def mlp_help(schema_madlib, message, is_classification):
@@ -511,34 +720,44 @@ def mlp_help(schema_madlib, message, is_classification):
                                     USAGE
     ---------------------------------------------------------------------------
     SELECT {schema_madlib}.{method}(
-        source_table,         -- name of input table
-        output_table,         -- name of output model table
-        independent_varname,  -- name of independent variable
-        dependent_varname,    -- {label_description}
-        hidden_layer_sizes,   -- Array of integers indicating the
+        source_table,         -- TEXT. name of input table
+        output_table,         -- TEXT. name of output model table
+        independent_varname,  -- TEXT. name of independent variable
+        dependent_varname,    -- TEXT. {label_description}
+        hidden_layer_sizes,   -- INTEGER[]. Array of integers indicating the
                                  number of hidden units per layer.
                                  Length equal to the number of hidden layers.
-        optimizer_params,     -- optional, default NULL
+        optimizer_params,     -- TEXT. optional, default NULL
                                  parameters for optimization in
                                  a comma-separated string of key-value pairs.
+                                 To find out more:
+
+                      SELECT {schema_madlib}.{method}('optimizer_params')
 
-            step_size DOUBLE PRECISION, -- Default: 0.001
-                                           Learning rate
-            n_iterations INTEGER,       -- Default: 100
-                                           Number of iterations per try
-            n_tries INTEGER,            -- Default: 1
-                                           Total number of training cycles,
-                                           with random initializations to avoid
-                                           local minima.
-            tolerance DOUBLE PRECISION, -- Default: 0.001
-                                           If the distance in loss between
-                                           two iterations is less than the
-                                           tolerance training will stop, even if
-                                           n_iterations has not been reached
-
-        activation            -- optional, default: 'sigmoid'.
+        activation            -- TEXT. optional, default: 'sigmoid'.
                                  supported activations: 'relu', 'sigmoid',
                                  and 'tanh'
+
+        weights               -- TEXT. optional, default: NULL.
+                                 Weights for input rows. Column name which
+                                 specifies the weight for each input row.
+                                 This weight will be incorporated into the
+                                 update during SGD, and will not be used
+                                 for loss calculations. If not specified,
+                                 weight for each row will default to 1.
+                                 Column should be a numeric type.
+
+        warm_start            -- BOOLEAN. optional, default: FALSE.
+                                 Initalize weights with the coefficients from
+                                 the last call.  If true, weights will
+                                 be initialized from output_table. Note that
+                                 all parameters other than optimizer_params,
+                                 and verbose must remain constant between calls
+                                 to warm_start.
+
+        verbose               -- BOOLEAN. optional, default: FALSE
+                                 Provides verbose output of the results of
+                                 training.
     );
 
 
@@ -576,22 +795,29 @@ def mlp_help(schema_madlib, message, is_classification):
     {1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71} | 1 | 21.70
     \.
 
-    - Generate a multilayer perception with a two hidden layers of 5 units
+    - Generate a multilayer perception with a two hidden layers of 25 units
     each. Use the x column as the independent variables, and use the class
-    column as the classification. Set the tolerance to 0 so that 300
+    column as the classification. Set the tolerance to 0 so that 500
     iterations will be run. Use a sigmoid activation function.
     The model will be written to mlp_regress_result.
 
-    SELECT mlp_regression(
-        'lin_housing_wi',           -- Source table
-        'mlp_regress_result',  -- Desination table
-        'x',                        -- Independent variable
-        'y',                        -- Dependent variable
-        ARRAY[5,5],                 -- Number of hidden units per layer
-        'step_size=0.007,
-        n_iterations=300,
+    DROP TABLE IF EXISTS mlp_regress;
+    DROP TABLE IF EXISTS mlp_regress_summary;
+    SELECT madlib.mlp_regression(
+        'lin_housing',         -- Source table
+        'mlp_regress',         -- Desination table
+        'x',                   -- Input features
+        'y',                   -- Dependent variable
+        ARRAY[25,25],            -- Number of units per layer
+        'learning_rate_init=0.001,
+        n_iterations=500,
+        lambda=0.001,
         tolerance=0',
-        'sigmoid');                 -- Activation
+        'relu',
+        NULL,             -- Default weight (1)
+        FALSE,            -- No warm start
+        TRUE              -- Verbose
+    );
 
     """
 
@@ -630,29 +856,78 @@ def mlp_help(schema_madlib, message, is_classification):
 
     -- Generate a multilayer perception with a single hidden layer of 5 units.
     Use the attributes column as the independent variables, and use the class
-    column as the classification. Set the tolerance to 0 so that 1000
+    column as the classification. Set the tolerance to 0 so that 500
     iterations will be run. Use a hyperbolic tangent activation function.
-    The model will be written to mlp_result.
+    The model will be written to mlp_model.
 
-    SELECT {schema_madlib}.mlp_classification(
+    DROP TABLE IF EXISTS mlp_model;
+    DROP TABLE IF EXISTS mlp_model_summary;
+    SELECT madlib.mlp_classification(
         'iris_data',      -- Source table
         'mlp_model',      -- Destination table
         'attributes',     -- Input features
         'class_text',     -- Label
         ARRAY[5],         -- Number of units per layer
-        'step_size=0.003,
-        n_iterations=5000,
+        'learning_rate_init=0.003,
+        n_iterations=500,
         tolerance=0',     -- Optimizer params
-        'tanh');          -- Activation function
+        'tanh',           -- Activation function
+        NULL,             -- Default weight (1)
+        FALSE,            -- No warm start
+        TRUE              -- Verbose
+    );
+
+    SELECT * FROM mlp_model;
 
     """.format(**args)
     example = classification_example if is_classification else regression_example
+    optimizer_params = """
+    ------------------------------------------------------------------------------------------------
+                                               OPTIMIZER PARAMS
+    ------------------------------------------------------------------------------------------------
+    learning_rate_init DOUBLE PRECISION, -- Default: 0.001
+                                            Initial learning rate
+    learning_rate_policy VARCHAR,        -- Default: 'constant'
+                                            One of 'constant','exp','inv','step'
+                                            'constant': learning_rate =
+                                            learning_rate_init
+                                            'exp': learning_rate =
+                                            learning_rate_init * gamma^(iter)
+                                            'inv': learning_rate =
+                                            learning_rate_init * (iter+1)^(-power)
+                                            'step': learning_rate =
+                                            learning_rate_init * gamma^(floor(iter/iterations_per_step))
+                                            Where iter is the current iteration of SGD.
+    gamma DOUBLE PRECISION,              -- Default: '0.1'
+                                            Decay rate for learning rate.
+                                            Valid for learning_rate_policy = 'exp', or 'step'
+    power DOUBLE PRECISION,              -- Default: '0.5'
+                                            Exponent for learning_rate_policy = 'inv'
+    iterations_per_step INTEGER,             -- Default: '100'
+                                            Number of iterations to run before decreasing the learning
+                                            rate by a factor of gamma.  Valid for learning rate
+                                            policy = 'step'
+    n_iterations INTEGER,                -- Default: 100
+                                            Number of iterations per try
+    n_tries INTEGER,                     -- Default: 1
+                                            Total number of training cycles,
+                                            with random initializations to avoid
+                                            local minima.
+    tolerance DOUBLE PRECISION,          -- Default: 0.001
+                                            If the distance in loss between
+                                            two iterations is less than the
+                                            tolerance training will stop, even if
+                                            n_iterations has not been reached.
+    """.format(**args)
+
     if not message:
         return summary
     elif message.lower() in ('usage', 'help', '?'):
         return usage
     elif message.lower() == 'example':
         return example
+    elif message.lower() == 'optimizer_params':
+        return optimizer_params
     return """
         No such option. Use "SELECT {schema_madlib}.{method}()" for help.
     """.format(**args)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/ports/postgres/modules/convex/test/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in
index 97541a9..2302252 100644
--- a/src/ports/postgres/modules/convex/test/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/test/mlp.sql_in
@@ -28,7 +28,7 @@
 -- Classification
 
 
-SELECT setseed(0.5);
+SELECT setseed(0.6);
 DROP TABLE IF EXISTS iris_data, iris_test, mlp_class, mlp_class_summary CASCADE;
 CREATE TABLE iris_data(
     id integer,
@@ -191,21 +191,27 @@ INSERT INTO iris_data VALUES
 
 
 SELECT mlp_classification(
-    'iris_data',      -- Source table
+    'iris_data',    -- Source table
     'mlp_class',    -- Desination table
-    'attributes',     -- Input features
-    'class',   -- Label
-    ARRAY[5],     -- Number of units per layer
-    'step_size=0.001,
-    n_iterations=1000,
+    'attributes',   -- Input features
+    'class',        -- Label
+    ARRAY[5],   -- Number of units per layer
+    'learning_rate_init=0.1,
+    learning_rate_policy=constant,
+    n_iterations=800,
+    n_tries=2,
     tolerance=0',
-    'tanh');
+    'sigmoid',
+    '',
+    FALSE,
+    TRUE
+);
 
 
 SELECT assert(
     -- Loss will improve much more if more iterations are run
-    loss < 30,
-    'MLP: Loss is too high (> 30). Wrong result.'
+    loss < 0.1,
+    'MLP: Loss is too high (> 0). Wrong result.'
 ) FROM mlp_class;
 
 DROP TABLE IF EXISTS mlp_prediction;
@@ -239,9 +245,8 @@ SELECT mlp_predict(
     'mlp_prediction',
     'response');
 
-select * from mlp_prediction;
+SELECT * FROM mlp_prediction;
 SELECT assert(
-    -- Accuracy greater than 90%
     COUNT(*)/150.0 > 0.95,
     'MLP: Accuracy is too low (< 95%). Wrong result.'
 ) FROM
@@ -766,65 +771,30 @@ COPY lin_housing_wi (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
 {1,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88} | 2 | 11.90
 \.
 
--- Normalize the columns
-CREATE TEMPORARY TABLE maxs as(
-    SELECT
-    max(x[1]) m1,
-    max(x[2]) m2,
-    max(x[3]) m3,
-    max(x[4]) m4,
-    max(x[5]) m5,
-    max(x[6]) m6,
-    max(x[7]) m7,
-    max(x[8]) m8,
-    max(x[9]) m9,
-    max(x[10]) m10,
-    max(x[11]) m11,
-    max(x[12]) m12,
-    max(x[13]) m13,
-    max(x[14]) m14
-    from lin_housing_wi
-);
 
-CREATE TABLE lin_housing_wi_scaled AS
-SELECT ARRAY[
-    x[1]/(SELECT m1 from maxs),
-    x[2]/(SELECT m2 from maxs),
-    x[3]/(SELECT m3 from maxs),
-    x[4]/(SELECT m4 from maxs),
-    x[5]/(SELECT m5 from maxs),
-    x[6]/(SELECT m6 from maxs),
-    x[7]/(SELECT m7 from maxs),
-    x[8]/(SELECT m8 from maxs),
-    x[9]/(SELECT m9 from maxs),
-    x[10]/(SELECT m10 from maxs),
-    x[11]/(SELECT m11 from maxs),
-    x[12]/(SELECT m12 from maxs),
-    x[13]/(SELECT m13 from maxs),
-    x[14]/(SELECT m14 from maxs)] as x,
-    id,y
-FROM lin_housing_wi;
-
-DROP TABLE IF EXISTS maxs;
 DROP TABLE IF EXISTS mlp_regress;
 DROP TABLE IF EXISTS mlp_regress_summary;
 
 
 SELECT setseed(0);
 SELECT mlp_regression(
-    'lin_housing_wi_scaled',           -- Source table
+    'lin_housing_wi',           -- Source table
     'mlp_regress',              -- Desination table
     'x',                        -- Input features
     'y',                        -- Dependent variable
-    ARRAY[5,5],                 -- Number of units per layer
-    'step_size=0.005,
-    n_iterations=800,
+    ARRAY[40],                 -- Number of units per layer
+    'learning_rate_init=0.015,
+    learning_rate_policy=inv,
+    n_iterations=300,
     tolerance=0',
-    'sigmoid');
+    'sigmoid',
+    '',
+    False,
+    TRUE);
 
 
 SELECT assert(
-    loss < 10,
+    loss < 2,
     'MLP: Loss is too high (> 10). Wrong result.'
 ) FROM mlp_regress;
 
@@ -832,14 +802,14 @@ SELECT assert(
 DROP TABLE IF EXISTS mlp_prediction_regress;
 SELECT mlp_predict(
     'mlp_regress',
-    'lin_housing_wi_scaled',
+    'lin_housing_wi',
     'id',
     'mlp_prediction_regress',
     'output');
 SELECT assert(
-    0.5*SUM(pow(mlp_prediction_regress.estimated_y-lin_housing_wi_scaled.y,2.0))/506 < 10.0,
+    0.5*SUM(pow(mlp_prediction_regress.estimated_y-lin_housing_wi.y,2.0))/506 < 2.0,
     'MLP: Predict MSE is too high (> 10). Wrong result'
 )
-FROM mlp_prediction_regress JOIN lin_housing_wi_scaled
-ON mlp_prediction_regress.id = lin_housing_wi_scaled.id;
-DROP TABLE IF EXISTS lin_housing_wi_scaled;
+FROM mlp_prediction_regress JOIN lin_housing_wi
+ON mlp_prediction_regress.id = lin_housing_wi.id;
+DROP TABLE IF EXISTS lin_housing_wi;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index b28a5f3..c1670b5 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -54,6 +54,18 @@ def is_orca():
 # ------------------------------------------------------------------------------
 
 
+def _assert_equal(o1, o2, msg):
+    """
+    @brief if the given objects are not equal, then raise an error with the message
+    @param o1           the first object
+    @param o2           the second object
+    @param msg          the error message to be reported
+    """
+    if not o1 == o2:
+        plpy.error(msg)
+# ------------------------------------------------------------------------------
+
+
 def _assert(condition, msg):
     """
     @brief if the given condition is false, then raise an error with the message


[2/2] incubator-madlib git commit: MLP: Add multiple enhancements

Posted by ri...@apache.org.
MLP: Add multiple enhancements

JIRA: MADLIB-1134

This commit adds following:
 - Weights: Each tuple in training data can be individually weighted
 - Warm start: Network weights can be initialized from the output of a
    previous call.
 - n_tries: Allows calling the train function multiple times to avoid
    local minima.
 - Learning rate policy: Allows user to specify a policy to decay the
    learning rate.
 - Standardization: Inputs are standardized to zero mean and unit std.
    deviation.

Closes #162


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ff1b0f88
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ff1b0f88
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ff1b0f88

Branch: refs/heads/master
Commit: ff1b0f883c7a178323670b83b14069e06bf1b808
Parents: 6f6f804
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Aug 14 09:50:25 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Aug 14 09:50:25 2017 -0700

----------------------------------------------------------------------
 .gitignore                                      |   1 +
 doc/design/modules/neural-network.tex           | 144 ++-
 doc/literature.bib                              |   8 +-
 doc/mainpage.dox.in                             |   3 +-
 src/modules/convex/mlp_igd.cpp                  |  74 +-
 src/modules/convex/task/l2.hpp                  |   3 +-
 src/modules/convex/task/mlp.hpp                 | 259 ++----
 src/modules/convex/type/model.hpp               |  70 +-
 src/modules/convex/type/state.hpp               |  30 +-
 src/modules/convex/type/tuple.hpp               |   2 +-
 src/ports/postgres/modules/convex/mlp.sql_in    | 497 +++++++---
 src/ports/postgres/modules/convex/mlp_igd.py_in | 923 ++++++++++++-------
 .../postgres/modules/convex/test/mlp.sql_in     |  94 +-
 .../postgres/modules/utilities/utilities.py_in  |  12 +
 14 files changed, 1297 insertions(+), 823 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index abfccfa..00dc016 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Ignore build directory
 /build*
+/build-docker*
 
 # Ignore generated code files
 *.so

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/design/modules/neural-network.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/neural-network.tex b/doc/design/modules/neural-network.tex
index 8802361..9f8110b 100644
--- a/doc/design/modules/neural-network.tex
+++ b/doc/design/modules/neural-network.tex
@@ -22,7 +22,7 @@
 \chapter{Neural Network}
 
 \begin{moduleinfo}
-\item[Authors] {Xixuan Feng}
+\item[Authors] {Xixuan Feng, Cooper Sloan}
 \end{moduleinfo}
 
 % Abstract. What is the problem we want to solve?
@@ -30,7 +30,8 @@ This module implements artificial neural network \cite{ann_wiki}.
 
 \section{Multilayer Perceptron}
 Multilayer perceptron is arguably the most popular model among many neural network models \cite{mlp_wiki}.
-Here, we learn the coefficients by minimizing a least square objective function (\cite{bertsekas1999nonlinear}, example 1.5.3).
+Here, we learn the coefficients by minimizing a least square objective function, or cross entropy (\cite{bertsekas1999nonlinear}, example 1.5.3).
+The parallel architecture is based on the paper by Zhiheng Huang \cite{mlp_parallel}.
 
 % Background. Why can we solve the problem with gradient-based methods?
 \subsection{Solving as a Convex Program}
@@ -46,41 +47,47 @@ For multilayer perceptron, we choose incremental gradient descent (IGD).
 In the remaining part of this section, we will give a formal description of the derivation of objective function and its gradient.
 
 \paragraph{Objective function.}
-We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k$th stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function is given as
-\[f_{(y, z)}(u) = \frac{1}{2} \|h(u, y) - z\|_2^2,\]
-where $y \in \mathbb{R}^{n_0}$ is the input vector, $z \in \mathbb{R}^{n_N}$ is the output vector,
-\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
+We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k^{th}$ stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function for regression is given as
+\[f_{(x, y)}(u) = \frac{1}{2} \|h(u, x) - y\|_2^2,\]
+and for classification the objective function is given as
+\[f_{(x, y)}(u) = \sum_i (\log(h_i(u, x)) * z_i + (1-\log(h_i(u, x))) *( 1- z_i) ,\]
+where $x \in \mathbb{R}^{n_0}$ is the input vector, $y \in \mathbb{R}^{n_N}$ is the output vector (one hot encoded for classification),~\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
 and the coefficients are given as
-\[u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}\]
+\[u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\},\]
+And are initialized from a uniform distribution as follows:
+\[u_{k}^{sj} = uniform(-r,r),\]
+where r is defined as follows:
+\[r = \sqrt{\frac{6}{n_k+n_{k+1}}}\]
+With regularization, an additional term enters the objective function, given as
+\[\sum_{u_k^{sj}} \frac{1}{2} \lambda u_k^{sj2} \]
 This still leaves $h : \mathbb{R}^{n_0} \to \mathbb{R}^{n_N}$ as an open item.
-Let $x_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k$th layer. Then we define $h(u, y) = x_N$, based on setting $x_0 = y$ and the $j$th component of $x_k$ is given in an iterative fashion as
-\footnote{$x_k^0 \equiv 1$ is used to simplified the notations, and $x_k^0$ is not a component of $x_k$, for any $k = 0,...,N$.}
+Let $o_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k^{th}$ layer. Then we define $h(u, x) = o_N$, based on setting $o_0 = x$ and the $j^{th}$ component of $o_k$ is given in an iterative fashion as~\footnote{$o_k^0 \equiv 1$ is used to simplified the notations, and $o_k^0$ is not a component of $o_k$, for any $k = 0,...,N$.}
 \[\begin{alignedat}{5}
-    x_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} x_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
+    o_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
 \end{alignedat}\]
 
 \paragraph{Gradient of the End Layer.}
 Let's first handle $u_{N-1}^{st}, s = 0,...,n_{N-1}, t = 1,...,n_N$.
-Let $z^t$ denote the $t$th component of $z \in \mathbb{R}^{n_N}$, and $h^t$ the $t$th component of output of $h$.
+Let $y^t$ denote the $t^{th}$ component of $y \in \mathbb{R}^{n_N}$, and $h^t$ the $t^{th}$ component of output of $h$.
 \[\begin{aligned}
     \frac{\partial f}{\partial u_{N-1}^{st}}
-    &= \left( h^t(u, y) - z^t \right) \cdot \frac{\partial h^t(u, y)}{\partial u_{N-1}^{st}} \\
-    &= \left( x_N^t - z^t \right) \cdot \frac{\partial x_N^t}{\partial u_{N-1}^{st}} \\
-    &= \left( x_N^t - z^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} x_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
-    &= \left( x_N^t - z^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} x_{N-1}^s u_{N-1}^{st} \right) \cdot x_{N-1}^s \\
+    &= \left( h^t(u, x) - y^t \right) \cdot \frac{\partial h^t(u, x)}{\partial u_{N-1}^{st}} \\
+    &= \left( o_N^t - y^t \right) \cdot \frac{\partial o_N^t}{\partial u_{N-1}^{st}} \\
+    &= \left( o_N^t - y^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
+    &= \left( o_N^t - y^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right) \cdot o_{N-1}^s \\
 \end{aligned}\]
-To ease the notation, let the input vector of the $j$th activation unit of the $(k+1)$th layer be
-\[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} x_{k-1}^s u_{k-1}^{sj},\]
-where $k = 1,...,N, \; j = 1,...,n_k$, and note that $x_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
-\[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( x_N^t - z^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot x_{N-1}^s\]
-For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $z^t$, and $x_N^t, \mathit{net}_N^t, x_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.
+To ease the notation, let the input vector of the $j^{th}$ activation unit of the $(k+1)^{th}$ layer be
+\[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj},\]
+where $k = 1,...,N, \; j = 1,...,n_k$, and note that $o_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
+\[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( o_N^t - y^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot o_{N-1}^s\]
+For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $y^t$, and $o_N^t, \mathit{net}_N^t, o_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.
 
 \subsubsection{Backpropagation}
 For inner (hidden) layers, it is more difficult to compute the partial derivative over the input of activation units (i.e., $\mathit{net}_k, k = 1,...,N-1$).
-That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (x_N^t - z^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
+That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (o_N^t - y^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
 This hard-to-compute statistic is referred to as \textit{delta error}, and let $\delta_k^j = \frac{\partial f}{\partial \mathit{net}_k^j}$, where $k = 1,...,N-1, j = 1,..,n_k$.
 If this is solved, the gradient can be easily computed as follow
-\[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} x_{k-1}^s,\]
+\[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} o_{k-1}^s,\]
 where $k = 1,...,N-1, s = 0,...,n_{k-1}, j = 1,..,n_k$.
 To solve this, we introduce the popular backpropagation below.
 
@@ -90,20 +97,20 @@ First,
 \[
     \delta_{k}^j
     = \frac{\partial f}{\partial \mathit{net}_{k}^j}
-    = \frac{\partial f}{\partial x_{k}^j} \cdot \frac{\partial x_{k}^j}{\partial \mathit{net}_{k}^j}
-    = \frac{\partial f}{\partial x_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
+    = \frac{\partial f}{\partial o_{k}^j} \cdot \frac{\partial o_{k}^j}{\partial \mathit{net}_{k}^j}
+    = \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
 \]
 And here comes the only equation that is needed but the author, I (Aaron), do not understand but it looks reasonable and repeats in different online notes \cite{mlp_gradient_wisc},
 \[\begin{alignedat}{5}
-    \frac{\partial f}{\partial x_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial x_{k}^j} \right),
+    \frac{\partial f}{\partial o_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right),
     &\quad k = 1,...,N-1, \: j = 1,...,n_{k}
 \end{alignedat}\]
 Assuming the above equation is true, we can solve delta error backward iteratively
 \[\begin{aligned}
     \delta_{k}^j
-    &= \frac{\partial f}{\partial x_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
-    &= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial x_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
-    &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} x_{k}^s u_{k}^{st} \right) }{\partial x_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
+    &= \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
+    &= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
+    &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} o_{k}^s u_{k}^{st} \right) }{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
     &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot u_{k}^{jt} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
 \end{aligned}\]
 To sum up, we need the following equation for error back propagation
@@ -111,20 +118,20 @@ To sum up, we need the following equation for error back propagation
 where $k = 1,...,N-1$, and $j = 1,...,n_{k}$.
 
 \subsubsection{The $\mathit{Gradient}$ Function}
-\begin{algorithm}[mlp-gradient$(u, y, z)$] \label{alg:mlp-gradient}
+\begin{algorithm}[mlp-gradient$(u, x, y)$] \label{alg:mlp-gradient}
 \alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
-start vector $y \in \mathbb{R}^{n_0}$,\\
-end vector $z \in \mathbb{R}^{n_N}$,\\
+start vector $x \in \mathbb{R}^{n_0}$,\\
+end vector $y \in \mathbb{R}^{n_N}$,\\
 activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
 \algoutput{Gradient value $\nabla f(u)$ that consists of components $\nabla f(u)_{k-1}^{sj} = \frac{\partial f}{\partial u_{k-1}^{sj}}$}
 \begin{algorithmic}[1]
-    \State $(\mathit{net}, x) \set$ \texttt{feed-forward}$(u, y, \phi)$
-    \State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, x, z, \phi')$
+    \State $(\mathit{net}, o) \set$ \texttt{feed-forward}$(u, x, \phi)$
+    \State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, o, y, \phi')$
     \State $\delta \set$ \texttt{error-back-propagation}$(\delta_N, \mathit{net}, u, \phi')$
     \For{$k = 1,...,N$}
         \For{$s = 0,...,n_{k-1}$}
             \For{$j = 1,...,n_k$}
-                \State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j x_{k-1}^s$
+                \State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j o_{k-1}^s$
                 \Comment{Can be put together with the computation of delta $\delta$}
             \EndFor
         \EndFor
@@ -138,46 +145,47 @@ Common examples of activation units are
 \[\begin{alignedat}{3}
 \phi(\xi) &= \frac{1}{1 + e^{-\xi}}, &\quad \text{ (logistic function),}\\
 \phi(\xi) &= \frac{e^{\xi} - e^{-\xi}}{e^{\xi} + e^{-\xi}}, &\quad \text{ (hyperbolic tangent function)}\\
+\phi(\xi) &= max(x,0), &\quad \text{ (rectified linear function)}\\
 \end{alignedat}\]
 
-\begin{algorithm}[feed-forward$(u, y, \phi)$] \label{alg:feed-forward}
+\begin{algorithm}[feed-forward$(u, x, \phi)$] \label{alg:feed-forward}
 \alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
-input vector $y \in \mathbb{R}^{n_0}$,\\
+input vector $x \in \mathbb{R}^{n_0}$,\\
 activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
 \algoutput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
-output vectors $x = \{x_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$}
+output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$}
 \begin{algorithmic}[1]
     \For{$k = 0,...,N$}
-        \State $x_k^0 \set 1$
+        \State $o_k^0 \set 1$
     \EndFor
-    \State $x_0 \set y$ \Comment{For all components $x_0^j, y^j, \; j = 1,...,n_0$}
+    \State $o_0 \set x$ \Comment{For all components $o_0^j, x^j, \; j = 1,...,n_0$}
     \For{$k = 1,...,N$}
         \For{$j = 1,...,n_k$}
             \State $\mathit{net}_k^j \set 0$
             \For{$s = 0,...,n_{k-1}$}
-                \State $\mathit{net}_k^j \set \mathit{net}_k^j + x_{k-1}^s u_{k-1}^{sj}$
+                \State $\mathit{net}_k^j \set \mathit{net}_k^j + o_{k-1}^s u_{k-1}^{sj}$
             \EndFor
-            \State $x_k^j = \phi(\mathit{net}_k^j)$
+            \State $o_k^j = \phi(\mathit{net}_k^j)$ \Comment{Where the activation function for the final layer is identity for regression and softmax for classification.}
         \EndFor
     \EndFor
-    \State \Return $(\mathit{net}, x)$
+    \State \Return $(\mathit{net}, o)$
 \end{algorithmic}
 \end{algorithm}
 
-\begin{algorithm}[end-layer-delta-error$(\mathit{net}, x, z, \phi')$] \label{alg:end-layer-delta-error}
+\clearpage
+\begin{algorithm}[end-layer-delta-error$(\mathit{net}, o, y, \phi')$] \label{alg:end-layer-delta-error}
 \alginput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
-output vectors $x = \{x_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
-end vector $z \in \mathbb{R}^{n_N}$,\\
+output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
+end vector $y \in \mathbb{R}^{n_N}$,\\
 derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
 \algoutput{End layer delta $\delta_N = \{\delta_N^t \; | \; t = 1,...,n_N\}$}
 \begin{algorithmic}[1]
     \For{$t = 1,...,n_N$}
-            \State $\delta_N^t \set (x_N^t - z^t) \phi'(\mathit{net}_N^t)$
+            \State $\delta_N^t \set (o_N^t - y^t)$ \Comment{This applies for identity activation and mean square error loss and softmax activation with cross entropy loss}
     \EndFor
     \State \Return $\delta_N$
 \end{algorithmic}
 \end{algorithm}
-
 \begin{algorithm}[error-back-propagation$(\delta_N, \mathit{net}, u, \phi')$] \label{alg:error-back-propagation}
 \alginput{End layer delta $\delta_N = \{\delta_N^t \; | \; t = 1,...,n_N\}$,\\
 input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
@@ -197,3 +205,45 @@ derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
     \State \Return $\delta$
 \end{algorithmic}
 \end{algorithm}
+
+\begin{algorithm}[mlp-train-iteration$(X, Y, \eta)$] \label{alg:mlp-train-iteration}
+\alginput{
+start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
+end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
+learning rate $\eta$,\\}
+\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
+\begin{algorithmic}[1]
+    \State \texttt{Randomnly initialize u}
+    \For{$i = 1,...,m$}
+        \State $\nabla f(u) \set \texttt{mlp-gradient}(u,X_i,Y_i)$
+        \State $u \set u - (\eta \nabla f(u) u + \lambda u)$
+    \EndFor
+    \State \Return $u$
+\end{algorithmic}
+\end{algorithm}
+
+\clearpage
+\begin{algorithm}[mlp-train-parallel$(X, Y, \eta, s, t)$] \label{alg:mlp-train-parallel}
+\alginput{
+start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
+end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
+learning rate $\eta$,\\
+segments $s$,\\
+iterations $t$,\\}
+\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
+\begin{algorithmic}[1]
+    \State \texttt{Randomnly initialize u}
+    \For{$j = 1,...,s$}
+        \State $X_j \set \texttt{subset-of-X}$
+        \State $Y_j \set \texttt{subset-of-Y}$
+    \EndFor
+    \For{$i = 1,...,t$}
+        \For{$j = 1,...,s$}
+            \State $u_j \set copy(u)$
+            \State $u_j \set \texttt{mlp-train-iteration}(X_j, Y_j, \eta)$
+        \EndFor
+        \State $u \set \texttt{weighted-avg}(u_{1...s})$
+    \EndFor
+    \State \Return $u$
+\end{algorithmic}
+\end{algorithm}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index 225622d..6784f5e 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -953,4 +953,10 @@ Applied Survival Analysis},
 @online{bfs_wikipedia,
    title = {Breadth-first search},
    url={https://en.wikipedia.org/wiki/Breadth-first_search}
-}
\ No newline at end of file
+}
+
+@misc{mlp_parallel,
+    Url = {https://www.microsoft.com/en-us/research/publication/accelerating-recurrent-neural-network-training-via-two-stage-classes-and-parallelization/},
+    Title = {{Accelerating Recurrent Neural Network Training via Two Stage Classes and Parallelization}},
+    Author = {{Zhiheng Huang}}
+}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index ccf58a8..e27e14a 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -183,7 +183,7 @@ Contains graph algorithms.
     @defgroup grp_crf Conditional Random Field
     @ingroup grp_super
 
-    @defgroup grp_mlp Multilayer Perceptron
+    @defgroup grp_nn Neural Network
     @ingroup grp_super
 
     @defgroup grp_regml Regression Models
@@ -202,7 +202,6 @@ Contains graph algorithms.
         @defgroup grp_robust Robust Variance
     @}
 
-
     @defgroup grp_svm Support Vector Machines
     @ingroup grp_super
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/mlp_igd.cpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/mlp_igd.cpp b/src/modules/convex/mlp_igd.cpp
index 3647d5f..9e9e665 100644
--- a/src/modules/convex/mlp_igd.cpp
+++ b/src/modules/convex/mlp_igd.cpp
@@ -29,6 +29,7 @@
 #include "mlp_igd.hpp"
 
 #include "task/mlp.hpp"
+#include "task/l2.hpp"
 #include "algo/igd.hpp"
 #include "algo/loss.hpp"
 
@@ -51,6 +52,8 @@ typedef Loss<MLPIGDState<MutableArrayHandle<double> >, MLPIGDState<ArrayHandle<d
 
 typedef MLP<MLPModel<MutableArrayHandle<double> >,MLPTuple> MLPTask;
 
+typedef MLPModel<MutableArrayHandle<double> > MLPModelType;
+
 /**
  * @brief Perform the multilayer perceptron transition step
  *
@@ -63,6 +66,7 @@ mlp_igd_transition::run(AnyType &args) {
     // For other tuples: args[0] holds the computation state until last tuple
     MLPIGDState<MutableArrayHandle<double> > state = args[0];
 
+
     // initilize the state if first tuple
     if (state.algo.numRows == 0) {
         if (!args[3].isNull()) {
@@ -74,20 +78,30 @@ mlp_igd_transition::run(AnyType &args) {
         } else {
             // configuration parameters
             ArrayHandle<double> numbersOfUnits = args[4].getAs<ArrayHandle<double> >();
+            int numberOfStages = numbersOfUnits.size() - 1;
 
             double stepsize = args[5].getAs<double>();
 
-            state.allocate(*this, numbersOfUnits.size() - 1,
+            state.allocate(*this, numberOfStages,
                            reinterpret_cast<const double *>(numbersOfUnits.ptr()));
             state.task.stepsize = stepsize;
 
 
-            int activation = args[6].getAs<int>();
-
-            int is_classification = args[7].getAs<int>();
-            state.task.model.initialize(is_classification, activation);
+            const int activation = args[6].getAs<int>();
+            const int is_classification = args[7].getAs<int>();
+
+            const bool warm_start = args[9].getAs<bool>();
+            const int n_tuples = args[11].getAs<int>();
+            const double lambda = args[12].getAs<double>();
+            state.task.lambda = lambda;
+            MLPTask::lambda = lambda;
+            double is_classification_double = (double) is_classification;
+            double activation_double = (double) activation;
+            MappedColumnVector coeff = args[10].getAs<MappedColumnVector>();
+            state.task.model.rebind(&is_classification_double,&activation_double,
+                                    &coeff.data()[0], numberOfStages,
+                                    &numbersOfUnits[0]);
         }
-
         // resetting in either case
         state.reset();
     }
@@ -96,25 +110,23 @@ mlp_igd_transition::run(AnyType &args) {
     const uint16_t N = state.task.numberOfStages;
     const double *n = state.task.numbersOfUnits;
 
+    MappedColumnVector x_means = args[13].getAs<MappedColumnVector>();
+    MappedColumnVector x_stds = args[14].getAs<MappedColumnVector>();
     // tuple
-    MappedColumnVector indVar;
+    ColumnVector indVar;
     MappedColumnVector depVar;
     try {
-        // an exception is raised in the backend if args[2] contains nulls
-        MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-        // x is a const reference, we can only rebind to change its pointer
-        indVar.rebind(x.memoryHandle(), x.size());
+        indVar = (args[1].getAs<MappedColumnVector>()-x_means).cwiseQuotient(x_stds);
         MappedColumnVector y = args[2].getAs<MappedColumnVector>();
         depVar.rebind(y.memoryHandle(), y.size());
-
     } catch (const ArrayWithNullException &e) {
         return args[0];
     }
     MLPTuple tuple;
-    tuple.indVar.rebind(indVar.memoryHandle(), indVar.size());
+    tuple.indVar = indVar;
     tuple.depVar.rebind(depVar.memoryHandle(), depVar.size());
+    tuple.weight = args[8].getAs<double>();
 
-    // Now do the transition step
     MLPIGDAlgorithm::transition(state, tuple);
     MLPLossAlgorithm::transition(state, tuple);
     state.algo.numRows ++;
@@ -130,14 +142,12 @@ mlp_igd_merge::run(AnyType &args) {
     MLPIGDState<MutableArrayHandle<double> > stateLeft = args[0];
     MLPIGDState<ArrayHandle<double> > stateRight = args[1];
 
-    // We first handle the trivial case where this function is called with one
-    // of the states being the initial state
     if (stateLeft.algo.numRows == 0) { return stateRight; }
     else if (stateRight.algo.numRows == 0) { return stateLeft; }
 
-    // Merge states together
     MLPIGDAlgorithm::merge(stateLeft, stateRight);
     MLPLossAlgorithm::merge(stateLeft, stateRight);
+
     // The following numRows update, cannot be put above, because the model
     // averaging depends on their original values
     stateLeft.algo.numRows += stateRight.algo.numRows;
@@ -154,20 +164,17 @@ mlp_igd_final::run(AnyType &args) {
     // a deep copy.
     MLPIGDState<MutableArrayHandle<double> > state = args[0];
 
-    // Aggregates that haven't seen any data just return Null.
     if (state.algo.numRows == 0) { return Null(); }
 
-    // finalizing
-    MLPIGDAlgorithm::final(state);
-
-    // Return the mean loss
+    L2<MLPModelType>::lambda = state.task.lambda;
     state.algo.loss = state.algo.loss/static_cast<double>(state.algo.numRows);
+    state.algo.loss += L2<MLPModelType>::loss(state.task.model);
+    MLPIGDAlgorithm::final(state);
 
-    // for stepsize tuning
-    std::stringstream debug;
-    debug << "loss: " << state.algo.loss;
-    elog(INFO,"%s",debug.str().c_str());
-    return state;
+    AnyType tuple;
+    tuple << state
+          << (double)state.algo.loss;
+    return tuple;
 }
 
 /**
@@ -191,10 +198,9 @@ internal_mlp_igd_result::run(AnyType &args) {
         flattenU;
     flattenU.rebind(&state.task.model.u[0](0, 0),
             state.task.model.arraySize(state.task.numberOfStages,
-                    state.task.numbersOfUnits)-2); // -2 for is_classification and activation
+                    state.task.numbersOfUnits));
     double loss = state.algo.loss;
 
-
     AnyType tuple;
     tuple << flattenU
           << loss;
@@ -204,27 +210,25 @@ internal_mlp_igd_result::run(AnyType &args) {
 AnyType
 internal_predict_mlp::run(AnyType &args) {
     MLPModel<MutableArrayHandle<double> > model;
-    MappedColumnVector indVar;
+    ColumnVector indVar;
     int is_response = args[5].getAs<int>();
+    MappedColumnVector x_means = args[6].getAs<MappedColumnVector>();
+    MappedColumnVector x_stds = args[7].getAs<MappedColumnVector>();
     MappedColumnVector coeff = args[0].getAs<MappedColumnVector>();
     MappedColumnVector layerSizes = args[4].getAs<MappedColumnVector>();
     // Input layer doesn't count
     size_t numberOfStages = layerSizes.size()-1;
-    //#TODO this should be an int not a double
     double is_classification = args[2].getAs<double>();
     double activation = args[3].getAs<double>();
     bool get_class = is_classification && is_response;
 
     model.rebind(&is_classification,&activation,&coeff.data()[0],numberOfStages,&layerSizes.data()[0]);
     try {
-        MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-        // x is a const reference, we can only rebind to change its pointer
-        indVar.rebind(x.memoryHandle(), x.size());
+        indVar = (args[1].getAs<MappedColumnVector>()-x_means).cwiseQuotient(x_stds);
     } catch (const ArrayWithNullException &e) {
         return args[0];
     }
     ColumnVector prediction = MLPTask::predict(model, indVar, get_class);
-
     return prediction;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/task/l2.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/l2.hpp b/src/modules/convex/task/l2.hpp
index a2e7f2f..308cfd9 100644
--- a/src/modules/convex/task/l2.hpp
+++ b/src/modules/convex/task/l2.hpp
@@ -84,7 +84,8 @@ double
 L2<Model, Hessian>::loss(
         const model_type &model) {
     // 1/2 * lambda * || w ||^2
-    return lambda * model.norm()*model.norm() / 2;
+    double norm = model.norm();
+    return lambda * norm*norm / 2;
 }
 
 } // namespace convex

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/task/mlp.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp
index e66492b..0032b81 100644
--- a/src/modules/convex/task/mlp.hpp
+++ b/src/modules/convex/task/mlp.hpp
@@ -26,6 +26,8 @@
 #ifndef MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
 #define MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
 
+#include <dbconnector/dbconnector.hpp>
+
 namespace madlib {
 
 namespace modules {
@@ -46,24 +48,26 @@ public:
 
     static void gradientInPlace(
             model_type                          &model,
-            const independent_variables_type    &y,
-            const dependent_variable_type       &z,
+            const independent_variables_type    &x,
+            const dependent_variable_type       &y,
             const double                        &stepsize);
 
     static double loss(
             const model_type                    &model,
-            const independent_variables_type    &y,
-            const dependent_variable_type       &z);
+            const independent_variables_type    &x,
+            const dependent_variable_type       &y);
 
     static ColumnVector predict(
             const model_type                    &model,
-            const independent_variables_type    &y,
+            const independent_variables_type    &x,
             const bool                          get_class);
 
     const static int RELU = 0;
     const static int SIGMOID = 1;
     const static int TANH = 2;
+    static double lambda;
 
+private:
     static double sigmoid(const double &xi) {
         return 1. / (1. + std::exp(-xi));
     }
@@ -76,9 +80,6 @@ public:
         return std::tanh(xi);
     }
 
-
-private:
-
     static double sigmoidDerivative(const double &xi) {
         double value = sigmoid(xi);
         return value * (1. - value);
@@ -95,59 +96,39 @@ private:
 
     static void feedForward(
             const model_type                    &model,
-            const independent_variables_type    &y,
+            const independent_variables_type    &x,
             std::vector<ColumnVector>           &net,
-            std::vector<ColumnVector>           &x);
-
-    static void endLayerDeltaError(
-            const std::vector<ColumnVector>     &net,
-            const std::vector<ColumnVector>     &x,
-            const dependent_variable_type       &z,
-            ColumnVector                        &delta_N);
+            std::vector<ColumnVector>           &o);
 
-    static void errorBackPropagation(
-            const ColumnVector                  &delta_N,
+    static void backPropogate(
+            const ColumnVector                  &y_true,
+            const ColumnVector                  &y_estimated,
             const std::vector<ColumnVector>     &net,
             const model_type                    &model,
             std::vector<ColumnVector>           &delta);
 };
 
 template <class Model, class Tuple>
+double MLP<Model, Tuple>::lambda = 0;
+
+template <class Model, class Tuple>
 void
 MLP<Model, Tuple>::gradientInPlace(
         model_type                          &model,
-        const independent_variables_type    &y,
-        const dependent_variable_type       &z,
+        const independent_variables_type    &x,
+        const dependent_variable_type       &y_true,
         const double                        &stepsize) {
-    (void) model;
-    (void) z;
-    (void) y;
-    (void) stepsize;
-    std::vector<ColumnVector> net;
-    std::vector<ColumnVector> x;
-    std::vector<ColumnVector> delta;
-    ColumnVector delta_N;
-
-    feedForward(model, y, net, x);
-    endLayerDeltaError(net, x, z, delta_N);
-    errorBackPropagation(delta_N, net, model, delta);
-
     uint16_t N = model.u.size(); // assuming nu. of layers >= 1
-    uint16_t k, s, j;
+    uint16_t k;
+    std::vector<ColumnVector> net, o, delta;
 
-    std::vector<uint16_t> n; n.clear(); //nu. of units in each layer
+    feedForward(model, x, net, o);
+    backPropogate(y_true, o.back(), net, model, delta);
 
-    n.push_back(model.u[0].rows() - 1);
-    for (k = 1; k <= N; k ++) {
-        n.push_back(model.u[k-1].cols() - 1);
-    }
-
-    for (k=1; k <= N; k++){
-        for (s=0; s <= n[k-1]; s++){
-            for (j=1; j <= n[k]; j++){
-                model.u[k-1](s,j) -= stepsize *  (delta[k](j) * x[k-1](s));
-            }
-        }
+    for (k=0; k < N; k++){
+        Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
+        regularization.row(0).setZero(); // Do not update bias
+        model.u[k] -= stepsize * (o[k] * delta[k].transpose() + regularization);
     }
 }
 
@@ -155,54 +136,40 @@ template <class Model, class Tuple>
 double
 MLP<Model, Tuple>::loss(
         const model_type                    &model,
-        const independent_variables_type    &y,
-        const dependent_variable_type       &z) {
+        const independent_variables_type    &x,
+        const dependent_variable_type       &y_true) {
     // Here we compute the loss. In the case of regression we use sum of square errors
     // In the case of classification the loss term is cross entropy.
-    std::vector<ColumnVector> net;
-    std::vector<ColumnVector> x;
-
-    feedForward(model, y, net, x);
-    double loss = 0.;
-    uint16_t j;
-
-    for (j = 1; j < z.rows() + 1; j ++) {
-        if(model.is_classification){
-            // Cross entropy: RHS term is negative
-            loss -= z(j-1)*std::log(x.back()(j)) + (1-z(j-1))*std::log(1-x.back()(j));
-        }else{
-            double diff = x.back()(j) - z(j-1);
-            loss += diff * diff;
-        }
+    std::vector<ColumnVector> net, o;
+    feedForward(model, x, net, o);
+    ColumnVector y_estimated = o.back();
+
+    if(model.is_classification){
+        double clip = 1.e-10;
+        y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip);
+        return - (y_true.array()*y_estimated.array().log()
+               + (-y_true.array()+1)*(-y_estimated.array()+1).log()).sum();
     }
-    if(!model.is_classification){
-        loss /= 2.;
-    }else{
-        loss /= z.rows();
+    else{
+        return 0.5 * (y_estimated-y_true).squaredNorm();
     }
-    return loss;
 }
 
 template <class Model, class Tuple>
 ColumnVector
 MLP<Model, Tuple>::predict(
         const model_type                    &model,
-        const independent_variables_type    &y,
-        const bool                          get_class
-        ) {
-    (void) model;
-    (void) y;
-    std::vector<ColumnVector> net;
-    std::vector<ColumnVector> x;
-
-    feedForward(model, y, net, x);
-    // Don't return the offset
-    ColumnVector output = x.back().tail(x.back().size()-1);
-    if(get_class){
+        const independent_variables_type    &x,
+        const bool                          get_class) {
+    std::vector<ColumnVector> net, o;
+
+    feedForward(model, x, net, o);
+    ColumnVector output = o.back();
+    if(get_class){ // Return a length 1 array with the predicted index
         int max_idx;
         output.maxCoeff(&max_idx);
         output.resize(1);
-        output[0] = (double)max_idx;
+        output[0] = (double) max_idx;
     }
     return output;
 }
@@ -212,113 +179,65 @@ template <class Model, class Tuple>
 void
 MLP<Model, Tuple>::feedForward(
         const model_type                    &model,
-        const independent_variables_type    &y,
+        const independent_variables_type    &x,
         std::vector<ColumnVector>           &net,
-        std::vector<ColumnVector>           &x){
-    // meta data and x_k^0 = 1
-    uint16_t k, j, s;
-    uint16_t N = model.u.size(); // assuming >= 1
+        std::vector<ColumnVector>           &o){
+    uint16_t k, N;
+    N = model.u.size(); // assuming >= 1
     net.resize(N + 1);
-    x.resize(N + 1);
-
-    std::vector<uint16_t> n; n.clear();
-    n.push_back(model.u[0].rows() - 1);
-    x[0].resize(n[0] + 1);
-    x[0](0) = 1.;
-    for (k = 1; k <= N; k ++) {
-        n.push_back(model.u[k-1].cols() - 1);
-        net[k].resize(n[k] + 1);
-        x[k].resize(n[k] + 1);
-        // Bias
-        x[k](0) = 1.;
-    }
+    o.resize(N + 1);
+
+    double (*activation)(const double&);
+    if(model.activation==RELU)
+        activation = &relu;
+    else if(model.activation==SIGMOID)
+        activation = &sigmoid;
+    else
+        activation = &tanh;
 
-    // y is a mapped parameter from DB, aligning with x here
-    for (j = 1; j <= n[0]; j ++) { x[0](j) = y(j-1); }
+    o[0].resize(x.size()+1);
+    o[0] << 1.,x;
 
     for (k = 1; k < N; k ++) {
-        for (j = 1; j <= n[k]; j ++) {
-            net[k](j) = 0.;
-            for (s = 0; s <= n[k-1]; s ++) {
-                net[k](j) += x[k-1](s) * model.u[k-1](s, j);
-            }
-            if(model.activation==RELU)
-                x[k](j) = relu(net[k](j));
-            else if(model.activation==SIGMOID)
-                x[k](j) = sigmoid(net[k](j));
-            else
-                x[k](j) = tanh(net[k](j));
-        }
+        net[k] = model.u[k-1].transpose() * o[k-1];
+        o[k] = ColumnVector(model.u[k-1].cols()+1);
+        o[k] << 1., net[k].unaryExpr(activation);
     }
+    o[N] = model.u[N-1].transpose() * o[N-1];
 
-    // output layer computation
-    for (j = 1; j <= n[N]; j ++) {
-        x[N](j) = 0.;
-        for (s = 0; s <= n[N-1]; s ++) {
-            x[N](j) += x[N-1](s) * model.u[N-1](s, j);
-        }
-    }
     // Numerically stable calculation of softmax
-    ColumnVector last_x = x[N].tail(n[N]);
     if(model.is_classification){
-        double max_x = last_x.maxCoeff();
-        last_x = (last_x.array() - max_x).exp();
-        last_x /= last_x.sum();
+        double max_x = o[N].maxCoeff();
+        o[N] = (o[N].array() - max_x).exp();
+        o[N] /= o[N].sum();
     }
-    x[N].tail(n[N]) = last_x;
 }
 
 template <class Model, class Tuple>
 void
-MLP<Model, Tuple>::endLayerDeltaError(
-        const std::vector<ColumnVector>     &net,
-        const std::vector<ColumnVector>     &x,
-        const dependent_variable_type       &z,
-        ColumnVector                        &delta_N) {
-    //meta data
-    uint16_t t;
-    uint16_t N = x.size() - 1; // assuming >= 1
-    uint16_t n_N = x[N].rows() - 1;
-    delta_N.resize(n_N + 1);
-
-    for (t = 1; t <= n_N; t ++) {
-		delta_N(t) = (x[N](t) - z(t-1));
-    }
-}
-
-template <class Model, class Tuple>
-void
-MLP<Model, Tuple>::errorBackPropagation(
-        const ColumnVector                  &delta_N,
+MLP<Model, Tuple>::backPropogate(
+        const ColumnVector                  &y_true,
+        const ColumnVector                  &y_estimated,
         const std::vector<ColumnVector>     &net,
         const model_type                    &model,
         std::vector<ColumnVector>           &delta) {
-    // meta data
-    uint16_t k, j, t;
-    uint16_t N = model.u.size(); // assuming >= 1
-    delta.resize(N + 1);
-
-    std::vector<uint16_t> n; n.clear();
-    n.push_back(model.u[0].rows() - 1);
-    for (k = 1; k <= N; k ++) {
-        n.push_back(model.u[k-1].cols() - 1);
-        delta[k].resize(n[k]+1);
-    }
-    delta[N] = delta_N;
-
+    uint16_t k, N;
+    N = model.u.size(); // assuming >= 1
+    delta.resize(N);
+
+    double (*activationDerivative)(const double&);
+    if(model.activation==RELU)
+        activationDerivative = &reluDerivative;
+    else if(model.activation==SIGMOID)
+        activationDerivative = &sigmoidDerivative;
+    else
+        activationDerivative = &tanhDerivative;
+
+    delta.back() = y_estimated - y_true;
     for (k = N - 1; k >= 1; k --) {
-        for (j = 0; j <= n[k]; j ++) {
-            delta[k](j) = 0.;
-            for (t = 1; t <= n[k+1]; t ++) {
-                delta[k](j) += delta[k+1](t) * model.u[k](j, t);
-            }
-            if(model.activation==RELU)
-                delta[k](j) = delta[k](j) * reluDerivative(net[k](j));
-            else if(model.activation==SIGMOID)
-                delta[k](j) = delta[k](j) * sigmoidDerivative(net[k](j));
-            else
-                delta[k](j) = delta[k](j) * tanhDerivative(net[k](j));
-        }
+        // Do not include the bias terms
+        delta[k-1] = model.u[k].bottomRows(model.u[k].rows()-1) * delta[k];
+        delta[k-1] = delta[k-1].array() * net[k].unaryExpr(activationDerivative).array();
     }
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/model.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/model.hpp b/src/modules/convex/type/model.hpp
index 9b68af8..679dab4 100644
--- a/src/modules/convex/type/model.hpp
+++ b/src/modules/convex/type/model.hpp
@@ -121,51 +121,9 @@ struct MLPModel {
         const double *n = inNumbersOfUnits;
         size_t k;
         for (k = 1; k <= N; k ++) {
-            size += (n[k-1] + 1) * (n[k] + 1);
-        }
-        return 1 +       // is_classification
-               1 +       // activation
-               size;     // weights (u)
-    }
-
-    /**
-     * @brief Initialize the model randomly
-     */
-    void initialize(int is_classification_in, int activation_in) {
-        is_classification = is_classification_in;
-        activation = activation_in;
-        // using madlib::dbconnector::$database::NativeRandomNumberGenerator
-        NativeRandomNumberGenerator rng;
-
-        // Scaling factor for weight initialization
-        double epsilon = 0.0001;
-
-
-        double base = rng.min();
-        double span = rng.max() - base;
-
-        uint16_t N = u.size(); // assuming nu. of layers >= 1
-        uint16_t k, s, j;
-
-        std::vector<uint16_t> n; n.clear(); //nu. of units in each layer
-
-        n.push_back(u[0].rows() - 1);
-        for (k = 1; k <= N; k ++) {
-            n.push_back(u[k-1].cols() - 1);
-        }
-
-        for (k=1; k <= N; k++){
-            for (s=0; s <= n[k-1]; s++){
-                u[k-1](s,0)=1;
-                for (j=1; j <= n[k]; j++){
-                    // Generate normal(0,epsilon) value using Box-Muller transform
-                    double u1 = (rng()-base)/span;
-                    double u2 = (rng()-base)/span;
-                    double z = std::sqrt(-2*std::log(u1))*std::cos(2*M_PI*u2);
-                    u[k-1](s,j) = epsilon*z;
-                }
-            }
+            size += (n[k-1] + 1) * (n[k]);
         }
+        return size;     // weights (u)
     }
 
     uint32_t rebind(const double *is_classification_in,
@@ -185,20 +143,38 @@ struct MLPModel {
         for (k = 1; k <= N; k ++) {
             u.push_back(Eigen::Map<Matrix >(
                     const_cast<double*>(data + sizeOfU),
-                    n[k-1] + 1, n[k] + 1));
-            sizeOfU += (n[k-1] + 1) * (n[k] + 1);
+                    n[k-1] + 1, n[k]));
+            sizeOfU += (n[k-1] + 1) * (n[k]);
         }
 
         return sizeOfU;
     }
 
+    double norm() const {
+        double norm = 0.;
+        size_t k;
+        for (k = 0; k < u.size(); k ++) {
+            norm+=u[k].bottomRows(u[k].rows()-1).squaredNorm();
+        }
+        return std::sqrt(norm);
+    }
+
+    void setZero(){
+        size_t k;
+        for (k = 1; k <= u.size(); k ++) {
+            u[k-1].setZero();
+        }
+    }
+
     /*
      *  Some operator wrappers for u.
      */
     MLPModel &operator*=(const double &c) {
+        // Note that when scaling the model, you should
+        // not update the bias.
         size_t k;
         for (k = 1; k <= u.size(); k ++) {
-            u[k-1] *= c;
+           u[k-1] *= c;
         }
 
         return *this;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/state.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/state.hpp b/src/modules/convex/type/state.hpp
index 66f5023..2cb2643 100644
--- a/src/modules/convex/type/state.hpp
+++ b/src/modules/convex/type/state.hpp
@@ -629,6 +629,9 @@ public:
         return 1                        // numberOfStages = N
             + (inNumberOfStages + 1)    // numbersOfUnits: size is (N + 1)
             + 1                         // stepsize
+            + 1                         // lambda
+            + 1                         // is_classification
+            + 1                         // activation
             + sizeOfModel               // model
 
             + 1                         // numRows
@@ -645,17 +648,16 @@ private:
      * - 0: numberOfStages (number of stages (layers), design doc: N)
      * - 1: numbersOfUnits (numbers of activation units, design doc: n_0,...,n_N)
      * - N + 2: stepsize (step size of gradient steps)
-     * - N + 3: is_classification (do classification)
-     * - N + 4: activation (activation function)
-     * - N + 5: coeff (coefficients, design doc: u)
+     * - N + 3: lambda (regularization term)
+     * - N + 4: is_classification (do classification)
+     * - N + 5: activation (activation function)
+     * - N + 6: coeff (coefficients, design doc: u)
      *
      * Intra-iteration components (updated in transition step):
      *   sizeOfModel = # of entries in u + 2, (\sum_1^N n_{k-1} n_k)
-     * - N + 3 + sizeOfModel: numRows (number of rows processed in this iteration)
-     * - N + 4 + sizeOfModel: loss (loss value, the sum of squared errors)
-     * - N + 5 + sizeOfModel: is_classification (do classification)
-     * - N + 6 + sizeOfModel: activation (activation function)
-     * - N + 7 + sizeOfModel: coeff (volatile model for incrementally update)
+     * - N + 6 + sizeOfModel: coeff (volatile model for incrementally update)
+     * - N + 6 + 2*sizeOfModel: numRows (number of rows processed in this iteration)
+     * - N + 7 + 2*sizeOfModel: loss (loss value, the sum of squared errors)
      */
     void rebind() {
         task.numberOfStages.rebind(&mStorage[0]);
@@ -663,13 +665,14 @@ private:
         task.numbersOfUnits =
             reinterpret_cast<dimension_pointer_type>(&mStorage[1]);
         task.stepsize.rebind(&mStorage[N + 2]);
-        uint32_t sizeOfModel = task.model.rebind(&mStorage[N + 3],&mStorage[N + 4],&mStorage[N + 5],
+        task.lambda.rebind(&mStorage[N + 3]);
+        uint32_t sizeOfModel = task.model.rebind(&mStorage[N + 4],&mStorage[N + 5],&mStorage[N + 6],
                 task.numberOfStages, task.numbersOfUnits);
 
-        algo.numRows.rebind(&mStorage[N + 5 + sizeOfModel]);
-        algo.loss.rebind(&mStorage[N + 6 + sizeOfModel]);
-        algo.incrModel.rebind(&mStorage[N + 3],&mStorage[N + 4],&mStorage[N + 7 + sizeOfModel],
+        algo.incrModel.rebind(&mStorage[N + 4],&mStorage[N + 5],&mStorage[N + 6 + sizeOfModel],
                 task.numberOfStages, task.numbersOfUnits);
+        algo.numRows.rebind(&mStorage[N + 6 + 2*sizeOfModel]);
+        algo.loss.rebind(&mStorage[N + 7 + 2*sizeOfModel]);
 
     }
 
@@ -685,13 +688,14 @@ public:
         dimension_type numberOfStages;
         dimension_pointer_type numbersOfUnits;
         numeric_type stepsize;
+        numeric_type lambda;
         MLPModel<Handle> model;
     } task;
 
     struct AlgoState {
+        MLPModel<Handle> incrModel;
         count_type numRows;
         numeric_type loss;
-        MLPModel<Handle> incrModel;
     } algo;
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/tuple.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/tuple.hpp b/src/modules/convex/type/tuple.hpp
index 4b9c55e..824ed90 100644
--- a/src/modules/convex/type/tuple.hpp
+++ b/src/modules/convex/type/tuple.hpp
@@ -64,7 +64,7 @@ typedef ExampleTuple<MappedColumnVector, double> GLMTuple;
 // madlib::modules::convex::MatrixIndex
 typedef ExampleTuple<MatrixIndex, double> LMFTuple;
 
-typedef ExampleTuple<MappedColumnVector, MappedColumnVector> MLPTuple;
+typedef ExampleTuple<ColumnVector, MappedColumnVector> MLPTuple;
 
 } // namespace convex
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index 400f892..6b9d828 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -29,23 +29,23 @@
 m4_include(`SQLCommon.m4')
 
 /**
-@addtogroup grp_mlp
+@addtogroup grp_nn
 
 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#mlp_classification">Classification</a></li>
 <li class="level1"><a href="#mlp_regression">Regression</a></li>
-<li class="level1"><a href="#optimization_params">Optimizer Parameters</a></li>
-<li class="level1"><a href="#predict">Prediction Functions/a></li>
+<li class="level1"><a href="#optimizer_params">Optimizer Parameters</a></li>
+<li class="level1"><a href="#predict">Prediction Functions</a></li>
 <li class="level1"><a href="#example">Examples</a></li>
 <li class="level1"><a href="#background">Technical Background</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 <li class="level1"><a href="#related">Related Topics</a></li>
 </ul></div>
 
-Multilayer Perceptron (MLP) is a model for regression and
-classification.
+Multilayer Perceptron (MLP) is a type of neural network that can be
+used for regression and classification.
 
-Also called "vanilla neural networks", they consist of several
+Also called "vanilla neural networks", MLPs consist of several
 fully connected hidden layers with non-linear activation
 functions.  In the case of classification, the final layer of the
 neural net has as many nodes as classes, and the output of the
@@ -67,7 +67,8 @@ mlp_classification(
     dependent_varname,
     hidden_layer_sizes,
     optimizer_params,
-    activation
+    activation,
+    weights
     )
 </pre>
 \b Arguments
@@ -75,6 +76,7 @@ mlp_classification(
   <DT>source_table</DT>
   <DD>TEXT. Name of the table containing the training data.</DD>
 
+
   <DT>output_table</DT>
   <DD>TEXT. Name of the output table containing the model. Details of the output
    tables are provided below.
@@ -83,19 +85,22 @@ mlp_classification(
   <DT>independent_varname</DT>
   <DD>TEXT. Expression list to evaluate for the
     independent variables. An intercept variable should not be included as part
-    of this expression. Please note that expression should be able to be cast
-    to DOUBLE PRECISION[].
+    of this expression. <b>Please note that expression should be encoded properly.</b>
+    All values are cast to DOUBLE PRECISION, so categorical variables should be
+    one-hot or dummy encoded.  See <a href="group__grp__encode__categorical.html">here</a>
+    for more details.
   </DD>
 
+
   <DT>dependent_varname</DT>
   <DD> TEXT. Name of the dependent variable column. For classification, supported types are:
   text, varchar, character varying, char, character
   integer, smallint, bigint, and boolean.  </DD>
 
-  <DT>hidden_layer_sizes (optional)</DT>
-  <DD>INTEGER[], default: ARRAY[].
+  <DT>hidden_layer_sizes </DT>
+  <DD>INTEGER[]
   The number of neurons in each hidden layer.  The length of this array will
-  determine the number of hidden layers.  Empty for no hidden layers.
+  determine the number of hidden layers.  NULL for no hidden layers.
   </DD>
 
 
@@ -111,6 +116,25 @@ mlp_classification(
     'relu', and 'tanh'. The text can be any prefix of the three
     strings; for e.g., activation='s' will use the sigmoid activation.
   </DD>
+
+
+  <DT>weights (optional)</DT>
+  <DD>TEXT, default: NULL.
+    Weights for input rows. Column name which specifies the weight for each input row.
+    This weight will be incorporated into the update during SGD, and will not be used
+    for loss calculations. If not specified, weight for each row will default to 1.
+    Column should be a numeric type.
+  </DD>
+
+  <DT>warm_start (optional)</DT>
+  <DD>BOOLEAN, default: FALSE.
+    Initalize weights with the coefficients from the last call.  If true, weights will
+    be initialized from output_table. Note that all parameters other than optimizer_params,
+    and verbose must remain constant between calls to warm_start.
+  </DD>
+
+  <DT>verbose (optional)</DT>
+  <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD>
 </DL>
 
 <b>Output tables</b>
@@ -142,24 +166,28 @@ A summary table named \<output_table\>_summary is also created, which has the fo
         <td>The source table.</td>
     </tr>
     <tr>
-        <th>dependent_varname</th>
-        <td>The dependent variable.</td>
-    </tr>
-    <tr>
         <th>independent_varname</th>
         <td>The independent variables.</td>
     </tr>
     <tr>
+        <th>dependent_varname</th>
+        <td>The dependent variable.</td>
+    </tr>
+    <tr>
         <th>tolerance</th>
         <td>The tolerance as given in optimizer_params.</td>
     </tr>
     <tr>
-        <th>step_size</th>
-        <td>The step size as given in optimizer_params.</td>
+        <th>learning_rate_init</th>
+        <td>The initial learning rate as given in optimizer_params.</td>
+    </tr>
+    <tr>
+        <th>learning_rate_policy</th>
+        <td>The learning rate policy as given in optimizer_params.</td>
     </tr>
     <tr>
         <th>n_iterations</th>
-        <td>The number of iterations run</td>
+        <td>The number of iterations run.</td>
     </tr>
     <tr>
         <th>n_tries</th>
@@ -170,17 +198,29 @@ A summary table named \<output_table\>_summary is also created, which has the fo
         <td>The number of units in each layer including the input and output layer.</td>
     </tr>
     <tr>
-        <th>activation_function</th>
+        <th>activation</th>
         <td>The activation function.</td>
     </tr>
     <tr>
         <th>is_classification</th>
         <td>True if the model was trained for classification, False if it was trained
-        for regression</td>
+        for regression.</td>
     </tr>
     <tr>
         <th>classes</th>
-        <td>The classes which were trained against (empty for regression)</td>
+        <td>The classes which were trained against (empty for regression).</td>
+    </tr>
+    <tr>
+        <th>weights</th>
+        <td>The weight column used during training.</td>
+    </tr>
+    <tr>
+        <th>x_means</th>
+        <td>The mean for all input features (used for normalization).</td>
+    </tr>
+    <tr>
+        <th>x_stds</th>
+        <td>The standard deviation for all input features (used for normalization).</td>
     </tr>
 
    </table>
@@ -197,7 +237,9 @@ mlp_regression(source_table,
     dependent_varname,
     hidden_layer_sizes,
     optimizer_params,
-    activation
+    activation,
+    weights,
+    verbose
     )
 </pre>
 
@@ -205,7 +247,7 @@ mlp_regression(source_table,
 
 Specifications for regression are largely the same as for classification. In the
 model table, the loss will refer to mean square error instead of cross entropy. In the
-summary table, there is classes column. The following
+summary table, there is no classes column. The following
 arguments have specifications which differ from mlp_classification:
 <DL class="arglist">
 <DT>dependent_varname</DT>
@@ -226,7 +268,7 @@ the parameter is ignored.
 
 
 <pre class="syntax">
-  'step_size = &lt;value>,
+  'learning_rate_init = &lt;value>,
    n_iterations = &lt;value>,
    n_tries = &lt;value>,
    tolerance = &lt;value>'
@@ -234,27 +276,57 @@ the parameter is ignored.
 \b Optimizer Parameters
 <DL class="arglist">
 
-<DT>step_size</dt>
-<DD>Default: [0.001].
+<DT>learning_rate_init</dt>
+<DD>Default: 0.001.
 Also known as the learning rate. A small value is usually desirable to
 ensure convergence, while a large value provides more room for progress during
 training. Since the best value depends on the condition number of the data, in
 practice one often tunes this parameter.
 </DD>
 
+<DT>learning_rate_policy</dt>
+<DD>Default: constant.
+One of 'constant', 'exp', 'inv' or 'step' or any prefix of these.
+'constant': learning_rate = learning_rate_init
+'exp': learning_rate = learning_rate_init * gamma^(iter)
+'inv': learning_rate = learning_rate_init * (iter+1)^(-power)
+'step': learning_rate = learning_rate_init * gamma^(floor(iter/iterations_per_step))
+Where iter is the current iteration of SGD.
+</DD>
+
+<DT>gamma</dt>
+<DD>Default: 0.1.
+Decay rate for learning rate when learning_rate_policy is 'exp' or 'step'.
+</DD>
+
+<DT>power</dt>
+<DD>Default: 0.5.
+Exponent for learning_rate_policy = 'inv'.
+</DD>
+
+<DT>iterations_per_step</dt>
+<DD>Default: 100.
+Number of iterations to run before decreasing the learning rate by
+a factor of gamma.  Valid for learning rate policy = 'step'.
+</DD>
 
 <DT>n_iterations</dt>
 <DD>Default: [100]. The maximum number of iterations allowed.
 </DD>
+
 <DT>n_tries</dt>
 <DD>Default: [1]. Number of times to retrain the network with randomly initialized
-weights
+weights.
+</DD>
+
+<DT>lambda</dt>
+<DD>Default: 0. The regularization coefficient for L2 regularization.
 </DD>
 
 <DT>tolerance</dt>
 <DD>Default: 0.001. The criterion to end iterations. The training stops whenever
-<the difference between the training models of two consecutive iterations is
-<smaller than \e tolerance or the iteration number is larger than \e max_iter.
+the difference between the training models of two consecutive iterations is
+smaller than \e tolerance or the iteration number is larger than \e max_iter.
 </DD>
 
 </DL>
@@ -293,19 +365,19 @@ table name is already in use, then an error is returned.  Table contains:</DD>
         <td>Gives the 'id' for each prediction, corresponding to each row from the data_table.</td>
       </tr>
       <tr>
-        <th>estimated_<COL_NAME></th>
+        <th>estimated_COL_NAME</th>
         <td>
         (For pred_type='response') The estimated class
          for classification or value for regression, where
-         <COL_NAME> is the name of the column to be
-         predicted from training data
+         COL_NAME is the name of the column to be
+         predicted from training data.
         </td>
       </tr>
       <tr>
-        <th>prob_<CLASS></th>
+        <th>prob_CLASS</th>
         <td>
         (For pred_type='prob' for classification) The
-        probability of a given class <CLASS> as given by
+        probability of a given class CLASS as given by
         softmax. There will be one column for each class
         in the training data.
         </td>
@@ -315,10 +387,10 @@ table name is already in use, then an error is returned.  Table contains:</DD>
   <DT>pred_type</DT>
   <DD>TEXT.
 
-the type of output requested:
+The type of output requested:
 'response' gives the actual prediction,
 'prob' gives the probability of each class.
-for regression, only type='response' is defined.
+For regression, only type='response' is defined.
 The name of the id column in the input table.</DD>
 </DL>
 </table>
@@ -363,30 +435,36 @@ The model will be written to mlp_model.
 <pre class="example">
 DROP TABLE IF EXISTS mlp_model;
 DROP TABLE IF EXISTS mlp_model_summary;
+-- Set seed so results are reproducible
+SELECT setseed(0);
 SELECT madlib.mlp_classification(
     'iris_data',      -- Source table
     'mlp_model',      -- Destination table
     'attributes',     -- Input features
     'class_text',     -- Label
     ARRAY[5],         -- Number of units per layer
-    'step_size=0.003,
-    n_iterations=5000,
+    'learning_rate_init=0.003,
+    n_iterations=500,
     tolerance=0',     -- Optimizer params
-    'tanh');          -- Activation function
+    'tanh',           -- Activation function
+    NULL,             -- Default weight (1)
+    FALSE,            -- No warm start
+    TRUE              -- Verbose
+);
 </pre>
 -# View the result for the model.
 <pre class="example">
 -- Set extended display on for easier reading of output
 \\x ON
--- Neural net Initialization is non-deterministic, so your results may vary
+-- Results may vary depending on platform
 SELECT * FROM mlp_model;
 </pre>
 Result:
 <pre class="result">
--[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-coeff          | {1,1,1,1,1,0.136374930803,0.188739676875,0.662387810001,-1.03381622734,-0.469961067046,0.0614006983397,0.0811504589436,0.299008228258,-0.47391918521,-0.215098143699,0.10519213944,0.145844617525,0.511683525606,-0.800215552382,-0.36417142683,0.120751709056,0.167531106521,0.587074895969,-0.916946198095,-0.417055067449,0.0539541885146,0.0694359704131,0.262598585854,-0.419234805076,-0.189915344282,1,1,1,1,1,1,0.105645702152,1.46247470474,0.484457903226,0.965962824478,1.19361986431,0.419805760087,-0.105696503487,-1.46245956666,-0.484427811691,-0.965730981426,-1.19365280555,-0.419973628863}
-loss           | 0.0184092375519
-num_iterations | 5000
+-[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coeff          | {-0.172392477419,-0.0836446652758,-0.0162194484142,-0.647268294231,-0.504884325538,0.184825723596,0.351728174731,-0.601148967035,0.720999542651,0.26521898248,0.245760922013,0.264645322438,-0.349957739904,0.797653395667,0.725747963566,-0.344498001796,0.261481840947,0.329074383545,0.379503434339,-0.267398086353,-0.0238069072658,0.330239268187,-0.178736289201,-0.0563356339946,-0.0333791780453,0.262137386864,0.491390436498,-1.02635831573,-1.29541478382,0.246017274,-0.0623575215434,0.0826297373887,-0.671671189842,0.853494672576,1.21671423502,0.296424359217,0.15294606861}
+loss           | 0.0136695756314
+num_iterations | 500
 </pre>
 -# Next train a regression example.  First create some test data.  This dataset
 contains housing prices data.
@@ -419,30 +497,36 @@ COPY lin_housing (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
 <pre class="example">
 DROP TABLE IF EXISTS mlp_regress;
 DROP TABLE IF EXISTS mlp_regress_summary;
+SELECT setseed(0);
 SELECT madlib.mlp_regression(
-    'lin_housing',            -- Source table
-    'mlp_regress',              -- Desination table
-    'x',                        -- Input features
-    'y',                        -- Dependent variable
-    ARRAY[5,5],                 -- Number of units per layer
-    'step_size=0.000007,
-    n_iterations=10000,
+    'lin_housing',         -- Source table
+    'mlp_regress',         -- Desination table
+    'x',                   -- Input features
+    'y',                   -- Dependent variable
+    ARRAY[25,25],            -- Number of units per layer
+    'learning_rate_init=0.001,
+    n_iterations=500,
+    lambda=0.001,
     tolerance=0',
-    'relu');
+    'relu',
+    NULL,             -- Default weight (1)
+    FALSE,            -- No warm start
+    TRUE              -- Verbose
+);
 </pre>
 -# Check the results of the model
 <pre class="example">
--- Set extended display on for easier reading of output
+-- Set extended display on for easier reading of output.
 \\x ON
--- Neural net Initialization is non-deterministic, so your results may vary
+-- Results may vary depending on platform.
 SELECT * FROM mlp_regress;
 </pre>
 Result:
 <pre class="result">
--[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 -----------------------------------
-coeff          | {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.79506311399e-05,3.56715008915e-05,-6.09333559685e-05,0.000251228318768,-0.000224772841379,-3.71863030857e-05,-3.5757865148e-06,5.27936784854e-05,-2.48474166186e-05,6.19731184294e-05,3.07638968743e-05,6.8964698578e-06,0.000106016701083,-1.71484730318e-05,1.18691881812e-05,-0.000163975464208,0.000170026304906,3.11688265279e-05,0.000177050148787,-1.58265976603e-05,2.70144422657e-05,0.000112667883422,3.77575139073e-05,8.12474658795e-05,-7.90458917626e-05,0.000107566386158,-2.63771171506e-06,2.47996880915e-05,-0.00012642310887,0.000203827391081,0.000139315565565,4.86147243454e-05,-0.000176126471913,-6.47820782916e-05,-8.51592776447e-06,-6.60601176758e-05,2.91421874156e-05,6.3556873752e-05,0.000197557443129,0.000220531367259,0.000135036310289,0.000143735913975,-4.75034117786e-05,-0.000179547345838,-1.6919846786e-05,0.000162784312994,0.000268595819851,-0.000460066553287,8.69756071591e-05,-0.00311762727057,0.000126024763103,0.000205988242921
 ,0.003463432426,-0.00729789075286,0.00151625867549,-0.000890852767597,-0.00525016037249,0.0031043106659,0.00798041103839,-0.00552693050079,0.0232180415786,0.0230489850143,-0.0437890272341,0.0165765426407,-0.248554261758,-7.81336427846e-05,0.00558145591752,0.283465844585,-0.571699956182,0.133474351994,-0.0785181945605,-0.419269930709,0.249547772912,0.631761009875,-0.431305975666,1,1,1,1,1,1,0.0158747497572,-9.02809160806e-05,0.00015574347618,4.10805373863e-06,0.00121532434965,0.101790351335,0.0647558401493,-0.00013654998677,-9.92872075948e-06,-5.5319694394e-05,0.00519320756484,0.412736586036,0.0011998026977,-1.53688189815e-05,1.94817888201e-05,-4.63111489966e-05,7.24547899029e-05,0.00880394144485,5.45309822095e-05,-0.000140943219275,-7.96211486227e-05,-1.04337307472e-05,0.000161936762028,0.00136273797767,-4.54737243585e-05,-3.4083840736e-05,3.69286883662e-05,9.9047243188e-08,3.75014011824e-06,-9.45366086368e-08,1,1,1,1,1,1,6.67488547054,0.102754199001,0.41668912471,0.00886867296479,0
 .00136206007228,-9.88642499013e-05}
-loss           | 144.965776158
-num_iterations | 10000
+[ RECORD 1 ]--+-----------------------------------------------------------------------------------
+coeff          | {-0.135647108464,0.0315402969485,-0.117580589352,-0.23084537701,-0.10868726702...
+loss           | 0.114125125042
+num_iterations | 500
 </pre>
 -# Now let's look at the prediction functions. In the following examples we will
 use the training data set for prediction as well, which is not usual but serves to
@@ -458,8 +542,6 @@ SELECT madlib.mlp_predict(
          'mlp_prediction',    -- Output table for predictions
          'response'           -- Output classes, not probabilities
      );
--# View results
-<pre class="example">
 SELECT * FROM mlp_prediction JOIN iris_data USING (id);
 </pre>
 Result for the classification model:
@@ -487,7 +569,7 @@ Result for the classification model:
  19 | Iris-versicolor      | {6.6,2.9,4.6,1.3} | Iris-versicolor |     2
  20 | Iris-versicolor      | {5.2,2.7,3.9,1.4} | Iris-versicolor |     2
 </pre>
-Prediction using the regression model:
+-# Prediction using the regression model:
 <pre class="example">
 DROP TABLE IF EXISTS mlp_regress_prediction;
 SELECT madlib.mlp_predict(
@@ -498,34 +580,35 @@ SELECT madlib.mlp_predict(
          'response'                   -- Output values, not probabilities
      );
 </pre>
--# View results
+View results
 <pre class="example">
 SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id);
 </pre>
 Result for the regression model:
 <pre class="result">
- id |                                    x                                    | grp_by_col |  y   |    estimated_y
-----+-------------------------------------------------------------------------+------------+------+--------------------
- 1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98}       |          1 |   24 | {23.2627062018087}
- 2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14}      |          1 | 21.6 | {25.7088419115781}
- 3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03}     |          1 | 34.7 | {27.5587003901404}
- 4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94}     |          1 | 33.4 | {31.1812237427816}
- 5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33}      |          1 | 36.2 | {30.3696873085477}
- 6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21}      |          1 | 28.7 | {29.5290259241882}
- 7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43}  |          1 | 22.9 | {21.1576051716888}
- 8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15}  |          1 | 27.1 | {17.6194200563055}
- 9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93}  |          1 | 16.5 | {15.1366297774139}
-10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1}  |          1 | 18.9 | {17.6528662199369}
-11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} |          1 |   15 | {17.2017487668181}
-12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27}  |          1 | 18.9 | {19.4893860319992}
-13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71}    |          1 | 21.7 | {23.2917226708039}
-14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26}        |          1 | 20.4 | {22.8904812605193}
-15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26}      |          1 | 18.2 | {18.2386754423677}
-16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47}       |          1 | 19.9 | {23.28949550874}
-17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58}       |          1 | 23.1 | {25.3288762085473}
-18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67}        |          1 | 17.5 | {19.0203738118451}
-19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69}      |          1 | 20.2 | {12.3162005347545}
-20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28}       |          1 | 18.2 | {21.0902211848747}
+ id |                                    x                                    | grp_by_col |  y   |   estimated_y
+----+-------------------------------------------------------------------------+------------+------+------------------
+  1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98}       |          1 |   24 |  23.973628645041
+  2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14}      |          1 | 21.6 | 21.6389086856109
+  3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03}     |          1 | 34.7 | 34.6766441639675
+  4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94}     |          1 | 33.4 | 33.4521871118756
+  5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33}      |          1 | 36.2 | 36.2899491706428
+  6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21}      |          1 | 28.7 | 28.6994076427827
+  7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43}  |          1 | 22.9 | 22.4882117113923
+  8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15}  |          1 | 27.1 | 26.5148927040405
+  9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93}  |          1 | 16.5 | 16.0669778867327
+ 10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1}  |          1 | 18.9 | 17.4237448788601
+ 11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} |          1 |   15 | 14.5944028616784
+ 12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27}  |          1 | 18.9 | 19.6071061560237
+ 13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71}    |          1 | 21.7 | 21.7585638578804
+ 14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26}        |          1 | 20.4 | 20.2832271533629
+ 15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26}      |          1 | 18.2 | 18.3440540662206
+ 16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47}       |          1 | 19.9 | 20.0246074554594
+ 17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58}       |          1 | 23.1 | 23.1458505146148
+ 18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67}        |          1 | 17.5 | 17.4602306566804
+ 19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69}      |          1 | 20.2 | 20.1785296856357
+ 20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28}       |          1 | 18.2 | 18.1810300625137
+(20 rows)
 </pre>
 Note that the results you get for all examples may vary with the platform you are using.
 
@@ -561,6 +644,10 @@ File mlp.sql_in documenting the training function
 
 */
 
+CREATE TYPE MADLIB_SCHEMA.mlp_step_result AS (
+        state    DOUBLE PRECISION[],
+        loss     DOUBLE PRECISION
+);
 
 CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
         coeff    DOUBLE PRECISION[],
@@ -571,14 +658,22 @@ CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
 -- create SQL functions for IGD optimizer
 --------------------------------------------------------------------------
 CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_transition(
-        state           DOUBLE PRECISION[],
-        start_vec       DOUBLE PRECISION[],
-        end_vec         DOUBLE PRECISION[],
-        previous_state  DOUBLE PRECISION[],
-        layer_sizes     DOUBLE PRECISION[],
-        stepsize        DOUBLE PRECISION,
-        activation      INTEGER,
-        is_classification INTEGER)
+        state              DOUBLE PRECISION[],
+        ind_var            DOUBLE PRECISION[],
+        dep_var            DOUBLE PRECISION[],
+        previous_state     DOUBLE PRECISION[],
+        layer_sizes        DOUBLE PRECISION[],
+        learning_rate_init DOUBLE PRECISION,
+        activation         INTEGER,
+        is_classification  INTEGER,
+        weight             DOUBLE PRECISION,
+        warm_start         BOOLEAN,
+        warm_start_coeff   DOUBLE PRECISION[],
+        n_tuples           INTEGER,
+        lambda             DOUBLE PRECISION,
+        x_means            DOUBLE PRECISION[],
+        x_stds             DOUBLE PRECISION[]
+    )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE;
@@ -592,7 +687,7 @@ LANGUAGE C IMMUTABLE STRICT;
 
 CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_final(
         state DOUBLE PRECISION[])
-RETURNS DOUBLE PRECISION[]
+RETURNS MADLIB_SCHEMA.mlp_step_result
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 
@@ -601,16 +696,24 @@ LANGUAGE C IMMUTABLE STRICT;
  * @brief Perform one iteration of backprop
  */
 CREATE AGGREGATE MADLIB_SCHEMA.mlp_igd_step(
-        /* start_vec*/        DOUBLE PRECISION[],
-        /* end_vec */         DOUBLE PRECISION[],
-        /* previous_state */  DOUBLE PRECISION[],
-        /* layer_sizes */     DOUBLE PRECISION[],
-        /* stepsize */        DOUBLE PRECISION,
-        /* activation */      INTEGER,
-        /* is_classification */ INTEGER )(
+        /* ind_var */             DOUBLE PRECISION[],
+        /* dep_var */             DOUBLE PRECISION[],
+        /* previous_state */      DOUBLE PRECISION[],
+        /* layer_sizes */         DOUBLE PRECISION[],
+        /* learning_rate_init */  DOUBLE PRECISION,
+        /* activation */          INTEGER,
+        /* is_classification */   INTEGER,
+        /* weight */              DOUBLE PRECISION,
+        /* warm_start */          BOOLEAN,
+        /* warm_start_coeff */    DOUBLE PRECISION[],
+        /* n_tuples */            INTEGER,
+        /* lambda */              DOUBLE PRECISION,
+        /* x_means */             DOUBLE PRECISION[],
+        /* x_stds */              DOUBLE PRECISION[]
+        )(
     STYPE=DOUBLE PRECISION[],
     SFUNC=MADLIB_SCHEMA.mlp_igd_transition,
-    m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
+    m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
     FINALFUNC=MADLIB_SCHEMA.mlp_igd_final,
     INITCOND='{0,0,0,0,0,0,0,0}'
 );
@@ -631,13 +734,16 @@ LANGUAGE c IMMUTABLE STRICT;
 -------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
-    source_table      VARCHAR,
-    output_table      VARCHAR,
-    independent_varname    VARCHAR,
-    dependent_varname   VARCHAR,
-    hidden_layer_sizes         INTEGER[],
-    optimizer_params   VARCHAR,
-    activation      VARCHAR
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN,
+    verbose              BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`convex', `mlp_igd')
     mlp_igd.mlp(
@@ -649,19 +755,96 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
         hidden_layer_sizes,
         optimizer_params,
         activation,
-        True
+        True,
+        weights,
+        warm_start,
+        verbose
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[]
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, NULL, NULL, NULL, FALSE, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
-    source_table      VARCHAR,
-    output_table      VARCHAR,
-    independent_varname    VARCHAR,
-    dependent_varname   VARCHAR,
-    hidden_layer_sizes         INTEGER[],
-    optimizer_params   VARCHAR,
-    activation      VARCHAR
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN,
+    verbose              BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`convex', `mlp_igd')
     mlp_igd.mlp(
@@ -673,11 +856,83 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
         hidden_layer_sizes,
         optimizer_params,
         activation,
-        False
+        False,
+        weights,
+        warm_start,
+        verbose
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[]
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, NULL, NULL, NULL, FALSE, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict(
     model_table      VARCHAR,
     data_table      VARCHAR,
@@ -700,9 +955,11 @@ CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp(
         coeff DOUBLE PRECISION[],
         independent_varname DOUBLE PRECISION[],
         is_classification DOUBLE PRECISION,
-        activation_function DOUBLE PRECISION,
+        activation DOUBLE PRECISION,
         layer_sizes DOUBLE PRECISION[],
-        is_response INTEGER
+        is_response INTEGER,
+        x_means DOUBLE PRECISION[],
+        x_stds DOUBLE PRECISION[]
     )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'