You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/08/14 17:37:21 UTC
[2/2] incubator-madlib git commit: MLP: Add multiple enhancements

MLP: Add multiple enhancements

JIRA: MADLIB-1134

This commit adds following:
 - Weights: Each tuple in training data can be individually weighted
 - Warm start: Network weights can be initialized from the output of a
    previous call.
 - n_tries: Allows calling the train function multiple times to avoid
    local minima.
 - Learning rate policy: Allows user to specify a policy to decay the
    learning rate.
 - Standardization: Inputs are standardized to zero mean and unit std.
    deviation.

Closes #162


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ff1b0f88
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ff1b0f88
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ff1b0f88

Branch: refs/heads/master
Commit: ff1b0f883c7a178323670b83b14069e06bf1b808
Parents: 6f6f804
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Aug 14 09:50:25 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Aug 14 09:50:25 2017 -0700

----------------------------------------------------------------------
 .gitignore                                      |   1 +
 doc/design/modules/neural-network.tex           | 144 ++-
 doc/literature.bib                              |   8 +-
 doc/mainpage.dox.in                             |   3 +-
 src/modules/convex/mlp_igd.cpp                  |  74 +-
 src/modules/convex/task/l2.hpp                  |   3 +-
 src/modules/convex/task/mlp.hpp                 | 259 ++----
 src/modules/convex/type/model.hpp               |  70 +-
 src/modules/convex/type/state.hpp               |  30 +-
 src/modules/convex/type/tuple.hpp               |   2 +-
 src/ports/postgres/modules/convex/mlp.sql_in    | 497 +++++++---
 src/ports/postgres/modules/convex/mlp_igd.py_in | 923 ++++++++++++-------
 .../postgres/modules/convex/test/mlp.sql_in     |  94 +-
 .../postgres/modules/utilities/utilities.py_in  |  12 +
 14 files changed, 1297 insertions(+), 823 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index abfccfa..00dc016 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Ignore build directory
 /build*
+/build-docker*
 
 # Ignore generated code files
 *.so

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/design/modules/neural-network.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/neural-network.tex b/doc/design/modules/neural-network.tex
index 8802361..9f8110b 100644
--- a/doc/design/modules/neural-network.tex
+++ b/doc/design/modules/neural-network.tex
@@ -22,7 +22,7 @@
 \chapter{Neural Network}
 
 \begin{moduleinfo}
-\item[Authors] {Xixuan Feng}
+\item[Authors] {Xixuan Feng, Cooper Sloan}
 \end{moduleinfo}
 
 % Abstract. What is the problem we want to solve?
@@ -30,7 +30,8 @@ This module implements artificial neural network \cite{ann_wiki}.
 
 \section{Multilayer Perceptron}
 Multilayer perceptron is arguably the most popular model among many neural network models \cite{mlp_wiki}.
-Here, we learn the coefficients by minimizing a least square objective function (\cite{bertsekas1999nonlinear}, example 1.5.3).
+Here, we learn the coefficients by minimizing a least square objective function, or cross entropy (\cite{bertsekas1999nonlinear}, example 1.5.3).
+The parallel architecture is based on the paper by Zhiheng Huang \cite{mlp_parallel}.
 
 % Background. Why can we solve the problem with gradient-based methods?
 \subsection{Solving as a Convex Program}
@@ -46,41 +47,47 @@ For multilayer perceptron, we choose incremental gradient descent (IGD).
 In the remaining part of this section, we will give a formal description of the derivation of objective function and its gradient.
 
 \paragraph{Objective function.}
-We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k$th stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function is given as
-\[f_{(y, z)}(u) = \frac{1}{2} \|h(u, y) - z\|_2^2,\]
-where $y \in \mathbb{R}^{n_0}$ is the input vector, $z \in \mathbb{R}^{n_N}$ is the output vector,
-\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
+We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k^{th}$ stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function for regression is given as
+\[f_{(x, y)}(u) = \frac{1}{2} \|h(u, x) - y\|_2^2,\]
+and for classification the objective function is given as
+\[f_{(x, y)}(u) = \sum_i (\log(h_i(u, x)) * z_i + (1-\log(h_i(u, x))) *( 1- z_i) ,\]
+where $x \in \mathbb{R}^{n_0}$ is the input vector, $y \in \mathbb{R}^{n_N}$ is the output vector (one hot encoded for classification),~\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
 and the coefficients are given as
-\[u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}\]
+\[u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\},\]
+And are initialized from a uniform distribution as follows:
+\[u_{k}^{sj} = uniform(-r,r),\]
+where r is defined as follows:
+\[r = \sqrt{\frac{6}{n_k+n_{k+1}}}\]
+With regularization, an additional term enters the objective function, given as
+\[\sum_{u_k^{sj}} \frac{1}{2} \lambda u_k^{sj2} \]
 This still leaves $h : \mathbb{R}^{n_0} \to \mathbb{R}^{n_N}$ as an open item.
-Let $x_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k$th layer. Then we define $h(u, y) = x_N$, based on setting $x_0 = y$ and the $j$th component of $x_k$ is given in an iterative fashion as
-\footnote{$x_k^0 \equiv 1$ is used to simplified the notations, and $x_k^0$ is not a component of $x_k$, for any $k = 0,...,N$.}
+Let $o_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k^{th}$ layer. Then we define $h(u, x) = o_N$, based on setting $o_0 = x$ and the $j^{th}$ component of $o_k$ is given in an iterative fashion as~\footnote{$o_k^0 \equiv 1$ is used to simplified the notations, and $o_k^0$ is not a component of $o_k$, for any $k = 0,...,N$.}
 \[\begin{alignedat}{5}
-    x_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} x_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
+    o_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
 \end{alignedat}\]
 
 \paragraph{Gradient of the End Layer.}
 Let's first handle $u_{N-1}^{st}, s = 0,...,n_{N-1}, t = 1,...,n_N$.
-Let $z^t$ denote the $t$th component of $z \in \mathbb{R}^{n_N}$, and $h^t$ the $t$th component of output of $h$.
+Let $y^t$ denote the $t^{th}$ component of $y \in \mathbb{R}^{n_N}$, and $h^t$ the $t^{th}$ component of output of $h$.
 \[\begin{aligned}
     \frac{\partial f}{\partial u_{N-1}^{st}}
-    &= \left( h^t(u, y) - z^t \right) \cdot \frac{\partial h^t(u, y)}{\partial u_{N-1}^{st}} \\
-    &= \left( x_N^t - z^t \right) \cdot \frac{\partial x_N^t}{\partial u_{N-1}^{st}} \\
-    &= \left( x_N^t - z^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} x_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
-    &= \left( x_N^t - z^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} x_{N-1}^s u_{N-1}^{st} \right) \cdot x_{N-1}^s \\
+    &= \left( h^t(u, x) - y^t \right) \cdot \frac{\partial h^t(u, x)}{\partial u_{N-1}^{st}} \\
+    &= \left( o_N^t - y^t \right) \cdot \frac{\partial o_N^t}{\partial u_{N-1}^{st}} \\
+    &= \left( o_N^t - y^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
+    &= \left( o_N^t - y^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right) \cdot o_{N-1}^s \\
 \end{aligned}\]
-To ease the notation, let the input vector of the $j$th activation unit of the $(k+1)$th layer be
-\[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} x_{k-1}^s u_{k-1}^{sj},\]
-where $k = 1,...,N, \; j = 1,...,n_k$, and note that $x_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
-\[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( x_N^t - z^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot x_{N-1}^s\]
-For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $z^t$, and $x_N^t, \mathit{net}_N^t, x_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.
+To ease the notation, let the input vector of the $j^{th}$ activation unit of the $(k+1)^{th}$ layer be
+\[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj},\]
+where $k = 1,...,N, \; j = 1,...,n_k$, and note that $o_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
+\[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( o_N^t - y^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot o_{N-1}^s\]
+For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $y^t$, and $o_N^t, \mathit{net}_N^t, o_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.
 
 \subsubsection{Backpropagation}
 For inner (hidden) layers, it is more difficult to compute the partial derivative over the input of activation units (i.e., $\mathit{net}_k, k = 1,...,N-1$).
-That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (x_N^t - z^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
+That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (o_N^t - y^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
 This hard-to-compute statistic is referred to as \textit{delta error}, and let $\delta_k^j = \frac{\partial f}{\partial \mathit{net}_k^j}$, where $k = 1,...,N-1, j = 1,..,n_k$.
 If this is solved, the gradient can be easily computed as follow
-\[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} x_{k-1}^s,\]
+\[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} o_{k-1}^s,\]
 where $k = 1,...,N-1, s = 0,...,n_{k-1}, j = 1,..,n_k$.
 To solve this, we introduce the popular backpropagation below.
 
@@ -90,20 +97,20 @@ First,
 \[
     \delta_{k}^j
     = \frac{\partial f}{\partial \mathit{net}_{k}^j}
-    = \frac{\partial f}{\partial x_{k}^j} \cdot \frac{\partial x_{k}^j}{\partial \mathit{net}_{k}^j}
-    = \frac{\partial f}{\partial x_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
+    = \frac{\partial f}{\partial o_{k}^j} \cdot \frac{\partial o_{k}^j}{\partial \mathit{net}_{k}^j}
+    = \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
 \]
 And here comes the only equation that is needed but the author, I (Aaron), do not understand but it looks reasonable and repeats in different online notes \cite{mlp_gradient_wisc},
 \[\begin{alignedat}{5}
-    \frac{\partial f}{\partial x_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial x_{k}^j} \right),
+    \frac{\partial f}{\partial o_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right),
     &\quad k = 1,...,N-1, \: j = 1,...,n_{k}
 \end{alignedat}\]
 Assuming the above equation is true, we can solve delta error backward iteratively
 \[\begin{aligned}
     \delta_{k}^j
-    &= \frac{\partial f}{\partial x_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
-    &= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial x_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
-    &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} x_{k}^s u_{k}^{st} \right) }{\partial x_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
+    &= \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
+    &= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
+    &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} o_{k}^s u_{k}^{st} \right) }{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
     &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot u_{k}^{jt} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
 \end{aligned}\]
 To sum up, we need the following equation for error back propagation
@@ -111,20 +118,20 @@ To sum up, we need the following equation for error back propagation
 where $k = 1,...,N-1$, and $j = 1,...,n_{k}$.
 
 \subsubsection{The $\mathit{Gradient}$ Function}
-\begin{algorithm}[mlp-gradient$(u, y, z)$] \label{alg:mlp-gradient}
+\begin{algorithm}[mlp-gradient$(u, x, y)$] \label{alg:mlp-gradient}
 \alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
-start vector $y \in \mathbb{R}^{n_0}$,\\
-end vector $z \in \mathbb{R}^{n_N}$,\\
+start vector $x \in \mathbb{R}^{n_0}$,\\
+end vector $y \in \mathbb{R}^{n_N}$,\\
 activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
 \algoutput{Gradient value $\nabla f(u)$ that consists of components $\nabla f(u)_{k-1}^{sj} = \frac{\partial f}{\partial u_{k-1}^{sj}}$}
 \begin{algorithmic}[1]
-    \State $(\mathit{net}, x) \set$ \texttt{feed-forward}$(u, y, \phi)$
-    \State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, x, z, \phi')$
+    \State $(\mathit{net}, o) \set$ \texttt{feed-forward}$(u, x, \phi)$
+    \State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, o, y, \phi')$
     \State $\delta \set$ \texttt{error-back-propagation}$(\delta_N, \mathit{net}, u, \phi')$
     \For{$k = 1,...,N$}
         \For{$s = 0,...,n_{k-1}$}
             \For{$j = 1,...,n_k$}
-                \State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j x_{k-1}^s$
+                \State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j o_{k-1}^s$
                 \Comment{Can be put together with the computation of delta $\delta$}
             \EndFor
         \EndFor
@@ -138,46 +145,47 @@ Common examples of activation units are
 \[\begin{alignedat}{3}
 \phi(\xi) &= \frac{1}{1 + e^{-\xi}}, &\quad \text{ (logistic function),}\\
 \phi(\xi) &= \frac{e^{\xi} - e^{-\xi}}{e^{\xi} + e^{-\xi}}, &\quad \text{ (hyperbolic tangent function)}\\
+\phi(\xi) &= max(x,0), &\quad \text{ (rectified linear function)}\\
 \end{alignedat}\]
 
-\begin{algorithm}[feed-forward$(u, y, \phi)$] \label{alg:feed-forward}
+\begin{algorithm}[feed-forward$(u, x, \phi)$] \label{alg:feed-forward}
 \alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
-input vector $y \in \mathbb{R}^{n_0}$,\\
+input vector $x \in \mathbb{R}^{n_0}$,\\
 activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
 \algoutput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
-output vectors $x = \{x_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$}
+output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$}
 \begin{algorithmic}[1]
     \For{$k = 0,...,N$}
-        \State $x_k^0 \set 1$
+        \State $o_k^0 \set 1$
     \EndFor
-    \State $x_0 \set y$ \Comment{For all components $x_0^j, y^j, \; j = 1,...,n_0$}
+    \State $o_0 \set x$ \Comment{For all components $o_0^j, x^j, \; j = 1,...,n_0$}
     \For{$k = 1,...,N$}
         \For{$j = 1,...,n_k$}
             \State $\mathit{net}_k^j \set 0$
             \For{$s = 0,...,n_{k-1}$}
-                \State $\mathit{net}_k^j \set \mathit{net}_k^j + x_{k-1}^s u_{k-1}^{sj}$
+                \State $\mathit{net}_k^j \set \mathit{net}_k^j + o_{k-1}^s u_{k-1}^{sj}$
             \EndFor
-            \State $x_k^j = \phi(\mathit{net}_k^j)$
+            \State $o_k^j = \phi(\mathit{net}_k^j)$ \Comment{Where the activation function for the final layer is identity for regression and softmax for classification.}
         \EndFor
     \EndFor
-    \State \Return $(\mathit{net}, x)$
+    \State \Return $(\mathit{net}, o)$
 \end{algorithmic}
 \end{algorithm}
 
-\begin{algorithm}[end-layer-delta-error$(\mathit{net}, x, z, \phi')$] \label{alg:end-layer-delta-error}
+\clearpage
+\begin{algorithm}[end-layer-delta-error$(\mathit{net}, o, y, \phi')$] \label{alg:end-layer-delta-error}
 \alginput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
-output vectors $x = \{x_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
-end vector $z \in \mathbb{R}^{n_N}$,\\
+output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
+end vector $y \in \mathbb{R}^{n_N}$,\\
 derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
 \algoutput{End layer delta $\delta_N = \{\delta_N^t \; | \; t = 1,...,n_N\}$}
 \begin{algorithmic}[1]
     \For{$t = 1,...,n_N$}
-            \State $\delta_N^t \set (x_N^t - z^t) \phi'(\mathit{net}_N^t)$
+            \State $\delta_N^t \set (o_N^t - y^t)$ \Comment{This applies for identity activation and mean square error loss and softmax activation with cross entropy loss}
     \EndFor
     \State \Return $\delta_N$
 \end{algorithmic}
 \end{algorithm}
-
 \begin{algorithm}[error-back-propagation$(\delta_N, \mathit{net}, u, \phi')$] \label{alg:error-back-propagation}
 \alginput{End layer delta $\delta_N = \{\delta_N^t \; | \; t = 1,...,n_N\}$,\\
 input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
@@ -197,3 +205,45 @@ derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
     \State \Return $\delta$
 \end{algorithmic}
 \end{algorithm}
+
+\begin{algorithm}[mlp-train-iteration$(X, Y, \eta)$] \label{alg:mlp-train-iteration}
+\alginput{
+start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
+end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
+learning rate $\eta$,\\}
+\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
+\begin{algorithmic}[1]
+    \State \texttt{Randomnly initialize u}
+    \For{$i = 1,...,m$}
+        \State $\nabla f(u) \set \texttt{mlp-gradient}(u,X_i,Y_i)$
+        \State $u \set u - (\eta \nabla f(u) u + \lambda u)$
+    \EndFor
+    \State \Return $u$
+\end{algorithmic}
+\end{algorithm}
+
+\clearpage
+\begin{algorithm}[mlp-train-parallel$(X, Y, \eta, s, t)$] \label{alg:mlp-train-parallel}
+\alginput{
+start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
+end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
+learning rate $\eta$,\\
+segments $s$,\\
+iterations $t$,\\}
+\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
+\begin{algorithmic}[1]
+    \State \texttt{Randomnly initialize u}
+    \For{$j = 1,...,s$}
+        \State $X_j \set \texttt{subset-of-X}$
+        \State $Y_j \set \texttt{subset-of-Y}$
+    \EndFor
+    \For{$i = 1,...,t$}
+        \For{$j = 1,...,s$}
+            \State $u_j \set copy(u)$
+            \State $u_j \set \texttt{mlp-train-iteration}(X_j, Y_j, \eta)$
+        \EndFor
+        \State $u \set \texttt{weighted-avg}(u_{1...s})$
+    \EndFor
+    \State \Return $u$
+\end{algorithmic}
+\end{algorithm}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index 225622d..6784f5e 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -953,4 +953,10 @@ Applied Survival Analysis},
 @online{bfs_wikipedia,
    title = {Breadth-first search},
    url={https://en.wikipedia.org/wiki/Breadth-first_search}
-}
\ No newline at end of file
+}
+
+@misc{mlp_parallel,
+    Url = {https://www.microsoft.com/en-us/research/publication/accelerating-recurrent-neural-network-training-via-two-stage-classes-and-parallelization/},
+    Title = {{Accelerating Recurrent Neural Network Training via Two Stage Classes and Parallelization}},
+    Author = {{Zhiheng Huang}}
+}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index ccf58a8..e27e14a 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -183,7 +183,7 @@ Contains graph algorithms.
     @defgroup grp_crf Conditional Random Field
     @ingroup grp_super
 
-    @defgroup grp_mlp Multilayer Perceptron
+    @defgroup grp_nn Neural Network
     @ingroup grp_super
 
     @defgroup grp_regml Regression Models
@@ -202,7 +202,6 @@ Contains graph algorithms.
         @defgroup grp_robust Robust Variance
     @}
 
-
     @defgroup grp_svm Support Vector Machines
     @ingroup grp_super
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/mlp_igd.cpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/mlp_igd.cpp b/src/modules/convex/mlp_igd.cpp
index 3647d5f..9e9e665 100644
--- a/src/modules/convex/mlp_igd.cpp
+++ b/src/modules/convex/mlp_igd.cpp
@@ -29,6 +29,7 @@
 #include "mlp_igd.hpp"
 
 #include "task/mlp.hpp"
+#include "task/l2.hpp"
 #include "algo/igd.hpp"
 #include "algo/loss.hpp"
 
@@ -51,6 +52,8 @@ typedef Loss<MLPIGDState<MutableArrayHandle<double> >, MLPIGDState<ArrayHandle<d
 
 typedef MLP<MLPModel<MutableArrayHandle<double> >,MLPTuple> MLPTask;
 
+typedef MLPModel<MutableArrayHandle<double> > MLPModelType;
+
 /**
  * @brief Perform the multilayer perceptron transition step
  *
@@ -63,6 +66,7 @@ mlp_igd_transition::run(AnyType &args) {
     // For other tuples: args[0] holds the computation state until last tuple
     MLPIGDState<MutableArrayHandle<double> > state = args[0];
 
+
     // initilize the state if first tuple
     if (state.algo.numRows == 0) {
         if (!args[3].isNull()) {
@@ -74,20 +78,30 @@ mlp_igd_transition::run(AnyType &args) {
         } else {
             // configuration parameters
             ArrayHandle<double> numbersOfUnits = args[4].getAs<ArrayHandle<double> >();
+            int numberOfStages = numbersOfUnits.size() - 1;
 
             double stepsize = args[5].getAs<double>();
 
-            state.allocate(*this, numbersOfUnits.size() - 1,
+            state.allocate(*this, numberOfStages,
                            reinterpret_cast<const double *>(numbersOfUnits.ptr()));
             state.task.stepsize = stepsize;
 
 
-            int activation = args[6].getAs<int>();
-
-            int is_classification = args[7].getAs<int>();
-            state.task.model.initialize(is_classification, activation);
+            const int activation = args[6].getAs<int>();
+            const int is_classification = args[7].getAs<int>();
+
+            const bool warm_start = args[9].getAs<bool>();
+            const int n_tuples = args[11].getAs<int>();
+            const double lambda = args[12].getAs<double>();
+            state.task.lambda = lambda;
+            MLPTask::lambda = lambda;
+            double is_classification_double = (double) is_classification;
+            double activation_double = (double) activation;
+            MappedColumnVector coeff = args[10].getAs<MappedColumnVector>();
+            state.task.model.rebind(&is_classification_double,&activation_double,
+                                    &coeff.data()[0], numberOfStages,
+                                    &numbersOfUnits[0]);
         }
-
         // resetting in either case
         state.reset();
     }
@@ -96,25 +110,23 @@ mlp_igd_transition::run(AnyType &args) {
     const uint16_t N = state.task.numberOfStages;
     const double *n = state.task.numbersOfUnits;
 
+    MappedColumnVector x_means = args[13].getAs<MappedColumnVector>();
+    MappedColumnVector x_stds = args[14].getAs<MappedColumnVector>();
     // tuple
-    MappedColumnVector indVar;
+    ColumnVector indVar;
     MappedColumnVector depVar;
     try {
-        // an exception is raised in the backend if args[2] contains nulls
-        MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-        // x is a const reference, we can only rebind to change its pointer
-        indVar.rebind(x.memoryHandle(), x.size());
+        indVar = (args[1].getAs<MappedColumnVector>()-x_means).cwiseQuotient(x_stds);
         MappedColumnVector y = args[2].getAs<MappedColumnVector>();
         depVar.rebind(y.memoryHandle(), y.size());
-
     } catch (const ArrayWithNullException &e) {
         return args[0];
     }
     MLPTuple tuple;
-    tuple.indVar.rebind(indVar.memoryHandle(), indVar.size());
+    tuple.indVar = indVar;
     tuple.depVar.rebind(depVar.memoryHandle(), depVar.size());
+    tuple.weight = args[8].getAs<double>();
 
-    // Now do the transition step
     MLPIGDAlgorithm::transition(state, tuple);
     MLPLossAlgorithm::transition(state, tuple);
     state.algo.numRows ++;
@@ -130,14 +142,12 @@ mlp_igd_merge::run(AnyType &args) {
     MLPIGDState<MutableArrayHandle<double> > stateLeft = args[0];
     MLPIGDState<ArrayHandle<double> > stateRight = args[1];
 
-    // We first handle the trivial case where this function is called with one
-    // of the states being the initial state
     if (stateLeft.algo.numRows == 0) { return stateRight; }
     else if (stateRight.algo.numRows == 0) { return stateLeft; }
 
-    // Merge states together
     MLPIGDAlgorithm::merge(stateLeft, stateRight);
     MLPLossAlgorithm::merge(stateLeft, stateRight);
+
     // The following numRows update, cannot be put above, because the model
     // averaging depends on their original values
     stateLeft.algo.numRows += stateRight.algo.numRows;
@@ -154,20 +164,17 @@ mlp_igd_final::run(AnyType &args) {
     // a deep copy.
     MLPIGDState<MutableArrayHandle<double> > state = args[0];
 
-    // Aggregates that haven't seen any data just return Null.
     if (state.algo.numRows == 0) { return Null(); }
 
-    // finalizing
-    MLPIGDAlgorithm::final(state);
-
-    // Return the mean loss
+    L2<MLPModelType>::lambda = state.task.lambda;
     state.algo.loss = state.algo.loss/static_cast<double>(state.algo.numRows);
+    state.algo.loss += L2<MLPModelType>::loss(state.task.model);
+    MLPIGDAlgorithm::final(state);
 
-    // for stepsize tuning
-    std::stringstream debug;
-    debug << "loss: " << state.algo.loss;
-    elog(INFO,"%s",debug.str().c_str());
-    return state;
+    AnyType tuple;
+    tuple << state
+          << (double)state.algo.loss;
+    return tuple;
 }
 
 /**
@@ -191,10 +198,9 @@ internal_mlp_igd_result::run(AnyType &args) {
         flattenU;
     flattenU.rebind(&state.task.model.u[0](0, 0),
             state.task.model.arraySize(state.task.numberOfStages,
-                    state.task.numbersOfUnits)-2); // -2 for is_classification and activation
+                    state.task.numbersOfUnits));
     double loss = state.algo.loss;
 
-
     AnyType tuple;
     tuple << flattenU
           << loss;
@@ -204,27 +210,25 @@ internal_mlp_igd_result::run(AnyType &args) {
 AnyType
 internal_predict_mlp::run(AnyType &args) {
     MLPModel<MutableArrayHandle<double> > model;
-    MappedColumnVector indVar;
+    ColumnVector indVar;
     int is_response = args[5].getAs<int>();
+    MappedColumnVector x_means = args[6].getAs<MappedColumnVector>();
+    MappedColumnVector x_stds = args[7].getAs<MappedColumnVector>();
     MappedColumnVector coeff = args[0].getAs<MappedColumnVector>();
     MappedColumnVector layerSizes = args[4].getAs<MappedColumnVector>();
     // Input layer doesn't count
     size_t numberOfStages = layerSizes.size()-1;
-    //#TODO this should be an int not a double
     double is_classification = args[2].getAs<double>();
     double activation = args[3].getAs<double>();
     bool get_class = is_classification && is_response;
 
     model.rebind(&is_classification,&activation,&coeff.data()[0],numberOfStages,&layerSizes.data()[0]);
     try {
-        MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-        // x is a const reference, we can only rebind to change its pointer
-        indVar.rebind(x.memoryHandle(), x.size());
+        indVar = (args[1].getAs<MappedColumnVector>()-x_means).cwiseQuotient(x_stds);
     } catch (const ArrayWithNullException &e) {
         return args[0];
     }
     ColumnVector prediction = MLPTask::predict(model, indVar, get_class);
-
     return prediction;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/task/l2.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/l2.hpp b/src/modules/convex/task/l2.hpp
index a2e7f2f..308cfd9 100644
--- a/src/modules/convex/task/l2.hpp
+++ b/src/modules/convex/task/l2.hpp
@@ -84,7 +84,8 @@ double
 L2<Model, Hessian>::loss(
         const model_type &model) {
     // 1/2 * lambda * || w ||^2
-    return lambda * model.norm()*model.norm() / 2;
+    double norm = model.norm();
+    return lambda * norm*norm / 2;
 }
 
 } // namespace convex

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/task/mlp.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp
index e66492b..0032b81 100644
--- a/src/modules/convex/task/mlp.hpp
+++ b/src/modules/convex/task/mlp.hpp
@@ -26,6 +26,8 @@
 #ifndef MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
 #define MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
 
+#include <dbconnector/dbconnector.hpp>
+
 namespace madlib {
 
 namespace modules {
@@ -46,24 +48,26 @@ public:
 
     static void gradientInPlace(
             model_type                          &model,
-            const independent_variables_type    &y,
-            const dependent_variable_type       &z,
+            const independent_variables_type    &x,
+            const dependent_variable_type       &y,
             const double                        &stepsize);
 
     static double loss(
             const model_type                    &model,
-            const independent_variables_type    &y,
-            const dependent_variable_type       &z);
+            const independent_variables_type    &x,
+            const dependent_variable_type       &y);
 
     static ColumnVector predict(
             const model_type                    &model,
-            const independent_variables_type    &y,
+            const independent_variables_type    &x,
             const bool                          get_class);
 
     const static int RELU = 0;
     const static int SIGMOID = 1;
     const static int TANH = 2;
+    static double lambda;
 
+private:
     static double sigmoid(const double &xi) {
         return 1. / (1. + std::exp(-xi));
     }
@@ -76,9 +80,6 @@ public:
         return std::tanh(xi);
     }
 
-
-private:
-
     static double sigmoidDerivative(const double &xi) {
         double value = sigmoid(xi);
         return value * (1. - value);
@@ -95,59 +96,39 @@ private:
 
     static void feedForward(
             const model_type                    &model,
-            const independent_variables_type    &y,
+            const independent_variables_type    &x,
             std::vector<ColumnVector>           &net,
-            std::vector<ColumnVector>           &x);
-
-    static void endLayerDeltaError(
-            const std::vector<ColumnVector>     &net,
-            const std::vector<ColumnVector>     &x,
-            const dependent_variable_type       &z,
-            ColumnVector                        &delta_N);
+            std::vector<ColumnVector>           &o);
 
-    static void errorBackPropagation(
-            const ColumnVector                  &delta_N,
+    static void backPropogate(
+            const ColumnVector                  &y_true,
+            const ColumnVector                  &y_estimated,
             const std::vector<ColumnVector>     &net,
             const model_type                    &model,
             std::vector<ColumnVector>           &delta);
 };
 
 template <class Model, class Tuple>
+double MLP<Model, Tuple>::lambda = 0;
+
+template <class Model, class Tuple>
 void
 MLP<Model, Tuple>::gradientInPlace(
         model_type                          &model,
-        const independent_variables_type    &y,
-        const dependent_variable_type       &z,
+        const independent_variables_type    &x,
+        const dependent_variable_type       &y_true,
         const double                        &stepsize) {
-    (void) model;
-    (void) z;
-    (void) y;
-    (void) stepsize;
-    std::vector<ColumnVector> net;
-    std::vector<ColumnVector> x;
-    std::vector<ColumnVector> delta;
-    ColumnVector delta_N;
-
-    feedForward(model, y, net, x);
-    endLayerDeltaError(net, x, z, delta_N);
-    errorBackPropagation(delta_N, net, model, delta);
-
     uint16_t N = model.u.size(); // assuming nu. of layers >= 1
-    uint16_t k, s, j;
+    uint16_t k;
+    std::vector<ColumnVector> net, o, delta;
 
-    std::vector<uint16_t> n; n.clear(); //nu. of units in each layer
+    feedForward(model, x, net, o);
+    backPropogate(y_true, o.back(), net, model, delta);
 
-    n.push_back(model.u[0].rows() - 1);
-    for (k = 1; k <= N; k ++) {
-        n.push_back(model.u[k-1].cols() - 1);
-    }
-
-    for (k=1; k <= N; k++){
-        for (s=0; s <= n[k-1]; s++){
-            for (j=1; j <= n[k]; j++){
-                model.u[k-1](s,j) -= stepsize *  (delta[k](j) * x[k-1](s));
-            }
-        }
+    for (k=0; k < N; k++){
+        Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
+        regularization.row(0).setZero(); // Do not update bias
+        model.u[k] -= stepsize * (o[k] * delta[k].transpose() + regularization);
     }
 }
 
@@ -155,54 +136,40 @@ template <class Model, class Tuple>
 double
 MLP<Model, Tuple>::loss(
         const model_type                    &model,
-        const independent_variables_type    &y,
-        const dependent_variable_type       &z) {
+        const independent_variables_type    &x,
+        const dependent_variable_type       &y_true) {
     // Here we compute the loss. In the case of regression we use sum of square errors
     // In the case of classification the loss term is cross entropy.
-    std::vector<ColumnVector> net;
-    std::vector<ColumnVector> x;
-
-    feedForward(model, y, net, x);
-    double loss = 0.;
-    uint16_t j;
-
-    for (j = 1; j < z.rows() + 1; j ++) {
-        if(model.is_classification){
-            // Cross entropy: RHS term is negative
-            loss -= z(j-1)*std::log(x.back()(j)) + (1-z(j-1))*std::log(1-x.back()(j));
-        }else{
-            double diff = x.back()(j) - z(j-1);
-            loss += diff * diff;
-        }
+    std::vector<ColumnVector> net, o;
+    feedForward(model, x, net, o);
+    ColumnVector y_estimated = o.back();
+
+    if(model.is_classification){
+        double clip = 1.e-10;
+        y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip);
+        return - (y_true.array()*y_estimated.array().log()
+               + (-y_true.array()+1)*(-y_estimated.array()+1).log()).sum();
     }
-    if(!model.is_classification){
-        loss /= 2.;
-    }else{
-        loss /= z.rows();
+    else{
+        return 0.5 * (y_estimated-y_true).squaredNorm();
     }
-    return loss;
 }
 
 template <class Model, class Tuple>
 ColumnVector
 MLP<Model, Tuple>::predict(
         const model_type                    &model,
-        const independent_variables_type    &y,
-        const bool                          get_class
-        ) {
-    (void) model;
-    (void) y;
-    std::vector<ColumnVector> net;
-    std::vector<ColumnVector> x;
-
-    feedForward(model, y, net, x);
-    // Don't return the offset
-    ColumnVector output = x.back().tail(x.back().size()-1);
-    if(get_class){
+        const independent_variables_type    &x,
+        const bool                          get_class) {
+    std::vector<ColumnVector> net, o;
+
+    feedForward(model, x, net, o);
+    ColumnVector output = o.back();
+    if(get_class){ // Return a length 1 array with the predicted index
         int max_idx;
         output.maxCoeff(&max_idx);
         output.resize(1);
-        output[0] = (double)max_idx;
+        output[0] = (double) max_idx;
     }
     return output;
 }
@@ -212,113 +179,65 @@ template <class Model, class Tuple>
 void
 MLP<Model, Tuple>::feedForward(
         const model_type                    &model,
-        const independent_variables_type    &y,
+        const independent_variables_type    &x,
         std::vector<ColumnVector>           &net,
-        std::vector<ColumnVector>           &x){
-    // meta data and x_k^0 = 1
-    uint16_t k, j, s;
-    uint16_t N = model.u.size(); // assuming >= 1
+        std::vector<ColumnVector>           &o){
+    uint16_t k, N;
+    N = model.u.size(); // assuming >= 1
     net.resize(N + 1);
-    x.resize(N + 1);
-
-    std::vector<uint16_t> n; n.clear();
-    n.push_back(model.u[0].rows() - 1);
-    x[0].resize(n[0] + 1);
-    x[0](0) = 1.;
-    for (k = 1; k <= N; k ++) {
-        n.push_back(model.u[k-1].cols() - 1);
-        net[k].resize(n[k] + 1);
-        x[k].resize(n[k] + 1);
-        // Bias
-        x[k](0) = 1.;
-    }
+    o.resize(N + 1);
+
+    double (*activation)(const double&);
+    if(model.activation==RELU)
+        activation = &relu;
+    else if(model.activation==SIGMOID)
+        activation = &sigmoid;
+    else
+        activation = &tanh;
 
-    // y is a mapped parameter from DB, aligning with x here
-    for (j = 1; j <= n[0]; j ++) { x[0](j) = y(j-1); }
+    o[0].resize(x.size()+1);
+    o[0] << 1.,x;
 
     for (k = 1; k < N; k ++) {
-        for (j = 1; j <= n[k]; j ++) {
-            net[k](j) = 0.;
-            for (s = 0; s <= n[k-1]; s ++) {
-                net[k](j) += x[k-1](s) * model.u[k-1](s, j);
-            }
-            if(model.activation==RELU)
-                x[k](j) = relu(net[k](j));
-            else if(model.activation==SIGMOID)
-                x[k](j) = sigmoid(net[k](j));
-            else
-                x[k](j) = tanh(net[k](j));
-        }
+        net[k] = model.u[k-1].transpose() * o[k-1];
+        o[k] = ColumnVector(model.u[k-1].cols()+1);
+        o[k] << 1., net[k].unaryExpr(activation);
     }
+    o[N] = model.u[N-1].transpose() * o[N-1];
 
-    // output layer computation
-    for (j = 1; j <= n[N]; j ++) {
-        x[N](j) = 0.;
-        for (s = 0; s <= n[N-1]; s ++) {
-            x[N](j) += x[N-1](s) * model.u[N-1](s, j);
-        }
-    }
     // Numerically stable calculation of softmax
-    ColumnVector last_x = x[N].tail(n[N]);
     if(model.is_classification){
-        double max_x = last_x.maxCoeff();
-        last_x = (last_x.array() - max_x).exp();
-        last_x /= last_x.sum();
+        double max_x = o[N].maxCoeff();
+        o[N] = (o[N].array() - max_x).exp();
+        o[N] /= o[N].sum();
     }
-    x[N].tail(n[N]) = last_x;
 }
 
 template <class Model, class Tuple>
 void
-MLP<Model, Tuple>::endLayerDeltaError(
-        const std::vector<ColumnVector>     &net,
-        const std::vector<ColumnVector>     &x,
-        const dependent_variable_type       &z,
-        ColumnVector                        &delta_N) {
-    //meta data
-    uint16_t t;
-    uint16_t N = x.size() - 1; // assuming >= 1
-    uint16_t n_N = x[N].rows() - 1;
-    delta_N.resize(n_N + 1);
-
-    for (t = 1; t <= n_N; t ++) {
-		delta_N(t) = (x[N](t) - z(t-1));
-    }
-}
-
-template <class Model, class Tuple>
-void
-MLP<Model, Tuple>::errorBackPropagation(
-        const ColumnVector                  &delta_N,
+MLP<Model, Tuple>::backPropogate(
+        const ColumnVector                  &y_true,
+        const ColumnVector                  &y_estimated,
         const std::vector<ColumnVector>     &net,
         const model_type                    &model,
         std::vector<ColumnVector>           &delta) {
-    // meta data
-    uint16_t k, j, t;
-    uint16_t N = model.u.size(); // assuming >= 1
-    delta.resize(N + 1);
-
-    std::vector<uint16_t> n; n.clear();
-    n.push_back(model.u[0].rows() - 1);
-    for (k = 1; k <= N; k ++) {
-        n.push_back(model.u[k-1].cols() - 1);
-        delta[k].resize(n[k]+1);
-    }
-    delta[N] = delta_N;
-
+    uint16_t k, N;
+    N = model.u.size(); // assuming >= 1
+    delta.resize(N);
+
+    double (*activationDerivative)(const double&);
+    if(model.activation==RELU)
+        activationDerivative = &reluDerivative;
+    else if(model.activation==SIGMOID)
+        activationDerivative = &sigmoidDerivative;
+    else
+        activationDerivative = &tanhDerivative;
+
+    delta.back() = y_estimated - y_true;
     for (k = N - 1; k >= 1; k --) {
-        for (j = 0; j <= n[k]; j ++) {
-            delta[k](j) = 0.;
-            for (t = 1; t <= n[k+1]; t ++) {
-                delta[k](j) += delta[k+1](t) * model.u[k](j, t);
-            }
-            if(model.activation==RELU)
-                delta[k](j) = delta[k](j) * reluDerivative(net[k](j));
-            else if(model.activation==SIGMOID)
-                delta[k](j) = delta[k](j) * sigmoidDerivative(net[k](j));
-            else
-                delta[k](j) = delta[k](j) * tanhDerivative(net[k](j));
-        }
+        // Do not include the bias terms
+        delta[k-1] = model.u[k].bottomRows(model.u[k].rows()-1) * delta[k];
+        delta[k-1] = delta[k-1].array() * net[k].unaryExpr(activationDerivative).array();
     }
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/model.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/model.hpp b/src/modules/convex/type/model.hpp
index 9b68af8..679dab4 100644
--- a/src/modules/convex/type/model.hpp
+++ b/src/modules/convex/type/model.hpp
@@ -121,51 +121,9 @@ struct MLPModel {
         const double *n = inNumbersOfUnits;
         size_t k;
         for (k = 1; k <= N; k ++) {
-            size += (n[k-1] + 1) * (n[k] + 1);
-        }
-        return 1 +       // is_classification
-               1 +       // activation
-               size;     // weights (u)
-    }
-
-    /**
-     * @brief Initialize the model randomly
-     */
-    void initialize(int is_classification_in, int activation_in) {
-        is_classification = is_classification_in;
-        activation = activation_in;
-        // using madlib::dbconnector::$database::NativeRandomNumberGenerator
-        NativeRandomNumberGenerator rng;
-
-        // Scaling factor for weight initialization
-        double epsilon = 0.0001;
-
-
-        double base = rng.min();
-        double span = rng.max() - base;
-
-        uint16_t N = u.size(); // assuming nu. of layers >= 1
-        uint16_t k, s, j;
-
-        std::vector<uint16_t> n; n.clear(); //nu. of units in each layer
-
-        n.push_back(u[0].rows() - 1);
-        for (k = 1; k <= N; k ++) {
-            n.push_back(u[k-1].cols() - 1);
-        }
-
-        for (k=1; k <= N; k++){
-            for (s=0; s <= n[k-1]; s++){
-                u[k-1](s,0)=1;
-                for (j=1; j <= n[k]; j++){
-                    // Generate normal(0,epsilon) value using Box-Muller transform
-                    double u1 = (rng()-base)/span;
-                    double u2 = (rng()-base)/span;
-                    double z = std::sqrt(-2*std::log(u1))*std::cos(2*M_PI*u2);
-                    u[k-1](s,j) = epsilon*z;
-                }
-            }
+            size += (n[k-1] + 1) * (n[k]);
         }
+        return size;     // weights (u)
     }
 
     uint32_t rebind(const double *is_classification_in,
@@ -185,20 +143,38 @@ struct MLPModel {
         for (k = 1; k <= N; k ++) {
             u.push_back(Eigen::Map<Matrix >(
                     const_cast<double*>(data + sizeOfU),
-                    n[k-1] + 1, n[k] + 1));
-            sizeOfU += (n[k-1] + 1) * (n[k] + 1);
+                    n[k-1] + 1, n[k]));
+            sizeOfU += (n[k-1] + 1) * (n[k]);
         }
 
         return sizeOfU;
     }
 
+    double norm() const {
+        double norm = 0.;
+        size_t k;
+        for (k = 0; k < u.size(); k ++) {
+            norm+=u[k].bottomRows(u[k].rows()-1).squaredNorm();
+        }
+        return std::sqrt(norm);
+    }
+
+    void setZero(){
+        size_t k;
+        for (k = 1; k <= u.size(); k ++) {
+            u[k-1].setZero();
+        }
+    }
+
     /*
      *  Some operator wrappers for u.
      */
     MLPModel &operator*=(const double &c) {
+        // Note that when scaling the model, you should
+        // not update the bias.
         size_t k;
         for (k = 1; k <= u.size(); k ++) {
-            u[k-1] *= c;
+           u[k-1] *= c;
         }
 
         return *this;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/state.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/state.hpp b/src/modules/convex/type/state.hpp
index 66f5023..2cb2643 100644
--- a/src/modules/convex/type/state.hpp
+++ b/src/modules/convex/type/state.hpp
@@ -629,6 +629,9 @@ public:
         return 1                        // numberOfStages = N
             + (inNumberOfStages + 1)    // numbersOfUnits: size is (N + 1)
             + 1                         // stepsize
+            + 1                         // lambda
+            + 1                         // is_classification
+            + 1                         // activation
             + sizeOfModel               // model
 
             + 1                         // numRows
@@ -645,17 +648,16 @@ private:
      * - 0: numberOfStages (number of stages (layers), design doc: N)
      * - 1: numbersOfUnits (numbers of activation units, design doc: n_0,...,n_N)
      * - N + 2: stepsize (step size of gradient steps)
-     * - N + 3: is_classification (do classification)
-     * - N + 4: activation (activation function)
-     * - N + 5: coeff (coefficients, design doc: u)
+     * - N + 3: lambda (regularization term)
+     * - N + 4: is_classification (do classification)
+     * - N + 5: activation (activation function)
+     * - N + 6: coeff (coefficients, design doc: u)
      *
      * Intra-iteration components (updated in transition step):
      *   sizeOfModel = # of entries in u + 2, (\sum_1^N n_{k-1} n_k)
-     * - N + 3 + sizeOfModel: numRows (number of rows processed in this iteration)
-     * - N + 4 + sizeOfModel: loss (loss value, the sum of squared errors)
-     * - N + 5 + sizeOfModel: is_classification (do classification)
-     * - N + 6 + sizeOfModel: activation (activation function)
-     * - N + 7 + sizeOfModel: coeff (volatile model for incrementally update)
+     * - N + 6 + sizeOfModel: coeff (volatile model for incrementally update)
+     * - N + 6 + 2*sizeOfModel: numRows (number of rows processed in this iteration)
+     * - N + 7 + 2*sizeOfModel: loss (loss value, the sum of squared errors)
      */
     void rebind() {
         task.numberOfStages.rebind(&mStorage[0]);
@@ -663,13 +665,14 @@ private:
         task.numbersOfUnits =
             reinterpret_cast<dimension_pointer_type>(&mStorage[1]);
         task.stepsize.rebind(&mStorage[N + 2]);
-        uint32_t sizeOfModel = task.model.rebind(&mStorage[N + 3],&mStorage[N + 4],&mStorage[N + 5],
+        task.lambda.rebind(&mStorage[N + 3]);
+        uint32_t sizeOfModel = task.model.rebind(&mStorage[N + 4],&mStorage[N + 5],&mStorage[N + 6],
                 task.numberOfStages, task.numbersOfUnits);
 
-        algo.numRows.rebind(&mStorage[N + 5 + sizeOfModel]);
-        algo.loss.rebind(&mStorage[N + 6 + sizeOfModel]);
-        algo.incrModel.rebind(&mStorage[N + 3],&mStorage[N + 4],&mStorage[N + 7 + sizeOfModel],
+        algo.incrModel.rebind(&mStorage[N + 4],&mStorage[N + 5],&mStorage[N + 6 + sizeOfModel],
                 task.numberOfStages, task.numbersOfUnits);
+        algo.numRows.rebind(&mStorage[N + 6 + 2*sizeOfModel]);
+        algo.loss.rebind(&mStorage[N + 7 + 2*sizeOfModel]);
 
     }
 
@@ -685,13 +688,14 @@ public:
         dimension_type numberOfStages;
         dimension_pointer_type numbersOfUnits;
         numeric_type stepsize;
+        numeric_type lambda;
         MLPModel<Handle> model;
     } task;
 
     struct AlgoState {
+        MLPModel<Handle> incrModel;
         count_type numRows;
         numeric_type loss;
-        MLPModel<Handle> incrModel;
     } algo;
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/tuple.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/tuple.hpp b/src/modules/convex/type/tuple.hpp
index 4b9c55e..824ed90 100644
--- a/src/modules/convex/type/tuple.hpp
+++ b/src/modules/convex/type/tuple.hpp
@@ -64,7 +64,7 @@ typedef ExampleTuple<MappedColumnVector, double> GLMTuple;
 // madlib::modules::convex::MatrixIndex
 typedef ExampleTuple<MatrixIndex, double> LMFTuple;
 
-typedef ExampleTuple<MappedColumnVector, MappedColumnVector> MLPTuple;
+typedef ExampleTuple<ColumnVector, MappedColumnVector> MLPTuple;
 
 } // namespace convex
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index 400f892..6b9d828 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -29,23 +29,23 @@
 m4_include(`SQLCommon.m4')
 
 /**
-@addtogroup grp_mlp
+@addtogroup grp_nn
 
 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#mlp_classification">Classification</a></li>
 <li class="level1"><a href="#mlp_regression">Regression</a></li>
-<li class="level1"><a href="#optimization_params">Optimizer Parameters</a></li>
-<li class="level1"><a href="#predict">Prediction Functions/a></li>
+<li class="level1"><a href="#optimizer_params">Optimizer Parameters</a></li>
+<li class="level1"><a href="#predict">Prediction Functions</a></li>
 <li class="level1"><a href="#example">Examples</a></li>
 <li class="level1"><a href="#background">Technical Background</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 <li class="level1"><a href="#related">Related Topics</a></li>
 </ul></div>
 
-Multilayer Perceptron (MLP) is a model for regression and
-classification.
+Multilayer Perceptron (MLP) is a type of neural network that can be
+used for regression and classification.
 
-Also called "vanilla neural networks", they consist of several
+Also called "vanilla neural networks", MLPs consist of several
 fully connected hidden layers with non-linear activation
 functions.  In the case of classification, the final layer of the
 neural net has as many nodes as classes, and the output of the
@@ -67,7 +67,8 @@ mlp_classification(
     dependent_varname,
     hidden_layer_sizes,
     optimizer_params,
-    activation
+    activation,
+    weights
     )
 </pre>
 \b Arguments
@@ -75,6 +76,7 @@ mlp_classification(
   <DT>source_table</DT>
   <DD>TEXT. Name of the table containing the training data.</DD>
 
+
   <DT>output_table</DT>
   <DD>TEXT. Name of the output table containing the model. Details of the output
    tables are provided below.
@@ -83,19 +85,22 @@ mlp_classification(
   <DT>independent_varname</DT>
   <DD>TEXT. Expression list to evaluate for the
     independent variables. An intercept variable should not be included as part
-    of this expression. Please note that expression should be able to be cast
-    to DOUBLE PRECISION[].
+    of this expression. <b>Please note that expression should be encoded properly.</b>
+    All values are cast to DOUBLE PRECISION, so categorical variables should be
+    one-hot or dummy encoded.  See <a href="group__grp__encode__categorical.html">here</a>
+    for more details.
   </DD>
 
+
   <DT>dependent_varname</DT>
   <DD> TEXT. Name of the dependent variable column. For classification, supported types are:
   text, varchar, character varying, char, character
   integer, smallint, bigint, and boolean.  </DD>
 
-  <DT>hidden_layer_sizes (optional)</DT>
-  <DD>INTEGER[], default: ARRAY[].
+  <DT>hidden_layer_sizes </DT>
+  <DD>INTEGER[]
   The number of neurons in each hidden layer.  The length of this array will
-  determine the number of hidden layers.  Empty for no hidden layers.
+  determine the number of hidden layers.  NULL for no hidden layers.
   </DD>
 
 
@@ -111,6 +116,25 @@ mlp_classification(
     'relu', and 'tanh'. The text can be any prefix of the three
     strings; for e.g., activation='s' will use the sigmoid activation.
   </DD>
+
+
+  <DT>weights (optional)</DT>
+  <DD>TEXT, default: NULL.
+    Weights for input rows. Column name which specifies the weight for each input row.
+    This weight will be incorporated into the update during SGD, and will not be used
+    for loss calculations. If not specified, weight for each row will default to 1.
+    Column should be a numeric type.
+  </DD>
+
+  <DT>warm_start (optional)</DT>
+  <DD>BOOLEAN, default: FALSE.
+    Initalize weights with the coefficients from the last call.  If true, weights will
+    be initialized from output_table. Note that all parameters other than optimizer_params,
+    and verbose must remain constant between calls to warm_start.
+  </DD>
+
+  <DT>verbose (optional)</DT>
+  <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD>
 </DL>
 
 <b>Output tables</b>
@@ -142,24 +166,28 @@ A summary table named \<output_table\>_summary is also created, which has the fo
         <td>The source table.</td>
     </tr>
     <tr>
-        <th>dependent_varname</th>
-        <td>The dependent variable.</td>
-    </tr>
-    <tr>
         <th>independent_varname</th>
         <td>The independent variables.</td>
     </tr>
     <tr>
+        <th>dependent_varname</th>
+        <td>The dependent variable.</td>
+    </tr>
+    <tr>
         <th>tolerance</th>
         <td>The tolerance as given in optimizer_params.</td>
     </tr>
     <tr>
-        <th>step_size</th>
-        <td>The step size as given in optimizer_params.</td>
+        <th>learning_rate_init</th>
+        <td>The initial learning rate as given in optimizer_params.</td>
+    </tr>
+    <tr>
+        <th>learning_rate_policy</th>
+        <td>The learning rate policy as given in optimizer_params.</td>
     </tr>
     <tr>
         <th>n_iterations</th>
-        <td>The number of iterations run</td>
+        <td>The number of iterations run.</td>
     </tr>
     <tr>
         <th>n_tries</th>
@@ -170,17 +198,29 @@ A summary table named \<output_table\>_summary is also created, which has the fo
         <td>The number of units in each layer including the input and output layer.</td>
     </tr>
     <tr>
-        <th>activation_function</th>
+        <th>activation</th>
         <td>The activation function.</td>
     </tr>
     <tr>
         <th>is_classification</th>
         <td>True if the model was trained for classification, False if it was trained
-        for regression</td>
+        for regression.</td>
     </tr>
     <tr>
         <th>classes</th>
-        <td>The classes which were trained against (empty for regression)</td>
+        <td>The classes which were trained against (empty for regression).</td>
+    </tr>
+    <tr>
+        <th>weights</th>
+        <td>The weight column used during training.</td>
+    </tr>
+    <tr>
+        <th>x_means</th>
+        <td>The mean for all input features (used for normalization).</td>
+    </tr>
+    <tr>
+        <th>x_stds</th>
+        <td>The standard deviation for all input features (used for normalization).</td>
     </tr>
 
    </table>
@@ -197,7 +237,9 @@ mlp_regression(source_table,
     dependent_varname,
     hidden_layer_sizes,
     optimizer_params,
-    activation
+    activation,
+    weights,
+    verbose
     )
 </pre>
 
@@ -205,7 +247,7 @@ mlp_regression(source_table,
 
 Specifications for regression are largely the same as for classification. In the
 model table, the loss will refer to mean square error instead of cross entropy. In the
-summary table, there is classes column. The following
+summary table, there is no classes column. The following
 arguments have specifications which differ from mlp_classification:
 <DL class="arglist">
 <DT>dependent_varname</DT>
@@ -226,7 +268,7 @@ the parameter is ignored.
 
 
 <pre class="syntax">
-  'step_size = &lt;value>,
+  'learning_rate_init = &lt;value>,
    n_iterations = &lt;value>,
    n_tries = &lt;value>,
    tolerance = &lt;value>'
@@ -234,27 +276,57 @@ the parameter is ignored.
 \b Optimizer Parameters
 <DL class="arglist">
 
-<DT>step_size</dt>
-<DD>Default: [0.001].
+<DT>learning_rate_init</dt>
+<DD>Default: 0.001.
 Also known as the learning rate. A small value is usually desirable to
 ensure convergence, while a large value provides more room for progress during
 training. Since the best value depends on the condition number of the data, in
 practice one often tunes this parameter.
 </DD>
 
+<DT>learning_rate_policy</dt>
+<DD>Default: constant.
+One of 'constant', 'exp', 'inv' or 'step' or any prefix of these.
+'constant': learning_rate = learning_rate_init
+'exp': learning_rate = learning_rate_init * gamma^(iter)
+'inv': learning_rate = learning_rate_init * (iter+1)^(-power)
+'step': learning_rate = learning_rate_init * gamma^(floor(iter/iterations_per_step))
+Where iter is the current iteration of SGD.
+</DD>
+
+<DT>gamma</dt>
+<DD>Default: 0.1.
+Decay rate for learning rate when learning_rate_policy is 'exp' or 'step'.
+</DD>
+
+<DT>power</dt>
+<DD>Default: 0.5.
+Exponent for learning_rate_policy = 'inv'.
+</DD>
+
+<DT>iterations_per_step</dt>
+<DD>Default: 100.
+Number of iterations to run before decreasing the learning rate by
+a factor of gamma.  Valid for learning rate policy = 'step'.
+</DD>
 
 <DT>n_iterations</dt>
 <DD>Default: [100]. The maximum number of iterations allowed.
 </DD>
+
 <DT>n_tries</dt>
 <DD>Default: [1]. Number of times to retrain the network with randomly initialized
-weights
+weights.
+</DD>
+
+<DT>lambda</dt>
+<DD>Default: 0. The regularization coefficient for L2 regularization.
 </DD>
 
 <DT>tolerance</dt>
 <DD>Default: 0.001. The criterion to end iterations. The training stops whenever
-<the difference between the training models of two consecutive iterations is
-<smaller than \e tolerance or the iteration number is larger than \e max_iter.
+the difference between the training models of two consecutive iterations is
+smaller than \e tolerance or the iteration number is larger than \e max_iter.
 </DD>
 
 </DL>
@@ -293,19 +365,19 @@ table name is already in use, then an error is returned.  Table contains:</DD>
         <td>Gives the 'id' for each prediction, corresponding to each row from the data_table.</td>
       </tr>
       <tr>
-        <th>estimated_<COL_NAME></th>
+        <th>estimated_COL_NAME</th>
         <td>
         (For pred_type='response') The estimated class
          for classification or value for regression, where
-         <COL_NAME> is the name of the column to be
-         predicted from training data
+         COL_NAME is the name of the column to be
+         predicted from training data.
         </td>
       </tr>
       <tr>
-        <th>prob_<CLASS></th>
+        <th>prob_CLASS</th>
         <td>
         (For pred_type='prob' for classification) The
-        probability of a given class <CLASS> as given by
+        probability of a given class CLASS as given by
         softmax. There will be one column for each class
         in the training data.
         </td>
@@ -315,10 +387,10 @@ table name is already in use, then an error is returned.  Table contains:</DD>
   <DT>pred_type</DT>
   <DD>TEXT.
 
-the type of output requested:
+The type of output requested:
 'response' gives the actual prediction,
 'prob' gives the probability of each class.
-for regression, only type='response' is defined.
+For regression, only type='response' is defined.
 The name of the id column in the input table.</DD>
 </DL>
 </table>
@@ -363,30 +435,36 @@ The model will be written to mlp_model.
 <pre class="example">
 DROP TABLE IF EXISTS mlp_model;
 DROP TABLE IF EXISTS mlp_model_summary;
+-- Set seed so results are reproducible
+SELECT setseed(0);
 SELECT madlib.mlp_classification(
     'iris_data',      -- Source table
     'mlp_model',      -- Destination table
     'attributes',     -- Input features
     'class_text',     -- Label
     ARRAY[5],         -- Number of units per layer
-    'step_size=0.003,
-    n_iterations=5000,
+    'learning_rate_init=0.003,
+    n_iterations=500,
     tolerance=0',     -- Optimizer params
-    'tanh');          -- Activation function
+    'tanh',           -- Activation function
+    NULL,             -- Default weight (1)
+    FALSE,            -- No warm start
+    TRUE              -- Verbose
+);
 </pre>
 -# View the result for the model.
 <pre class="example">
 -- Set extended display on for easier reading of output
 \\x ON
--- Neural net Initialization is non-deterministic, so your results may vary
+-- Results may vary depending on platform
 SELECT * FROM mlp_model;
 </pre>
 Result:
 <pre class="result">
--[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-coeff          | {1,1,1,1,1,0.136374930803,0.188739676875,0.662387810001,-1.03381622734,-0.469961067046,0.0614006983397,0.0811504589436,0.299008228258,-0.47391918521,-0.215098143699,0.10519213944,0.145844617525,0.511683525606,-0.800215552382,-0.36417142683,0.120751709056,0.167531106521,0.587074895969,-0.916946198095,-0.417055067449,0.0539541885146,0.0694359704131,0.262598585854,-0.419234805076,-0.189915344282,1,1,1,1,1,1,0.105645702152,1.46247470474,0.484457903226,0.965962824478,1.19361986431,0.419805760087,-0.105696503487,-1.46245956666,-0.484427811691,-0.965730981426,-1.19365280555,-0.419973628863}
-loss           | 0.0184092375519
-num_iterations | 5000
+-[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coeff          | {-0.172392477419,-0.0836446652758,-0.0162194484142,-0.647268294231,-0.504884325538,0.184825723596,0.351728174731,-0.601148967035,0.720999542651,0.26521898248,0.245760922013,0.264645322438,-0.349957739904,0.797653395667,0.725747963566,-0.344498001796,0.261481840947,0.329074383545,0.379503434339,-0.267398086353,-0.0238069072658,0.330239268187,-0.178736289201,-0.0563356339946,-0.0333791780453,0.262137386864,0.491390436498,-1.02635831573,-1.29541478382,0.246017274,-0.0623575215434,0.0826297373887,-0.671671189842,0.853494672576,1.21671423502,0.296424359217,0.15294606861}
+loss           | 0.0136695756314
+num_iterations | 500
 </pre>
 -# Next train a regression example.  First create some test data.  This dataset
 contains housing prices data.
@@ -419,30 +497,36 @@ COPY lin_housing (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
 <pre class="example">
 DROP TABLE IF EXISTS mlp_regress;
 DROP TABLE IF EXISTS mlp_regress_summary;
+SELECT setseed(0);
 SELECT madlib.mlp_regression(
-    'lin_housing',            -- Source table
-    'mlp_regress',              -- Desination table
-    'x',                        -- Input features
-    'y',                        -- Dependent variable
-    ARRAY[5,5],                 -- Number of units per layer
-    'step_size=0.000007,
-    n_iterations=10000,
+    'lin_housing',         -- Source table
+    'mlp_regress',         -- Desination table
+    'x',                   -- Input features
+    'y',                   -- Dependent variable
+    ARRAY[25,25],            -- Number of units per layer
+    'learning_rate_init=0.001,
+    n_iterations=500,
+    lambda=0.001,
     tolerance=0',
-    'relu');
+    'relu',
+    NULL,             -- Default weight (1)
+    FALSE,            -- No warm start
+    TRUE              -- Verbose
+);
 </pre>
 -# Check the results of the model
 <pre class="example">
--- Set extended display on for easier reading of output
+-- Set extended display on for easier reading of output.
 \\x ON
--- Neural net Initialization is non-deterministic, so your results may vary
+-- Results may vary depending on platform.
 SELECT * FROM mlp_regress;
 </pre>
 Result:
 <pre class="result">
--[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 -----------------------------------
-coeff          | {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.79506311399e-05,3.56715008915e-05,-6.09333559685e-05,0.000251228318768,-0.000224772841379,-3.71863030857e-05,-3.5757865148e-06,5.27936784854e-05,-2.48474166186e-05,6.19731184294e-05,3.07638968743e-05,6.8964698578e-06,0.000106016701083,-1.71484730318e-05,1.18691881812e-05,-0.000163975464208,0.000170026304906,3.11688265279e-05,0.000177050148787,-1.58265976603e-05,2.70144422657e-05,0.000112667883422,3.77575139073e-05,8.12474658795e-05,-7.90458917626e-05,0.000107566386158,-2.63771171506e-06,2.47996880915e-05,-0.00012642310887,0.000203827391081,0.000139315565565,4.86147243454e-05,-0.000176126471913,-6.47820782916e-05,-8.51592776447e-06,-6.60601176758e-05,2.91421874156e-05,6.3556873752e-05,0.000197557443129,0.000220531367259,0.000135036310289,0.000143735913975,-4.75034117786e-05,-0.000179547345838,-1.6919846786e-05,0.000162784312994,0.000268595819851,-0.000460066553287,8.69756071591e-05,-0.00311762727057,0.000126024763103,0.000205988242921
 ,0.003463432426,-0.00729789075286,0.00151625867549,-0.000890852767597,-0.00525016037249,0.0031043106659,0.00798041103839,-0.00552693050079,0.0232180415786,0.0230489850143,-0.0437890272341,0.0165765426407,-0.248554261758,-7.81336427846e-05,0.00558145591752,0.283465844585,-0.571699956182,0.133474351994,-0.0785181945605,-0.419269930709,0.249547772912,0.631761009875,-0.431305975666,1,1,1,1,1,1,0.0158747497572,-9.02809160806e-05,0.00015574347618,4.10805373863e-06,0.00121532434965,0.101790351335,0.0647558401493,-0.00013654998677,-9.92872075948e-06,-5.5319694394e-05,0.00519320756484,0.412736586036,0.0011998026977,-1.53688189815e-05,1.94817888201e-05,-4.63111489966e-05,7.24547899029e-05,0.00880394144485,5.45309822095e-05,-0.000140943219275,-7.96211486227e-05,-1.04337307472e-05,0.000161936762028,0.00136273797767,-4.54737243585e-05,-3.4083840736e-05,3.69286883662e-05,9.9047243188e-08,3.75014011824e-06,-9.45366086368e-08,1,1,1,1,1,1,6.67488547054,0.102754199001,0.41668912471,0.00886867296479,0
 .00136206007228,-9.88642499013e-05}
-loss           | 144.965776158
-num_iterations | 10000
+[ RECORD 1 ]--+-----------------------------------------------------------------------------------
+coeff          | {-0.135647108464,0.0315402969485,-0.117580589352,-0.23084537701,-0.10868726702...
+loss           | 0.114125125042
+num_iterations | 500
 </pre>
 -# Now let's look at the prediction functions. In the following examples we will
 use the training data set for prediction as well, which is not usual but serves to
@@ -458,8 +542,6 @@ SELECT madlib.mlp_predict(
          'mlp_prediction',    -- Output table for predictions
          'response'           -- Output classes, not probabilities
      );
--# View results
-<pre class="example">
 SELECT * FROM mlp_prediction JOIN iris_data USING (id);
 </pre>
 Result for the classification model:
@@ -487,7 +569,7 @@ Result for the classification model:
  19 | Iris-versicolor      | {6.6,2.9,4.6,1.3} | Iris-versicolor |     2
  20 | Iris-versicolor      | {5.2,2.7,3.9,1.4} | Iris-versicolor |     2
 </pre>
-Prediction using the regression model:
+-# Prediction using the regression model:
 <pre class="example">
 DROP TABLE IF EXISTS mlp_regress_prediction;
 SELECT madlib.mlp_predict(
@@ -498,34 +580,35 @@ SELECT madlib.mlp_predict(
          'response'                   -- Output values, not probabilities
      );
 </pre>
--# View results
+View results
 <pre class="example">
 SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id);
 </pre>
 Result for the regression model:
 <pre class="result">
- id |                                    x                                    | grp_by_col |  y   |    estimated_y
-----+-------------------------------------------------------------------------+------------+------+--------------------
- 1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98}       |          1 |   24 | {23.2627062018087}
- 2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14}      |          1 | 21.6 | {25.7088419115781}
- 3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03}     |          1 | 34.7 | {27.5587003901404}
- 4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94}     |          1 | 33.4 | {31.1812237427816}
- 5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33}      |          1 | 36.2 | {30.3696873085477}
- 6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21}      |          1 | 28.7 | {29.5290259241882}
- 7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43}  |          1 | 22.9 | {21.1576051716888}
- 8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15}  |          1 | 27.1 | {17.6194200563055}
- 9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93}  |          1 | 16.5 | {15.1366297774139}
-10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1}  |          1 | 18.9 | {17.6528662199369}
-11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} |          1 |   15 | {17.2017487668181}
-12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27}  |          1 | 18.9 | {19.4893860319992}
-13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71}    |          1 | 21.7 | {23.2917226708039}
-14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26}        |          1 | 20.4 | {22.8904812605193}
-15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26}      |          1 | 18.2 | {18.2386754423677}
-16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47}       |          1 | 19.9 | {23.28949550874}
-17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58}       |          1 | 23.1 | {25.3288762085473}
-18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67}        |          1 | 17.5 | {19.0203738118451}
-19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69}      |          1 | 20.2 | {12.3162005347545}
-20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28}       |          1 | 18.2 | {21.0902211848747}
+ id |                                    x                                    | grp_by_col |  y   |   estimated_y
+----+-------------------------------------------------------------------------+------------+------+------------------
+  1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98}       |          1 |   24 |  23.973628645041
+  2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14}      |          1 | 21.6 | 21.6389086856109
+  3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03}     |          1 | 34.7 | 34.6766441639675
+  4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94}     |          1 | 33.4 | 33.4521871118756
+  5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33}      |          1 | 36.2 | 36.2899491706428
+  6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21}      |          1 | 28.7 | 28.6994076427827
+  7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43}  |          1 | 22.9 | 22.4882117113923
+  8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15}  |          1 | 27.1 | 26.5148927040405
+  9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93}  |          1 | 16.5 | 16.0669778867327
+ 10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1}  |          1 | 18.9 | 17.4237448788601
+ 11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} |          1 |   15 | 14.5944028616784
+ 12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27}  |          1 | 18.9 | 19.6071061560237
+ 13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71}    |          1 | 21.7 | 21.7585638578804
+ 14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26}        |          1 | 20.4 | 20.2832271533629
+ 15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26}      |          1 | 18.2 | 18.3440540662206
+ 16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47}       |          1 | 19.9 | 20.0246074554594
+ 17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58}       |          1 | 23.1 | 23.1458505146148
+ 18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67}        |          1 | 17.5 | 17.4602306566804
+ 19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69}      |          1 | 20.2 | 20.1785296856357
+ 20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28}       |          1 | 18.2 | 18.1810300625137
+(20 rows)
 </pre>
 Note that the results you get for all examples may vary with the platform you are using.
 
@@ -561,6 +644,10 @@ File mlp.sql_in documenting the training function
 
 */
 
+CREATE TYPE MADLIB_SCHEMA.mlp_step_result AS (
+        state    DOUBLE PRECISION[],
+        loss     DOUBLE PRECISION
+);
 
 CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
         coeff    DOUBLE PRECISION[],
@@ -571,14 +658,22 @@ CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
 -- create SQL functions for IGD optimizer
 --------------------------------------------------------------------------
 CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_transition(
-        state           DOUBLE PRECISION[],
-        start_vec       DOUBLE PRECISION[],
-        end_vec         DOUBLE PRECISION[],
-        previous_state  DOUBLE PRECISION[],
-        layer_sizes     DOUBLE PRECISION[],
-        stepsize        DOUBLE PRECISION,
-        activation      INTEGER,
-        is_classification INTEGER)
+        state              DOUBLE PRECISION[],
+        ind_var            DOUBLE PRECISION[],
+        dep_var            DOUBLE PRECISION[],
+        previous_state     DOUBLE PRECISION[],
+        layer_sizes        DOUBLE PRECISION[],
+        learning_rate_init DOUBLE PRECISION,
+        activation         INTEGER,
+        is_classification  INTEGER,
+        weight             DOUBLE PRECISION,
+        warm_start         BOOLEAN,
+        warm_start_coeff   DOUBLE PRECISION[],
+        n_tuples           INTEGER,
+        lambda             DOUBLE PRECISION,
+        x_means            DOUBLE PRECISION[],
+        x_stds             DOUBLE PRECISION[]
+    )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE;
@@ -592,7 +687,7 @@ LANGUAGE C IMMUTABLE STRICT;
 
 CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_final(
         state DOUBLE PRECISION[])
-RETURNS DOUBLE PRECISION[]
+RETURNS MADLIB_SCHEMA.mlp_step_result
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 
@@ -601,16 +696,24 @@ LANGUAGE C IMMUTABLE STRICT;
  * @brief Perform one iteration of backprop
  */
 CREATE AGGREGATE MADLIB_SCHEMA.mlp_igd_step(
-        /* start_vec*/        DOUBLE PRECISION[],
-        /* end_vec */         DOUBLE PRECISION[],
-        /* previous_state */  DOUBLE PRECISION[],
-        /* layer_sizes */     DOUBLE PRECISION[],
-        /* stepsize */        DOUBLE PRECISION,
-        /* activation */      INTEGER,
-        /* is_classification */ INTEGER )(
+        /* ind_var */             DOUBLE PRECISION[],
+        /* dep_var */             DOUBLE PRECISION[],
+        /* previous_state */      DOUBLE PRECISION[],
+        /* layer_sizes */         DOUBLE PRECISION[],
+        /* learning_rate_init */  DOUBLE PRECISION,
+        /* activation */          INTEGER,
+        /* is_classification */   INTEGER,
+        /* weight */              DOUBLE PRECISION,
+        /* warm_start */          BOOLEAN,
+        /* warm_start_coeff */    DOUBLE PRECISION[],
+        /* n_tuples */            INTEGER,
+        /* lambda */              DOUBLE PRECISION,
+        /* x_means */             DOUBLE PRECISION[],
+        /* x_stds */              DOUBLE PRECISION[]
+        )(
     STYPE=DOUBLE PRECISION[],
     SFUNC=MADLIB_SCHEMA.mlp_igd_transition,
-    m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
+    m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
     FINALFUNC=MADLIB_SCHEMA.mlp_igd_final,
     INITCOND='{0,0,0,0,0,0,0,0}'
 );
@@ -631,13 +734,16 @@ LANGUAGE c IMMUTABLE STRICT;
 -------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
-    source_table      VARCHAR,
-    output_table      VARCHAR,
-    independent_varname    VARCHAR,
-    dependent_varname   VARCHAR,
-    hidden_layer_sizes         INTEGER[],
-    optimizer_params   VARCHAR,
-    activation      VARCHAR
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN,
+    verbose              BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`convex', `mlp_igd')
     mlp_igd.mlp(
@@ -649,19 +755,96 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
         hidden_layer_sizes,
         optimizer_params,
         activation,
-        True
+        True,
+        weights,
+        warm_start,
+        verbose
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[]
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, NULL, NULL, NULL, FALSE, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
-    source_table      VARCHAR,
-    output_table      VARCHAR,
-    independent_varname    VARCHAR,
-    dependent_varname   VARCHAR,
-    hidden_layer_sizes         INTEGER[],
-    optimizer_params   VARCHAR,
-    activation      VARCHAR
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN,
+    verbose              BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`convex', `mlp_igd')
     mlp_igd.mlp(
@@ -673,11 +856,83 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
         hidden_layer_sizes,
         optimizer_params,
         activation,
-        False
+        False,
+        weights,
+        warm_start,
+        verbose
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR,
+    warm_start           BOOLEAN
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR,
+    weights              VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR,
+    activation           VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[],
+    optimizer_params     VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table         VARCHAR,
+    output_table         VARCHAR,
+    independent_varname  VARCHAR,
+    dependent_varname    VARCHAR,
+    hidden_layer_sizes   INTEGER[]
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, NULL, NULL, NULL, FALSE, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict(
     model_table      VARCHAR,
     data_table      VARCHAR,
@@ -700,9 +955,11 @@ CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp(
         coeff DOUBLE PRECISION[],
         independent_varname DOUBLE PRECISION[],
         is_classification DOUBLE PRECISION,
-        activation_function DOUBLE PRECISION,
+        activation DOUBLE PRECISION,
         layer_sizes DOUBLE PRECISION[],
-        is_response INTEGER
+        is_response INTEGER,
+        x_means DOUBLE PRECISION[],
+        x_stds DOUBLE PRECISION[]
     )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'