You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/08/14 17:37:21 UTC
[2/2] incubator-madlib git commit: MLP: Add multiple enhancements
MLP: Add multiple enhancements
JIRA: MADLIB-1134
This commit adds following:
- Weights: Each tuple in training data can be individually weighted
- Warm start: Network weights can be initialized from the output of a
previous call.
- n_tries: Allows calling the train function multiple times to avoid
local minima.
- Learning rate policy: Allows user to specify a policy to decay the
learning rate.
- Standardization: Inputs are standardized to zero mean and unit std.
deviation.
Closes #162
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ff1b0f88
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ff1b0f88
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ff1b0f88
Branch: refs/heads/master
Commit: ff1b0f883c7a178323670b83b14069e06bf1b808
Parents: 6f6f804
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Aug 14 09:50:25 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Aug 14 09:50:25 2017 -0700
----------------------------------------------------------------------
.gitignore | 1 +
doc/design/modules/neural-network.tex | 144 ++-
doc/literature.bib | 8 +-
doc/mainpage.dox.in | 3 +-
src/modules/convex/mlp_igd.cpp | 74 +-
src/modules/convex/task/l2.hpp | 3 +-
src/modules/convex/task/mlp.hpp | 259 ++----
src/modules/convex/type/model.hpp | 70 +-
src/modules/convex/type/state.hpp | 30 +-
src/modules/convex/type/tuple.hpp | 2 +-
src/ports/postgres/modules/convex/mlp.sql_in | 497 +++++++---
src/ports/postgres/modules/convex/mlp_igd.py_in | 923 ++++++++++++-------
.../postgres/modules/convex/test/mlp.sql_in | 94 +-
.../postgres/modules/utilities/utilities.py_in | 12 +
14 files changed, 1297 insertions(+), 823 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index abfccfa..00dc016 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
# Ignore build directory
/build*
+/build-docker*
# Ignore generated code files
*.so
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/design/modules/neural-network.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/neural-network.tex b/doc/design/modules/neural-network.tex
index 8802361..9f8110b 100644
--- a/doc/design/modules/neural-network.tex
+++ b/doc/design/modules/neural-network.tex
@@ -22,7 +22,7 @@
\chapter{Neural Network}
\begin{moduleinfo}
-\item[Authors] {Xixuan Feng}
+\item[Authors] {Xixuan Feng, Cooper Sloan}
\end{moduleinfo}
% Abstract. What is the problem we want to solve?
@@ -30,7 +30,8 @@ This module implements artificial neural network \cite{ann_wiki}.
\section{Multilayer Perceptron}
Multilayer perceptron is arguably the most popular model among many neural network models \cite{mlp_wiki}.
-Here, we learn the coefficients by minimizing a least square objective function (\cite{bertsekas1999nonlinear}, example 1.5.3).
+Here, we learn the coefficients by minimizing a least square objective function, or cross entropy (\cite{bertsekas1999nonlinear}, example 1.5.3).
+The parallel architecture is based on the paper by Zhiheng Huang \cite{mlp_parallel}.
% Background. Why can we solve the problem with gradient-based methods?
\subsection{Solving as a Convex Program}
@@ -46,41 +47,47 @@ For multilayer perceptron, we choose incremental gradient descent (IGD).
In the remaining part of this section, we will give a formal description of the derivation of objective function and its gradient.
\paragraph{Objective function.}
-We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k$th stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function is given as
-\[f_{(y, z)}(u) = \frac{1}{2} \|h(u, y) - z\|_2^2,\]
-where $y \in \mathbb{R}^{n_0}$ is the input vector, $z \in \mathbb{R}^{n_N}$ is the output vector,
-\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
+We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k^{th}$ stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function for regression is given as
+\[f_{(x, y)}(u) = \frac{1}{2} \|h(u, x) - y\|_2^2,\]
+and for classification the objective function is given as
+\[f_{(x, y)}(u) = \sum_i (\log(h_i(u, x)) * z_i + (1-\log(h_i(u, x))) *( 1- z_i) ,\]
+where $x \in \mathbb{R}^{n_0}$ is the input vector, $y \in \mathbb{R}^{n_N}$ is the output vector (one hot encoded for classification),~\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
and the coefficients are given as
-\[u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}\]
+\[u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\},\]
+And are initialized from a uniform distribution as follows:
+\[u_{k}^{sj} = uniform(-r,r),\]
+where r is defined as follows:
+\[r = \sqrt{\frac{6}{n_k+n_{k+1}}}\]
+With regularization, an additional term enters the objective function, given as
+\[\sum_{u_k^{sj}} \frac{1}{2} \lambda u_k^{sj2} \]
This still leaves $h : \mathbb{R}^{n_0} \to \mathbb{R}^{n_N}$ as an open item.
-Let $x_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k$th layer. Then we define $h(u, y) = x_N$, based on setting $x_0 = y$ and the $j$th component of $x_k$ is given in an iterative fashion as
-\footnote{$x_k^0 \equiv 1$ is used to simplified the notations, and $x_k^0$ is not a component of $x_k$, for any $k = 0,...,N$.}
+Let $o_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k^{th}$ layer. Then we define $h(u, x) = o_N$, based on setting $o_0 = x$ and the $j^{th}$ component of $o_k$ is given in an iterative fashion as~\footnote{$o_k^0 \equiv 1$ is used to simplified the notations, and $o_k^0$ is not a component of $o_k$, for any $k = 0,...,N$.}
\[\begin{alignedat}{5}
- x_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} x_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
+ o_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
\end{alignedat}\]
\paragraph{Gradient of the End Layer.}
Let's first handle $u_{N-1}^{st}, s = 0,...,n_{N-1}, t = 1,...,n_N$.
-Let $z^t$ denote the $t$th component of $z \in \mathbb{R}^{n_N}$, and $h^t$ the $t$th component of output of $h$.
+Let $y^t$ denote the $t^{th}$ component of $y \in \mathbb{R}^{n_N}$, and $h^t$ the $t^{th}$ component of output of $h$.
\[\begin{aligned}
\frac{\partial f}{\partial u_{N-1}^{st}}
- &= \left( h^t(u, y) - z^t \right) \cdot \frac{\partial h^t(u, y)}{\partial u_{N-1}^{st}} \\
- &= \left( x_N^t - z^t \right) \cdot \frac{\partial x_N^t}{\partial u_{N-1}^{st}} \\
- &= \left( x_N^t - z^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} x_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
- &= \left( x_N^t - z^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} x_{N-1}^s u_{N-1}^{st} \right) \cdot x_{N-1}^s \\
+ &= \left( h^t(u, x) - y^t \right) \cdot \frac{\partial h^t(u, x)}{\partial u_{N-1}^{st}} \\
+ &= \left( o_N^t - y^t \right) \cdot \frac{\partial o_N^t}{\partial u_{N-1}^{st}} \\
+ &= \left( o_N^t - y^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
+ &= \left( o_N^t - y^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right) \cdot o_{N-1}^s \\
\end{aligned}\]
-To ease the notation, let the input vector of the $j$th activation unit of the $(k+1)$th layer be
-\[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} x_{k-1}^s u_{k-1}^{sj},\]
-where $k = 1,...,N, \; j = 1,...,n_k$, and note that $x_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
-\[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( x_N^t - z^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot x_{N-1}^s\]
-For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $z^t$, and $x_N^t, \mathit{net}_N^t, x_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.
+To ease the notation, let the input vector of the $j^{th}$ activation unit of the $(k+1)^{th}$ layer be
+\[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj},\]
+where $k = 1,...,N, \; j = 1,...,n_k$, and note that $o_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
+\[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( o_N^t - y^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot o_{N-1}^s\]
+For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $y^t$, and $o_N^t, \mathit{net}_N^t, o_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.
\subsubsection{Backpropagation}
For inner (hidden) layers, it is more difficult to compute the partial derivative over the input of activation units (i.e., $\mathit{net}_k, k = 1,...,N-1$).
-That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (x_N^t - z^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
+That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (o_N^t - y^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
This hard-to-compute statistic is referred to as \textit{delta error}, and let $\delta_k^j = \frac{\partial f}{\partial \mathit{net}_k^j}$, where $k = 1,...,N-1, j = 1,..,n_k$.
If this is solved, the gradient can be easily computed as follow
-\[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} x_{k-1}^s,\]
+\[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} o_{k-1}^s,\]
where $k = 1,...,N-1, s = 0,...,n_{k-1}, j = 1,..,n_k$.
To solve this, we introduce the popular backpropagation below.
@@ -90,20 +97,20 @@ First,
\[
\delta_{k}^j
= \frac{\partial f}{\partial \mathit{net}_{k}^j}
- = \frac{\partial f}{\partial x_{k}^j} \cdot \frac{\partial x_{k}^j}{\partial \mathit{net}_{k}^j}
- = \frac{\partial f}{\partial x_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
+ = \frac{\partial f}{\partial o_{k}^j} \cdot \frac{\partial o_{k}^j}{\partial \mathit{net}_{k}^j}
+ = \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
\]
And here comes the only equation that is needed but the author, I (Aaron), do not understand but it looks reasonable and repeats in different online notes \cite{mlp_gradient_wisc},
\[\begin{alignedat}{5}
- \frac{\partial f}{\partial x_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial x_{k}^j} \right),
+ \frac{\partial f}{\partial o_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right),
&\quad k = 1,...,N-1, \: j = 1,...,n_{k}
\end{alignedat}\]
Assuming the above equation is true, we can solve delta error backward iteratively
\[\begin{aligned}
\delta_{k}^j
- &= \frac{\partial f}{\partial x_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
- &= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial x_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
- &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} x_{k}^s u_{k}^{st} \right) }{\partial x_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
+ &= \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
+ &= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
+ &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} o_{k}^s u_{k}^{st} \right) }{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
&= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot u_{k}^{jt} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
\end{aligned}\]
To sum up, we need the following equation for error back propagation
@@ -111,20 +118,20 @@ To sum up, we need the following equation for error back propagation
where $k = 1,...,N-1$, and $j = 1,...,n_{k}$.
\subsubsection{The $\mathit{Gradient}$ Function}
-\begin{algorithm}[mlp-gradient$(u, y, z)$] \label{alg:mlp-gradient}
+\begin{algorithm}[mlp-gradient$(u, x, y)$] \label{alg:mlp-gradient}
\alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
-start vector $y \in \mathbb{R}^{n_0}$,\\
-end vector $z \in \mathbb{R}^{n_N}$,\\
+start vector $x \in \mathbb{R}^{n_0}$,\\
+end vector $y \in \mathbb{R}^{n_N}$,\\
activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
\algoutput{Gradient value $\nabla f(u)$ that consists of components $\nabla f(u)_{k-1}^{sj} = \frac{\partial f}{\partial u_{k-1}^{sj}}$}
\begin{algorithmic}[1]
- \State $(\mathit{net}, x) \set$ \texttt{feed-forward}$(u, y, \phi)$
- \State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, x, z, \phi')$
+ \State $(\mathit{net}, o) \set$ \texttt{feed-forward}$(u, x, \phi)$
+ \State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, o, y, \phi')$
\State $\delta \set$ \texttt{error-back-propagation}$(\delta_N, \mathit{net}, u, \phi')$
\For{$k = 1,...,N$}
\For{$s = 0,...,n_{k-1}$}
\For{$j = 1,...,n_k$}
- \State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j x_{k-1}^s$
+ \State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j o_{k-1}^s$
\Comment{Can be put together with the computation of delta $\delta$}
\EndFor
\EndFor
@@ -138,46 +145,47 @@ Common examples of activation units are
\[\begin{alignedat}{3}
\phi(\xi) &= \frac{1}{1 + e^{-\xi}}, &\quad \text{ (logistic function),}\\
\phi(\xi) &= \frac{e^{\xi} - e^{-\xi}}{e^{\xi} + e^{-\xi}}, &\quad \text{ (hyperbolic tangent function)}\\
+\phi(\xi) &= max(x,0), &\quad \text{ (rectified linear function)}\\
\end{alignedat}\]
-\begin{algorithm}[feed-forward$(u, y, \phi)$] \label{alg:feed-forward}
+\begin{algorithm}[feed-forward$(u, x, \phi)$] \label{alg:feed-forward}
\alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
-input vector $y \in \mathbb{R}^{n_0}$,\\
+input vector $x \in \mathbb{R}^{n_0}$,\\
activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
\algoutput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
-output vectors $x = \{x_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$}
+output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$}
\begin{algorithmic}[1]
\For{$k = 0,...,N$}
- \State $x_k^0 \set 1$
+ \State $o_k^0 \set 1$
\EndFor
- \State $x_0 \set y$ \Comment{For all components $x_0^j, y^j, \; j = 1,...,n_0$}
+ \State $o_0 \set x$ \Comment{For all components $o_0^j, x^j, \; j = 1,...,n_0$}
\For{$k = 1,...,N$}
\For{$j = 1,...,n_k$}
\State $\mathit{net}_k^j \set 0$
\For{$s = 0,...,n_{k-1}$}
- \State $\mathit{net}_k^j \set \mathit{net}_k^j + x_{k-1}^s u_{k-1}^{sj}$
+ \State $\mathit{net}_k^j \set \mathit{net}_k^j + o_{k-1}^s u_{k-1}^{sj}$
\EndFor
- \State $x_k^j = \phi(\mathit{net}_k^j)$
+ \State $o_k^j = \phi(\mathit{net}_k^j)$ \Comment{Where the activation function for the final layer is identity for regression and softmax for classification.}
\EndFor
\EndFor
- \State \Return $(\mathit{net}, x)$
+ \State \Return $(\mathit{net}, o)$
\end{algorithmic}
\end{algorithm}
-\begin{algorithm}[end-layer-delta-error$(\mathit{net}, x, z, \phi')$] \label{alg:end-layer-delta-error}
+\clearpage
+\begin{algorithm}[end-layer-delta-error$(\mathit{net}, o, y, \phi')$] \label{alg:end-layer-delta-error}
\alginput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
-output vectors $x = \{x_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
-end vector $z \in \mathbb{R}^{n_N}$,\\
+output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
+end vector $y \in \mathbb{R}^{n_N}$,\\
derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
\algoutput{End layer delta $\delta_N = \{\delta_N^t \; | \; t = 1,...,n_N\}$}
\begin{algorithmic}[1]
\For{$t = 1,...,n_N$}
- \State $\delta_N^t \set (x_N^t - z^t) \phi'(\mathit{net}_N^t)$
+ \State $\delta_N^t \set (o_N^t - y^t)$ \Comment{This applies for identity activation and mean square error loss and softmax activation with cross entropy loss}
\EndFor
\State \Return $\delta_N$
\end{algorithmic}
\end{algorithm}
-
\begin{algorithm}[error-back-propagation$(\delta_N, \mathit{net}, u, \phi')$] \label{alg:error-back-propagation}
\alginput{End layer delta $\delta_N = \{\delta_N^t \; | \; t = 1,...,n_N\}$,\\
input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
@@ -197,3 +205,45 @@ derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
\State \Return $\delta$
\end{algorithmic}
\end{algorithm}
+
+\begin{algorithm}[mlp-train-iteration$(X, Y, \eta)$] \label{alg:mlp-train-iteration}
+\alginput{
+start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
+end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
+learning rate $\eta$,\\}
+\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
+\begin{algorithmic}[1]
+ \State \texttt{Randomnly initialize u}
+ \For{$i = 1,...,m$}
+ \State $\nabla f(u) \set \texttt{mlp-gradient}(u,X_i,Y_i)$
+ \State $u \set u - (\eta \nabla f(u) u + \lambda u)$
+ \EndFor
+ \State \Return $u$
+\end{algorithmic}
+\end{algorithm}
+
+\clearpage
+\begin{algorithm}[mlp-train-parallel$(X, Y, \eta, s, t)$] \label{alg:mlp-train-parallel}
+\alginput{
+start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
+end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
+learning rate $\eta$,\\
+segments $s$,\\
+iterations $t$,\\}
+\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
+\begin{algorithmic}[1]
+ \State \texttt{Randomnly initialize u}
+ \For{$j = 1,...,s$}
+ \State $X_j \set \texttt{subset-of-X}$
+ \State $Y_j \set \texttt{subset-of-Y}$
+ \EndFor
+ \For{$i = 1,...,t$}
+ \For{$j = 1,...,s$}
+ \State $u_j \set copy(u)$
+ \State $u_j \set \texttt{mlp-train-iteration}(X_j, Y_j, \eta)$
+ \EndFor
+ \State $u \set \texttt{weighted-avg}(u_{1...s})$
+ \EndFor
+ \State \Return $u$
+\end{algorithmic}
+\end{algorithm}
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index 225622d..6784f5e 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -953,4 +953,10 @@ Applied Survival Analysis},
@online{bfs_wikipedia,
title = {Breadth-first search},
url={https://en.wikipedia.org/wiki/Breadth-first_search}
-}
\ No newline at end of file
+}
+
+@misc{mlp_parallel,
+ Url = {https://www.microsoft.com/en-us/research/publication/accelerating-recurrent-neural-network-training-via-two-stage-classes-and-parallelization/},
+ Title = {{Accelerating Recurrent Neural Network Training via Two Stage Classes and Parallelization}},
+ Author = {{Zhiheng Huang}}
+}
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index ccf58a8..e27e14a 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -183,7 +183,7 @@ Contains graph algorithms.
@defgroup grp_crf Conditional Random Field
@ingroup grp_super
- @defgroup grp_mlp Multilayer Perceptron
+ @defgroup grp_nn Neural Network
@ingroup grp_super
@defgroup grp_regml Regression Models
@@ -202,7 +202,6 @@ Contains graph algorithms.
@defgroup grp_robust Robust Variance
@}
-
@defgroup grp_svm Support Vector Machines
@ingroup grp_super
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/mlp_igd.cpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/mlp_igd.cpp b/src/modules/convex/mlp_igd.cpp
index 3647d5f..9e9e665 100644
--- a/src/modules/convex/mlp_igd.cpp
+++ b/src/modules/convex/mlp_igd.cpp
@@ -29,6 +29,7 @@
#include "mlp_igd.hpp"
#include "task/mlp.hpp"
+#include "task/l2.hpp"
#include "algo/igd.hpp"
#include "algo/loss.hpp"
@@ -51,6 +52,8 @@ typedef Loss<MLPIGDState<MutableArrayHandle<double> >, MLPIGDState<ArrayHandle<d
typedef MLP<MLPModel<MutableArrayHandle<double> >,MLPTuple> MLPTask;
+typedef MLPModel<MutableArrayHandle<double> > MLPModelType;
+
/**
* @brief Perform the multilayer perceptron transition step
*
@@ -63,6 +66,7 @@ mlp_igd_transition::run(AnyType &args) {
// For other tuples: args[0] holds the computation state until last tuple
MLPIGDState<MutableArrayHandle<double> > state = args[0];
+
// initilize the state if first tuple
if (state.algo.numRows == 0) {
if (!args[3].isNull()) {
@@ -74,20 +78,30 @@ mlp_igd_transition::run(AnyType &args) {
} else {
// configuration parameters
ArrayHandle<double> numbersOfUnits = args[4].getAs<ArrayHandle<double> >();
+ int numberOfStages = numbersOfUnits.size() - 1;
double stepsize = args[5].getAs<double>();
- state.allocate(*this, numbersOfUnits.size() - 1,
+ state.allocate(*this, numberOfStages,
reinterpret_cast<const double *>(numbersOfUnits.ptr()));
state.task.stepsize = stepsize;
- int activation = args[6].getAs<int>();
-
- int is_classification = args[7].getAs<int>();
- state.task.model.initialize(is_classification, activation);
+ const int activation = args[6].getAs<int>();
+ const int is_classification = args[7].getAs<int>();
+
+ const bool warm_start = args[9].getAs<bool>();
+ const int n_tuples = args[11].getAs<int>();
+ const double lambda = args[12].getAs<double>();
+ state.task.lambda = lambda;
+ MLPTask::lambda = lambda;
+ double is_classification_double = (double) is_classification;
+ double activation_double = (double) activation;
+ MappedColumnVector coeff = args[10].getAs<MappedColumnVector>();
+ state.task.model.rebind(&is_classification_double,&activation_double,
+ &coeff.data()[0], numberOfStages,
+ &numbersOfUnits[0]);
}
-
// resetting in either case
state.reset();
}
@@ -96,25 +110,23 @@ mlp_igd_transition::run(AnyType &args) {
const uint16_t N = state.task.numberOfStages;
const double *n = state.task.numbersOfUnits;
+ MappedColumnVector x_means = args[13].getAs<MappedColumnVector>();
+ MappedColumnVector x_stds = args[14].getAs<MappedColumnVector>();
// tuple
- MappedColumnVector indVar;
+ ColumnVector indVar;
MappedColumnVector depVar;
try {
- // an exception is raised in the backend if args[2] contains nulls
- MappedColumnVector x = args[1].getAs<MappedColumnVector>();
- // x is a const reference, we can only rebind to change its pointer
- indVar.rebind(x.memoryHandle(), x.size());
+ indVar = (args[1].getAs<MappedColumnVector>()-x_means).cwiseQuotient(x_stds);
MappedColumnVector y = args[2].getAs<MappedColumnVector>();
depVar.rebind(y.memoryHandle(), y.size());
-
} catch (const ArrayWithNullException &e) {
return args[0];
}
MLPTuple tuple;
- tuple.indVar.rebind(indVar.memoryHandle(), indVar.size());
+ tuple.indVar = indVar;
tuple.depVar.rebind(depVar.memoryHandle(), depVar.size());
+ tuple.weight = args[8].getAs<double>();
- // Now do the transition step
MLPIGDAlgorithm::transition(state, tuple);
MLPLossAlgorithm::transition(state, tuple);
state.algo.numRows ++;
@@ -130,14 +142,12 @@ mlp_igd_merge::run(AnyType &args) {
MLPIGDState<MutableArrayHandle<double> > stateLeft = args[0];
MLPIGDState<ArrayHandle<double> > stateRight = args[1];
- // We first handle the trivial case where this function is called with one
- // of the states being the initial state
if (stateLeft.algo.numRows == 0) { return stateRight; }
else if (stateRight.algo.numRows == 0) { return stateLeft; }
- // Merge states together
MLPIGDAlgorithm::merge(stateLeft, stateRight);
MLPLossAlgorithm::merge(stateLeft, stateRight);
+
// The following numRows update, cannot be put above, because the model
// averaging depends on their original values
stateLeft.algo.numRows += stateRight.algo.numRows;
@@ -154,20 +164,17 @@ mlp_igd_final::run(AnyType &args) {
// a deep copy.
MLPIGDState<MutableArrayHandle<double> > state = args[0];
- // Aggregates that haven't seen any data just return Null.
if (state.algo.numRows == 0) { return Null(); }
- // finalizing
- MLPIGDAlgorithm::final(state);
-
- // Return the mean loss
+ L2<MLPModelType>::lambda = state.task.lambda;
state.algo.loss = state.algo.loss/static_cast<double>(state.algo.numRows);
+ state.algo.loss += L2<MLPModelType>::loss(state.task.model);
+ MLPIGDAlgorithm::final(state);
- // for stepsize tuning
- std::stringstream debug;
- debug << "loss: " << state.algo.loss;
- elog(INFO,"%s",debug.str().c_str());
- return state;
+ AnyType tuple;
+ tuple << state
+ << (double)state.algo.loss;
+ return tuple;
}
/**
@@ -191,10 +198,9 @@ internal_mlp_igd_result::run(AnyType &args) {
flattenU;
flattenU.rebind(&state.task.model.u[0](0, 0),
state.task.model.arraySize(state.task.numberOfStages,
- state.task.numbersOfUnits)-2); // -2 for is_classification and activation
+ state.task.numbersOfUnits));
double loss = state.algo.loss;
-
AnyType tuple;
tuple << flattenU
<< loss;
@@ -204,27 +210,25 @@ internal_mlp_igd_result::run(AnyType &args) {
AnyType
internal_predict_mlp::run(AnyType &args) {
MLPModel<MutableArrayHandle<double> > model;
- MappedColumnVector indVar;
+ ColumnVector indVar;
int is_response = args[5].getAs<int>();
+ MappedColumnVector x_means = args[6].getAs<MappedColumnVector>();
+ MappedColumnVector x_stds = args[7].getAs<MappedColumnVector>();
MappedColumnVector coeff = args[0].getAs<MappedColumnVector>();
MappedColumnVector layerSizes = args[4].getAs<MappedColumnVector>();
// Input layer doesn't count
size_t numberOfStages = layerSizes.size()-1;
- //#TODO this should be an int not a double
double is_classification = args[2].getAs<double>();
double activation = args[3].getAs<double>();
bool get_class = is_classification && is_response;
model.rebind(&is_classification,&activation,&coeff.data()[0],numberOfStages,&layerSizes.data()[0]);
try {
- MappedColumnVector x = args[1].getAs<MappedColumnVector>();
- // x is a const reference, we can only rebind to change its pointer
- indVar.rebind(x.memoryHandle(), x.size());
+ indVar = (args[1].getAs<MappedColumnVector>()-x_means).cwiseQuotient(x_stds);
} catch (const ArrayWithNullException &e) {
return args[0];
}
ColumnVector prediction = MLPTask::predict(model, indVar, get_class);
-
return prediction;
}
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/task/l2.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/l2.hpp b/src/modules/convex/task/l2.hpp
index a2e7f2f..308cfd9 100644
--- a/src/modules/convex/task/l2.hpp
+++ b/src/modules/convex/task/l2.hpp
@@ -84,7 +84,8 @@ double
L2<Model, Hessian>::loss(
const model_type &model) {
// 1/2 * lambda * || w ||^2
- return lambda * model.norm()*model.norm() / 2;
+ double norm = model.norm();
+ return lambda * norm*norm / 2;
}
} // namespace convex
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/task/mlp.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp
index e66492b..0032b81 100644
--- a/src/modules/convex/task/mlp.hpp
+++ b/src/modules/convex/task/mlp.hpp
@@ -26,6 +26,8 @@
#ifndef MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
#define MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
+#include <dbconnector/dbconnector.hpp>
+
namespace madlib {
namespace modules {
@@ -46,24 +48,26 @@ public:
static void gradientInPlace(
model_type &model,
- const independent_variables_type &y,
- const dependent_variable_type &z,
+ const independent_variables_type &x,
+ const dependent_variable_type &y,
const double &stepsize);
static double loss(
const model_type &model,
- const independent_variables_type &y,
- const dependent_variable_type &z);
+ const independent_variables_type &x,
+ const dependent_variable_type &y);
static ColumnVector predict(
const model_type &model,
- const independent_variables_type &y,
+ const independent_variables_type &x,
const bool get_class);
const static int RELU = 0;
const static int SIGMOID = 1;
const static int TANH = 2;
+ static double lambda;
+private:
static double sigmoid(const double &xi) {
return 1. / (1. + std::exp(-xi));
}
@@ -76,9 +80,6 @@ public:
return std::tanh(xi);
}
-
-private:
-
static double sigmoidDerivative(const double &xi) {
double value = sigmoid(xi);
return value * (1. - value);
@@ -95,59 +96,39 @@ private:
static void feedForward(
const model_type &model,
- const independent_variables_type &y,
+ const independent_variables_type &x,
std::vector<ColumnVector> &net,
- std::vector<ColumnVector> &x);
-
- static void endLayerDeltaError(
- const std::vector<ColumnVector> &net,
- const std::vector<ColumnVector> &x,
- const dependent_variable_type &z,
- ColumnVector &delta_N);
+ std::vector<ColumnVector> &o);
- static void errorBackPropagation(
- const ColumnVector &delta_N,
+ static void backPropogate(
+ const ColumnVector &y_true,
+ const ColumnVector &y_estimated,
const std::vector<ColumnVector> &net,
const model_type &model,
std::vector<ColumnVector> &delta);
};
template <class Model, class Tuple>
+double MLP<Model, Tuple>::lambda = 0;
+
+template <class Model, class Tuple>
void
MLP<Model, Tuple>::gradientInPlace(
model_type &model,
- const independent_variables_type &y,
- const dependent_variable_type &z,
+ const independent_variables_type &x,
+ const dependent_variable_type &y_true,
const double &stepsize) {
- (void) model;
- (void) z;
- (void) y;
- (void) stepsize;
- std::vector<ColumnVector> net;
- std::vector<ColumnVector> x;
- std::vector<ColumnVector> delta;
- ColumnVector delta_N;
-
- feedForward(model, y, net, x);
- endLayerDeltaError(net, x, z, delta_N);
- errorBackPropagation(delta_N, net, model, delta);
-
uint16_t N = model.u.size(); // assuming nu. of layers >= 1
- uint16_t k, s, j;
+ uint16_t k;
+ std::vector<ColumnVector> net, o, delta;
- std::vector<uint16_t> n; n.clear(); //nu. of units in each layer
+ feedForward(model, x, net, o);
+ backPropogate(y_true, o.back(), net, model, delta);
- n.push_back(model.u[0].rows() - 1);
- for (k = 1; k <= N; k ++) {
- n.push_back(model.u[k-1].cols() - 1);
- }
-
- for (k=1; k <= N; k++){
- for (s=0; s <= n[k-1]; s++){
- for (j=1; j <= n[k]; j++){
- model.u[k-1](s,j) -= stepsize * (delta[k](j) * x[k-1](s));
- }
- }
+ for (k=0; k < N; k++){
+ Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
+ regularization.row(0).setZero(); // Do not update bias
+ model.u[k] -= stepsize * (o[k] * delta[k].transpose() + regularization);
}
}
@@ -155,54 +136,40 @@ template <class Model, class Tuple>
double
MLP<Model, Tuple>::loss(
const model_type &model,
- const independent_variables_type &y,
- const dependent_variable_type &z) {
+ const independent_variables_type &x,
+ const dependent_variable_type &y_true) {
// Here we compute the loss. In the case of regression we use sum of square errors
// In the case of classification the loss term is cross entropy.
- std::vector<ColumnVector> net;
- std::vector<ColumnVector> x;
-
- feedForward(model, y, net, x);
- double loss = 0.;
- uint16_t j;
-
- for (j = 1; j < z.rows() + 1; j ++) {
- if(model.is_classification){
- // Cross entropy: RHS term is negative
- loss -= z(j-1)*std::log(x.back()(j)) + (1-z(j-1))*std::log(1-x.back()(j));
- }else{
- double diff = x.back()(j) - z(j-1);
- loss += diff * diff;
- }
+ std::vector<ColumnVector> net, o;
+ feedForward(model, x, net, o);
+ ColumnVector y_estimated = o.back();
+
+ if(model.is_classification){
+ double clip = 1.e-10;
+ y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip);
+ return - (y_true.array()*y_estimated.array().log()
+ + (-y_true.array()+1)*(-y_estimated.array()+1).log()).sum();
}
- if(!model.is_classification){
- loss /= 2.;
- }else{
- loss /= z.rows();
+ else{
+ return 0.5 * (y_estimated-y_true).squaredNorm();
}
- return loss;
}
template <class Model, class Tuple>
ColumnVector
MLP<Model, Tuple>::predict(
const model_type &model,
- const independent_variables_type &y,
- const bool get_class
- ) {
- (void) model;
- (void) y;
- std::vector<ColumnVector> net;
- std::vector<ColumnVector> x;
-
- feedForward(model, y, net, x);
- // Don't return the offset
- ColumnVector output = x.back().tail(x.back().size()-1);
- if(get_class){
+ const independent_variables_type &x,
+ const bool get_class) {
+ std::vector<ColumnVector> net, o;
+
+ feedForward(model, x, net, o);
+ ColumnVector output = o.back();
+ if(get_class){ // Return a length 1 array with the predicted index
int max_idx;
output.maxCoeff(&max_idx);
output.resize(1);
- output[0] = (double)max_idx;
+ output[0] = (double) max_idx;
}
return output;
}
@@ -212,113 +179,65 @@ template <class Model, class Tuple>
void
MLP<Model, Tuple>::feedForward(
const model_type &model,
- const independent_variables_type &y,
+ const independent_variables_type &x,
std::vector<ColumnVector> &net,
- std::vector<ColumnVector> &x){
- // meta data and x_k^0 = 1
- uint16_t k, j, s;
- uint16_t N = model.u.size(); // assuming >= 1
+ std::vector<ColumnVector> &o){
+ uint16_t k, N;
+ N = model.u.size(); // assuming >= 1
net.resize(N + 1);
- x.resize(N + 1);
-
- std::vector<uint16_t> n; n.clear();
- n.push_back(model.u[0].rows() - 1);
- x[0].resize(n[0] + 1);
- x[0](0) = 1.;
- for (k = 1; k <= N; k ++) {
- n.push_back(model.u[k-1].cols() - 1);
- net[k].resize(n[k] + 1);
- x[k].resize(n[k] + 1);
- // Bias
- x[k](0) = 1.;
- }
+ o.resize(N + 1);
+
+ double (*activation)(const double&);
+ if(model.activation==RELU)
+ activation = &relu;
+ else if(model.activation==SIGMOID)
+ activation = &sigmoid;
+ else
+ activation = &tanh;
- // y is a mapped parameter from DB, aligning with x here
- for (j = 1; j <= n[0]; j ++) { x[0](j) = y(j-1); }
+ o[0].resize(x.size()+1);
+ o[0] << 1.,x;
for (k = 1; k < N; k ++) {
- for (j = 1; j <= n[k]; j ++) {
- net[k](j) = 0.;
- for (s = 0; s <= n[k-1]; s ++) {
- net[k](j) += x[k-1](s) * model.u[k-1](s, j);
- }
- if(model.activation==RELU)
- x[k](j) = relu(net[k](j));
- else if(model.activation==SIGMOID)
- x[k](j) = sigmoid(net[k](j));
- else
- x[k](j) = tanh(net[k](j));
- }
+ net[k] = model.u[k-1].transpose() * o[k-1];
+ o[k] = ColumnVector(model.u[k-1].cols()+1);
+ o[k] << 1., net[k].unaryExpr(activation);
}
+ o[N] = model.u[N-1].transpose() * o[N-1];
- // output layer computation
- for (j = 1; j <= n[N]; j ++) {
- x[N](j) = 0.;
- for (s = 0; s <= n[N-1]; s ++) {
- x[N](j) += x[N-1](s) * model.u[N-1](s, j);
- }
- }
// Numerically stable calculation of softmax
- ColumnVector last_x = x[N].tail(n[N]);
if(model.is_classification){
- double max_x = last_x.maxCoeff();
- last_x = (last_x.array() - max_x).exp();
- last_x /= last_x.sum();
+ double max_x = o[N].maxCoeff();
+ o[N] = (o[N].array() - max_x).exp();
+ o[N] /= o[N].sum();
}
- x[N].tail(n[N]) = last_x;
}
template <class Model, class Tuple>
void
-MLP<Model, Tuple>::endLayerDeltaError(
- const std::vector<ColumnVector> &net,
- const std::vector<ColumnVector> &x,
- const dependent_variable_type &z,
- ColumnVector &delta_N) {
- //meta data
- uint16_t t;
- uint16_t N = x.size() - 1; // assuming >= 1
- uint16_t n_N = x[N].rows() - 1;
- delta_N.resize(n_N + 1);
-
- for (t = 1; t <= n_N; t ++) {
- delta_N(t) = (x[N](t) - z(t-1));
- }
-}
-
-template <class Model, class Tuple>
-void
-MLP<Model, Tuple>::errorBackPropagation(
- const ColumnVector &delta_N,
+MLP<Model, Tuple>::backPropogate(
+ const ColumnVector &y_true,
+ const ColumnVector &y_estimated,
const std::vector<ColumnVector> &net,
const model_type &model,
std::vector<ColumnVector> &delta) {
- // meta data
- uint16_t k, j, t;
- uint16_t N = model.u.size(); // assuming >= 1
- delta.resize(N + 1);
-
- std::vector<uint16_t> n; n.clear();
- n.push_back(model.u[0].rows() - 1);
- for (k = 1; k <= N; k ++) {
- n.push_back(model.u[k-1].cols() - 1);
- delta[k].resize(n[k]+1);
- }
- delta[N] = delta_N;
-
+ uint16_t k, N;
+ N = model.u.size(); // assuming >= 1
+ delta.resize(N);
+
+ double (*activationDerivative)(const double&);
+ if(model.activation==RELU)
+ activationDerivative = &reluDerivative;
+ else if(model.activation==SIGMOID)
+ activationDerivative = &sigmoidDerivative;
+ else
+ activationDerivative = &tanhDerivative;
+
+ delta.back() = y_estimated - y_true;
for (k = N - 1; k >= 1; k --) {
- for (j = 0; j <= n[k]; j ++) {
- delta[k](j) = 0.;
- for (t = 1; t <= n[k+1]; t ++) {
- delta[k](j) += delta[k+1](t) * model.u[k](j, t);
- }
- if(model.activation==RELU)
- delta[k](j) = delta[k](j) * reluDerivative(net[k](j));
- else if(model.activation==SIGMOID)
- delta[k](j) = delta[k](j) * sigmoidDerivative(net[k](j));
- else
- delta[k](j) = delta[k](j) * tanhDerivative(net[k](j));
- }
+ // Do not include the bias terms
+ delta[k-1] = model.u[k].bottomRows(model.u[k].rows()-1) * delta[k];
+ delta[k-1] = delta[k-1].array() * net[k].unaryExpr(activationDerivative).array();
}
}
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/model.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/model.hpp b/src/modules/convex/type/model.hpp
index 9b68af8..679dab4 100644
--- a/src/modules/convex/type/model.hpp
+++ b/src/modules/convex/type/model.hpp
@@ -121,51 +121,9 @@ struct MLPModel {
const double *n = inNumbersOfUnits;
size_t k;
for (k = 1; k <= N; k ++) {
- size += (n[k-1] + 1) * (n[k] + 1);
- }
- return 1 + // is_classification
- 1 + // activation
- size; // weights (u)
- }
-
- /**
- * @brief Initialize the model randomly
- */
- void initialize(int is_classification_in, int activation_in) {
- is_classification = is_classification_in;
- activation = activation_in;
- // using madlib::dbconnector::$database::NativeRandomNumberGenerator
- NativeRandomNumberGenerator rng;
-
- // Scaling factor for weight initialization
- double epsilon = 0.0001;
-
-
- double base = rng.min();
- double span = rng.max() - base;
-
- uint16_t N = u.size(); // assuming nu. of layers >= 1
- uint16_t k, s, j;
-
- std::vector<uint16_t> n; n.clear(); //nu. of units in each layer
-
- n.push_back(u[0].rows() - 1);
- for (k = 1; k <= N; k ++) {
- n.push_back(u[k-1].cols() - 1);
- }
-
- for (k=1; k <= N; k++){
- for (s=0; s <= n[k-1]; s++){
- u[k-1](s,0)=1;
- for (j=1; j <= n[k]; j++){
- // Generate normal(0,epsilon) value using Box-Muller transform
- double u1 = (rng()-base)/span;
- double u2 = (rng()-base)/span;
- double z = std::sqrt(-2*std::log(u1))*std::cos(2*M_PI*u2);
- u[k-1](s,j) = epsilon*z;
- }
- }
+ size += (n[k-1] + 1) * (n[k]);
}
+ return size; // weights (u)
}
uint32_t rebind(const double *is_classification_in,
@@ -185,20 +143,38 @@ struct MLPModel {
for (k = 1; k <= N; k ++) {
u.push_back(Eigen::Map<Matrix >(
const_cast<double*>(data + sizeOfU),
- n[k-1] + 1, n[k] + 1));
- sizeOfU += (n[k-1] + 1) * (n[k] + 1);
+ n[k-1] + 1, n[k]));
+ sizeOfU += (n[k-1] + 1) * (n[k]);
}
return sizeOfU;
}
+ double norm() const {
+ double norm = 0.;
+ size_t k;
+ for (k = 0; k < u.size(); k ++) {
+ norm+=u[k].bottomRows(u[k].rows()-1).squaredNorm();
+ }
+ return std::sqrt(norm);
+ }
+
+ void setZero(){
+ size_t k;
+ for (k = 1; k <= u.size(); k ++) {
+ u[k-1].setZero();
+ }
+ }
+
/*
* Some operator wrappers for u.
*/
MLPModel &operator*=(const double &c) {
+ // Note that when scaling the model, you should
+ // not update the bias.
size_t k;
for (k = 1; k <= u.size(); k ++) {
- u[k-1] *= c;
+ u[k-1] *= c;
}
return *this;
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/state.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/state.hpp b/src/modules/convex/type/state.hpp
index 66f5023..2cb2643 100644
--- a/src/modules/convex/type/state.hpp
+++ b/src/modules/convex/type/state.hpp
@@ -629,6 +629,9 @@ public:
return 1 // numberOfStages = N
+ (inNumberOfStages + 1) // numbersOfUnits: size is (N + 1)
+ 1 // stepsize
+ + 1 // lambda
+ + 1 // is_classification
+ + 1 // activation
+ sizeOfModel // model
+ 1 // numRows
@@ -645,17 +648,16 @@ private:
* - 0: numberOfStages (number of stages (layers), design doc: N)
* - 1: numbersOfUnits (numbers of activation units, design doc: n_0,...,n_N)
* - N + 2: stepsize (step size of gradient steps)
- * - N + 3: is_classification (do classification)
- * - N + 4: activation (activation function)
- * - N + 5: coeff (coefficients, design doc: u)
+ * - N + 3: lambda (regularization term)
+ * - N + 4: is_classification (do classification)
+ * - N + 5: activation (activation function)
+ * - N + 6: coeff (coefficients, design doc: u)
*
* Intra-iteration components (updated in transition step):
* sizeOfModel = # of entries in u + 2, (\sum_1^N n_{k-1} n_k)
- * - N + 3 + sizeOfModel: numRows (number of rows processed in this iteration)
- * - N + 4 + sizeOfModel: loss (loss value, the sum of squared errors)
- * - N + 5 + sizeOfModel: is_classification (do classification)
- * - N + 6 + sizeOfModel: activation (activation function)
- * - N + 7 + sizeOfModel: coeff (volatile model for incrementally update)
+ * - N + 6 + sizeOfModel: coeff (volatile model for incrementally update)
+ * - N + 6 + 2*sizeOfModel: numRows (number of rows processed in this iteration)
+ * - N + 7 + 2*sizeOfModel: loss (loss value, the sum of squared errors)
*/
void rebind() {
task.numberOfStages.rebind(&mStorage[0]);
@@ -663,13 +665,14 @@ private:
task.numbersOfUnits =
reinterpret_cast<dimension_pointer_type>(&mStorage[1]);
task.stepsize.rebind(&mStorage[N + 2]);
- uint32_t sizeOfModel = task.model.rebind(&mStorage[N + 3],&mStorage[N + 4],&mStorage[N + 5],
+ task.lambda.rebind(&mStorage[N + 3]);
+ uint32_t sizeOfModel = task.model.rebind(&mStorage[N + 4],&mStorage[N + 5],&mStorage[N + 6],
task.numberOfStages, task.numbersOfUnits);
- algo.numRows.rebind(&mStorage[N + 5 + sizeOfModel]);
- algo.loss.rebind(&mStorage[N + 6 + sizeOfModel]);
- algo.incrModel.rebind(&mStorage[N + 3],&mStorage[N + 4],&mStorage[N + 7 + sizeOfModel],
+ algo.incrModel.rebind(&mStorage[N + 4],&mStorage[N + 5],&mStorage[N + 6 + sizeOfModel],
task.numberOfStages, task.numbersOfUnits);
+ algo.numRows.rebind(&mStorage[N + 6 + 2*sizeOfModel]);
+ algo.loss.rebind(&mStorage[N + 7 + 2*sizeOfModel]);
}
@@ -685,13 +688,14 @@ public:
dimension_type numberOfStages;
dimension_pointer_type numbersOfUnits;
numeric_type stepsize;
+ numeric_type lambda;
MLPModel<Handle> model;
} task;
struct AlgoState {
+ MLPModel<Handle> incrModel;
count_type numRows;
numeric_type loss;
- MLPModel<Handle> incrModel;
} algo;
};
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/modules/convex/type/tuple.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/tuple.hpp b/src/modules/convex/type/tuple.hpp
index 4b9c55e..824ed90 100644
--- a/src/modules/convex/type/tuple.hpp
+++ b/src/modules/convex/type/tuple.hpp
@@ -64,7 +64,7 @@ typedef ExampleTuple<MappedColumnVector, double> GLMTuple;
// madlib::modules::convex::MatrixIndex
typedef ExampleTuple<MatrixIndex, double> LMFTuple;
-typedef ExampleTuple<MappedColumnVector, MappedColumnVector> MLPTuple;
+typedef ExampleTuple<ColumnVector, MappedColumnVector> MLPTuple;
} // namespace convex
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ff1b0f88/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index 400f892..6b9d828 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -29,23 +29,23 @@
m4_include(`SQLCommon.m4')
/**
-@addtogroup grp_mlp
+@addtogroup grp_nn
<div class="toc"><b>Contents</b><ul>
<li class="level1"><a href="#mlp_classification">Classification</a></li>
<li class="level1"><a href="#mlp_regression">Regression</a></li>
-<li class="level1"><a href="#optimization_params">Optimizer Parameters</a></li>
-<li class="level1"><a href="#predict">Prediction Functions/a></li>
+<li class="level1"><a href="#optimizer_params">Optimizer Parameters</a></li>
+<li class="level1"><a href="#predict">Prediction Functions</a></li>
<li class="level1"><a href="#example">Examples</a></li>
<li class="level1"><a href="#background">Technical Background</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>
-Multilayer Perceptron (MLP) is a model for regression and
-classification.
+Multilayer Perceptron (MLP) is a type of neural network that can be
+used for regression and classification.
-Also called "vanilla neural networks", they consist of several
+Also called "vanilla neural networks", MLPs consist of several
fully connected hidden layers with non-linear activation
functions. In the case of classification, the final layer of the
neural net has as many nodes as classes, and the output of the
@@ -67,7 +67,8 @@ mlp_classification(
dependent_varname,
hidden_layer_sizes,
optimizer_params,
- activation
+ activation,
+ weights
)
</pre>
\b Arguments
@@ -75,6 +76,7 @@ mlp_classification(
<DT>source_table</DT>
<DD>TEXT. Name of the table containing the training data.</DD>
+
<DT>output_table</DT>
<DD>TEXT. Name of the output table containing the model. Details of the output
tables are provided below.
@@ -83,19 +85,22 @@ mlp_classification(
<DT>independent_varname</DT>
<DD>TEXT. Expression list to evaluate for the
independent variables. An intercept variable should not be included as part
- of this expression. Please note that expression should be able to be cast
- to DOUBLE PRECISION[].
+ of this expression. <b>Please note that expression should be encoded properly.</b>
+ All values are cast to DOUBLE PRECISION, so categorical variables should be
+ one-hot or dummy encoded. See <a href="group__grp__encode__categorical.html">here</a>
+ for more details.
</DD>
+
<DT>dependent_varname</DT>
<DD> TEXT. Name of the dependent variable column. For classification, supported types are:
text, varchar, character varying, char, character
integer, smallint, bigint, and boolean. </DD>
- <DT>hidden_layer_sizes (optional)</DT>
- <DD>INTEGER[], default: ARRAY[].
+ <DT>hidden_layer_sizes </DT>
+ <DD>INTEGER[]
The number of neurons in each hidden layer. The length of this array will
- determine the number of hidden layers. Empty for no hidden layers.
+ determine the number of hidden layers. NULL for no hidden layers.
</DD>
@@ -111,6 +116,25 @@ mlp_classification(
'relu', and 'tanh'. The text can be any prefix of the three
strings; for e.g., activation='s' will use the sigmoid activation.
</DD>
+
+
+ <DT>weights (optional)</DT>
+ <DD>TEXT, default: NULL.
+ Weights for input rows. Column name which specifies the weight for each input row.
+ This weight will be incorporated into the update during SGD, and will not be used
+ for loss calculations. If not specified, weight for each row will default to 1.
+ Column should be a numeric type.
+ </DD>
+
+ <DT>warm_start (optional)</DT>
+ <DD>BOOLEAN, default: FALSE.
+ Initalize weights with the coefficients from the last call. If true, weights will
+ be initialized from output_table. Note that all parameters other than optimizer_params,
+ and verbose must remain constant between calls to warm_start.
+ </DD>
+
+ <DT>verbose (optional)</DT>
+ <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD>
</DL>
<b>Output tables</b>
@@ -142,24 +166,28 @@ A summary table named \<output_table\>_summary is also created, which has the fo
<td>The source table.</td>
</tr>
<tr>
- <th>dependent_varname</th>
- <td>The dependent variable.</td>
- </tr>
- <tr>
<th>independent_varname</th>
<td>The independent variables.</td>
</tr>
<tr>
+ <th>dependent_varname</th>
+ <td>The dependent variable.</td>
+ </tr>
+ <tr>
<th>tolerance</th>
<td>The tolerance as given in optimizer_params.</td>
</tr>
<tr>
- <th>step_size</th>
- <td>The step size as given in optimizer_params.</td>
+ <th>learning_rate_init</th>
+ <td>The initial learning rate as given in optimizer_params.</td>
+ </tr>
+ <tr>
+ <th>learning_rate_policy</th>
+ <td>The learning rate policy as given in optimizer_params.</td>
</tr>
<tr>
<th>n_iterations</th>
- <td>The number of iterations run</td>
+ <td>The number of iterations run.</td>
</tr>
<tr>
<th>n_tries</th>
@@ -170,17 +198,29 @@ A summary table named \<output_table\>_summary is also created, which has the fo
<td>The number of units in each layer including the input and output layer.</td>
</tr>
<tr>
- <th>activation_function</th>
+ <th>activation</th>
<td>The activation function.</td>
</tr>
<tr>
<th>is_classification</th>
<td>True if the model was trained for classification, False if it was trained
- for regression</td>
+ for regression.</td>
</tr>
<tr>
<th>classes</th>
- <td>The classes which were trained against (empty for regression)</td>
+ <td>The classes which were trained against (empty for regression).</td>
+ </tr>
+ <tr>
+ <th>weights</th>
+ <td>The weight column used during training.</td>
+ </tr>
+ <tr>
+ <th>x_means</th>
+ <td>The mean for all input features (used for normalization).</td>
+ </tr>
+ <tr>
+ <th>x_stds</th>
+ <td>The standard deviation for all input features (used for normalization).</td>
</tr>
</table>
@@ -197,7 +237,9 @@ mlp_regression(source_table,
dependent_varname,
hidden_layer_sizes,
optimizer_params,
- activation
+ activation,
+ weights,
+ verbose
)
</pre>
@@ -205,7 +247,7 @@ mlp_regression(source_table,
Specifications for regression are largely the same as for classification. In the
model table, the loss will refer to mean square error instead of cross entropy. In the
-summary table, there is classes column. The following
+summary table, there is no classes column. The following
arguments have specifications which differ from mlp_classification:
<DL class="arglist">
<DT>dependent_varname</DT>
@@ -226,7 +268,7 @@ the parameter is ignored.
<pre class="syntax">
- 'step_size = <value>,
+ 'learning_rate_init = <value>,
n_iterations = <value>,
n_tries = <value>,
tolerance = <value>'
@@ -234,27 +276,57 @@ the parameter is ignored.
\b Optimizer Parameters
<DL class="arglist">
-<DT>step_size</dt>
-<DD>Default: [0.001].
+<DT>learning_rate_init</dt>
+<DD>Default: 0.001.
Also known as the learning rate. A small value is usually desirable to
ensure convergence, while a large value provides more room for progress during
training. Since the best value depends on the condition number of the data, in
practice one often tunes this parameter.
</DD>
+<DT>learning_rate_policy</dt>
+<DD>Default: constant.
+One of 'constant', 'exp', 'inv' or 'step' or any prefix of these.
+'constant': learning_rate = learning_rate_init
+'exp': learning_rate = learning_rate_init * gamma^(iter)
+'inv': learning_rate = learning_rate_init * (iter+1)^(-power)
+'step': learning_rate = learning_rate_init * gamma^(floor(iter/iterations_per_step))
+Where iter is the current iteration of SGD.
+</DD>
+
+<DT>gamma</dt>
+<DD>Default: 0.1.
+Decay rate for learning rate when learning_rate_policy is 'exp' or 'step'.
+</DD>
+
+<DT>power</dt>
+<DD>Default: 0.5.
+Exponent for learning_rate_policy = 'inv'.
+</DD>
+
+<DT>iterations_per_step</dt>
+<DD>Default: 100.
+Number of iterations to run before decreasing the learning rate by
+a factor of gamma. Valid for learning rate policy = 'step'.
+</DD>
<DT>n_iterations</dt>
<DD>Default: [100]. The maximum number of iterations allowed.
</DD>
+
<DT>n_tries</dt>
<DD>Default: [1]. Number of times to retrain the network with randomly initialized
-weights
+weights.
+</DD>
+
+<DT>lambda</dt>
+<DD>Default: 0. The regularization coefficient for L2 regularization.
</DD>
<DT>tolerance</dt>
<DD>Default: 0.001. The criterion to end iterations. The training stops whenever
-<the difference between the training models of two consecutive iterations is
-<smaller than \e tolerance or the iteration number is larger than \e max_iter.
+the difference between the training models of two consecutive iterations is
+smaller than \e tolerance or the iteration number is larger than \e max_iter.
</DD>
</DL>
@@ -293,19 +365,19 @@ table name is already in use, then an error is returned. Table contains:</DD>
<td>Gives the 'id' for each prediction, corresponding to each row from the data_table.</td>
</tr>
<tr>
- <th>estimated_<COL_NAME></th>
+ <th>estimated_COL_NAME</th>
<td>
(For pred_type='response') The estimated class
for classification or value for regression, where
- <COL_NAME> is the name of the column to be
- predicted from training data
+ COL_NAME is the name of the column to be
+ predicted from training data.
</td>
</tr>
<tr>
- <th>prob_<CLASS></th>
+ <th>prob_CLASS</th>
<td>
(For pred_type='prob' for classification) The
- probability of a given class <CLASS> as given by
+ probability of a given class CLASS as given by
softmax. There will be one column for each class
in the training data.
</td>
@@ -315,10 +387,10 @@ table name is already in use, then an error is returned. Table contains:</DD>
<DT>pred_type</DT>
<DD>TEXT.
-the type of output requested:
+The type of output requested:
'response' gives the actual prediction,
'prob' gives the probability of each class.
-for regression, only type='response' is defined.
+For regression, only type='response' is defined.
The name of the id column in the input table.</DD>
</DL>
</table>
@@ -363,30 +435,36 @@ The model will be written to mlp_model.
<pre class="example">
DROP TABLE IF EXISTS mlp_model;
DROP TABLE IF EXISTS mlp_model_summary;
+-- Set seed so results are reproducible
+SELECT setseed(0);
SELECT madlib.mlp_classification(
'iris_data', -- Source table
'mlp_model', -- Destination table
'attributes', -- Input features
'class_text', -- Label
ARRAY[5], -- Number of units per layer
- 'step_size=0.003,
- n_iterations=5000,
+ 'learning_rate_init=0.003,
+ n_iterations=500,
tolerance=0', -- Optimizer params
- 'tanh'); -- Activation function
+ 'tanh', -- Activation function
+ NULL, -- Default weight (1)
+ FALSE, -- No warm start
+ TRUE -- Verbose
+);
</pre>
-# View the result for the model.
<pre class="example">
-- Set extended display on for easier reading of output
\\x ON
--- Neural net Initialization is non-deterministic, so your results may vary
+-- Results may vary depending on platform
SELECT * FROM mlp_model;
</pre>
Result:
<pre class="result">
--[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-coeff | {1,1,1,1,1,0.136374930803,0.188739676875,0.662387810001,-1.03381622734,-0.469961067046,0.0614006983397,0.0811504589436,0.299008228258,-0.47391918521,-0.215098143699,0.10519213944,0.145844617525,0.511683525606,-0.800215552382,-0.36417142683,0.120751709056,0.167531106521,0.587074895969,-0.916946198095,-0.417055067449,0.0539541885146,0.0694359704131,0.262598585854,-0.419234805076,-0.189915344282,1,1,1,1,1,1,0.105645702152,1.46247470474,0.484457903226,0.965962824478,1.19361986431,0.419805760087,-0.105696503487,-1.46245956666,-0.484427811691,-0.965730981426,-1.19365280555,-0.419973628863}
-loss | 0.0184092375519
-num_iterations | 5000
+-[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coeff | {-0.172392477419,-0.0836446652758,-0.0162194484142,-0.647268294231,-0.504884325538,0.184825723596,0.351728174731,-0.601148967035,0.720999542651,0.26521898248,0.245760922013,0.264645322438,-0.349957739904,0.797653395667,0.725747963566,-0.344498001796,0.261481840947,0.329074383545,0.379503434339,-0.267398086353,-0.0238069072658,0.330239268187,-0.178736289201,-0.0563356339946,-0.0333791780453,0.262137386864,0.491390436498,-1.02635831573,-1.29541478382,0.246017274,-0.0623575215434,0.0826297373887,-0.671671189842,0.853494672576,1.21671423502,0.296424359217,0.15294606861}
+loss | 0.0136695756314
+num_iterations | 500
</pre>
-# Next train a regression example. First create some test data. This dataset
contains housing prices data.
@@ -419,30 +497,36 @@ COPY lin_housing (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
<pre class="example">
DROP TABLE IF EXISTS mlp_regress;
DROP TABLE IF EXISTS mlp_regress_summary;
+SELECT setseed(0);
SELECT madlib.mlp_regression(
- 'lin_housing', -- Source table
- 'mlp_regress', -- Desination table
- 'x', -- Input features
- 'y', -- Dependent variable
- ARRAY[5,5], -- Number of units per layer
- 'step_size=0.000007,
- n_iterations=10000,
+ 'lin_housing', -- Source table
+ 'mlp_regress', -- Desination table
+ 'x', -- Input features
+ 'y', -- Dependent variable
+ ARRAY[25,25], -- Number of units per layer
+ 'learning_rate_init=0.001,
+ n_iterations=500,
+ lambda=0.001,
tolerance=0',
- 'relu');
+ 'relu',
+ NULL, -- Default weight (1)
+ FALSE, -- No warm start
+ TRUE -- Verbose
+);
</pre>
-# Check the results of the model
<pre class="example">
--- Set extended display on for easier reading of output
+-- Set extended display on for easier reading of output.
\\x ON
--- Neural net Initialization is non-deterministic, so your results may vary
+-- Results may vary depending on platform.
SELECT * FROM mlp_regress;
</pre>
Result:
<pre class="result">
--[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-----------------------------------
-coeff | {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.79506311399e-05,3.56715008915e-05,-6.09333559685e-05,0.000251228318768,-0.000224772841379,-3.71863030857e-05,-3.5757865148e-06,5.27936784854e-05,-2.48474166186e-05,6.19731184294e-05,3.07638968743e-05,6.8964698578e-06,0.000106016701083,-1.71484730318e-05,1.18691881812e-05,-0.000163975464208,0.000170026304906,3.11688265279e-05,0.000177050148787,-1.58265976603e-05,2.70144422657e-05,0.000112667883422,3.77575139073e-05,8.12474658795e-05,-7.90458917626e-05,0.000107566386158,-2.63771171506e-06,2.47996880915e-05,-0.00012642310887,0.000203827391081,0.000139315565565,4.86147243454e-05,-0.000176126471913,-6.47820782916e-05,-8.51592776447e-06,-6.60601176758e-05,2.91421874156e-05,6.3556873752e-05,0.000197557443129,0.000220531367259,0.000135036310289,0.000143735913975,-4.75034117786e-05,-0.000179547345838,-1.6919846786e-05,0.000162784312994,0.000268595819851,-0.000460066553287,8.69756071591e-05,-0.00311762727057,0.000126024763103,0.000205988242921
,0.003463432426,-0.00729789075286,0.00151625867549,-0.000890852767597,-0.00525016037249,0.0031043106659,0.00798041103839,-0.00552693050079,0.0232180415786,0.0230489850143,-0.0437890272341,0.0165765426407,-0.248554261758,-7.81336427846e-05,0.00558145591752,0.283465844585,-0.571699956182,0.133474351994,-0.0785181945605,-0.419269930709,0.249547772912,0.631761009875,-0.431305975666,1,1,1,1,1,1,0.0158747497572,-9.02809160806e-05,0.00015574347618,4.10805373863e-06,0.00121532434965,0.101790351335,0.0647558401493,-0.00013654998677,-9.92872075948e-06,-5.5319694394e-05,0.00519320756484,0.412736586036,0.0011998026977,-1.53688189815e-05,1.94817888201e-05,-4.63111489966e-05,7.24547899029e-05,0.00880394144485,5.45309822095e-05,-0.000140943219275,-7.96211486227e-05,-1.04337307472e-05,0.000161936762028,0.00136273797767,-4.54737243585e-05,-3.4083840736e-05,3.69286883662e-05,9.9047243188e-08,3.75014011824e-06,-9.45366086368e-08,1,1,1,1,1,1,6.67488547054,0.102754199001,0.41668912471,0.00886867296479,0
.00136206007228,-9.88642499013e-05}
-loss | 144.965776158
-num_iterations | 10000
+[ RECORD 1 ]--+-----------------------------------------------------------------------------------
+coeff | {-0.135647108464,0.0315402969485,-0.117580589352,-0.23084537701,-0.10868726702...
+loss | 0.114125125042
+num_iterations | 500
</pre>
-# Now let's look at the prediction functions. In the following examples we will
use the training data set for prediction as well, which is not usual but serves to
@@ -458,8 +542,6 @@ SELECT madlib.mlp_predict(
'mlp_prediction', -- Output table for predictions
'response' -- Output classes, not probabilities
);
--# View results
-<pre class="example">
SELECT * FROM mlp_prediction JOIN iris_data USING (id);
</pre>
Result for the classification model:
@@ -487,7 +569,7 @@ Result for the classification model:
19 | Iris-versicolor | {6.6,2.9,4.6,1.3} | Iris-versicolor | 2
20 | Iris-versicolor | {5.2,2.7,3.9,1.4} | Iris-versicolor | 2
</pre>
-Prediction using the regression model:
+-# Prediction using the regression model:
<pre class="example">
DROP TABLE IF EXISTS mlp_regress_prediction;
SELECT madlib.mlp_predict(
@@ -498,34 +580,35 @@ SELECT madlib.mlp_predict(
'response' -- Output values, not probabilities
);
</pre>
--# View results
+View results
<pre class="example">
SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id);
</pre>
Result for the regression model:
<pre class="result">
- id | x | grp_by_col | y | estimated_y
-----+-------------------------------------------------------------------------+------------+------+--------------------
- 1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98} | 1 | 24 | {23.2627062018087}
- 2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14} | 1 | 21.6 | {25.7088419115781}
- 3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03} | 1 | 34.7 | {27.5587003901404}
- 4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94} | 1 | 33.4 | {31.1812237427816}
- 5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33} | 1 | 36.2 | {30.3696873085477}
- 6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21} | 1 | 28.7 | {29.5290259241882}
- 7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43} | 1 | 22.9 | {21.1576051716888}
- 8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15} | 1 | 27.1 | {17.6194200563055}
- 9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93} | 1 | 16.5 | {15.1366297774139}
-10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1} | 1 | 18.9 | {17.6528662199369}
-11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} | 1 | 15 | {17.2017487668181}
-12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27} | 1 | 18.9 | {19.4893860319992}
-13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71} | 1 | 21.7 | {23.2917226708039}
-14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26} | 1 | 20.4 | {22.8904812605193}
-15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26} | 1 | 18.2 | {18.2386754423677}
-16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47} | 1 | 19.9 | {23.28949550874}
-17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58} | 1 | 23.1 | {25.3288762085473}
-18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67} | 1 | 17.5 | {19.0203738118451}
-19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69} | 1 | 20.2 | {12.3162005347545}
-20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28} | 1 | 18.2 | {21.0902211848747}
+ id | x | grp_by_col | y | estimated_y
+----+-------------------------------------------------------------------------+------------+------+------------------
+ 1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98} | 1 | 24 | 23.973628645041
+ 2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14} | 1 | 21.6 | 21.6389086856109
+ 3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03} | 1 | 34.7 | 34.6766441639675
+ 4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94} | 1 | 33.4 | 33.4521871118756
+ 5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33} | 1 | 36.2 | 36.2899491706428
+ 6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21} | 1 | 28.7 | 28.6994076427827
+ 7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43} | 1 | 22.9 | 22.4882117113923
+ 8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15} | 1 | 27.1 | 26.5148927040405
+ 9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93} | 1 | 16.5 | 16.0669778867327
+ 10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1} | 1 | 18.9 | 17.4237448788601
+ 11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} | 1 | 15 | 14.5944028616784
+ 12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27} | 1 | 18.9 | 19.6071061560237
+ 13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71} | 1 | 21.7 | 21.7585638578804
+ 14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26} | 1 | 20.4 | 20.2832271533629
+ 15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26} | 1 | 18.2 | 18.3440540662206
+ 16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47} | 1 | 19.9 | 20.0246074554594
+ 17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58} | 1 | 23.1 | 23.1458505146148
+ 18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67} | 1 | 17.5 | 17.4602306566804
+ 19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69} | 1 | 20.2 | 20.1785296856357
+ 20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28} | 1 | 18.2 | 18.1810300625137
+(20 rows)
</pre>
Note that the results you get for all examples may vary with the platform you are using.
@@ -561,6 +644,10 @@ File mlp.sql_in documenting the training function
*/
+CREATE TYPE MADLIB_SCHEMA.mlp_step_result AS (
+ state DOUBLE PRECISION[],
+ loss DOUBLE PRECISION
+);
CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
coeff DOUBLE PRECISION[],
@@ -571,14 +658,22 @@ CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
-- create SQL functions for IGD optimizer
--------------------------------------------------------------------------
CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_transition(
- state DOUBLE PRECISION[],
- start_vec DOUBLE PRECISION[],
- end_vec DOUBLE PRECISION[],
- previous_state DOUBLE PRECISION[],
- layer_sizes DOUBLE PRECISION[],
- stepsize DOUBLE PRECISION,
- activation INTEGER,
- is_classification INTEGER)
+ state DOUBLE PRECISION[],
+ ind_var DOUBLE PRECISION[],
+ dep_var DOUBLE PRECISION[],
+ previous_state DOUBLE PRECISION[],
+ layer_sizes DOUBLE PRECISION[],
+ learning_rate_init DOUBLE PRECISION,
+ activation INTEGER,
+ is_classification INTEGER,
+ weight DOUBLE PRECISION,
+ warm_start BOOLEAN,
+ warm_start_coeff DOUBLE PRECISION[],
+ n_tuples INTEGER,
+ lambda DOUBLE PRECISION,
+ x_means DOUBLE PRECISION[],
+ x_stds DOUBLE PRECISION[]
+ )
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
@@ -592,7 +687,7 @@ LANGUAGE C IMMUTABLE STRICT;
CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_final(
state DOUBLE PRECISION[])
-RETURNS DOUBLE PRECISION[]
+RETURNS MADLIB_SCHEMA.mlp_step_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
@@ -601,16 +696,24 @@ LANGUAGE C IMMUTABLE STRICT;
* @brief Perform one iteration of backprop
*/
CREATE AGGREGATE MADLIB_SCHEMA.mlp_igd_step(
- /* start_vec*/ DOUBLE PRECISION[],
- /* end_vec */ DOUBLE PRECISION[],
- /* previous_state */ DOUBLE PRECISION[],
- /* layer_sizes */ DOUBLE PRECISION[],
- /* stepsize */ DOUBLE PRECISION,
- /* activation */ INTEGER,
- /* is_classification */ INTEGER )(
+ /* ind_var */ DOUBLE PRECISION[],
+ /* dep_var */ DOUBLE PRECISION[],
+ /* previous_state */ DOUBLE PRECISION[],
+ /* layer_sizes */ DOUBLE PRECISION[],
+ /* learning_rate_init */ DOUBLE PRECISION,
+ /* activation */ INTEGER,
+ /* is_classification */ INTEGER,
+ /* weight */ DOUBLE PRECISION,
+ /* warm_start */ BOOLEAN,
+ /* warm_start_coeff */ DOUBLE PRECISION[],
+ /* n_tuples */ INTEGER,
+ /* lambda */ DOUBLE PRECISION,
+ /* x_means */ DOUBLE PRECISION[],
+ /* x_stds */ DOUBLE PRECISION[]
+ )(
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.mlp_igd_transition,
- m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
+ m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
FINALFUNC=MADLIB_SCHEMA.mlp_igd_final,
INITCOND='{0,0,0,0,0,0,0,0}'
);
@@ -631,13 +734,16 @@ LANGUAGE c IMMUTABLE STRICT;
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
- source_table VARCHAR,
- output_table VARCHAR,
- independent_varname VARCHAR,
- dependent_varname VARCHAR,
- hidden_layer_sizes INTEGER[],
- optimizer_params VARCHAR,
- activation VARCHAR
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR,
+ weights VARCHAR,
+ warm_start BOOLEAN,
+ verbose BOOLEAN
) RETURNS VOID AS $$
PythonFunctionBodyOnly(`convex', `mlp_igd')
mlp_igd.mlp(
@@ -649,19 +755,96 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
hidden_layer_sizes,
optimizer_params,
activation,
- True
+ True,
+ weights,
+ warm_start,
+ verbose
)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR,
+ weights VARCHAR,
+ warm_start BOOLEAN
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR,
+ weights VARCHAR
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[]
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_classification($1, $2, $3, $4, $5, NULL, NULL, NULL, FALSE, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
- source_table VARCHAR,
- output_table VARCHAR,
- independent_varname VARCHAR,
- dependent_varname VARCHAR,
- hidden_layer_sizes INTEGER[],
- optimizer_params VARCHAR,
- activation VARCHAR
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR,
+ weights VARCHAR,
+ warm_start BOOLEAN,
+ verbose BOOLEAN
) RETURNS VOID AS $$
PythonFunctionBodyOnly(`convex', `mlp_igd')
mlp_igd.mlp(
@@ -673,11 +856,83 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
hidden_layer_sizes,
optimizer_params,
activation,
- False
+ False,
+ weights,
+ warm_start,
+ verbose
)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR,
+ weights VARCHAR,
+ warm_start BOOLEAN
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR,
+ weights VARCHAR
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[]
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.mlp_regression($1, $2, $3, $4, $5, NULL, NULL, NULL, FALSE, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict(
model_table VARCHAR,
data_table VARCHAR,
@@ -700,9 +955,11 @@ CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp(
coeff DOUBLE PRECISION[],
independent_varname DOUBLE PRECISION[],
is_classification DOUBLE PRECISION,
- activation_function DOUBLE PRECISION,
+ activation DOUBLE PRECISION,
layer_sizes DOUBLE PRECISION[],
- is_response INTEGER
+ is_response INTEGER,
+ x_means DOUBLE PRECISION[],
+ x_stds DOUBLE PRECISION[]
)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'