You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/07/20 22:32:11 UTC
[2/3] incubator-madlib git commit: New module: Multilayer Perceptron
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4fcb60ed/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
new file mode 100644
index 0000000..400f892
--- /dev/null
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -0,0 +1,752 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file mlp.sql_in
+ *
+ * @brief SQL functions for multilayer perceptron
+ * @date June 2012
+ *
+ *
+ *//* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_mlp
+
+<div class="toc"><b>Contents</b><ul>
+<li class="level1"><a href="#mlp_classification">Classification</a></li>
+<li class="level1"><a href="#mlp_regression">Regression</a></li>
+<li class="level1"><a href="#optimization_params">Optimizer Parameters</a></li>
+<li class="level1"><a href="#predict">Prediction Functions/a></li>
+<li class="level1"><a href="#example">Examples</a></li>
+<li class="level1"><a href="#background">Technical Background</a></li>
+<li class="level1"><a href="#literature">Literature</a></li>
+<li class="level1"><a href="#related">Related Topics</a></li>
+</ul></div>
+
+Multilayer Perceptron (MLP) is a model for regression and
+classification.
+
+Also called "vanilla neural networks", they consist of several
+fully connected hidden layers with non-linear activation
+functions. In the case of classification, the final layer of the
+neural net has as many nodes as classes, and the output of the
+neural net can be interpreted as the probability of a given input
+feature belonging to a specific class.
+
+
+@brief Solves classification and regression problems with several
+fully connected layers and nonlinear activation functions.
+
+@anchor mlp_classification
+@par Classification Training Function
+The mlp classification training function has the following format:
+<pre class="syntax">
+mlp_classification(
+ source_table,
+ output_table,
+ independent_varname,
+ dependent_varname,
+ hidden_layer_sizes,
+ optimizer_params,
+ activation
+ )
+</pre>
+\b Arguments
+<DL class="arglist">
+ <DT>source_table</DT>
+ <DD>TEXT. Name of the table containing the training data.</DD>
+
+ <DT>output_table</DT>
+ <DD>TEXT. Name of the output table containing the model. Details of the output
+ tables are provided below.
+ </DD>
+
+ <DT>independent_varname</DT>
+ <DD>TEXT. Expression list to evaluate for the
+ independent variables. An intercept variable should not be included as part
+ of this expression. Please note that expression should be able to be cast
+ to DOUBLE PRECISION[].
+ </DD>
+
+ <DT>dependent_varname</DT>
+ <DD> TEXT. Name of the dependent variable column. For classification, supported types are:
+ text, varchar, character varying, char, character
+ integer, smallint, bigint, and boolean. </DD>
+
+ <DT>hidden_layer_sizes (optional)</DT>
+ <DD>INTEGER[], default: ARRAY[].
+ The number of neurons in each hidden layer. The length of this array will
+ determine the number of hidden layers. Empty for no hidden layers.
+ </DD>
+
+
+ <DT>optimizer_params (optional)</DT>
+ <DD>TEXT, default: NULL.
+ Parameters for optimization in a comma-separated string
+ of key-value pairs. See the description below for details.
+ </DD>
+
+ <DT>activation (optional)</DT>
+ <DD>TEXT, default: 'sigmoid'.
+ Activation function. Currently three functions are supported: 'sigmoid' (default),
+ 'relu', and 'tanh'. The text can be any prefix of the three
+ strings; for e.g., activation='s' will use the sigmoid activation.
+ </DD>
+</DL>
+
+<b>Output tables</b>
+<br>
+ The model table produced by mlp contains the following columns:
+ <table class="output">
+ <tr>
+ <th>coeffs</th>
+ <td>FLOAT8[]. Flat array containing the weights of the neural net</td>
+ </tr>
+ <tr>
+ <th>n_iterations</th>
+ <td>INTEGER. Number of iterations completed by stochastic gradient descent
+ algorithm. The algorithm either converged in this number of iterations
+ or hit the maximum number specified in the optimization parameters. </td>
+ </tr>
+ <tr>
+ <th>loss</th>
+ <td>FLOAT8. The cross entropy over the training data.
+ See Technical Background section below for more details.</td>
+ </tr>
+ </table>
+
+
+A summary table named \<output_table\>_summary is also created, which has the following columns:
+ <table class="output">
+ <tr>
+ <th>source_table</th>
+ <td>The source table.</td>
+ </tr>
+ <tr>
+ <th>dependent_varname</th>
+ <td>The dependent variable.</td>
+ </tr>
+ <tr>
+ <th>independent_varname</th>
+ <td>The independent variables.</td>
+ </tr>
+ <tr>
+ <th>tolerance</th>
+ <td>The tolerance as given in optimizer_params.</td>
+ </tr>
+ <tr>
+ <th>step_size</th>
+ <td>The step size as given in optimizer_params.</td>
+ </tr>
+ <tr>
+ <th>n_iterations</th>
+ <td>The number of iterations run</td>
+ </tr>
+ <tr>
+ <th>n_tries</th>
+ <td>The number of tries as given in optimizer_params.</td>
+ </tr>
+ <tr>
+ <th>layer_sizes</th>
+ <td>The number of units in each layer including the input and output layer.</td>
+ </tr>
+ <tr>
+ <th>activation_function</th>
+ <td>The activation function.</td>
+ </tr>
+ <tr>
+ <th>is_classification</th>
+ <td>True if the model was trained for classification, False if it was trained
+ for regression</td>
+ </tr>
+ <tr>
+ <th>classes</th>
+ <td>The classes which were trained against (empty for regression)</td>
+ </tr>
+
+ </table>
+
+
+@anchor mlp_regression
+@par Regression Training Function
+The mlp regression training function has the following format:
+<pre class="syntax">
+mlp_regression(source_table,
+ source_table,
+ output_table,
+ independent_varname,
+ dependent_varname,
+ hidden_layer_sizes,
+ optimizer_params,
+ activation
+ )
+</pre>
+
+\b Arguments
+
+Specifications for regression are largely the same as for classification. In the
+model table, the loss will refer to mean square error instead of cross entropy. In the
+summary table, there is classes column. The following
+arguments have specifications which differ from mlp_classification:
+<DL class="arglist">
+<DT>dependent_varname</DT>
+ <DD>TEXT. Name of the dependent variable column.
+ For regression supported types are any numeric type, or array
+ or numeric types (for multiple regression).
+ </DD>
+</DL>
+
+
+@anchor optimizer_params
+@par Optimizer Parameters
+Parameters in this section are supplied in the \e optimizer_params argument as a string
+containing a comma-delimited list of name-value pairs. All of these named
+parameters are optional, and their order does not matter. You must use the
+format "<param_name> = <value>" to specify the value of a parameter, otherwise
+the parameter is ignored.
+
+
+<pre class="syntax">
+ 'step_size = <value>,
+ n_iterations = <value>,
+ n_tries = <value>,
+ tolerance = <value>'
+</pre>
+\b Optimizer Parameters
+<DL class="arglist">
+
+<DT>step_size</dt>
+<DD>Default: [0.001].
+Also known as the learning rate. A small value is usually desirable to
+ensure convergence, while a large value provides more room for progress during
+training. Since the best value depends on the condition number of the data, in
+practice one often tunes this parameter.
+</DD>
+
+
+<DT>n_iterations</dt>
+<DD>Default: [100]. The maximum number of iterations allowed.
+</DD>
+<DT>n_tries</dt>
+<DD>Default: [1]. Number of times to retrain the network with randomly initialized
+weights
+</DD>
+
+<DT>tolerance</dt>
+<DD>Default: 0.001. The criterion to end iterations. The training stops whenever
+<the difference between the training models of two consecutive iterations is
+<smaller than \e tolerance or the iteration number is larger than \e max_iter.
+</DD>
+
+</DL>
+
+@anchor predict
+@par Prediction Function
+Used to generate predictions given a previously trained model on novel data.
+The same syntax is used for classification, and regression.
+<pre class="syntax">
+mlp_predict(model_table,
+ data_table,
+ id_col_name,
+ output_table,
+ pred_type)
+</pre>
+
+\b Arguments
+<DL class="arglist">
+ <DT>model_table</DT>
+ <DD>TEXT. Model table produced by the training function.</DD>
+
+ <DT>data_table</DT>
+ <DD>TEXT. Name of the table containing the data for prediction. This table is expected
+ to contain the same input features that were used during training. The table should
+ also contain id_col_name used for identifying each row.</DD>
+
+ <DT>id_col_name</DT>
+ <DD>TEXT. The name of the id column in the input table.</DD>
+
+ <DT>output_table</DT>
+ <DD>TEXT. Name of the table where output predictions are written. If this
+table name is already in use, then an error is returned. Table contains:</DD>
+ <table class="output">
+ <tr>
+ <th>id</th>
+ <td>Gives the 'id' for each prediction, corresponding to each row from the data_table.</td>
+ </tr>
+ <tr>
+ <th>estimated_<COL_NAME></th>
+ <td>
+ (For pred_type='response') The estimated class
+ for classification or value for regression, where
+ <COL_NAME> is the name of the column to be
+ predicted from training data
+ </td>
+ </tr>
+ <tr>
+ <th>prob_<CLASS></th>
+ <td>
+ (For pred_type='prob' for classification) The
+ probability of a given class <CLASS> as given by
+ softmax. There will be one column for each class
+ in the training data.
+ </td>
+ </tr>
+
+
+ <DT>pred_type</DT>
+ <DD>TEXT.
+
+the type of output requested:
+'response' gives the actual prediction,
+'prob' gives the probability of each class.
+for regression, only type='response' is defined.
+The name of the id column in the input table.</DD>
+</DL>
+</table>
+
+@anchor example
+@par Examples
+-# Create an input data set.
+<pre class="example">
+CREATE TABLE iris_data(
+ id integer,
+ attributes numeric[],
+ class_text varchar,
+ class integer
+);
+INSERT INTO iris_data VALUES
+(1,ARRAY[5.1,3.5,1.4,0.2],'Iris-setosa',1),
+(2,ARRAY[4.9,3.0,1.4,0.2],'Iris-setosa',1),
+(3,ARRAY[4.7,3.2,1.3,0.2],'Iris-setosa',1),
+(4,ARRAY[4.6,3.1,1.5,0.2],'Iris-setosa',1),
+(5,ARRAY[5.0,3.6,1.4,0.2],'Iris-setosa',1),
+(6,ARRAY[5.4,3.9,1.7,0.4],'Iris-setosa',1),
+(7,ARRAY[4.6,3.4,1.4,0.3],'Iris-setosa',1),
+(8,ARRAY[5.0,3.4,1.5,0.2],'Iris-setosa',1),
+(9,ARRAY[4.4,2.9,1.4,0.2],'Iris-setosa',1),
+(10,ARRAY[4.9,3.1,1.5,0.1],'Iris-setosa',1),
+(11,ARRAY[7.0,3.2,4.7,1.4],'Iris-versicolor',2),
+(12,ARRAY[6.4,3.2,4.5,1.5],'Iris-versicolor',2),
+(13,ARRAY[6.9,3.1,4.9,1.5],'Iris-versicolor',2),
+(14,ARRAY[5.5,2.3,4.0,1.3],'Iris-versicolor',2),
+(15,ARRAY[6.5,2.8,4.6,1.5],'Iris-versicolor',2),
+(16,ARRAY[5.7,2.8,4.5,1.3],'Iris-versicolor',2),
+(17,ARRAY[6.3,3.3,4.7,1.6],'Iris-versicolor',2),
+(18,ARRAY[4.9,2.4,3.3,1.0],'Iris-versicolor',2),
+(19,ARRAY[6.6,2.9,4.6,1.3],'Iris-versicolor',2),
+(20,ARRAY[5.2,2.7,3.9,1.4],'Iris-versicolor',2);
+</pre>
+-# Generate a multilayer perceptron with a single hidden layer of 5 units.
+Use the attributes column as the independent variables, and use the class
+column as the classification. Set the tolerance to 0 so that 5000
+iterations will be run. Use a hyperbolic tangent activation function.
+The model will be written to mlp_model.
+<pre class="example">
+DROP TABLE IF EXISTS mlp_model;
+DROP TABLE IF EXISTS mlp_model_summary;
+SELECT madlib.mlp_classification(
+ 'iris_data', -- Source table
+ 'mlp_model', -- Destination table
+ 'attributes', -- Input features
+ 'class_text', -- Label
+ ARRAY[5], -- Number of units per layer
+ 'step_size=0.003,
+ n_iterations=5000,
+ tolerance=0', -- Optimizer params
+ 'tanh'); -- Activation function
+</pre>
+-# View the result for the model.
+<pre class="example">
+-- Set extended display on for easier reading of output
+\\x ON
+-- Neural net Initialization is non-deterministic, so your results may vary
+SELECT * FROM mlp_model;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coeff | {1,1,1,1,1,0.136374930803,0.188739676875,0.662387810001,-1.03381622734,-0.469961067046,0.0614006983397,0.0811504589436,0.299008228258,-0.47391918521,-0.215098143699,0.10519213944,0.145844617525,0.511683525606,-0.800215552382,-0.36417142683,0.120751709056,0.167531106521,0.587074895969,-0.916946198095,-0.417055067449,0.0539541885146,0.0694359704131,0.262598585854,-0.419234805076,-0.189915344282,1,1,1,1,1,1,0.105645702152,1.46247470474,0.484457903226,0.965962824478,1.19361986431,0.419805760087,-0.105696503487,-1.46245956666,-0.484427811691,-0.965730981426,-1.19365280555,-0.419973628863}
+loss | 0.0184092375519
+num_iterations | 5000
+</pre>
+-# Next train a regression example. First create some test data. This dataset
+contains housing prices data.
+<pre class="example">
+CREATE TABLE lin_housing (id serial, x float8[], grp_by_col int, y float8);
+COPY lin_housing (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
+{1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98}|1|24.00
+{1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14}|1|21.60
+{1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03}|1|34.70
+{1,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94}|1|33.40
+{1,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33}|1|36.20
+{1,0.02985,0.00,2.180,0,0.4580,6.4300,58.70,6.0622,3,222.0,18.70,394.12,5.21}|1|28.70
+{1,0.08829,12.50,7.870,0,0.5240,6.0120,66.60,5.5605,5,311.0,15.20,395.60,12.43}|1|22.90
+{1,0.14455,12.50,7.870,0,0.5240,6.1720,96.10,5.9505,5,311.0,15.20,396.90,19.15}|1|27.10
+{1,0.21124,12.50,7.870,0,0.5240,5.6310,100.00,6.0821,5,311.0,15.20,386.63,29.93}|1|16.50
+{1,0.17004,12.50,7.870,0,0.5240,6.0040,85.90,6.5921,5,311.0,15.20,386.71,17.10}|1|18.90
+{1,0.22489,12.50,7.870,0,0.5240,6.3770,94.30,6.3467,5,311.0,15.20,392.52,20.45}|1|15.00
+{1,0.11747,12.50,7.870,0,0.5240,6.0090,82.90,6.2267,5,311.0,15.20,396.90,13.27}|1|18.90
+{1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71}|1|21.70
+{1,0.62976,0.00,8.140,0,0.5380,5.9490,61.80,4.7075,4,307.0,21.00,396.90,8.26}|1|20.40
+{1,0.63796,0.00,8.140,0,0.5380,6.0960,84.50,4.4619,4,307.0,21.00,380.02,10.26}|1|18.20
+{1,0.62739,0.00,8.140,0,0.5380,5.8340,56.50,4.4986,4,307.0,21.00,395.62,8.47}|1|19.90
+{1,1.05393,0.00,8.140,0,0.5380,5.9350,29.30,4.4986,4,307.0,21.00,386.85,6.58}|1| 23.10
+{1,0.78420,0.00,8.140,0,0.5380,5.9900,81.70,4.2579,4,307.0,21.00,386.75,14.67}|1|17.50
+{1,0.80271,0.00,8.140,0,0.5380,5.4560,36.60,3.7965,4,307.0,21.00,288.99,11.69}|1|20.20
+{1,0.72580,0.00,8.140,0,0.5380,5.7270,69.50,3.7965,4,307.0,21.00,390.95,11.28}|1|18.20
+\\.
+</pre>
+-# Now train a regression model using a multilayer perceptron a single hidden layer of two nodes.
+<pre class="example">
+DROP TABLE IF EXISTS mlp_regress;
+DROP TABLE IF EXISTS mlp_regress_summary;
+SELECT madlib.mlp_regression(
+ 'lin_housing', -- Source table
+ 'mlp_regress', -- Desination table
+ 'x', -- Input features
+ 'y', -- Dependent variable
+ ARRAY[5,5], -- Number of units per layer
+ 'step_size=0.000007,
+ n_iterations=10000,
+ tolerance=0',
+ 'relu');
+</pre>
+-# Check the results of the model
+<pre class="example">
+-- Set extended display on for easier reading of output
+\\x ON
+-- Neural net Initialization is non-deterministic, so your results may vary
+SELECT * FROM mlp_regress;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-----------------------------------
+coeff | {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.79506311399e-05,3.56715008915e-05,-6.09333559685e-05,0.000251228318768,-0.000224772841379,-3.71863030857e-05,-3.5757865148e-06,5.27936784854e-05,-2.48474166186e-05,6.19731184294e-05,3.07638968743e-05,6.8964698578e-06,0.000106016701083,-1.71484730318e-05,1.18691881812e-05,-0.000163975464208,0.000170026304906,3.11688265279e-05,0.000177050148787,-1.58265976603e-05,2.70144422657e-05,0.000112667883422,3.77575139073e-05,8.12474658795e-05,-7.90458917626e-05,0.000107566386158,-2.63771171506e-06,2.47996880915e-05,-0.00012642310887,0.000203827391081,0.000139315565565,4.86147243454e-05,-0.000176126471913,-6.47820782916e-05,-8.51592776447e-06,-6.60601176758e-05,2.91421874156e-05,6.3556873752e-05,0.000197557443129,0.000220531367259,0.000135036310289,0.000143735913975,-4.75034117786e-05,-0.000179547345838,-1.6919846786e-05,0.000162784312994,0.000268595819851,-0.000460066553287,8.69756071591e-05,-0.00311762727057,0.000126024763103,0.000205988242921
,0.003463432426,-0.00729789075286,0.00151625867549,-0.000890852767597,-0.00525016037249,0.0031043106659,0.00798041103839,-0.00552693050079,0.0232180415786,0.0230489850143,-0.0437890272341,0.0165765426407,-0.248554261758,-7.81336427846e-05,0.00558145591752,0.283465844585,-0.571699956182,0.133474351994,-0.0785181945605,-0.419269930709,0.249547772912,0.631761009875,-0.431305975666,1,1,1,1,1,1,0.0158747497572,-9.02809160806e-05,0.00015574347618,4.10805373863e-06,0.00121532434965,0.101790351335,0.0647558401493,-0.00013654998677,-9.92872075948e-06,-5.5319694394e-05,0.00519320756484,0.412736586036,0.0011998026977,-1.53688189815e-05,1.94817888201e-05,-4.63111489966e-05,7.24547899029e-05,0.00880394144485,5.45309822095e-05,-0.000140943219275,-7.96211486227e-05,-1.04337307472e-05,0.000161936762028,0.00136273797767,-4.54737243585e-05,-3.4083840736e-05,3.69286883662e-05,9.9047243188e-08,3.75014011824e-06,-9.45366086368e-08,1,1,1,1,1,1,6.67488547054,0.102754199001,0.41668912471,0.00886867296479,0
.00136206007228,-9.88642499013e-05}
+loss | 144.965776158
+num_iterations | 10000
+</pre>
+-# Now let's look at the prediction functions. In the following examples we will
+use the training data set for prediction as well, which is not usual but serves to
+show the syntax. First we will test the classification example.
+The prediction is in the the estimated_class_text column with the
+actual value in the class_text column.
+<pre class="example">
+DROP TABLE IF EXISTS mlp_prediction;
+SELECT madlib.mlp_predict(
+ 'mlp_model', -- Model table
+ 'iris_data', -- Test data table
+ 'id', -- Id column in test table
+ 'mlp_prediction', -- Output table for predictions
+ 'response' -- Output classes, not probabilities
+ );
+-# View results
+<pre class="example">
+SELECT * FROM mlp_prediction JOIN iris_data USING (id);
+</pre>
+Result for the classification model:
+<pre class="result">
+ id | estimated_class_text | attributes | class_text | class
+----+----------------------+-------------------+-----------------+-------
+ 1 | Iris-setosa | {5.1,3.5,1.4,0.2} | Iris-setosa | 1
+ 2 | Iris-setosa | {4.9,3.0,1.4,0.2} | Iris-setosa | 1
+ 3 | Iris-setosa | {4.7,3.2,1.3,0.2} | Iris-setosa | 1
+ 4 | Iris-setosa | {4.6,3.1,1.5,0.2} | Iris-setosa | 1
+ 5 | Iris-setosa | {5.0,3.6,1.4,0.2} | Iris-setosa | 1
+ 6 | Iris-setosa | {5.4,3.9,1.7,0.4} | Iris-setosa | 1
+ 7 | Iris-setosa | {4.6,3.4,1.4,0.3} | Iris-setosa | 1
+ 8 | Iris-setosa | {5.0,3.4,1.5,0.2} | Iris-setosa | 1
+ 9 | Iris-setosa | {4.4,2.9,1.4,0.2} | Iris-setosa | 1
+ 10 | Iris-setosa | {4.9,3.1,1.5,0.1} | Iris-setosa | 1
+ 11 | Iris-versicolor | {7.0,3.2,4.7,1.4} | Iris-versicolor | 2
+ 12 | Iris-versicolor | {6.4,3.2,4.5,1.5} | Iris-versicolor | 2
+ 13 | Iris-versicolor | {6.9,3.1,4.9,1.5} | Iris-versicolor | 2
+ 14 | Iris-versicolor | {5.5,2.3,4.0,1.3} | Iris-versicolor | 2
+ 15 | Iris-versicolor | {6.5,2.8,4.6,1.5} | Iris-versicolor | 2
+ 16 | Iris-versicolor | {5.7,2.8,4.5,1.3} | Iris-versicolor | 2
+ 17 | Iris-versicolor | {6.3,3.3,4.7,1.6} | Iris-versicolor | 2
+ 18 | Iris-versicolor | {4.9,2.4,3.3,1.0} | Iris-versicolor | 2
+ 19 | Iris-versicolor | {6.6,2.9,4.6,1.3} | Iris-versicolor | 2
+ 20 | Iris-versicolor | {5.2,2.7,3.9,1.4} | Iris-versicolor | 2
+</pre>
+Prediction using the regression model:
+<pre class="example">
+DROP TABLE IF EXISTS mlp_regress_prediction;
+SELECT madlib.mlp_predict(
+ 'mlp_regress', -- Model table
+ 'lin_housing', -- Test data table
+ 'id', -- Id column in test table
+ 'mlp_regress_prediction', -- Output table for predictions
+ 'response' -- Output values, not probabilities
+ );
+</pre>
+-# View results
+<pre class="example">
+SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id);
+</pre>
+Result for the regression model:
+<pre class="result">
+ id | x | grp_by_col | y | estimated_y
+----+-------------------------------------------------------------------------+------------+------+--------------------
+ 1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98} | 1 | 24 | {23.2627062018087}
+ 2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14} | 1 | 21.6 | {25.7088419115781}
+ 3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03} | 1 | 34.7 | {27.5587003901404}
+ 4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94} | 1 | 33.4 | {31.1812237427816}
+ 5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33} | 1 | 36.2 | {30.3696873085477}
+ 6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21} | 1 | 28.7 | {29.5290259241882}
+ 7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43} | 1 | 22.9 | {21.1576051716888}
+ 8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15} | 1 | 27.1 | {17.6194200563055}
+ 9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93} | 1 | 16.5 | {15.1366297774139}
+10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1} | 1 | 18.9 | {17.6528662199369}
+11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} | 1 | 15 | {17.2017487668181}
+12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27} | 1 | 18.9 | {19.4893860319992}
+13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71} | 1 | 21.7 | {23.2917226708039}
+14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26} | 1 | 20.4 | {22.8904812605193}
+15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26} | 1 | 18.2 | {18.2386754423677}
+16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47} | 1 | 19.9 | {23.28949550874}
+17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58} | 1 | 23.1 | {25.3288762085473}
+18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67} | 1 | 17.5 | {19.0203738118451}
+19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69} | 1 | 20.2 | {12.3162005347545}
+20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28} | 1 | 18.2 | {21.0902211848747}
+</pre>
+Note that the results you get for all examples may vary with the platform you are using.
+
+@anchor background
+@par Technical Background
+
+To train a neural net, the respective loss function is minimized using stochastic gradient descent.
+In the case of classification, the loss function is cross entropy. For regression, mean square error
+is used. Weights in the neural net are updated via the backpropogation process, which uses dynamic
+programming to compute the partial derivative of each weight with respect to the overall loss. This
+partial derivative incorporates the respective activation function used, so this requires that the
+activation function be differentiable.
+
+For an overview of multilayer perceptrons, see website [1].
+
+For details on backpropogation, see the notes at [2].
+
+@anchor literature
+@literature
+
+@anchor mlp-lit-1
+[1] "Multilayer Perceptron." Wikipedia. Wikimedia Foundation,
+ 12 July 2017. Web. 12 July 2017.
+
+[2] Yu Hen Hu. "Lecture 11. MLP (III): Back-Propagation."
+ University of Wisconsin Madison: Computer-Aided Engineering. Web. 12 July 2017,
+ http://homepages.cae.wisc.edu/~ece539/videocourse/notes/pdf/lec%2011%20MLP%20(3)%20BP.pdf
+
+@anchor related
+@par Related Topics
+
+File mlp.sql_in documenting the training function
+
+*/
+
+
+CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
+ coeff DOUBLE PRECISION[],
+ loss DOUBLE PRECISION
+);
+
+--------------------------------------------------------------------------
+-- create SQL functions for IGD optimizer
+--------------------------------------------------------------------------
+CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_transition(
+ state DOUBLE PRECISION[],
+ start_vec DOUBLE PRECISION[],
+ end_vec DOUBLE PRECISION[],
+ previous_state DOUBLE PRECISION[],
+ layer_sizes DOUBLE PRECISION[],
+ stepsize DOUBLE PRECISION,
+ activation INTEGER,
+ is_classification INTEGER)
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE;
+
+CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_merge(
+ state1 DOUBLE PRECISION[],
+ state2 DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_final(
+ state DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
+/**
+ * @internal
+ * @brief Perform one iteration of backprop
+ */
+CREATE AGGREGATE MADLIB_SCHEMA.mlp_igd_step(
+ /* start_vec*/ DOUBLE PRECISION[],
+ /* end_vec */ DOUBLE PRECISION[],
+ /* previous_state */ DOUBLE PRECISION[],
+ /* layer_sizes */ DOUBLE PRECISION[],
+ /* stepsize */ DOUBLE PRECISION,
+ /* activation */ INTEGER,
+ /* is_classification */ INTEGER )(
+ STYPE=DOUBLE PRECISION[],
+ SFUNC=MADLIB_SCHEMA.mlp_igd_transition,
+ m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
+ FINALFUNC=MADLIB_SCHEMA.mlp_igd_final,
+ INITCOND='{0,0,0,0,0,0,0,0}'
+);
+-------------------------------------------------------------------------
+
+CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_igd_distance(
+ /*+ state1 */ DOUBLE PRECISION[],
+ /*+ state2 */ DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION AS
+'MODULE_PATHNAME'
+LANGUAGE c IMMUTABLE STRICT;
+
+CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_igd_result(
+ /*+ state */ DOUBLE PRECISION[])
+RETURNS MADLIB_SCHEMA.mlp_result AS
+'MODULE_PATHNAME'
+LANGUAGE c IMMUTABLE STRICT;
+-------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR
+) RETURNS VOID AS $$
+ PythonFunctionBodyOnly(`convex', `mlp_igd')
+ mlp_igd.mlp(
+ schema_madlib,
+ source_table,
+ output_table,
+ independent_varname,
+ dependent_varname,
+ hidden_layer_sizes,
+ optimizer_params,
+ activation,
+ True
+ )
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ independent_varname VARCHAR,
+ dependent_varname VARCHAR,
+ hidden_layer_sizes INTEGER[],
+ optimizer_params VARCHAR,
+ activation VARCHAR
+) RETURNS VOID AS $$
+ PythonFunctionBodyOnly(`convex', `mlp_igd')
+ mlp_igd.mlp(
+ schema_madlib,
+ source_table,
+ output_table,
+ independent_varname,
+ dependent_varname,
+ hidden_layer_sizes,
+ optimizer_params,
+ activation,
+ False
+ )
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict(
+ model_table VARCHAR,
+ data_table VARCHAR,
+ id_col_name VARCHAR,
+ output_table VARCHAR,
+ pred_type VARCHAR
+) RETURNS VOID AS $$
+ PythonFunctionBodyOnly(`convex', `mlp_igd')
+ mlp_igd.mlp_predict(
+ schema_madlib,
+ model_table,
+ data_table,
+ id_col_name,
+ output_table,
+ pred_type)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp(
+ coeff DOUBLE PRECISION[],
+ independent_varname DOUBLE PRECISION[],
+ is_classification DOUBLE PRECISION,
+ activation_function DOUBLE PRECISION,
+ layer_sizes DOUBLE PRECISION[],
+ is_response INTEGER
+ )
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+ message TEXT
+) RETURNS TEXT AS $$
+ PythonFunctionBodyOnly(`convex', `mlp_igd')
+ return mlp_igd.mlp_help(schema_madlib,message,True)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification()
+RETURNS TEXT AS $$
+ SELECT MADLIB_SCHEMA.mlp_classification(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+ message TEXT
+) RETURNS TEXT AS $$
+ PythonFunctionBodyOnly(`convex', `mlp_igd')
+ return mlp_igd.mlp_help(schema_madlib,message,False)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression()
+RETURNS TEXT AS $$
+ SELECT MADLIB_SCHEMA.mlp_regression(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict(
+ message TEXT
+) RETURNS TEXT AS $$
+ PythonFunctionBodyOnly(`convex', `mlp_igd')
+ return mlp_igd.mlp_predict_help(schema_madlib,message)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict()
+RETURNS TEXT AS $$
+ SELECT MADLIB_SCHEMA.mlp_predict(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4fcb60ed/src/ports/postgres/modules/convex/mlp_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
new file mode 100644
index 0000000..6cea7b0
--- /dev/null
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -0,0 +1,752 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+@file mlp_igd.py_in
+
+@brief Multilayer perceptron using IGD: Driver functions
+
+@namespace mlp_igd
+"""
+import plpy
+
+from utilities.control import MinWarning
+from utilities.utilities import add_postfix
+from utilities.utilities import py_list_to_sql_string
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import _assert
+from utilities.utilities import unique_string
+from utilities.utilities import strip_end_quotes
+
+from utilities.validate_args import cols_in_tbl_valid
+from utilities.validate_args import input_tbl_valid
+from utilities.validate_args import is_var_valid
+from utilities.validate_args import output_tbl_valid
+from utilities.validate_args import get_expr_type
+from utilities.validate_args import array_col_has_same_dimension
+from utilities.validate_args import array_col_dimension
+
+
+def mlp(schema_madlib, source_table, output_table, independent_varname,
+ dependent_varname, hidden_layer_sizes,
+ optimizer_param_str, activation, is_classification, **kwargs):
+ """
+ Args:
+ @param schema_madlib
+ @param source_table
+ @param output_table
+ @param independent_varname
+ @param dependent_varname
+ @param hidden_layer_sizes
+ @param optimizer_param_str
+
+ Returns:
+ None
+ """
+ with MinWarning('warning'):
+ optimizer_params = _get_optimizer_params(optimizer_param_str or "")
+ summary_table = add_postfix(output_table, "_summary")
+ _validate_args(source_table, output_table, summary_table, independent_varname,
+ dependent_varname, hidden_layer_sizes,
+ optimizer_params, is_classification)
+
+ current_iteration = 1
+ prev_state = None
+ tolerance = optimizer_params["tolerance"]
+ n_iterations = optimizer_params["n_iterations"]
+ step_size = optimizer_params["step_size"]
+ n_tries = optimizer_params["n_tries"]
+ activation_name = _get_activation_function_name(activation)
+ activation_index = _get_activation_index(activation_name)
+ num_input_nodes = array_col_dimension(
+ source_table, independent_varname)
+ num_output_nodes = 0
+ classes = []
+ dependent_type = get_expr_type(dependent_varname, source_table)
+ original_dependent_varname = dependent_varname
+
+ if is_classification:
+ dependent_variable_sql = """
+ SELECT DISTINCT {dependent_varname}
+ FROM {source_table}
+ """.format(dependent_varname=dependent_varname,
+ source_table=source_table)
+ labels = plpy.execute(dependent_variable_sql)
+ one_hot_dependent_varname = 'ARRAY['
+ num_output_nodes = len(labels)
+ for label_obj in labels:
+ label = _format_label(label_obj[dependent_varname])
+ classes.append(label)
+ one_hot_dependent_varname += dependent_varname + \
+ "=" + str(label) + ","
+ # Remove the last comma
+ one_hot_dependent_varname = one_hot_dependent_varname[:-1]
+ one_hot_dependent_varname += ']::integer[]'
+ dependent_varname = one_hot_dependent_varname
+ else:
+ if "[]" not in dependent_type:
+ dependent_varname = "ARRAY[" + dependent_varname + "]"
+ num_output_nodes = array_col_dimension(
+ source_table, dependent_varname)
+ layer_sizes = [num_input_nodes] + \
+ hidden_layer_sizes + [num_output_nodes]
+
+ while True:
+ if prev_state:
+ prev_state_str = py_list_to_sql_string(
+ prev_state, array_type="double precision")
+ else:
+ prev_state_str = "(NULL)::DOUBLE PRECISION[]"
+ train_sql = """
+ SELECT
+ {schema_madlib}.mlp_igd_step(
+ ({independent_varname})::DOUBLE PRECISION[],
+ ({dependent_varname})::DOUBLE PRECISION[],
+ {prev_state},
+ {layer_sizes},
+ ({step_size})::FLOAT8,
+ {activation},
+ {is_classification}) as curr_state
+ FROM {source_table} AS _src
+ """.format(schema_madlib=schema_madlib,
+ independent_varname=independent_varname,
+ dependent_varname=dependent_varname,
+ prev_state=prev_state_str,
+ # C++ uses double internally
+ layer_sizes=py_list_to_sql_string(layer_sizes,
+ array_type="double precision"),
+ step_size=step_size,
+ source_table=source_table,
+ activation=activation_index,
+ is_classification=int(is_classification))
+ curr_state = plpy.execute(train_sql)[0]["curr_state"]
+ dist_sql = """
+ SELECT {schema_madlib}.internal_mlp_igd_distance(
+ {prev_state},
+ {curr_state}) as state_dist
+ """.format(schema_madlib=schema_madlib,
+ prev_state=prev_state_str,
+ curr_state=py_list_to_sql_string(curr_state, "double precision"))
+ state_dist = plpy.execute(dist_sql)[0]["state_dist"]
+ if ((state_dist and state_dist < tolerance) or
+ current_iteration > n_iterations):
+ break
+ prev_state = curr_state
+ current_iteration += 1
+ _build_model_table(schema_madlib, output_table,
+ curr_state, n_iterations)
+ layer_sizes_str = py_list_to_sql_string(
+ layer_sizes, array_type="integer")
+ classes_str = py_list_to_sql_string(
+ [strip_end_quotes(cl, "'") for cl in classes],
+ array_type=dependent_type)
+ summary_table_creation_query = """
+ CREATE TABLE {summary_table}(
+ source_table TEXT,
+ independent_varname TEXT,
+ dependent_varname TEXT,
+ tolerance FLOAT,
+ step_size FLOAT,
+ n_iterations INTEGER,
+ n_tries INTEGER,
+ layer_sizes INTEGER[],
+ activation_function TEXT,
+ is_classification BOOLEAN,
+ classes {dependent_type}[]
+ )""".format(summary_table=summary_table,
+ dependent_type=dependent_type)
+
+ summary_table_update_query = """
+ INSERT INTO {summary_table} VALUES(
+ '{source_table}',
+ '{independent_varname}',
+ '{original_dependent_varname}',
+ {tolerance},
+ {step_size},
+ {n_iterations},
+ {n_tries},
+ {layer_sizes_str},
+ '{activation_name}',
+ {is_classification},
+ {classes_str}
+ )
+ """.format(**locals())
+ plpy.execute(summary_table_creation_query)
+ plpy.execute(summary_table_update_query)
+# ----------------------------------------------------------------------
+
+
+def _build_model_table(schema_madlib, output_table, final_state, n_iterations):
+ final_state_str = py_list_to_sql_string(
+ final_state, array_type="double precision")
+
+ model_table_query = """
+ CREATE TABLE {output_table} AS
+ SELECT
+ (result).coeff AS coeff,
+ (result).loss AS loss,
+ {n_iterations} AS num_iterations
+ -- (result).num_rows_processed AS num_rows_processed,
+ -- n_tuples_including_nulls - (result).num_rows_processed
+ FROM (
+ SELECT
+ {schema_madlib}.internal_mlp_igd_result(
+ {final_state_str}
+ ) AS result
+ ) rel_state_subq
+ """.format(**locals())
+ plpy.execute(model_table_query)
+# ----------------------------------------------------------------------
+
+
+def _get_optimizer_params(param_str):
+ params_defaults = {
+ "step_size": (0.001, float),
+ "n_iterations": (100, int),
+ "n_tries": (1, int),
+ "tolerance": (0.001, float),
+ }
+ param_defaults = dict([(k, v[0]) for k, v in params_defaults.items()])
+ param_types = dict([(k, v[1]) for k, v in params_defaults.items()])
+
+ if not param_str:
+ return param_defaults
+
+ name_value = extract_keyvalue_params(param_str, param_types, param_defaults,
+ ignore_invalid=False)
+ return name_value
+# ----------------------------------------------------------------------
+
+
+def _validate_args_classification(source_table, dependent_varname):
+ expr_type = get_expr_type(dependent_varname, source_table)
+ int_types = ['integer', 'smallint', 'bigint']
+ text_types = ['text', 'varchar', 'character varying', 'char', 'character']
+ boolean_types = ['boolean']
+ _assert("[]" in expr_type or expr_type in int_types + text_types + boolean_types,
+ "Dependent variable column should refer to an "
+ "integer, boolean, text, varchar, or character type.")
+# ----------------------------------------------------------------------
+
+
+def _validate_args_regression(source_table, dependent_varname):
+ expr_type = get_expr_type(dependent_varname, source_table)
+ int_types = ['integer', 'smallint', 'bigint']
+ float_types = ['double precision', 'real']
+ _assert("[]" in expr_type or expr_type in int_types + float_types,
+ "Dependent variable column should refer to an array or numeric type")
+ if "[]" in expr_type:
+ _assert(array_col_has_same_dimension(source_table, dependent_varname),
+ "Dependent variable column should refer to arrays of the same length")
+# ----------------------------------------------------------------------
+
+
+def _validate_args(source_table, output_table, summary_table, independent_varname,
+ dependent_varname, hidden_layer_sizes,
+ optimizer_params, is_classification):
+ input_tbl_valid(source_table, "MLP")
+ output_tbl_valid(output_table, "MLP")
+ output_tbl_valid(summary_table, "MLP")
+ _assert(is_var_valid(source_table, independent_varname),
+ "MLP error: invalid independent_varname "
+ "('{independent_varname}') for source_table "
+ "({source_table})!".format(independent_varname=independent_varname,
+ source_table=source_table))
+
+ _assert(is_var_valid(source_table, dependent_varname),
+ "MLP error: invalid dependent_varname "
+ "('{dependent_varname}') for source_table "
+ "({source_table})!".format(dependent_varname=dependent_varname,
+ source_table=source_table))
+ _assert(hidden_layer_sizes is not None,
+ "hidden_layer_sizes may not be null")
+ _assert(isinstance(hidden_layer_sizes, list),
+ "hidden_layer_sizes must be an array of integers")
+ _assert(all(isinstance(value, int) for value in hidden_layer_sizes),
+ "MLP error: Hidden layers sizes must be integers")
+ _assert(all(value >= 0 for value in hidden_layer_sizes),
+ "MLP error: Hidden layers sizes must be greater than 0.")
+ _assert(optimizer_params["tolerance"] >= 0,
+ "MLP error: Tolerance should be greater than or equal to 0.")
+ _assert(optimizer_params["n_tries"] >= 1,
+ "MLP error: Number of tries should be greater than or equal to 1")
+ _assert(optimizer_params["n_iterations"] >= 1,
+ "MLP error: Number of iterations should be greater than or equal to 1")
+ _assert(optimizer_params["step_size"] > 0,
+ "MLP error: Stepsize should be greater than 0.")
+ _assert("[]" in get_expr_type(independent_varname, source_table),
+ "Independent variable column should refer to an array")
+ _assert(array_col_has_same_dimension(source_table, independent_varname),
+ "Independent variable column should refer to arrays of the same length")
+
+ if is_classification:
+ _validate_args_classification(source_table, dependent_varname)
+ else:
+ _validate_args_regression(source_table, dependent_varname)
+# ----------------------------------------------------------------------
+
+
+def _get_activation_function_name(activation_function):
+ if not activation_function:
+ activation_function = 'sigmoid'
+ else:
+ # Add non-linear kernels below after implementing them.
+ supported_activation_function = ['sigmoid', 'tanh', 'relu']
+ try:
+ # allow user to specify a prefix substring of
+ # supported kernels. This works because the supported
+ # kernels have unique prefixes.
+ activation_function = next(x for x in supported_activation_function
+ if x.startswith(activation_function))
+ except StopIteration:
+ # next() returns a StopIteration if no element found
+ plpy.error("MLP Error: Invalid activation function: "
+ "{0}. Supported activation functions are ({1})"
+ .format(activation_function, ','.join(
+ sorted(supported_activation_function))))
+ return activation_function
+# ------------------------------------------------------------------------------
+
+
+def _get_activation_index(activation_name):
+ table = {"relu": 0, "sigmoid": 1, "tanh": 2}
+ return table[activation_name]
+
+
+def _format_label(label):
+ if isinstance(label, str):
+ return "'" + label + "'"
+ return label
+# -------------------------------------------------------------------------
+
+
+def mlp_predict(schema_madlib, model_table, data_table,
+ id_col_name, output_table,
+ pred_type='response', **kwargs):
+ """ Score new observations using a trained neural network
+
+ @param schema_madlib Name of the schema where MADlib is installed
+ @param model_table Name of learned model
+ @param data_table Name of table/view containing the data
+ points to be scored
+ @param id_col_name Name of column in source_table containing
+ (integer) identifier for data point
+ @param output_table Name of table to store the results
+ @param pred_type: str, The type of output required:
+ 'response' gives the actual response values,
+ 'prob' gives the probability of the classes in a
+ For regression, only type='response' is defined.
+ """
+ input_tbl_valid(model_table, 'MLP')
+ cols_in_tbl_valid(model_table, ['coeff'], 'MLP')
+ summary_table = add_postfix(model_table, "_summary")
+ input_tbl_valid(summary_table, 'MLP')
+ cols_in_tbl_valid(summary_table,
+ ['dependent_varname', 'independent_varname',
+ 'activation_function',
+ 'tolerance', 'step_size', 'n_iterations',
+ 'n_tries', 'classes', 'layer_sizes', 'source_table'],
+ 'MLP')
+
+ summary = plpy.execute("SELECT * FROM {0}".format(summary_table))[0]
+ coeff = py_list_to_sql_string(plpy.execute(
+ "SELECT * FROM {0}".format(model_table))[0]["coeff"])
+ dependent_varname = summary['dependent_varname']
+ independent_varname = summary['independent_varname']
+ source_table = summary['source_table']
+ activation_function = _get_activation_index(summary['activation_function'])
+ layer_sizes = py_list_to_sql_string(
+ summary['layer_sizes'], array_type="DOUBLE PRECISION")
+ is_classification = int(summary["is_classification"])
+ is_response = int(pred_type == 'response')
+
+ pred_name = ('"prob_{0}"' if pred_type == "prob" else
+ '"estimated_{0}"').format(dependent_varname.replace('"', '').strip())
+
+ input_tbl_valid(data_table, 'MLP')
+
+ _assert(is_var_valid(data_table, independent_varname),
+ "MLP Error: independent_varname ('{0}') is invalid for data_table ({1})".
+ format(independent_varname, data_table))
+ _assert(id_col_name is not None, "MLP Error: id_col_name is NULL")
+ _assert(is_var_valid(data_table, id_col_name),
+ "MLP Error: id_col_name ('{0}') is invalid for {1}".
+ format(id_col_name, data_table))
+ output_tbl_valid(output_table, 'MLP')
+
+ with MinWarning("warning"):
+ header = "CREATE TABLE " + output_table + " AS "
+ # Regression
+ if not is_classification:
+ dependent_type = get_expr_type(dependent_varname, source_table)
+ unnest_if_not_array = ""
+ # Return the same type as the user provided. Internally we always use an array, but
+ # if they provided a scaler, unnest it for the user
+ if "[]" not in dependent_type:
+ unnest_if_not_array = "UNNEST"
+ sql = header + """
+ SELECT {id_col_name},
+ {unnest_if_not_array}({schema_madlib}.internal_predict_mlp(
+ {coeff},
+ {independent_varname}::DOUBLE PRECISION[],
+ {is_classification},
+ {activation_function},
+ {layer_sizes},
+ {is_response}
+ )) as {pred_name}
+ FROM {data_table}
+ """
+ else:
+ summary_query = """
+ SELECT classes FROM {0}
+ """.format(summary_table)
+ classes = plpy.execute(summary_query)[0]['classes']
+ if pred_type == "response":
+ # This join is to recover the class name from the summary table,
+ # as prediction just returns an index
+ classes_with_index_table = unique_string()
+ classes_table = unique_string()
+ sql = header + """
+ SELECT
+ q.{id_col_name}
+ ,(ARRAY{classes})[pred_idx[1]+1] as {pred_name}
+ FROM (
+ SELECT
+ {id_col_name},
+ {schema_madlib}.internal_predict_mlp(
+ {coeff}::DOUBLE PRECISION[],
+ {independent_varname}::DOUBLE PRECISION[],
+ {is_classification},
+ {activation_function},
+ {layer_sizes},
+ {is_response}
+ )
+ as pred_idx
+ FROM {data_table}
+ ) q
+ """
+ else:
+ # Incomplete
+ intermediate_col = unique_string()
+ score_format = ',\n'.join([
+ 'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'.
+ format(j=i + 1, c_str=str(c).strip(' "'),
+ interim=intermediate_col)
+ for i, c in enumerate(classes)])
+ sql = header + """
+ SELECT
+ {id_col_name},
+ {score_format}
+ FROM (
+ SELECT {id_col_name},
+ {schema_madlib}.internal_predict_mlp(
+ {coeff}::DOUBLE PRECISION[],
+ {independent_varname}::DOUBLE PRECISION[],
+ {is_classification},
+ {activation_function},
+ {layer_sizes},
+ {is_response}
+ )::TEXT[]
+ AS {intermediate_col}
+ FROM {data_table}
+ ) q
+ """
+ sql = sql.format(**locals())
+ plpy.execute(sql)
+# ----------------------------------------------------------------------
+
+
+def mlp_help(schema_madlib, message, is_classification):
+ method = 'mlp_classification' if is_classification else 'mlp_regression'
+ int_types = ['integer', 'smallint', 'bigint']
+ text_types = ['text', 'varchar', 'character varying', 'char', 'character']
+ boolean_types = ['boolean']
+ supported_types = " " * 33 + ", ".join(text_types) + "\n" +\
+ " " * 33 + ", ".join(int_types + boolean_types)
+ label_description_classification = "Name of a column which specifies label.\n" +\
+ " " * 33 + "Supported types are:\n" + supported_types
+ label_description_regression = "Dependent variable. May be an array for multiple\n" +\
+ " " * 33 + "regression or the name of a column which is any\n" + " " * 33 +\
+ "numeric type for single regression"
+ label_description = label_description_classification if is_classification\
+ else label_description_regression
+ args = dict(schema_madlib=schema_madlib, method=method,
+ label_description=label_description)
+
+ summary = """
+ ----------------------------------------------------------------
+ SUMMARY
+ ----------------------------------------------------------------
+ Multilayer Perceptron (MLP) is a model for regression and
+ classification
+
+ Also called "vanilla neural networks", they consist of several
+ fully connected hidden layers with non-linear activation
+ functions.
+
+ For more details on function usage:
+ SELECT {schema_madlib}.{method}('usage')
+
+ For a small example on using the function:
+ SELECT {schema_madlib}.{method}('example')""".format(**args)
+
+ usage = """
+ ---------------------------------------------------------------------------
+ USAGE
+ ---------------------------------------------------------------------------
+ SELECT {schema_madlib}.{method}(
+ source_table, -- name of input table
+ output_table, -- name of output model table
+ independent_varname, -- name of independent variable
+ dependent_varname, -- {label_description}
+ hidden_layer_sizes, -- Array of integers indicating the
+ number of hidden units per layer.
+ Length equal to the number of hidden layers.
+ optimizer_params, -- optional, default NULL
+ parameters for optimization in
+ a comma-separated string of key-value pairs.
+
+ step_size DOUBLE PRECISION, -- Default: 0.001
+ Learning rate
+ n_iterations INTEGER, -- Default: 100
+ Number of iterations per try
+ n_tries INTEGER, -- Default: 1
+ Total number of training cycles,
+ with random initializations to avoid
+ local minima.
+ tolerance DOUBLE PRECISION, -- Default: 0.001
+ If the distance in loss between
+ two iterations is less than the
+ tolerance training will stop, even if
+ n_iterations has not been reached
+
+ activation -- optional, default: 'sigmoid'.
+ supported activations: 'relu', 'sigmoid',
+ and 'tanh'
+ );
+
+
+ ---------------------------------------------------------------------------
+ OUTPUT
+ ---------------------------------------------------------------------------
+ The model table produced by MLP contains the following columns:
+
+ coeffs -- Flat array containing the weights of the neural net
+
+ loss -- The total loss over the training data. Cross entropy
+ for classification and MSE for regression
+
+ num_iterations -- The total number of training iterations
+
+ """.format(**args)
+
+ regression_example = """
+ - Create input table
+
+ CREATE TABLE lin_housing_wi (id serial, x float8[], grp_by_col int, y float8);
+ COPY lin_housing_wi (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
+ {1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98} | 1 | 24.00
+ {1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14} | 1 | 21.60
+ {1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03} | 1 | 34.70
+ {1,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94} | 1 | 33.40
+ {1,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33} | 1 | 36.20
+ {1,0.02985,0.00,2.180,0,0.4580,6.4300,58.70,6.0622,3,222.0,18.70,394.12,5.21} | 1 | 28.70
+ {1,0.08829,12.50,7.870,0,0.5240,6.0120,66.60,5.5605,5,311.0,15.20,395.60,12.43} | 1 | 22.90
+ {1,0.14455,12.50,7.870,0,0.5240,6.1720,96.10,5.9505,5,311.0,15.20,396.90,19.15} | 1 | 27.10
+ {1,0.21124,12.50,7.870,0,0.5240,5.6310,100.00,6.0821,5,311.0,15.20,386.63,29.93} | 1 | 16.50
+ {1,0.17004,12.50,7.870,0,0.5240,6.0040,85.90,6.5921,5,311.0,15.20,386.71,17.10} | 1 | 18.90
+ {1,0.22489,12.50,7.870,0,0.5240,6.3770,94.30,6.3467,5,311.0,15.20,392.52,20.45} | 1 | 15.00
+ {1,0.11747,12.50,7.870,0,0.5240,6.0090,82.90,6.2267,5,311.0,15.20,396.90,13.27} | 1 | 18.90
+ {1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71} | 1 | 21.70
+ \.
+
+ - Generate a multilayer perception with a two hidden layers of 5 units
+ each. Use the x column as the independent variables, and use the class
+ column as the classification. Set the tolerance to 0 so that 300
+ iterations will be run. Use a sigmoid activation function.
+ The model will be written to mlp_regress_result.
+
+ SELECT mlp_regression(
+ 'lin_housing_wi', -- Source table
+ 'mlp_regress_result', -- Desination table
+ 'x', -- Independent variable
+ 'y', -- Dependent variable
+ ARRAY[5,5], -- Number of hidden units per layer
+ 'step_size=0.007,
+ n_iterations=300,
+ tolerance=0',
+ 'sigmoid'); -- Activation
+
+ """
+
+ classification_example = """
+ -- Create input table
+
+ CREATE TABLE iris_data(
+ id integer,
+ attributes numeric[],
+ class_text varchar,
+ class integer
+ );
+
+ INSERT INTO iris_data VALUES
+ (1,ARRAY[5.1,3.5,1.4,0.2],'Iris-setosa',1),
+ (2,ARRAY[4.9,3.0,1.4,0.2],'Iris-setosa',1),
+ (3,ARRAY[4.7,3.2,1.3,0.2],'Iris-setosa',1),
+ (4,ARRAY[4.6,3.1,1.5,0.2],'Iris-setosa',1),
+ (5,ARRAY[5.0,3.6,1.4,0.2],'Iris-setosa',1),
+ (6,ARRAY[5.4,3.9,1.7,0.4],'Iris-setosa',1),
+ (7,ARRAY[4.6,3.4,1.4,0.3],'Iris-setosa',1),
+ (8,ARRAY[5.0,3.4,1.5,0.2],'Iris-setosa',1),
+ (9,ARRAY[4.4,2.9,1.4,0.2],'Iris-setosa',1),
+ (10,ARRAY[4.9,3.1,1.5,0.1],'Iris-setosa',1),
+ (11,ARRAY[7.0,3.2,4.7,1.4],'Iris-versicolor',2),
+ (12,ARRAY[6.4,3.2,4.5,1.5],'Iris-versicolor',2),
+ (13,ARRAY[6.9,3.1,4.9,1.5],'Iris-versicolor',2),
+ (14,ARRAY[5.5,2.3,4.0,1.3],'Iris-versicolor',2),
+ (15,ARRAY[6.5,2.8,4.6,1.5],'Iris-versicolor',2),
+ (16,ARRAY[5.7,2.8,4.5,1.3],'Iris-versicolor',2),
+ (17,ARRAY[6.3,3.3,4.7,1.6],'Iris-versicolor',2),
+ (18,ARRAY[4.9,2.4,3.3,1.0],'Iris-versicolor',2),
+ (19,ARRAY[6.6,2.9,4.6,1.3],'Iris-versicolor',2),
+ (20,ARRAY[5.2,2.7,3.9,1.4],'Iris-versicolor',2);
+
+
+ -- Generate a multilayer perception with a single hidden layer of 5 units.
+ Use the attributes column as the independent variables, and use the class
+ column as the classification. Set the tolerance to 0 so that 1000
+ iterations will be run. Use a hyperbolic tangent activation function.
+ The model will be written to mlp_result.
+
+ SELECT {schema_madlib}.mlp_classification(
+ 'iris_data', -- Source table
+ 'mlp_model', -- Destination table
+ 'attributes', -- Input features
+ 'class_text', -- Label
+ ARRAY[5], -- Number of units per layer
+ 'step_size=0.003,
+ n_iterations=5000,
+ tolerance=0', -- Optimizer params
+ 'tanh'); -- Activation function
+
+ """.format(**args)
+ example = classification_example if is_classification else regression_example
+ if not message:
+ return summary
+ elif message.lower() in ('usage', 'help', '?'):
+ return usage
+ elif message.lower() == 'example':
+ return example
+ return """
+ No such option. Use "SELECT {schema_madlib}.{method}()" for help.
+ """.format(**args)
+
+
+def mlp_predict_help(schema_madlib, message):
+ args = dict(schema_madlib=schema_madlib)
+
+ summary = """
+ ----------------------------------------------------------------
+ SUMMARY
+ ----------------------------------------------------------------
+ Multilayer Perceptron (MLP) is a model for regression and
+ classification
+
+ Also called "vanilla neural networks", they consist of several
+ fully connected hidden layers with non-linear activation
+ functions.
+
+ For more details on function usage:
+ SELECT {schema_madlib}.mlp_predict('usage')
+
+ For a small example on using the function:
+ SELECT {schema_madlib}.mlp_predict('example')""".format(**args)
+
+ usage = """
+ ---------------------------------------------------------------------------
+ USAGE
+ ---------------------------------------------------------------------------
+ SELECT {schema_madlib}.mlp_predict(
+ model_table, -- name of model table
+ data_table, -- name of data table
+ id_col_name, -- id column for data table
+ output_table, -- name of output table
+ pred_type -- the type of output requested:
+ -- 'response' gives the actual prediction,
+ -- 'prob' gives the probability of each class.
+ -- for regression, only type='response' is defined.
+ );
+
+
+ ---------------------------------------------------------------------------
+ OUTPUT
+ ---------------------------------------------------------------------------
+ The model table produced by mlp contains the following columns:
+
+ id -- The provided id for the given input vector
+
+ estimated_<COL_NAME> -- (For pred_type='response') The estimated class
+ for classification or value for regression, where
+ <COL_NAME> is the name of the column to be
+ predicted from training data
+
+ prob_<CLASS> -- (For pred_type='prob' for classification) The
+ probability of a given class <CLASS> as given by
+ softmax. There will be one column for each class
+ in the training data.
+
+ """.format(**args)
+
+ example = """
+ -- See {schema_madlib}.mlp_classification('example') for test
+ -- and model tables
+
+ -- Predict classes using
+ SELECT {schema_madlib}.mlp_predict(
+ 'mlp_model', -- Model table
+ 'iris_data', -- Test data table
+ 'id', -- Id column in test table
+ 'mlp_prediction', -- Output table for predictions
+ 'response' -- Output classes, not probabilities
+ );
+
+ SELECT * FROM mlp_prediction;
+
+ WITH total_count AS (SELECT count(*) AS c FROM iris_data)
+ SELECT count(*)/((SELECT c FROM total_count)::DOUBLE PRECISION)
+ AS train_accuracy
+ FROM
+ (
+ SELECT iris_data.class_text AS actual_label,
+ mlp_prediction.estimated_class_text AS predicted_label
+ FROM mlp_prediction
+ INNER JOIN iris_data ON iris_data.id=mlp_prediction.id
+ ) q
+ WHERE q.actual_label=q.predicted_label;
+ """.format(**args)
+
+ if not message:
+ return summary
+ elif message.lower() in ('usage', 'help', '?'):
+ return usage
+ elif message.lower() == 'example':
+ return example
+ return """
+ No such option. Use "SELECT {schema_madlib}.mlp_predict()" for help.
+ """.format(**args)