You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2018/04/17 21:06:44 UTC
[4/6] madlib git commit: Docs: Update MLP, mini-batch documentation
http://git-wip-us.apache.org/repos/asf/madlib/blob/0f9f12f3/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
index 44e3e26..4a08702 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
@@ -30,24 +30,41 @@ m4_include(`SQLCommon.m4')
@addtogroup grp_minibatch_preprocessing
<div class="toc"><b>Contents</b><ul>
-<li class="level1"><a href="#todo">todo</a></li>
+<li class="level1"><a href="#minibatch_preprocessor">Mini-Batch Preprocessor</a></li>
+<li class="level1"><a href="#example">Examples</a></li>
+<li class="level1"><a href="#literature">Literature</a></li>
+<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>
-MiniBatch Preprocessor is a utility function to pre process the input
-data for use with models that support mini-batching as an optimization
+The mini-batch preprocessor is a utility that prepares input
+data for use by models that support mini-batch as an optimization option.
+(This is currently
+only the case for <a href="group__grp__nn.html">Neural Networks</a>.)
+It is effectively a packing operation that builds
+arrays of dependent and independent variables from the source data table.
+
+The advantage of using mini-batching is that it can perform better than
+stochastic gradient descent (default MADlib optimizer) because it
+uses more than one training
+example at a time, typically resulting faster and smoother convergence [1].
@brief
+Utility that prepares input data for use by models that support
+mini-batch as an optimization option.
+
@anchor minibatch_preprocessor
-@par MiniBatch Preprocessor
+@par Mini-Batch Preprocessor
+The mini-batch preprocessor has the following format:
+
<pre class="syntax">
-minibatch_preprocessor(
- source_table
- output_table
- dependent_varname
- independent_varname
- buffer_size,
- one_hot_encode_int_dep_var
- )
+minibatch_preprocessor( source_table,
+ output_table,
+ dependent_varname,
+ independent_varname,
+ grouping_col,
+ buffer_size,
+ one_hot_encode_int_dep_var
+ )
</pre>
\b Arguments
@@ -57,8 +74,11 @@ minibatch_preprocessor(
</dd>
<dt>output_table</dt>
- <dd>TEXT. Name of the output table from the preprocessor which will be used
- as input to algorithms that support mini-batching.
+ <dd>TEXT. Name of the output table from the preprocessor which
+ will be used as input to algorithms that support mini-batching.
+ Note that the arrays packed into the output table are randomized
+ and normalized, so they will not match up in an obvious way with the
+ rows in the source table.
</dd>
<dt>dependent_varname</dt>
@@ -67,30 +87,62 @@ minibatch_preprocessor(
<dt>independent_varname</dt>
<dd>TEXT. Column name or expression list to evaluate for the independent
- variable. Will be cast to double when packing.
+ variable. Please note that independent variables
+ are cast to double precision by the preprocessor,
+ so categorical variables should be
+ one-hot or dummy encoded as appropriate.
+ See <a href="group__grp__encode__categorical.html">Encoding Categorical Variables</a>
+ for more details on this.
@note
- Supported expressions for independent variable
- ‘ARRAY[x1,x2,x3]’ , where x1,x2,x3 are columns in source table with scalar values
- ‘x1’, where x1 is a single column in source table, with value as an array, like ARRAY[1,2,3] or {1,2,3}
- We might already support expressions that evaluate to array but haven't tested it.
-
- Not supported
- ‘x1,x2,x3’, where x1,x2,x3 are columns in source table with scalar values
- ARRAY[x1,x2] where x1 is scalar and x2 is array
- ARRAY[x1,x2] where both x1 and x2 are arrays
- ARRAY[x1] where x1 is array
- </dd>
-
- <dt>buffer_size</dt>
- <dd>INTEGER. default: ???. Number of source input rows to pack into batch
+ Supported expressions for independent variables include:
+ - ‘ARRAY[x1,x2,x3]’, where x1, x2, and x3 are
+ columns in the source table containing scalar values.
+ - Single column in the source table containing
+ an array like ARRAY[1,2,3] or {1,2,3}.
+ @note
+ The following forms are not currently supported:
+ - ‘x1,x2,x3’, where x1,x2,x3 are columns in source table with scalar values
+ - ARRAY[x1,x2] where x1 is scalar and x2 is array
+ - ARRAY[x1,x2] where both x1 and x2 are arrays
+ - ARRAY[x1] where x1 is array
</dd>
<dt>grouping_col (optional)</dt>
<dd>TEXT, default: NULL.
An expression list used to group the input dataset into discrete groups,
- running one preprocessing step per group. Similar to the SQL GROUP BY clause.
- When this value is NULL, no grouping is used and a single preprocessing step
- is performed for the whole data set.
+ which runs the preprocessing separately for each group.
+ When this value is NULL, no grouping is used and a single preprocessor step
+ is run for the whole data set.
+ @note
+ If you plan to use grouping in model training, then you must set
+ up the groups in the preprocessor exactly as you want to use them
+ in training.
+ </dd>
+
+ <dt>buffer_size (optional)</dt>
+ <dd>INTEGER, default: computed. Buffer size is the
+ number of rows from the
+ source table that are packed into one row of the preprocessor
+ output table. The default value is computed considering size of
+ the source table, number of independent variables, number of groups,
+ and number of segments in the database cluster. For larger data sets,
+ the computed buffer size will typically be a value in the millions.
+ </dd>
+
+ <dt>one_hot_encode_int_dep_var (optional)</dt>
+ <dd> BOOLEAN. default: FALSE.
+ Flag to one-hot encode dependent variables that are
+ scalar integers. This parameter is ignored if the
+ dependent variable is not a scalar integer.
+
+@note The mini-batch preprocessor automatically encodes
+dependent variables that are boolean and character types such as text, char and
+varchar. However, scalar integers are a special case because they can be used
+in both classification and regression problems, so you must tell the mini-batch
+preprocessor whether you want to encode them or not. In the case that you have
+already encoded the dependent variable yourself, you can ignore this parameter.
+Also, if you want to encode float values for some reason, cast them to text
+first.
</dd>
<dt>one_hot_encode_int_dep_var (optional)</dt>
@@ -112,16 +164,18 @@ first.
<b>Output tables</b>
<br>
- The output table produced by MLP contains the following columns:
+ The output table produced by the mini-batch preprocessor contains the following columns:
<table class="output">
<tr>
- <th>id</th>
+ <th>__id__</th>
<td>INTEGER. Unique id for packed table.
</td>
</tr>
<tr>
<th>dependent_varname</th>
- <td>FLOAT8[]. Packed array of dependent variables.
+ <td>FLOAT8[]. Packed array of dependent variables. If the
+ dependent variable in the source table is categorical,
+ the preprocessor will one-hot encode it.
</td>
</tr>
<tr>
@@ -131,7 +185,7 @@ first.
</tr>
<tr>
<th>grouping_cols</th>
- <td>TEXT. Name of grouping columns
+ <td>TEXT. Name of grouping columns.
</td>
</tr>
</table>
@@ -140,15 +194,15 @@ A summary table named \<output_table\>_summary is also created, which has the fo
<table class="output">
<tr>
<th>source_table</th>
- <td>The source table.</td>
+ <td>Name of the source table.</td>
</tr>
<tr>
<th>output_table</th>
- <td>Output table name from preprocessor.</td>
+ <td>Name of output table generated by preprocessor.</td>
</tr>
<tr>
<th>dependent_varname</th>
- <td>Dependent variable from the input table.</td>
+ <td>Dependent variable from the source table.</td>
</tr>
<tr>
<th>independent_varname</th>
@@ -160,43 +214,330 @@ A summary table named \<output_table\>_summary is also created, which has the fo
</tr>
<tr>
<th>class_values</th>
- <td>Class values of the dependent variable (‘NULL’(as TEXT type) for non categorical vars, i,e., if dependent_vartype=”Categorical”)./td>
+ <td>Class values (i.e., levels) of the dependent
+ variable if categorical. If the dependent variable is not
+ categorical, this will be NULL./td>
</tr>
<tr>
<th>num_rows_processed</th>
- <td>The total number of rows that were used in the computation.</td>
+ <td>The total number of rows that were used in the
+ preprocessing operation.</td>
</tr>
<tr>
<th>num_missing_rows_skipped</th>
- <td>The total number of rows that were skipped because of NULL values in them.</td>
+ <td>The total number of rows that were skipped because of
+ NULL values in either the dependent or independent variables.</td>
</tr>
<tr>
<th>grouping_col</th>
- <td>NULL if no grouping_col was specified , and a comma separated
- list of grouping column names if not.</td>
+ <td>Comma separated list of grouping column names
+ if grouping is used. If no grouping, will be NULL.</td>
</tr>
</table>
-A standardization table named \<output_table\>_standardization is also created, that has the
-following columns:
+A standardization table named \<output_table\>_standardization
+is also created. This is needed by the models that will use the
+preprocessed data so is likely not of much interest to users.
+It has the following columns:
<table class="output">
<tr>
<th>grouping columns</th>
- <td>If grouping_col is specified during training, a column for each grouping column
+ <td>If 'grouping_col' is specified,
+ a column for each grouping column
is created.</td>
</tr>
<tr>
<th>mean</th>
- <td>Mean of independent vars by group</td>
+ <td>Mean of independent variables.</td>
</tr>
<tr>
<th>std</th>
- <td>Standard deviation of independent vars by group</td>
+ <td>Population standard deviation of
+ independent variables.</td>
</tr>
</table>
@anchor example
@par Examples
+-# Create an input data set based on the well known iris data set:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data;
+CREATE TABLE iris_data(
+ id serial,
+ attributes numeric[],
+ class_text varchar,
+ class integer,
+ state varchar
+);
+INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
+(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
+(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
+(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
+(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
+(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
+(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
+(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
+(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
+(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
+(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
+(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
+(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
+(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
+(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
+(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
+(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
+(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
+(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
+(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
+(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
+(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
+(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
+(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
+(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
+(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
+(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
+(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
+(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
+(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
+(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
+(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
+(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
+(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
+(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
+(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
+(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
+(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
+(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
+(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
+(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
+(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
+</pre>
+
+-# Run the preprocessor:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
+ 'iris_data_packed', -- Output table
+ 'class_text', -- Dependent variable
+ 'attributes' -- Independent variables
+ );
+</pre>
+For small datasets like in this example, buffer size is mainly
+determined by the number of segments in the database.
+This example is run on a Greenplum database with 2 segments,
+so there are 2 rows with a buffer size of 26.
+For PostgresSQL, there would be only one row with a buffer
+size of 52 since it is a single node database.
+For larger data sets, other factors go into
+computing buffers size besides number of segments.
+Also, note that the dependent variable has
+been one-hot encoded since it is categorical.
+Here is a sample of the packed output table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+-------------------------------------
+__id__ | 0
+dependent_varname | {{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1}}
+independent_varname | {{-0.767560815504508,0.806649237861967,-1.07515071152907,-1.18456909732025},{-0.0995580974152422,0.00385956572525086,1.03989986852812,1.17758048907675},...
+...
+-[ RECORD 2 ]-------+-------------------------------------
+__id__ | 1
+dependent_varname | {{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{0,1}}
+independent_varname | {{0.568444620674023,2.01083374606704,-1.28665576953479,-1.18456909732025},{-1.76956489263841,0.405254401793609,-1.21615408353289,-1.18456909732025},...
+...
+</pre>
+Review the output summary table:
+<pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]------------+------------------------------
+source_table | iris_data
+output_table | iris_data_packed
+dependent_varname | class_text
+independent_varname | attributes
+buffer_size | 26
+class_values | {Iris_setosa,Iris_versicolor}
+num_rows_processed | 52
+num_missing_rows_skipped | 0
+grouping_cols |
+</pre>
+Review the output standardization table:
+<pre class="example">
+SELECT * FROM iris_data_packed_standardization;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]------------------------------------------------------
+mean | {5.45961538462,2.99807692308,3.025,0.851923076923}
+std | {0.598799958695,0.498262513686,1.41840579525,0.550346179381}
+</pre>
+
+-# Generally the default buffer size will work well,
+but if you have occasion to change it:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
+ 'iris_data_packed', -- Output table
+ 'class_text', -- Dependent variable
+ 'attributes', -- Independent variables
+ NULL, -- Grouping
+ 10 -- Buffer size
+ );
+</pre>
+Review the output summary table:
+<pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]------------+------------------------------
+source_table | iris_data
+output_table | iris_data_packed
+dependent_varname | class_text
+independent_varname | attributes
+buffer_size | 10
+class_values | {Iris_setosa,Iris_versicolor}
+num_rows_processed | 52
+num_missing_rows_skipped | 0
+grouping_cols |
+</pre>
+
+-# Run the preprocessor with grouping by state:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
+ 'iris_data_packed', -- Output table
+ 'class_text', -- Dependent variable
+ 'attributes', -- Independent variables
+ 'state' -- Grouping
+ );
+</pre>
+Review the output table:
+<pre class="example">
+SELECT * FROM iris_data_packed ORDER BY state, __id__;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+-------------------------------------
+__id__ | 0
+state | Alaska
+dependent_varname | {{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1}}
+independent_varname | {{0.306242850830503,-0.977074857057813,0.680489757142278 ...
+...
+-[ RECORD 2 ]-------+-------------------------------------
+__id__ | 1
+state | Alaska
+dependent_varname | {{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}}
+independent_varname | {{1.10129640587123,-0.126074175104234,1.2524188915498 ...
+...
+-[ RECORD 3 ]-------+-------------------------------------
+__id__ | 2
+state | Alaska
+dependent_varname | {{1,0}}
+independent_varname | {{-0.647821415218373,1.15042684782613,-1.17827992968215 ...
+...
+-[ RECORD 4 ]-------+-------------------------------------
+__id__ | 0
+state | Tennessee
+dependent_varname | {{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{0,1}}
+independent_varname | {{0.32912603663053,2.59625206429212,-1.12079945083087 ...
+...
+-[ RECORD 5 ]-------+-------------------------------------
+__id__ | 1
+state | Tennessee
+dependent_varname | {{0,1},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1}}
+independent_varname | {{0.865744574615085,-0.267261241912424,0.970244300719264 ...
+...
+</pre>
+Review the output summary table:
+<pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]------------+------------------------------
+source_table | iris_data
+output_table | iris_data_packed
+dependent_varname | class_text
+independent_varname | attributes
+buffer_size | 13
+class_values | {Iris_setosa,Iris_versicolor}
+num_rows_processed | 52
+num_missing_rows_skipped | 0
+grouping_cols | state
+</pre>
+Review the output standardization table:
+<pre class="example">
+SELECT * FROM iris_data_packed_standardization;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------------------------------------------------------------------
+state | Alaska
+mean | {5.40740740740741,2.95925925925926,2.94814814814815,0.833333333333333}
+std | {0.628888452645665,0.470034875978888,1.39877469405147,0.536103914747325}
+-[ RECORD 2 ]-------------------------------------------------------------------
+state | Tennessee
+mean | {5.516,3.04,3.108,0.872}
+std | {0.55905634778617,0.523832034148353,1.43469021046357,0.564637937088893}
+</pre>
+
+-# If the depedent variable is scalar integer,
+and you have not already encoded it, you can ask
+the preprocessor to encode it for you:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
+ 'iris_data_packed', -- Output table
+ 'class', -- Integer dependent variable
+ 'attributes', -- Independent variables
+ NULL, -- Grouping
+ NULL, -- Buffer size
+ TRUE -- Encode scalar int dependent variable
+ );
+</pre>
+Review the output summary table:
+<pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]------------+-----------------
+source_table | iris_data
+output_table | iris_data_packed
+dependent_varname | class
+independent_varname | attributes
+dependent_vartype | integer
+buffer_size | 26
+class_values | {1,2}
+num_rows_processed | 52
+num_missing_rows_skipped | 0
+grouping_cols |
+</pre>
+
+@anchor literature
+@literature
+
+[1] "Neural Networks for Machine Learning", Lectures 6a and 6b on mini-batch gradient descent,
+Geoffrey Hinton with Nitish Srivastava and Kevin Swersky,
+http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+
+@anchor related
+@par Related Topics
+
+minibatch_preprocessing.sql_in
+
+<a href="group__grp__nn.html"><b>Neural Networks</b></a>
+
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(