You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/12/28 22:52:03 UTC
[29/51] [abbrv] [partial] madlib-site git commit: Additional updates for 1.13 release

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6c103d3e/docs/v1.13/group__grp__elasticnet.html
----------------------------------------------------------------------
diff --git a/docs/v1.13/group__grp__elasticnet.html b/docs/v1.13/group__grp__elasticnet.html
new file mode 100644
index 0000000..f6b2c45
--- /dev/null
+++ b/docs/v1.13/group__grp__elasticnet.html
@@ -0,0 +1,755 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Elastic Net Regularization</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.13</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__elasticnet.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Elastic Net Regularization<div class="ingroups"><a class="el" href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" href="group__grp__regml.html">Regression Models</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#optimizer">Optimizer Parameters</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Functions</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#background">Technical Background</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>This module implements elastic net regularization [1] for linear and logistic regression. Regularization is a technique often used to prevent overfitting.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training Function</dt><dd>The training function has the following syntax: <pre class="syntax">
+elastic_net_train( tbl_source,
+                   tbl_result,
+                   col_dep_var,
+                   col_ind_var,
+                   regress_family,
+                   alpha,
+                   lambda_value,
+                   standardize,
+                   grouping_col,
+                   optimizer,
+                   optimizer_params,
+                   excluded,
+                   max_iter,
+                   tolerance
+                 )
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>tbl_source </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the training data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>tbl_result </dt>
+<dd><p class="startdd">TEXT. Name of the output table containing output model. The output table produced by the <a class="el" href="elastic__net_8sql__in.html#a735038a5090c112505c740a90a203e83" title="Interface for elastic net. ">elastic_net_train()</a> function has the following columns: </p><table class="output">
+<tr>
+<th>regress_family </th><td>The regression type: 'gaussian' or 'binomial'.  </td></tr>
+<tr>
+<th>features </th><td>Array of features (independent variables) passed to the algorithm.  </td></tr>
+<tr>
+<th>features_selected </th><td>Array of features selected by the algorithm.  </td></tr>
+<tr>
+<th>coef_nonzero </th><td>Coefficients of the selected features.  </td></tr>
+<tr>
+<th>coef_all </th><td>Coefficients of all features, both selected and unselected.  </td></tr>
+<tr>
+<th>intercept </th><td>Intercept for the model.  </td></tr>
+<tr>
+<th>log_likelihood </th><td>Log of the likelihood value produced by the algorithm.  </td></tr>
+<tr>
+<th>standardize </th><td>BOOLEAN. If data has been normalized, will be set to TRUE.  </td></tr>
+<tr>
+<th>iteration_run </th><td>The number of iterations executed.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>col_dep_var </dt>
+<dd><p class="startdd">TEXT. An expression for the dependent variable.</p>
+<dl class="section note"><dt>Note</dt><dd>Both <em>col_dep_var</em> and <em>col_ind_var</em> can be valid PostgreSQL expressions. For example, <code>col_dep_var = 'log(y+1)'</code>, and <code>col_ind_var = 'array[exp(x[1]), x[2], 1/(1+x[3])]'</code>. In the binomial case, you can use a Boolean expression, for example, <code>col_dep_var = 'y &lt; 0'</code>.</dd></dl>
+</dd>
+<dt>col_ind_var </dt>
+<dd><p class="startdd">TEXT. An expression for the independent variables. Use <code>'*'</code> to specify all columns of <em>tbl_source</em> except those listed in the <em>excluded</em> string described below. If <em>col_dep_var</em> is a column name, it is automatically excluded from the independent variables. However, if <em>col_dep_var</em> is a valid PostgreSQL expression, any column names used within the expression are only excluded if they are explicitly listed in the <em>excluded</em> argument. Therefore, it is a good idea to add all column names involved in the dependent variable expression to the <em>excluded</em> string.</p>
+<p class="enddd"></p>
+</dd>
+<dt>regress_family </dt>
+<dd><p class="startdd">TEXT. For regression type, specify either 'gaussian' ('linear') or 'binomial' ('logistic').</p>
+<p class="enddd"></p>
+</dd>
+<dt>alpha </dt>
+<dd><p class="startdd">FLOAT8. Elastic net control parameter with a value in the range [0, 1]. A value of 1 means L1 regularization, and a value of 0 means L2 regularization.</p>
+<p class="enddd"></p>
+</dd>
+<dt>lambda_value </dt>
+<dd><p class="startdd">FLOAT8. Regularization parameter (must be positive).</p>
+<p class="enddd"></p>
+</dd>
+<dt>standardize (optional) </dt>
+<dd><p class="startdd">BOOLEAN, default: TRUE. Whether to normalize the data or not. Setting to TRUE usually yields better results and faster convergence.</p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_col (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. A single column or a list of comma-separated columns that divides the input data into discrete groups, resulting in one regression per group. When this value is NULL, no grouping is used and a single model is generated for all data.</p>
+<dl class="section note"><dt>Note</dt><dd>Expressions are not currently supported for 'grouping_col'.</dd></dl>
+</dd>
+<dt>optimizer (optional) </dt>
+<dd><p class="startdd">TEXT, default: 'fista'. Name of optimizer, either 'fista' or 'igd'. FISTA [2] is an algorithm with a fast global rate of convergence for solving linear inverse problems. Incremental gradient descent (IGD) is a stochastic approach to minimizing an objective function [4].</p>
+<p class="enddd"></p>
+</dd>
+<dt>optimizer_params (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. Optimizer parameters, delimited with commas. These parameters differ depending on the value of <em>optimizer</em> parameter. See the descriptions below for details.</p>
+<p class="enddd"></p>
+</dd>
+<dt>excluded (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. If the <em>col_ind_var</em> input is '*' then <em>excluded</em> can be provided as a comma-delimited list of column names that are to be excluded from the features. For example, <code>'col1, col2'</code>. If the <em>col_ind_var</em> is an array, <em>excluded</em> must be a list of the integer array positions to exclude, for example <code>'1,2'</code>. If this argument is NULL or an empty string, no columns are excluded.</p>
+<p class="enddd"></p>
+</dd>
+<dt>max_iter (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 1000. The maximum number of iterations allowed.</p>
+<p class="enddd"></p>
+</dd>
+<dt>tolerance </dt>
+<dd>FLOAT8, default: 1e-6. This is the criterion to stop iterating. Both the 'fista' and 'igd' optimizers compute the difference between the log likelihood of two consecutive iterations, and when the difference is smaller than <em>tolerance</em> or the iteration number is larger than <em>max_iter</em>, the computation stops. </dd>
+</dl>
+<p><a class="anchor" id="optimizer"></a></p><dl class="section user"><dt>Other Parameters</dt><dd></dd></dl>
+<p>For <em>optimizer_params</em>, there are several parameters that can be supplied in a string containing a comma-delimited list of name-value pairs . All of these named parameters are optional and use the format "&lt;param_name&gt; = &lt;value&gt;".</p>
+<p>The parameters described below are organized by category: warmup, cross validation and optimization.</p>
+<p><em><b>Warmup parameters</b></em> </p><pre class="syntax">
+  $$
+    warmup = &lt;value&gt;,
+    warmup_lambdas = &lt;value&gt;,
+    warmup_lambda_no = &lt;value&gt;,
+    warmup_tolerance = &lt;value&gt;
+  $$
+</pre><dl class="arglist">
+<dt>warmup </dt>
+<dd><p class="startdd">Default: FALSE. If <em>warmup</em> is TRUE, a series of strictly descending lambda values are used, which end with the lambda value that the user wants to calculate. A larger lambda gives a sparser solution, and the sparse solution is then used as the initial guess for the next lambda's solution, which can speed up the computation for the next lambda. For larger data sets, this can sometimes accelerate the whole computation and may in fact be faster than computation with only a single lambda value.</p>
+<p class="enddd"></p>
+</dd>
+<dt>warmup_lambdas </dt>
+<dd><p class="startdd">Default: NULL. Set of lambda values to use when <em>warmup</em> is TRUE. The default is NULL, which means that lambda values will be automatically generated.</p>
+<p class="enddd"></p>
+</dd>
+<dt>warmup_lambda_no </dt>
+<dd><p class="startdd">Default: 15. Number of lambda values used in <em>warm-up</em>. If <em>warmup_lambdas</em> is not NULL, this value is overridden by the number of provided lambda values.</p>
+<p class="enddd"></p>
+</dd>
+<dt>warmup_tolerance </dt>
+<dd>The value of tolerance used during warmup. The default value is the same as the <em>tolerance</em> argument described above. </dd>
+</dl>
+<p><em><b>Cross validation parameters</b></em> </p><dl class="section note"><dt>Note</dt><dd>Please note that for performance reasons, warmup is disabled whenever cross validation is used. Also, cross validation is not supported if grouping is used.</dd></dl>
+<pre class="syntax">
+  $$
+    n_folds = &lt;value&gt;,
+    validation_result = &lt;value&gt;,
+    lambda_value = &lt;value&gt;,
+    n_lambdas = &lt;value&gt;,
+    alpha = &lt;value&gt;
+  $$
+</pre><p>Hyperparameter optimization can be carried out using the built-in cross validation mechanism, which is activated by assigning a value greater than 1 to the parameter <em>n_folds</em>. Negative misclassification error is used for classification and negative root mean squared error is used for regression.</p>
+<p>The values of a parameter to cross validate should be provided in a list. For example, to regularize with the L1 norm and use a lambda value from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'. Note that the use of '{}' and '[]' are both valid here.</p>
+<dl class="arglist">
+<dt>n_folds </dt>
+<dd><p class="startdd">Default: 0. Number of folds (k). Must be at least 2 to activate cross validation. If a value of k &gt; 2 is specified, each fold is then used as a validation set once, while the other k - 1 folds form the training set. </p>
+<p class="enddd"></p>
+</dd>
+<dt>validation_result </dt>
+<dd><p class="startdd">Default: NULL. Name of the table to store the cross validation results, including the values of parameters and their averaged error values. The table is only created if the name is not NULL. </p>
+<p class="enddd"></p>
+</dd>
+<dt>lambda_value </dt>
+<dd><p class="startdd">Default: NULL. Set of regularization values to be used for cross validation. The default is NULL, which means that lambda values will be automatically generated.</p>
+<p class="enddd"></p>
+</dd>
+<dt>n_lambdas </dt>
+<dd><p class="startdd">Default: 15. Number of lambdas to cross validate over. If a list of lambda values is not provided in the <em>lambda_value</em> set above, this parameter can be used to autogenerate the set of lambdas. If the <em>lambda_value</em> set is not NULL, this value is overridden by the number of provided lambda values. </p>
+<dl class="section note"><dt>Note</dt><dd>If you want to cross validate over alpha only and not lambda, then set <em>lambda_value</em> to NULL and <em>n_lambdas</em> to 0. In this case, cross validation will be done on the set of <em>alpha</em> values specified in the next parameter. The lambda value used will be the one specified in the main function call at the top of this page.</dd></dl>
+</dd>
+<dt>alpha </dt>
+<dd>Elastic net control parameter. This is a list of values to apply cross validation on. (Note that alpha values are not autogenerated.) If not specified, the alpha value used will be the one specified in the main function call at the top of this page.  </dd>
+</dl>
+<p><em><b>Optimizer parameters</b></em></p>
+<p><b>FISTA</b> Parameters </p><pre class="syntax">
+  $$
+    max_stepsize = &lt;value&gt;,
+    eta = &lt;value&gt;,
+    use_active_set = &lt;value&gt;,
+    activeset_tolerance = &lt;value&gt;,
+    random_stepsize = &lt;value&gt;
+  $$
+</pre><dl class="arglist">
+<dt>max_stepsize </dt>
+<dd><p class="startdd">Default: 4.0. Initial backtracking step size. At each iteration, the algorithm first tries <em>stepsize = max_stepsize</em>, and if it does not work out, it then tries a smaller step size, <em>stepsize = stepsize/eta</em>, where <em>eta</em> must be larger than 1. At first glance, this seems to perform repeated iterations for even one step, but using a larger step size actually greatly increases the computation speed and minimizes the total number of iterations. A careful choice of <em>max_stepsize</em> can decrease the computation time by more than 10 times.</p>
+<p class="enddd"></p>
+</dd>
+<dt>eta </dt>
+<dd><p class="startdd">Default: 2.0 If stepsize does not work, <em>stepsize/<em>eta</em> is</em> tried. Must be greater than 1. </p>
+<p class="enddd"></p>
+</dd>
+<dt>use_active_set </dt>
+<dd><p class="startdd">Default: FALSE. If <em>use_active_set</em> is TRUE, an active-set method is used to speed up the computation. Considerable speedup is obtained by organizing the iterations around the active set of features&mdash;those with nonzero coefficients. After a complete cycle through all the variables, we iterate only on the active set until convergence. If another complete cycle does not change the active set, we are done. Otherwise, the process is repeated.</p>
+<p class="enddd"></p>
+</dd>
+<dt>activeset_tolerance </dt>
+<dd><p class="startdd">The value of tolerance used during active set calculation. The default value is the same as the <em>tolerance</em> argument described above. </p>
+<p class="enddd"></p>
+</dd>
+<dt>random_stepsize </dt>
+<dd>Default: FALSE. Whether to add some randomness to the step size. Sometimes, this can speed up the calculation. </dd>
+</dl>
+<p><b>IGD</b> parameters </p><pre class="syntax">
+  $$
+      stepsize = &lt;value&gt;,
+      step_decay = &lt;value&gt;,
+      threshold = &lt;value&gt;,
+      parallel = &lt;value&gt;
+  $$
+</pre> <dl class="arglist">
+<dt>stepsize </dt>
+<dd><p class="startdd">The default is 0.01.</p>
+<p class="enddd"></p>
+</dd>
+<dt>step_decay </dt>
+<dd><p class="startdd">The actual stepsize used for current step is (previous stepsize) / exp(step_decay). The default value is 0, which means that a constant stepsize is used in IGD.</p>
+<p class="enddd"></p>
+</dd>
+<dt>threshold </dt>
+<dd><p class="startdd">Default: 1e-10. When a coefficient is really small, set this coefficient to be 0.</p>
+<p>Due to the stochastic nature of SGD, we can only obtain very small values for the fitting coefficients. Therefore, <em>threshold</em> is needed at the end of the computation to screen out tiny values and hard-set them to zeros. This is accomplished as follows: (1) multiply each coefficient with the standard deviation of the corresponding feature; (2) compute the average of absolute values of re-scaled coefficients; (3) divide each rescaled coefficient with the average, and if the resulting absolute value is smaller than <em>threshold</em>, set the original coefficient to zero.</p>
+<p class="enddd"></p>
+</dd>
+<dt>parallel </dt>
+<dd><p class="startdd">Whether to run the computation on multiple segments. The default is TRUE.</p>
+<p class="enddd">SGD is a sequential algorithm in nature. When running in a distributed manner, each segment of the data runs its own SGD model and then the models are averaged to get a model for each iteration. This averaging might slow down the convergence speed, but it affords the ability to process large datasets on a cluster of machines. This algorithm, therefore, provides the <em>parallel</em> option to allow you to choose whether to do parallel computation.  </p>
+</dd>
+</dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section user"><dt>Prediction Function</dt><dd></dd></dl>
+<h4>Per-Tuple Prediction</h4>
+<p>The prediction function returns a double value for the Gaussian family and a Boolean value for the Binomial family.</p>
+<p>The predict function has the following syntax (<a class="el" href="elastic__net_8sql__in.html#a96db4ff4ba3ea363fafbf6c036c19fae" title="Prediction for linear models use learned coefficients for a given example. ">elastic_net_gaussian_predict()</a> and <a class="el" href="elastic__net_8sql__in.html#aa78cde79f1f2caa7c5b38f933001d793" title="Prediction for logistic models use learned coefficients for a given example. ">elastic_net_binomial_predict()</a>): </p><pre class="syntax">
+elastic_net_&lt;family&gt;_predict(
+                     coefficients,
+                     intercept,
+                     ind_var
+                   )
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>coefficients </dt>
+<dd>DOUBLE PRECISION[]. Fitting coefficients, usually <em>coef_all</em> or <em>coef_nonzero</em>. </dd>
+<dt>intercept </dt>
+<dd>DOUBLE PRECISION. Intercept for the model. </dd>
+<dt>ind_var </dt>
+<dd>DOUBLE PRECISION[]. Independent variables that correspond to coefficients. Use <em>features</em> column in <em>tbl_result</em> for <em>coef_all</em>, and <em>features_selected</em> for <em>coef_nonzero</em>. See the <a href="#additional_example">examples for this case below</a>. <dl class="section note"><dt>Note</dt><dd>Unexpected results or errors may be returned in the case that this argument <em>ind_var</em> is not specified properly. </dd></dl>
+</dd>
+</dl>
+<p>For the binomial family, there is a function (<a class="el" href="elastic__net_8sql__in.html#a308718fd5234bc1007b971a639aadf71" title="Compute the probability of belonging to the True class for a given observation. ">elastic_net_binomial_prob()</a>) that outputs the probability of the instance being TRUE: </p><pre class="syntax">
+elastic_net_binomial_prob(
+                     coefficients,
+                     intercept,
+                     ind_var
+                   )
+</pre><h4>Per-Table Prediction</h4>
+<p>Alternatively, you can use another prediction function that stores the prediction result in a table (<a class="el" href="elastic__net_8sql__in.html#a3578608204ac9b2d3442ff42977f632b" title="Prediction and put the result in a table can be used together with General-CV. ">elastic_net_predict()</a>). This is useful if you want to use elastic net together with the general cross validation function. </p><pre class="syntax">
+elastic_net_predict( tbl_model,
+                     tbl_new_sourcedata,
+                     col_id,
+                     tbl_predict
+                   )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>tbl_model </dt>
+<dd>TEXT. Name of the table containing the output from the training function. </dd>
+<dt>tbl_new_sourcedata </dt>
+<dd>TEXT. Name of the table containing the new source data. </dd>
+<dt>col_id </dt>
+<dd>TEXT. Unique ID associated with each row. </dd>
+<dt>tbl_predict </dt>
+<dd>TEXT. Name of table to store the prediction result.  </dd>
+</dl>
+<p>You do not need to specify whether the model is "linear" or "logistic" because this information is already included in the <em>tbl_model</em> table.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Display online help for the <a class="el" href="elastic__net_8sql__in.html#a735038a5090c112505c740a90a203e83" title="Interface for elastic net. ">elastic_net_train()</a> function: <pre class="example">
+SELECT madlib.elastic_net_train();
+</pre></li>
+<li>Create an input data set of house prices and features: <pre class="example">
+DROP TABLE IF EXISTS houses;
+CREATE TABLE houses ( id INT,
+                      tax INT,
+                      bedroom INT,
+                      bath FLOAT,
+                      price INT,
+                      size INT,
+                      lot INT,
+                      zipcode INT);
+INSERT INTO houses (id, tax, bedroom, bath, price, size, lot, zipcode) VALUES
+(1  ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100  , 94301),
+(2  , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000  , 94301),
+(3  ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500  , 94301),
+(4  ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500  , 94301),
+(5  , 1320 ,       3 ,    2 , 133000 , 1500 , 30000  , 94301),
+(6  , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700  , 94301),
+(7  , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000  , 94301),
+(8  ,  680 ,       2 ,    1 , 142500 , 1170 , 22000  , 94301),
+(9  , 1840 ,       3 ,    2 , 160000 , 1500 , 19000  , 94301),
+(10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000  , 94301),
+(11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500  , 94301),
+(12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000  , 94301),
+(13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000  , 94301),
+(14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000  , 94301),
+(15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000  , 94301),
+(16 ,  770 ,       2 ,    2 ,  91000 , 1300 , 17500  , 76010),
+(17 , 1220 ,       3 ,    2 , 132300 , 1500 , 30000  , 76010),
+(18 , 1150 ,       2 ,    1 ,  91100 ,  820 , 25700  , 76010),
+(19 , 2690 ,       3 ,  2.5 , 260011 , 2130 , 25000  , 76010),
+(20 ,  780 ,       2 ,    1 , 141800 , 1170 , 22000  , 76010),
+(21 , 1910 ,       3 ,    2 , 160900 , 1500 , 19000  , 76010),
+(22 , 3600 ,       4 ,    2 , 239000 , 2790 , 20000  , 76010),
+(23 , 1600 ,       3 ,    1 ,  81010 , 1030 , 17500  , 76010),
+(24 , 1590 ,       3 ,    2 , 117910 , 1250 , 20000  , 76010),
+(25 , 3200 ,       3 ,    2 , 141100 , 1760 , 38000  , 76010),
+(26 , 2270 ,       2 ,    3 , 148011 , 1550 , 14000  , 76010),
+(27 ,  750 ,       3 ,  1.5 ,  66000 , 1450 , 12000  , 76010);
+</pre></li>
+<li>Train the model: <pre class="example">
+DROP TABLE IF EXISTS houses_en, houses_en_summary;
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
+                               );
+</pre></li>
+<li>View the resulting model: <pre class="example">
+-- Turn on expanded display to make it easier to read results.
+\x on
+SELECT * FROM houses_en;
+</pre> Result: <pre class="result">
+-[ RECORD 1 ]-----+-------------------------------------------
+family            | gaussian
+features          | {tax,bath,size}
+features_selected | {tax,bath,size}
+coef_nonzero      | {22.785201806,10707.9664343,54.7959774173}
+coef_all          | {22.785201806,10707.9664343,54.7959774173}
+intercept         | -7798.71393905
+log_likelihood    | -512248641.971
+standardize       | t
+iteration_run     | 10000
+</pre></li>
+<li>Use the prediction function to evaluate residuals: <pre class="example">
+\x off
+SELECT id, price, predict, price - predict AS residual
+FROM (
+    SELECT
+        houses.*,
+        madlib.elastic_net_gaussian_predict(
+            m.coef_all,             -- Coefficients
+            m.intercept,            -- Intercept
+            ARRAY[tax,bath,size]    -- Features (corresponding to coefficients)
+            ) AS predict
+    FROM houses, houses_en m) s
+ORDER BY id;
+</pre> Result: <pre class="result">
+ id | price  |     predict      |     residual      
+----+--------+------------------+-------------------
+  1 |  50000 |  58545.391894031 |   -8545.391894031
+  2 |  85000 | 114804.077663003 |  -29804.077663003
+  3 |  22500 |  61448.835664388 |  -38948.835664388
+  4 |  90000 |  104675.17768007 |   -14675.17768007
+  5 | 133000 |  125887.70644358 |     7112.29355642
+  6 |  90500 |  78601.843595366 |   11898.156404634
+  7 | 260000 | 199257.358231079 |   60742.641768921
+  8 | 142500 |  82514.559377081 |   59985.440622919
+  9 | 160000 |  137735.93215082 |    22264.06784918
+ 10 | 240000 | 250347.627648647 |  -10347.627648647
+ 11 |  87000 |  97172.428263539 |  -10172.428263539
+ 12 | 118600 | 119024.150628605 | -424.150628604999
+ 13 | 140000 | 180692.127913358 |  -40692.127913358
+ 14 | 148000 | 156424.249824545 |   -8424.249824545
+ 15 |  65000 | 102527.938104575 |  -37527.938104575
+ 16 |  91000 |  102396.67273637 |   -11396.67273637
+ 17 | 132300 |  123609.20149988 |     8690.79850012
+ 18 |  91100 |  74044.833707966 |   17055.166292034
+ 19 | 260011 | 196978.853287379 |   63032.146712621
+ 20 | 141800 |  84793.064320781 |   57006.935679219
+ 21 | 160900 |  139330.88561141 |    21569.11438859
+ 22 | 239000 | 248524.823693687 | -9524.82369368701
+ 23 |  81010 |  95805.325297319 |  -14795.325297319
+ 24 | 117910 | 118340.599145495 | -430.599145494998
+ 25 | 141100 | 182970.632857058 |  -41870.632857058
+ 26 | 148011 | 160981.259711945 |  -12970.259711945
+ 27 |  66000 | 104806.443048275 |  -38806.443048275
+</pre></li>
+</ol>
+<h4>Example with Grouping</h4>
+<ol type="1">
+<li>Reuse the houses table above and train the model by grouping on zip code: <pre class="example">
+DROP TABLE IF EXISTS houses_en1, houses_en1_summary;
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en1',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 'zipcode',                 -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
+                               );
+</pre></li>
+<li>View the resulting model with a separate model for each group: <pre class="example">
+-- Turn on expanded display to make it easier to read results.
+\x on
+SELECT * FROM houses_en1;
+</pre> Result: <pre class="result">
+-[ RECORD 1 ]-----+--------------------------------------------
+zipcode           | 94301
+family            | gaussian
+features          | {tax,bath,size}
+features_selected | {tax,bath,size}
+coef_nonzero      | {27.0542096962,12351.5244083,47.5833289771}
+coef_all          | {27.0542096962,12351.5244083,47.5833289771}
+intercept         | -7191.19791597
+log_likelihood    | -519199964.967
+standardize       | t
+iteration_run     | 10000
+-[ RECORD 2 ]-----+--------------------------------------------
+zipcode           | 76010
+family            | gaussian
+features          | {tax,bath,size}
+features_selected | {tax,bath,size}
+coef_nonzero      | {15.6325953499,10166.6608469,57.8689916035}
+coef_all          | {15.6325953499,10166.6608469,57.8689916035}
+intercept         | 513.912201627
+log_likelihood    | -538806528.45
+standardize       | t
+iteration_run     | 10000
+</pre></li>
+<li>Use the prediction function to evaluate residuals: <pre class="example">
+\x off
+SELECT madlib.elastic_net_predict(
+                'houses_en1',             -- Model table
+                'houses',                 -- New source data table
+                'id',                     -- Unique ID associated with each row
+                'houses_en1_prediction'   -- Table to store prediction result
+              );
+SELECT  houses.id,
+        houses.price,
+        houses_en1_prediction.prediction,
+        houses.price - houses_en1_prediction.prediction AS residual
+FROM houses_en1_prediction, houses
+WHERE houses.id = houses_en1_prediction.id ORDER BY id;
+</pre></li>
+</ol>
+<p><a class="anchor" id="additional_example"></a></p><h4>Example where coef_nonzero is different from coef_all</h4>
+<ol type="1">
+<li>Reuse the <a href="#examples">houses</a> table above and train the model with alpha=1 (L1) and a large lambda value (30000). <pre class="example">
+DROP TABLE IF EXISTS houses_en2, houses_en2_summary;
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en2',              -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 1,                         -- Alpha value
+                                 30000,                     -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
+                               );
+</pre></li>
+<li>View the resulting model and see coef_nonzero is different from coef_all: <pre class="example">
+-- Turn on expanded display to make it easier to read results.
+\x on
+SELECT * FROM houses_en2;
+</pre> Result: <pre class="result">
+-[ RECORD 1 ]-----+--------------------------------
+family            | gaussian
+features          | {tax,bath,size}
+features_selected | {tax,size}
+coef_nonzero      | {6.94744249834,29.7137297658}
+coef_all          | {6.94744249834,0,29.7137297658}
+intercept         | 74445.7039382
+log_likelihood    | -1635348585.07
+standardize       | t
+iteration_run     | 151
+</pre></li>
+<li>We can still use the prediction function with <em>coef_all</em> to evaluate residuals: <pre class="example">
+\x off
+SELECT id, price, predict, price - predict AS residual
+FROM (
+    SELECT
+        houses.*,
+        madlib.elastic_net_gaussian_predict(
+            m.coef_all,                   -- All coefficients
+            m.intercept,                  -- Intercept
+            ARRAY[tax,bath,size]          -- All features
+            ) AS predict
+    FROM houses, houses_en2 m) s
+ORDER BY id;
+</pre></li>
+<li>We can speed up the prediction function with <em>coef_nonzero</em> to evaluate residuals. This requires the user to examine the <em>feature_selected</em> column in the result table to construct the correct set of independent variables to provide to the prediction function: <pre class="example">
+\x off
+SELECT id, price, predict, price - predict AS residual
+FROM (
+    SELECT
+        houses.*,
+        madlib.elastic_net_gaussian_predict(
+            m.coef_nonzero,               -- Non-zero coefficients
+            m.intercept,                  -- Intercept
+            ARRAY[tax,size]               -- Features corresponding to non-zero coefficients
+            ) AS predict
+    FROM houses, houses_en2 m) s
+ORDER BY id;
+</pre> The two queries above will result in same residuals: <pre class="result">
+ id | price  |     predict      |     residual
+----+--------+------------------+-------------------
+  1 |  50000 | 101424.266931887 | -51424.2669318866
+  2 |  85000 | 123636.877531235 |  -38636.877531235
+  3 |  22500 | 106081.206339915 | -83581.2063399148
+  4 |  90000 | 119117.827607296 | -29117.8276072958
+  5 | 133000 | 128186.922684709 |   4813.0773152912
+  6 |  90500 | 108190.009718915 |  -17690.009718915
+  7 | 260000 | 157119.312909723 |  102880.687090277
+  8 | 142500 | 113935.028663057 |  28564.9713369428
+  9 | 160000 | 131799.592783846 |  28200.4072161544
+ 10 | 240000 | 182913.598378673 |  57086.4016213268
+ 11 |  87000 | 116583.600144218 | -29583.6001442184
+ 12 | 118600 | 122842.722992761 |  -4242.7229927608
+ 13 | 140000 | 148278.940070862 | -8278.94007086201
+ 14 | 148000 | 134883.191046754 |  13116.8089532462
+ 15 |  65000 | 122046.449722531 |  -57046.449722531
+ 16 |  91000 | 118423.083357462 | -27423.0833574618
+ 17 | 132300 | 127492.178434875 |   4807.8215651252
+ 18 |  91100 | 106800.521219247 |  -15700.521219247
+ 19 | 260011 | 156424.568659889 |  103586.431340111
+ 20 | 141800 | 114629.772912891 |  27170.2270871088
+ 21 | 160900 | 132285.913758729 |  28614.0862412706
+ 22 | 239000 | 182357.802978806 |   56642.197021194
+ 23 |  81010 | 116166.753594318 |  -35156.753594318
+ 24 | 117910 | 122634.299717811 | -4724.29971781059
+ 25 | 141100 | 148973.684320696 | -7873.68432069599
+ 26 | 148011 | 136272.679546422 |  11738.3204535782
+ 27 |  66000 | 122741.193972365 |  -56741.193972365
+(27 rows)
+</pre></li>
+</ol>
+<h4>Example with Cross Validation</h4>
+<ol type="1">
+<li>Reuse the houses table above. Here we use 3-fold cross validation with 3 automatically generated lambda values and 3 specified alpha values. (This can take some time to run since elastic net is effectively being called 27 times for these combinations, then a 28th time for the whole dataset.) <pre class="example">
+DROP TABLE IF EXISTS houses_en3, houses_en3_summary, houses_en3_cv;
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en3',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 $$ n_folds = 3,            -- Cross validation parameters
+                                    validation_result=houses_en3_cv,
+                                    n_lambdas = 3,
+                                    alpha = {0, 0.1, 1}
+                                 $$,
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
+                               );
+SELECT * FROM houses_en3;
+</pre> <pre class="result">
+-[ RECORD 1 ]-----+--------------------------------------------
+family            | gaussian
+features          | {tax,bath,size}
+features_selected | {tax,bath,size}
+coef_nonzero      | {22.4584188479,11657.0739045,52.1624090811}
+coef_all          | {22.4584188479,11657.0739045,52.1624090811}
+intercept         | -5067.33396522
+log_likelihood    | -543193170.15
+standardize       | t
+iteration_run     | 10000
+</pre></li>
+<li>Details of the cross validation: <pre class="example">
+SELECT * FROM houses_en3_cv ORDER BY lambda_value DESC, alpha ASC;
+</pre> <pre class="result">
+ alpha | lambda_value |    mean_neg_loss   |   std_neg_loss
+-------+--------------+--------------------+-------------------
+   0.0 |     100000.0 | -1.617365261170+55 | 1.26711815498+55
+   0.0 |        100.0 |     -63555.0502789 |    3973.78527042
+   0.0 |          0.1 |     -37136.5397256 |    9022.78236248
+   0.1 |     100000.0 | -3.260479720340+53 | 9.10745448826+53
+   0.1 |        100.0 |     -63445.8310011 |    3965.83900962
+   0.1 |          0.1 |     -37192.0390897 |    9058.79757772
+   1.0 |     100000.0 |     -64569.8882099 |     4051.1856361
+   1.0 |        100.0 |     -38121.9154268 |    9332.65800111
+   1.0 |          0.1 |     -38117.5477067 |    9384.36765881
+(9 rows)
+</pre></li>
+</ol>
+<p><a class="anchor" id="notes"></a></p><dl class="section user"><dt>Note</dt><dd>It is <b>strongly</b> <b>recommended</b> that you run <code><a class="el" href="elastic__net_8sql__in.html#a735038a5090c112505c740a90a203e83" title="Interface for elastic net. ">elastic_net_train()</a></code> on a subset of the data with a limited <em>max_iter</em> before applying it to the full data set with a large <em>max_iter</em>. In the pre-run, you can adjust the parameters to get the best performance and then apply the best set of parameters to the whole data set.</dd></dl>
+<p><a class="anchor" id="background"></a></p><dl class="section user"><dt>Technical Background</dt><dd></dd></dl>
+<p>Elastic net regularization seeks to find a weight vector that, for any given training example set, minimizes: </p><p class="formulaDsp">
+\[\min_{w \in R^N} L(w) + \lambda \left(\frac{(1-\alpha)}{2} \|w\|_2^2 + \alpha \|w\|_1 \right)\]
+</p>
+<p> where \(L\) is the metric function that the user wants to minimize. Here \( \alpha \in [0,1] \) and \( lambda \geq 0 \). If \(alpha = 0\), we have the ridge regularization (known also as Tikhonov regularization), and if \(\alpha = 1\), we have the LASSO regularization.</p>
+<p>For the Gaussian response family (or linear model), we have </p><p class="formulaDsp">
+\[L(\vec{w}) = \frac{1}{2}\left[\frac{1}{M} \sum_{m=1}^M (w^{t} x_m + w_{0} - y_m)^2 \right] \]
+</p>
+<p>For the Binomial response family (or logistic model), we have </p><p class="formulaDsp">
+\[ L(\vec{w}) = \sum_{m=1}^M\left[y_m \log\left(1 + e^{-(w_0 + \vec{w}\cdot\vec{x}_m)}\right) + (1-y_m) \log\left(1 + e^{w_0 + \vec{w}\cdot\vec{x}_m}\right)\right]\ , \]
+</p>
+<p> where \(y_m \in {0,1}\).</p>
+<p>To get better convergence, one can rescale the value of each element of x </p><p class="formulaDsp">
+\[ x&#39; \leftarrow \frac{x - \bar{x}}{\sigma_x} \]
+</p>
+<p> and for Gaussian case we also let </p><p class="formulaDsp">
+\[y&#39; \leftarrow y - \bar{y} \]
+</p>
+<p> and then minimize with the regularization terms. At the end of the calculation, the orginal scales will be restored and an intercept term will be obtained at the same time as a by-product.</p>
+<p>Note that fitting after scaling is not equivalent to directly fitting.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] Elastic net regularization, <a href="http://en.wikipedia.org/wiki/Elastic_net_regularization">http://en.wikipedia.org/wiki/Elastic_net_regularization</a></p>
+<p>[2] Beck, A. and M. Teboulle (2009), A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM J. on Imaging Sciences 2(1), 183-202.</p>
+<p>[3] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for L1 Regularized Loss Minimization. Proceedings of the 26th International Conference on Machine Learning, Montreal, Canada, 2009.</p>
+<p>[4] Stochastic gradient descent, <a href="https://en.wikipedia.org/wiki/Stochastic_gradient_descent">https://en.wikipedia.org/wiki/Stochastic_gradient_descent</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="elastic__net_8sql__in.html" title="SQL functions for elastic net regularization. ">elastic_net.sql_in</a> documenting the SQL functions. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed Dec 27 2017 19:05:57 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6c103d3e/docs/v1.13/group__grp__encode__categorical.html
----------------------------------------------------------------------
diff --git a/docs/v1.13/group__grp__encode__categorical.html b/docs/v1.13/group__grp__encode__categorical.html
new file mode 100644
index 0000000..83f71dd
--- /dev/null
+++ b/docs/v1.13/group__grp__encode__categorical.html
@@ -0,0 +1,610 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Encoding Categorical Variables</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.13</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__encode__categorical.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Encoding Categorical Variables<div class="ingroups"><a class="el" href="group__grp__datatrans.html">Data Types and Transformations</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#categorical">Coding Systems for Categorical Variables</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+</ul>
+</div><p><a class="anchor" id="categorical"></a></p><dl class="section user"><dt>Coding Systems for Categorical Variables</dt><dd>Categorical variables [1] require special attention in regression analysis because, unlike dichotomous or continuous variables, they cannot be entered into the regression equation just as they are. For example, if you have a variable called race that is coded with 1=Hispanic, 2=Asian, 3=Black, 4=White, then entering race in your regression will look at the linear effect of the race variable, which is probably not what you intended. Instead, categorical variables like this need to be coded into a series of indicator variables which can then be entered into the regression model. There are a variety of coding systems that can be used for coding categorical variables, including one-hot, dummy, effects, orthogonal, and Helmert.</dd></dl>
+<p>We currently support one-hot and dummy coding techniques.</p>
+<p>Dummy coding is used when a researcher wants to compare other groups of the predictor variable with one specific group of the predictor variable. Often, the specific group to compare with is called the reference group.</p>
+<p>One-hot encoding is similar to dummy coding except it builds indicator (0/1) columns (cast as numeric) for each value of each category. Only one of these columns could take on the value 1 for each row (data point). There is no reference category for this function.</p>
+<pre class="syntax">
+encode_categorical_variables (
+        source_table,
+        output_table,
+        categorical_cols,
+        categorical_cols_to_exclude,    -- Optional
+        row_id,                         -- Optional
+        top,                            -- Optional
+        value_to_drop,                  -- Optional
+        encode_null,                    -- Optional
+        output_type,                    -- Optional
+        output_dictionary,              -- Optional
+        distributed_by                  -- Optional
+        )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the table containing the source categorical data to encode.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the result table.</p>
+<dl class="section note"><dt>Note</dt><dd>If there are index columns in the 'source_table' specified by the parameter 'row_id' (see below), then the output table will contain only the index columns 'row_id' and the encoded columns. If the parameter 'row_id' is not specified, then all columns from the 'source_table', with the exception of the original columns that have been encoded, will be included in the 'output_table'. </dd></dl>
+</dd>
+<dt>categorical_cols </dt>
+<dd><p class="startdd">VARCHAR. Comma-separated string of column names of categorical variables to encode. Can also be '*' meaning all columns are to be encoded, except the ones specified in 'categorical_cols_to_exclude' and 'row_id'. Please note that all Boolean, integer and text columns are considered categorical columns and will be encoded when ‘*’ is specified for this argument. </p>
+<p class="enddd"></p>
+</dd>
+<dt>categorical_cols_to_exclude (optional) </dt>
+<dd><p class="startdd">VARCHAR. Comma-separated string of column names to exclude from the categorical variables to encode. Applicable only if 'categorical_cols' = '*'. </p>
+<p class="enddd"></p>
+</dd>
+<dt>row_id (optional) </dt>
+<dd><p class="startdd">VARCHAR. Comma-separated column name(s) corresponding to the primary key(s) of the source table. This parameter determines the format of the 'output_table' as described above. If 'categorical_cols' = '*', these columns will be excluded from encoding (but will be included in the output table).</p>
+<dl class="section note"><dt>Note</dt><dd>If you want to see both the raw categorical variable and its encoded form in the output_table, then include the categorical variable in the 'row_id' parameter. However, this will not work if you specify '*' for the parameter 'categorical_cols', because in this case 'row_id' columns will not be encoded at all. </dd></dl>
+</dd>
+<dt>top (optional) </dt>
+<dd><p class="startdd">VARCHAR. default: NULL. If integer, encodes the top n values by frequency. If float in the range (0.0, 1.0), encodes the specified fraction of values by frequency (e.g., 0.1 means top 10%). Can be specified as a global for all categorical columns, or as a dictionary with separate 'top' values for each categorical variable. Set to NULL to encode all levels (values) for all categorical columns. </p>
+<p class="enddd"></p>
+</dd>
+<dt>value_to_drop (optional) </dt>
+<dd><p class="startdd">VARCHAR. Default: NULL.</p>
+<ul>
+<li>For dummy coding, indicate the desired value (reference) to drop for each categorical variable. Can be specified as a global for all categorical columns, or a comma-separated string containing items of the form 'name=value', where 'name' is the column name and 'value' is the reference value to be dropped.</li>
+<li>Set to NULL for one-hot encoding (default)</li>
+</ul>
+<dl class="section note"><dt>Note</dt><dd>If you specify both 'value_to_drop' and 'top' parameters, the 'value_to_drop' will be applied first (takes priority), then 'top' will be applied to the remaining values. </dd></dl>
+</dd>
+<dt>encode_null (optional) </dt>
+<dd><p class="startdd">BOOLEAN. default: FALSE. Whether NULL should be treated as one of the values of the categorical variable. If TRUE, then an indicator variable is created corresponding to the NULL value. If FALSE, then all encoded values for that variable will be set to 0. </p>
+<p class="enddd"></p>
+</dd>
+<dt>output_type (optional) </dt>
+<dd><p class="startdd">VARCHAR. default: 'column'. This parameter controls the output format of the indicator variables. If 'column', a column is created for each indicator variable. PostgreSQL limits the number of columns in a table. If the total number of indicator columns exceeds the limit, then make this parameter either 'array' to combine the indicator columns into an array or 'svec' to cast the array output to <em>'madlib.svec'</em> type.</p>
+<p>Since the array output for any single tuple would be sparse (only one non-zero entry for each categorical column), the 'svec' output would be most efficient for storage. The 'array' output is useful if the array is used for post-processing, including concatenating with other non-categorical features.</p>
+<p>The order of the array is the same as specified in 'categorical_cols'. A dictionary will be created when 'output_type' is 'array' or 'svec' to define an index into the array. The dictionary table will be given the name of the 'output_table' appended by '_dictionary'. </p>
+<p class="enddd"></p>
+</dd>
+<dt>output_dictionary (optional) </dt>
+<dd><p class="startdd">BOOLEAN. default: FALSE. This parameter is used to handle auto-generated column names that exceed the PostgreSQL limit of 63 bytes.</p>
+<ul>
+<li>If TRUE, column names will include numerical IDs and will create a dictionary table called 'output_table_dictionary' ('output_table' appended with '_dictionary').</li>
+<li>If FALSE, will auto-generate column names in the usual way unless the limit of 63 bytes will be exceeded. In this case, a dictionary output file will be created and a message given to the user. </li>
+</ul>
+<p class="enddd"></p>
+</dd>
+<dt>distributed_by (optional) </dt>
+<dd><p class="startdd">VARCHAR. default: NULL. Columns to use for the distribution policy of the output table. When NULL, either 'row_id' is used as distribution policy (when provided), or else the distribution policy of 'source_table' will be used. This argument does not apply to PostgreSQL platforms.</p>
+<ul>
+<li>NULL: By default, the distribution policy of the source_table will be used.</li>
+<li>Comma-separated column names: Column(s) to be used for the distribution key.</li>
+<li>RANDOMLY: Use random distribution policy (only if there does not exist a column named 'randomly').</li>
+</ul>
+<p class="enddd"></p>
+</dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Use a subset of the abalone dataset [2]: <pre class="example">
+DROP TABLE IF EXISTS abalone;
+CREATE TABLE abalone (
+    id serial,
+    sex character varying,
+    length double precision,
+    diameter double precision,
+    height double precision,
+    rings int
+);
+INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
+('M',    0.455,  0.365,  0.095,  15),
+('M',    0.35,   0.265,  0.09,   7),
+('F',    0.53,   0.42,   0.135,  9),
+('M',    0.44,   0.365,  0.125,  10),
+('I',    0.33,   0.255,  0.08,   7),
+('I',    0.425,  0.3,    0.095,  8),
+('F',    0.53,   0.415,  0.15,   20),
+('F',    0.545,  0.425,  0.125,  16),
+('M',    0.475,  0.37,   0.125,  9),
+(NULL,   0.55,   0.44,   0.15,   19),
+('F',    0.525,  0.38,   0.14,   14),
+('M',    0.43,   0.35,   0.11,   10),
+('M',    0.49,   0.38,   0.135,  11),
+('F',    0.535,  0.405,  0.145,  10),
+('F',    0.47,   0.355,  0.1,    10),
+('M',    0.5,    0.4,    0.13,   12),
+('I',    0.355,  0.28,   0.085,  7),
+('F',    0.44,   0.34,   0.1,    10),
+('M',    0.365,  0.295,  0.08,   7),
+(NULL,   0.45,   0.32,   0.1,    9);
+</pre></li>
+<li>Create new table with one-hot encoding. The column 'sex' is replaced by three columns encoding the values 'F', 'M' and 'I'. Null values are not encoded by default: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        'sex'                        -- Categorical columns
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+  id | length | diameter | height | rings | sex_F | sex_I | sex_M
+----+--------+----------+--------+-------+-------+-------+-------
+  1 |  0.455 |    0.365 |  0.095 |    15 |     0 |     0 |     1
+  2 |   0.35 |    0.265 |   0.09 |     7 |     0 |     0 |     1
+  3 |   0.53 |     0.42 |  0.135 |     9 |     1 |     0 |     0
+  4 |   0.44 |    0.365 |  0.125 |    10 |     0 |     0 |     1
+  5 |   0.33 |    0.255 |   0.08 |     7 |     0 |     1 |     0
+  6 |  0.425 |      0.3 |  0.095 |     8 |     0 |     1 |     0
+  7 |   0.53 |    0.415 |   0.15 |    20 |     1 |     0 |     0
+  8 |  0.545 |    0.425 |  0.125 |    16 |     1 |     0 |     0
+  9 |  0.475 |     0.37 |  0.125 |     9 |     0 |     0 |     1
+ 10 |   0.55 |     0.44 |   0.15 |    19 |     0 |     0 |     0
+ 11 |  0.525 |     0.38 |   0.14 |    14 |     1 |     0 |     0
+ 12 |   0.43 |     0.35 |   0.11 |    10 |     0 |     0 |     1
+ 13 |   0.49 |     0.38 |  0.135 |    11 |     0 |     0 |     1
+ 14 |  0.535 |    0.405 |  0.145 |    10 |     1 |     0 |     0
+ 15 |   0.47 |    0.355 |    0.1 |    10 |     1 |     0 |     0
+ 16 |    0.5 |      0.4 |   0.13 |    12 |     0 |     0 |     1
+ 17 |  0.355 |     0.28 |  0.085 |     7 |     0 |     1 |     0
+ 18 |   0.44 |     0.34 |    0.1 |    10 |     1 |     0 |     0
+ 19 |  0.365 |    0.295 |   0.08 |     7 |     0 |     0 |     1
+ 20 |   0.45 |     0.32 |    0.1 |     9 |     0 |     0 |     0
+(20 rows)
+</pre></li>
+<li>Now include NULL values in encoding (note the additional column 'sex_NULL'): <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        'sex',                       -- Categorical columns
+        NULL,                        -- Categorical columns to exclude
+        NULL,                        -- Index columns
+        NULL,                        -- Top values
+        NULL,                        -- Value to drop for dummy encoding
+        TRUE                         -- Encode nulls
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+ id | length | diameter | height | rings | sex_F | sex_I | sex_M | sex_NULL
+----+--------+----------+--------+-------+-------+-------+-------+----------
+  1 |  0.455 |    0.365 |  0.095 |    15 |     0 |     0 |     1 |        0
+  2 |   0.35 |    0.265 |   0.09 |     7 |     0 |     0 |     1 |        0
+  3 |   0.53 |     0.42 |  0.135 |     9 |     1 |     0 |     0 |        0
+  4 |   0.44 |    0.365 |  0.125 |    10 |     0 |     0 |     1 |        0
+  5 |   0.33 |    0.255 |   0.08 |     7 |     0 |     1 |     0 |        0
+  6 |  0.425 |      0.3 |  0.095 |     8 |     0 |     1 |     0 |        0
+  7 |   0.53 |    0.415 |   0.15 |    20 |     1 |     0 |     0 |        0
+  8 |  0.545 |    0.425 |  0.125 |    16 |     1 |     0 |     0 |        0
+  9 |  0.475 |     0.37 |  0.125 |     9 |     0 |     0 |     1 |        0
+ 10 |   0.55 |     0.44 |   0.15 |    19 |     0 |     0 |     0 |        1
+ 11 |  0.525 |     0.38 |   0.14 |    14 |     1 |     0 |     0 |        0
+ 12 |   0.43 |     0.35 |   0.11 |    10 |     0 |     0 |     1 |        0
+ 13 |   0.49 |     0.38 |  0.135 |    11 |     0 |     0 |     1 |        0
+ 14 |  0.535 |    0.405 |  0.145 |    10 |     1 |     0 |     0 |        0
+ 15 |   0.47 |    0.355 |    0.1 |    10 |     1 |     0 |     0 |        0
+ 16 |    0.5 |      0.4 |   0.13 |    12 |     0 |     0 |     1 |        0
+ 17 |  0.355 |     0.28 |  0.085 |     7 |     0 |     1 |     0 |        0
+ 18 |   0.44 |     0.34 |    0.1 |    10 |     1 |     0 |     0 |        0
+ 19 |  0.365 |    0.295 |   0.08 |     7 |     0 |     0 |     1 |        0
+ 20 |   0.45 |     0.32 |    0.1 |     9 |     0 |     0 |     0 |        1
+(20 rows)
+</pre></li>
+<li>Encode all categorical variables in the source table. Also, specify the column 'id' as the index (primary key) which changes the output table to include only the index and the encoded variables: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        '*',                         -- Categorical columns
+        NULL,                        -- Categorical columns to exclude
+        'id'                         -- Index columns
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+ id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20
+----+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+  1 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
+  2 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  3 |     1 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  4 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  5 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  6 |     0 |     1 |     0 |       0 |       1 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  7 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1
+  8 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0
+  9 |     0 |     0 |     1 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 10 |     0 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0
+ 11 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0
+ 12 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 13 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0
+ 14 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 15 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 16 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0
+ 17 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 18 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 19 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 20 |     0 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+(20 rows)
+</pre></li>
+<li>Now let's encode only the top values and group others into a miscellaneous bucket column. Top values can be global across all columns or specified by column. As an example of the latter, here are the top 2 'sex' values and the top 50% of 'rings' values: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        '*',                         -- Categorical columns
+        NULL,                        -- Categorical columns to exclude
+        'id',                        -- Index columns
+        'sex=2, rings=0.5'           -- Top values
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+ id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__
+----+-------+-------+-------------+----------+---------+---------+---------------
+  1 |     1 |     0 |           0 |        0 |       0 |       0 |             1
+  2 |     1 |     0 |           0 |        0 |       1 |       0 |             0
+  3 |     0 |     1 |           0 |        0 |       0 |       1 |             0
+  4 |     1 |     0 |           0 |        1 |       0 |       0 |             0
+  5 |     0 |     0 |           1 |        0 |       1 |       0 |             0
+  6 |     0 |     0 |           1 |        0 |       0 |       0 |             1
+  7 |     0 |     1 |           0 |        0 |       0 |       0 |             1
+  8 |     0 |     1 |           0 |        0 |       0 |       0 |             1
+  9 |     1 |     0 |           0 |        0 |       0 |       1 |             0
+ 10 |     0 |     0 |           0 |        0 |       0 |       0 |             1
+ 11 |     0 |     1 |           0 |        0 |       0 |       0 |             1
+ 12 |     1 |     0 |           0 |        1 |       0 |       0 |             0
+ 13 |     1 |     0 |           0 |        0 |       0 |       0 |             1
+ 14 |     0 |     1 |           0 |        1 |       0 |       0 |             0
+ 15 |     0 |     1 |           0 |        1 |       0 |       0 |             0
+ 16 |     1 |     0 |           0 |        0 |       0 |       0 |             1
+ 17 |     0 |     0 |           1 |        0 |       1 |       0 |             0
+ 18 |     0 |     1 |           0 |        1 |       0 |       0 |             0
+ 19 |     1 |     0 |           0 |        0 |       1 |       0 |             0
+ 20 |     0 |     0 |           0 |        0 |       0 |       1 |             0
+(20 rows)
+</pre></li>
+<li>If you want to see both the raw categorical variable and its encoded form in the output_table, then include the categorical variable(s) in the index parameter. (Remember that this will not work if you specify '*' for the parameter 'categorical_cols', because in this case 'row_id' columns will not be encoded at all.) <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        'sex, rings',                -- Categorical columns
+        NULL,                        -- Categorical columns to exclude
+        'id, sex, rings'             -- Index columns
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+ id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20
+----+-----+-------+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+  1 | M   |    15 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
+  2 | M   |     7 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  3 | F   |     9 |     1 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  4 | M   |    10 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  5 | I   |     7 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  6 | I   |     8 |     0 |     1 |     0 |       0 |       1 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  7 | F   |    20 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1
+  8 | F   |    16 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0
+  9 | M   |     9 |     0 |     0 |     1 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 10 |     |    19 |     0 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0
+ 11 | F   |    14 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0
+ 12 | M   |    10 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 13 | M   |    11 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0
+ 14 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 15 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 16 | M   |    12 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0
+ 17 | I   |     7 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 18 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 19 | M   |     7 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 20 |     |     9 |     0 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+(20 rows)
+</pre></li>
+<li>For dummy encoding, let's make the 'I' value from the 'sex' variable as the reference. Here we use the 'value_to_drop' parameter: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        '*',                         -- Categorical columns
+        'rings',                     -- Categorical columns to exclude
+        'id',                        -- Index columns
+        NULL,                        -- Top value
+        'sex=I'                      -- Value to drop for dummy encoding
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+  id | sex_F | sex_M
+----+-------+-------
+  1 |     0 |     1
+  2 |     0 |     1
+  3 |     1 |     0
+  4 |     0 |     1
+  5 |     0 |     0
+  6 |     0 |     0
+  7 |     1 |     0
+  8 |     1 |     0
+  9 |     0 |     1
+ 10 |     0 |     0
+ 11 |     1 |     0
+ 12 |     0 |     1
+ 13 |     0 |     1
+ 14 |     1 |     0
+ 15 |     1 |     0
+ 16 |     0 |     1
+ 17 |     0 |     0
+ 18 |     1 |     0
+ 19 |     0 |     1
+ 20 |     0 |     0
+(20 rows)
+</pre></li>
+<li>Create an array output for the two categorical variables in the source table: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        '*',                         -- Categorical columns
+        NULL,                        -- Categorical columns to exclude
+        'id',                        -- Index columns
+        NULL,                        -- Top values
+        NULL,                        -- Value to drop for dummy encoding
+        NULL,                        -- Encode nulls
+        'array'                      -- Array output type
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+ id |     __encoded_variables__
+----+-------------------------------
+  1 | {0,0,1,0,0,0,0,0,0,0,1,0,0,0}
+  2 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+  3 | {1,0,0,0,0,1,0,0,0,0,0,0,0,0}
+  4 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+  5 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+  6 | {0,1,0,0,1,0,0,0,0,0,0,0,0,0}
+  7 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
+  8 | {1,0,0,0,0,0,0,0,0,0,0,1,0,0}
+  9 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
+ 10 | {0,0,0,0,0,0,0,0,0,0,0,0,1,0}
+ 11 | {1,0,0,0,0,0,0,0,0,1,0,0,0,0}
+ 12 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+ 13 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
+ 14 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 15 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 16 | {0,0,1,0,0,0,0,0,1,0,0,0,0,0}
+ 17 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+ 18 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 19 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+ 20 | {0,0,0,0,0,1,0,0,0,0,0,0,0,0}
+(20 rows)
+</pre> View the dictionary table that gives the index into the array: <pre class="example">
+SELECT * FROM abalone_out_dictionary;
+</pre> <pre class="result">
+  encoded_column_name  | index | variable | value
+-----------------------+-------+----------+-------
+ __encoded_variables__ |     1 | sex      | F
+ __encoded_variables__ |     2 | sex      | I
+ __encoded_variables__ |     3 | sex      | M
+ __encoded_variables__ |     4 | rings    | 7
+ __encoded_variables__ |     5 | rings    | 8
+ __encoded_variables__ |     6 | rings    | 9
+ __encoded_variables__ |     7 | rings    | 10
+ __encoded_variables__ |     8 | rings    | 11
+ __encoded_variables__ |     9 | rings    | 12
+ __encoded_variables__ |    10 | rings    | 14
+ __encoded_variables__ |    11 | rings    | 15
+ __encoded_variables__ |    12 | rings    | 16
+ __encoded_variables__ |    13 | rings    | 19
+ __encoded_variables__ |    14 | rings    | 20
+(14 rows)
+</pre></li>
+<li>Create a dictionary output: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        '*',                         -- Categorical columns
+        NULL,                        -- Categorical columns to exclude
+        'id',                        -- Index columns
+        NULL,                        -- Top values
+        NULL,                        -- Value to drop for dummy encoding
+        NULL,                        -- Encode nulls
+        NULL,                        -- Output type
+        TRUE                         -- Dictionary output
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre> <pre class="result">
+ id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11
+----+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------
+  1 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |        0 |        0
+  2 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  3 |     1 |     0 |     0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  4 |     0 |     0 |     1 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  5 |     0 |     1 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  6 |     0 |     1 |     0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  7 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
+  8 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
+  9 |     0 |     0 |     1 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 10 |     0 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        1 |        0
+ 11 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |        0 |        0
+ 12 |     0 |     0 |     1 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 13 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |        0 |        0
+ 14 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 15 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 16 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |        0 |        0
+ 17 |     0 |     1 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 18 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 19 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 20 |     0 |     0 |     0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+(20 rows)
+</pre> View the dictionary table that defines the numerical columns in the output table: <pre class="example">
+SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;
+</pre> <pre class="result">
+ encoded_column_name | index | variable | value
+---------------------+-------+----------+-------
+ "rings_1"           |     1 | rings    | 7
+ "rings_10"          |    10 | rings    | 19
+ "rings_11"          |    11 | rings    | 20
+ "rings_2"           |     2 | rings    | 8
+ "rings_3"           |     3 | rings    | 9
+ "rings_4"           |     4 | rings    | 10
+ "rings_5"           |     5 | rings    | 11
+ "rings_6"           |     6 | rings    | 12
+ "rings_7"           |     7 | rings    | 14
+ "rings_8"           |     8 | rings    | 15
+ "rings_9"           |     9 | rings    | 16
+ "sex_1"             |     1 | sex      | F
+ "sex_2"             |     2 | sex      | I
+ "sex_3"             |     3 | sex      | M
+(14 rows)
+</pre></li>
+<li>We can chose from various distribution policies, for examply RANDOMLY: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        '*',                         -- Categorical columns
+        NULL,                        -- Categorical columns to exclude
+        'id',                        -- Index columns
+        NULL,                        -- Top values
+        NULL,                        -- Value to drop for dummy encoding
+        NULL,                        -- Encode nulls
+        NULL,                        -- Output type
+        NULL,                        -- Dictionary output
+        'RANDOMLY'                   -- Distribution policy
+        );
+</pre></li>
+<li>If you have a reason to encode FLOAT variables, you can cast them in the following way within the function call: <pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        'height::TEXT'               -- Categorical columns
+        );
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
+<p><a class="anchor" id="svm-lit-1"></a>[1] <a href="https://en.wikipedia.org/wiki/Categorical_variable">https://en.wikipedia.org/wiki/Categorical_variable</a></p>
+<p>[2] <a href="https://archive.ics.uci.edu/ml/datasets/Abalone">https://archive.ics.uci.edu/ml/datasets/Abalone</a> </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed Dec 27 2017 19:05:57 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>