You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/12/28 22:51:53 UTC

[19/51] [abbrv] [partial] madlib-site git commit: Additional updates for 1.13 release

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6c103d3e/docs/v1.13/group__grp__pred.html
----------------------------------------------------------------------
diff --git a/docs/v1.13/group__grp__pred.html b/docs/v1.13/group__grp__pred.html
new file mode 100644
index 0000000..0e5e48c
--- /dev/null
+++ b/docs/v1.13/group__grp__pred.html
@@ -0,0 +1,365 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Prediction Metrics</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.13</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__pred.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Prediction Metrics<div class="ingroups"><a class="el" href="group__grp__mdl.html">Model Selection</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#list">List of Prediction Metric Functions</a> </li>
+<li>
+<a href="#specs">Function Specific Details</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>This module provides a set of metrics to evaluate the quality of predictions of a model. A typical function will take a set of "prediction" and "observation" values and use them to calculate the desired metric, unless noted otherwise. Grouping is supported for all functions (except confusion matrix).</p>
+<p><a class="anchor" id="list"></a></p><dl class="section user"><dt>Prediction Metrics Functions</dt><dd><table class="output">
+<tr>
+<th>mean_abs_error(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>Mean absolute error  </td></tr>
+<tr>
+<th>mean_abs_perc_error(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>Mean absolute percentage error  </td></tr>
+<tr>
+<th>mean_perc_error(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>Mean percentage error  </td></tr>
+<tr>
+<th>mean_squared_error(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>Mean squared error </td></tr>
+<tr>
+<th>r2_score(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>R-squared  </td></tr>
+<tr>
+<th>adjusted_r2_score(table_in, table_out, prediction_col, observed_col, num_predictors, training_size, grouping_cols)</th><td>Adjusted R-squared  </td></tr>
+<tr>
+<th>binary_classifier(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>Collection of prediction metrics related to binary classification </td></tr>
+<tr>
+<th>area_under_roc(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>Area under the ROC curve (in binary classification)  </td></tr>
+<tr>
+<th>confusion_matrix(table_in, table_out, prediction_col, observed_col, grouping_cols)</th><td>Confusion matrix for a multi-class classifier  </td></tr>
+</table>
+</dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>table_in </dt>
+<dd>TEXT. Name of the input table. </dd>
+<dt>table_out </dt>
+<dd>TEXT. Name of the output table. For consistency, a table is created for all metric outputs even when grouping is not used, which may mean there is only a single value in the output table in some cases.  </dd>
+<dt>prediction_col </dt>
+<dd>TEXT. Name of the column of predicted values from input table. </dd>
+<dt>observed_col </dt>
+<dd>TEXT. Name of the column of observed values from input table. </dd>
+<dt>num_predictors (for adjusted R-squared score only) </dt>
+<dd>INTEGER. The number of parameters in the predicting model, not counting the constant term. </dd>
+<dt>training_size (for adjusted R-squared score only) </dt>
+<dd>INTEGER. The number of rows used for training, excluding any NULL rows. </dd>
+<dt>grouping_cols (optional) </dt>
+<dd>TEXT, default: NULL. Name of the column of grouping values from input table. </dd>
+</dl>
+<p><a class="anchor" id="specs"></a></p><dl class="section user"><dt>Function Specific Details</dt><dd></dd></dl>
+<p><b>R-squared Score</b></p>
+<p>This function returns the coefficient of determination (R2) between the predicted and observed values. An R2 of 1 indicates that the regression line perfectly fits the data, while an R2 of 0 indicates that the line does not fit the data at all. Negative values of R2 may occur when fitting non-linear functions to data. Please refer to reference <a href="#r2">[1]</a> for more details.</p>
+<p><b>Adjusted R-squared Score</b></p>
+<p>This function returns the adjusted R2 score in addition to the R-squared score described above. Adjusted R2 score is used to counter the problem of the R2 automatically increasing when extra explanatory variables are added to the model. It takes two additional parameters describing the degrees of freedom of the model (num_predictors) and the size of the training set over which it was developed (training_size):</p>
+<ul>
+<li>num_predictors: Indicates the number of parameters the model has other than the constant term. For example, if it is set to '3' the model may take the following form as an example: 7 + 5x + 39y + 0.91z.</li>
+<li>training_size: Indicates the number of rows in the training set (excluding any NULL rows).</li>
+</ul>
+<p>Neither of these arguments can be deduced from the predicted values and the test data alone which is why they are explicit inputs. Please refer to reference <a href="#r2">[1]</a> for more details.</p>
+<p><a class="anchor" id="bc"></a><b>Binary Classification</b></p>
+<p>This function returns an output table with a number of metrics commonly used in binary classification.</p>
+<p>The definitions of the various metrics are as follows:</p>
+<ul>
+<li>\(\textit{tp}\) is the count of correctly-classified positives.</li>
+<li>\(\textit{tn}\) is the count of correctly-classified negatives.</li>
+<li>\(\textit{fp}\) is the count of misclassified negatives.</li>
+<li>\(\textit{fn}\) is the count of misclassified positives.</li>
+<li>\(\textit{tpr}=\textit{tp}/(\textit{tp}+\textit{fn})\).</li>
+<li>\(\textit{tnr}=\textit{tn}/(\textit{fp}+\textit{tn})\).</li>
+<li>\(\textit{ppv}=\textit{tp}/(\textit{tp}+\textit{fp})\).</li>
+<li>\(\textit{npv}=\textit{tn}/(\textit{tn}+\textit{fn})\).</li>
+<li>\(\textit{fpr}=\textit{fp}/(\textit{fp}+\textit{tn})\).</li>
+<li>\(\textit{fdr}=1-\textit{ppv}\).</li>
+<li>\(\textit{fnr}=\textit{fn}/(\textit{fn}+\textit{tp})\).</li>
+<li>\(\textit{acc}=(\textit{tp}+\textit{tn})/(\textit{tp}+\textit{tn}+\textit{fp} +\textit{fn})\).</li>
+<li>\(\textit{f1}=2*\textit{tp}/(2*\textit{tp}+\textit{fp}+\textit{fn})\).</li>
+</ul>
+<p><b>Area Under ROC Curve</b></p>
+<p>This function returns the area under the Receiver Operating Characteristic curve for binary classification (the AUC). The ROC curve is the curve relating the classifier's TPR and FPR metrics. (See <a href="#bc">Binary Classification</a> above for a definition of these metrics). Please refer to reference <a href="#aoc">[2]</a> for more details. Note that the binary classification function can be used to obtain the data (TPR and FPR values) required for drawing the ROC curve.</p>
+<dl class="section note"><dt>Note</dt><dd>For 'binary_classifier' and 'area_under_roc' functions:<ul>
+<li>The 'observed_col' column is assumed to be a numeric column with two values: 0 and 1, or a Boolean column. For the purposes of the metric calculation, 0 is considered to be negative and 1 to be positive.</li>
+<li>The 'pred_col' column is expected to contain numeric values corresponding to likelihood/probability. A larger value corresponds to greater certainty that the observed value will be '1', and a lower value corresponds to a greater certainty that it will be '0'.</li>
+</ul>
+</dd></dl>
+<p><b>Confusion Matrix</b></p>
+<p>This function returns the confusion matrix of a multi-class classification. Each column of the matrix represents the instances in a predicted class while each row represents the instances in an actual class. This allows more detailed analysis than mere proportion of correct guesses (accuracy). Please refer to the reference <a href="#cm">[3]</a> for more details. Please note that grouping is not supported for the confusion matrix.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Create the sample data: <pre class="example">
+DROP TABLE IF EXISTS test_set;
+CREATE TABLE test_set(
+                  pred FLOAT8,
+                  obs FLOAT8
+                );
+INSERT INTO test_set VALUES
+  (37.5,53.1), (12.3,34.2), (74.2,65.4), (91.1,82.1);
+</pre></li>
+<li>Run the Mean Absolute Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_abs_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_abs_error
+&#160;----------------
+         13.825
+</pre></li>
+<li>Run the Mean Absolute Percentage Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_abs_perc_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_abs_perc_error
+&#160;---------------------
+   0.294578793636013
+</pre></li>
+<li>Run the Mean Percentage Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_perc_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_perc_error
+&#160;-------------------
+   -0.17248930032771
+</pre></li>
+<li>Run the Mean Squared Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_squared_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_squared_error
+&#160;--------------------
+   220.3525
+</pre></li>
+<li>Run the R2 Score function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.r2_score( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ r2_score
+&#160;------------------------
+   0.27992908844337695865
+</pre></li>
+<li>Run the Adjusted R2 Score function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.adjusted_r2_score( 'test_set', 'table_out', 'pred', 'obs', 3, 100);
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+       r2_score      | adjusted_r2_score 
+&#160;--------------------+------------------
+   0.279929088443375 | 0.257426872457231
+</pre></li>
+<li>Create the sample data for binary classifier metrics: <pre class="example">
+DROP TABLE IF EXISTS test_set;
+CREATE TABLE test_set AS
+    SELECT ((a*8)::integer)/8.0 pred,
+        ((a*0.5+random()*0.5)&gt;0.5) obs
+    FROM (select random() as a from generate_series(1,100)) x;
+</pre></li>
+<li>Run the Binary Classifier metrics function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.binary_classifier( 'test_set', 'table_out', 'pred', 'obs');
+</pre></li>
+<li>View the True Positive Rate and the False Positive Rate: <pre class="example">
+SELECT threshold, tpr, fpr FROM table_out ORDER BY threshold;
+</pre> Result (your results for this and other functions below will look different due to the presence of the random function in sample data generator): <pre class="result">
+       threshold        |          tpr           |          fpr
+------------------------+------------------------+------------------------
+ 0.00000000000000000000 | 1.00000000000000000000 | 1.00000000000000000000
+ 0.12500000000000000000 | 1.00000000000000000000 | 0.94915254237288135593
+ 0.25000000000000000000 | 0.92682926829268292683 | 0.64406779661016949153
+ 0.37500000000000000000 | 0.80487804878048780488 | 0.47457627118644067797
+ 0.50000000000000000000 | 0.70731707317073170732 | 0.35593220338983050847
+ 0.62500000000000000000 | 0.63414634146341463415 | 0.25423728813559322034
+ 0.75000000000000000000 | 0.48780487804878048780 | 0.06779661016949152542
+ 0.87500000000000000000 | 0.29268292682926829268 | 0.03389830508474576271
+ 1.00000000000000000000 | 0.12195121951219512195 | 0.00000000000000000000
+</pre></li>
+<li>View all metrics at a given threshold value: <pre class="example">
+-- Set extended display on for easier reading of output
+\x on
+SELECT * FROM table_out WHERE threshold=0.5;
+</pre> Result <pre class="result">
+-[ RECORD 1 ]---------------------
+threshold | 0.50000000000000000000
+tp        | 29
+fp        | 21
+fn        | 12
+tn        | 38
+tpr       | 0.70731707317073170732
+tnr       | 0.64406779661016949153
+ppv       | 0.58000000000000000000
+npv       | 0.76000000000000000000
+fpr       | 0.35593220338983050847
+fdr       | 0.42000000000000000000
+fnr       | 0.29268292682926829268
+acc       | 0.67000000000000000000
+f1        | 0.63736263736263736264
+</pre></li>
+<li>Run the Area Under ROC curve function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.area_under_roc( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ area_under_roc
+&#160;---------------------------------------------
+0.77428689541132699462698842496899545266640
+</pre></li>
+<li>Create the sample data for confusion matrix. <pre class="example">
+DROP TABLE IF EXISTS test_set;
+CREATE TABLE test_set AS
+    SELECT (x+y)%5+1 AS pred,
+        (x*y)%5 AS obs
+    FROM generate_series(1,5) x,
+        generate_series(1,5) y;
+</pre></li>
+<li>Run the confusion matrix function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.confusion_matrix( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out ORDER BY class;
+</pre> Result <pre class="result">
+ class | confusion_arr
+-------+---------------
+     0 | {0,1,2,2,2,2}
+     1 | {0,2,0,1,1,0}
+     2 | {0,0,0,2,2,0}
+     3 | {0,0,2,0,0,2}
+     4 | {0,2,1,0,0,1}
+     5 | {0,0,0,0,0,0}
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
+<p><a class="anchor" id="r2"></a> [1] <a href="https://en.wikipedia.org/wiki/Coefficient_of_determination">https://en.wikipedia.org/wiki/Coefficient_of_determination</a></p>
+<p><a class="anchor" id="aoc"></a> [2] <a href="https://en.wikipedia.org/wiki/Receiver_operating_characteristic">https://en.wikipedia.org/wiki/Receiver_operating_characteristic</a></p>
+<p><a class="anchor" id="cm"></a> [3] <a href="https://en.wikipedia.org/wiki/Confusion_matrix">https://en.wikipedia.org/wiki/Confusion_matrix</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="pred__metrics_8sql__in.html" title="A collection of summary statistics to gauge model accuracy based on predicted values vs...">pred_metrics.sql_in</a> for list of functions and usage. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed Dec 27 2017 19:05:57 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6c103d3e/docs/v1.13/group__grp__prob.html
----------------------------------------------------------------------
diff --git a/docs/v1.13/group__grp__prob.html b/docs/v1.13/group__grp__prob.html
new file mode 100644
index 0000000..97dbccf
--- /dev/null
+++ b/docs/v1.13/group__grp__prob.html
@@ -0,0 +1,164 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Probability Functions</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.13</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__prob.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Probability Functions<div class="ingroups"><a class="el" href="group__grp__stats.html">Statistics</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#syntax">Function Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>The Probability Functions module provides cumulative distribution, density/mass, and quantile functions for a wide range of probability distributions.</p>
+<p>Unless otherwise documented, all of these functions are wrappers around functionality provided by the boost C++ library [1, “<a href="http://www.boost.org/doc/libs/1_49_0/libs/math/doc/sf_and_dist/html/math_toolkit/dist.html">Statistical Distributions and Functions</a>”].</p>
+<p>For convenience, all cumulative distribution and density/mass functions (CDFs and PDF/PMFs in short) are defined over the range of all floating-point numbers including infinity. Inputs that are <code>NULL</code> or <code>NaN</code> (not a number) will always produce a <code>NULL</code> or <code>NaN</code> result, respectively. Inputs that are plus or minus infinity will return the respective limits.</p>
+<p>A quantile function for a probability distrution with CDF \( F \) takes a probability argument \( p \in [0,1] \) and returns the value \( x \) so that \( F(x) = p \), provided such an \( x \) exists and it is unique. If it does not, the result will be \( \sup \{ x \in D \mid F(x) \leq p \} \) (interpreted as 0 if the supremum is over an empty set) if \( p &lt; 0.5 \), and \( \inf \{ x \in D \mid F(x) \geq p \} \) if \( p \geq 0.5 \). Here \( D \) denotes the domain of the distribution, which is the set of reals \( \mathbb R \) for continuous and the set of nonnegative integers \( \mathbb N_0 \) for discrete distributions.</p>
+<p>Intuitively, the formulas in the previous paragraph deal with the following special cases. The 0-quantile will always be the “left end” of the support, and the 1-quantile will be the “right end” of the support of the distribution. For discrete distributions, most values of \( p \in [0,1] \) do not admit an \( x \) with \( F(x) = p \). Instead, there is an \( x \in \mathbb N_0 \) so that \( F(x) &lt; p &lt; F(x + 1) \). The above formulas mean that the value returned as \( p \)-quantile is \( x \) if \( p &lt; 0.5 \), and it is \( x + 1 \) if \( p \geq 0.5 \). (As a special case, in order to ensure that quantiles are always within the support, the \( p \)-quantile will be 0 if \( p &lt; F(0) \)).</p>
+<p>The rationale for choosing this behavior is that \(p\)-quantiles for \( p &lt; 0.5 \) are typically requested when interested in the value \( x \) such that with confidence level <b>at least</b> \( 1 - p \) a random variable will be \( &gt; x \) (or equivalently, with probability <b>at most</b> \( p \), it will be \( \leq x \)). Likewise, \(p\)-quantiles for \( p \geq 0.5 \) are typically requested when interested in the value \( x \) such that with confidence level <b>at least</b> \( p \) a random variable will be \( \leq x \). See also [1, “<a href="http://www.boost.org/doc/libs/1_46_1/libs/math/doc/sf_and_dist/html/math_toolkit/policy/pol_tutorial/understand_dis_quant.html">Understanding Quantiles of Discrete Distributions</a>”].</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function Syntax</dt><dd></dd></dl>
+<p>Cumulative distribution functions:</p>
+<pre class="syntax"><em>distribution</em>_cdf(<em>random variate</em>[, <em>parameter1</em> [, <em>parameter2</em> [, <em>parameter3</em>] ] ])</pre><p>Probability density/mass functions: </p><pre class="syntax"><em>distribution</em>_{pdf|pmf}(<em>random variate</em>[, <em>parameter1</em> [, <em>parameter2</em> [, <em>parameter3</em>] ] ])</pre><p>Quantile functions: </p><pre class="syntax"><em>distribution</em>_quantile(<em>probability</em>[, <em>parameter1</em> [, <em>parameter2</em> [, <em>parameter3</em>] ] ])</pre><p>For concrete function signatures, see <a class="el" href="prob_8sql__in.html">prob.sql_in</a>.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
+<pre class="example">
+SELECT madlib.normal_cdf(0);
+</pre><p> Result: </p><pre class="result">
+ normal_cdf
+&#160;-----------
+        0.5
+</pre> <pre class="example">
+SELECT madlib.normal_quantile(0.5, 0, 1);
+</pre><p> Result: </p><pre class="result">
+ normal_quantile
+&#160;----------------
+               0
+(1 row)
+</pre><p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] John Maddock, Paul A. Bristow, Hubert Holin, Xiaogang Zhang, Bruno Lalande, Johan Råde, Gautam Sewani and Thijs van den Berg: <em>Boost Math Toolkit</em>, Version 1.49, available at: <a href="http://www.boost.org/doc/libs/1_49_0/libs/math/doc/sf_and_dist/html/index.html">http://www.boost.org/doc/libs/1_49_0/libs/math/doc/sf_and_dist/html/index.html</a></p>
+<dl class="section user"><dt>Related Topics</dt><dd><a class="anchor" id="related"></a>File <a class="el" href="prob_8sql__in.html" title="SQL functions for evaluating probability functions. ">prob.sql_in</a> documenting the SQL functions. </dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed Dec 27 2017 19:05:57 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6c103d3e/docs/v1.13/group__grp__random__forest.html
----------------------------------------------------------------------
diff --git a/docs/v1.13/group__grp__random__forest.html b/docs/v1.13/group__grp__random__forest.html
new file mode 100644
index 0000000..4e61b7a
--- /dev/null
+++ b/docs/v1.13/group__grp__random__forest.html
@@ -0,0 +1,765 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Random Forest</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.13</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__random__forest.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Random Forest<div class="ingroups"><a class="el" href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" href="group__grp__tree.html">Tree Methods</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#get_tree">Display Function</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>Random forests build an ensemble of classifiers, each of which is a tree model constructed using bootstrapped samples from the input data. The results of these models are then combined to yield a single prediction, which, at the expense of some loss in interpretation, have been found to be highly accurate.</p>
+<p>Please also refer to the decision tree user documentation for information relevant to the implementation of random forests in MADlib.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training Function</dt><dd>Random Forest training function has the following format: <pre class="syntax">
+forest_train(training_table_name,
+             output_table_name,
+             id_col_name,
+             dependent_variable,
+             list_of_features,
+             list_of_features_to_exclude,
+             grouping_cols,
+             num_trees,
+             num_random_features,
+             importance,
+             num_permutations,
+             max_tree_depth,
+             min_split,
+             min_bucket,
+             num_splits,
+             null_handling_params,
+             verbose,
+             sample_ratio
+             )
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>training_table_name </dt>
+<dd><p class="startdd">text. Name of the table containing the training data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table_name </dt>
+<dd><p class="startdd">text. Name of the generated table containing the model.</p>
+<p>The model table produced by the training function contains the following columns:</p>
+<table class="output">
+<tr>
+<th>gid </th><td>integer. group id that uniquely identifies a set of grouping column values.  </td></tr>
+<tr>
+<th>sample_id </th><td>integer. The id of the bootstrap sample that this tree is a part of.  </td></tr>
+<tr>
+<th>tree </th><td>bytea8. Trained tree model stored in binary format.  </td></tr>
+</table>
+<p>A summary table named <em>&lt;model_table&gt;_summary</em> is also created at the same time, which contains the following columns: </p><table class="output">
+<tr>
+<th>method </th><td><p class="starttd">'forest_train' </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>is_classification </th><td><p class="starttd">boolean. True if it is a classification model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>source_table </th><td><p class="starttd">text. Data source table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>model_table </th><td><p class="starttd">text. Model table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>id_col_name </th><td><p class="starttd">text. The ID column name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_varname </th><td><p class="starttd">text. Dependent variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_varname </th><td><p class="starttd">text. Independent variables </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_features </th><td><p class="starttd">text. Categorical feature names. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>con_features </th><td><p class="starttd">text. Continuous feature names. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>grouping_col </th><td><p class="starttd">int. Names of grouping columns. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_trees </th><td><p class="starttd">int. Number of trees grown by the model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_random_features </th><td><p class="starttd">int. Number of features randomly selected for each split. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>max_tree_depth </th><td><p class="starttd">int. Maximum depth of any tree in the random forest model_table. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>min_split </th><td><p class="starttd">int. Minimum number of observations in a node for it to be split. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>min_bucket </th><td><p class="starttd">int. Minimum number of observations in any terminal node. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_splits </th><td><p class="starttd">int. Number of buckets for continuous variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>verbose </th><td><p class="starttd">boolean. Whether or not to display debug info. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>importance </th><td><p class="starttd">boolean. Whether or not to calculate variable importance. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_permutations </th><td><p class="starttd">int. Number of times feature values are permuted while calculating variable importance. The default value is 1. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_all_groups </th><td><p class="starttd">int. Number of groups during forest training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_failed_groups </th><td><p class="starttd">int. Number of failed groups during forest training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_processed </th><td><p class="starttd">bigint. Total numbers of rows processed in all groups. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_skipped </th><td><p class="starttd">bigint. Total numbers of rows skipped in all groups due to missing values or failures. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_levels </th><td><p class="starttd">itext. For classification, the distinct levels of the dependent variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_type </th><td>text. The type of dependent variable.  </td></tr>
+</table>
+<p>A group table named <em> &lt;model_table&gt;_group</em> is created, which has the following columns: </p><table class="output">
+<tr>
+<th>gid </th><td><p class="starttd">integer. Group id that uniquely identifies a set of grouping column values. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>&lt;...&gt; </th><td><p class="starttd">Same type as in the training data table. Grouping columns, if provided in input. This could be multiple columns depending on the <code>grouping_cols</code> input. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>success </th><td><p class="starttd">boolean. Indicator of the success of the group. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_levels_in_text </th><td><p class="starttd">text[]. Ordered levels of categorical variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_n_levels </th><td><p class="starttd">integer[]. Number of levels for each categorical variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>oob_error </th><td><p class="starttd">double precision. Out-of-bag error for the random forest model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_var_importance </th><td><p class="starttd">double precision[]. Variable importance for categorical features. The order corresponds to the order of the variables as found in cat_features in <em> &lt;model_table&gt;_summary</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>con_var_importance </th><td><p class="starttd">double precision[]. Variable importance for continuous features. The order corresponds to the order of the variables as found in con_features in <em> &lt;model_table&gt;_summary</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">text. Name of the column containing id information in the training data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_variable </dt>
+<dd><p class="startdd">text. Name of the column that contains the output for training. Boolean, integer and text are considered classification outputs, while float values are considered regression outputs.</p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of column names or expressions to use as predictors. Can also be a '*' implying all columns are to be used as predictors (except for the ones included in the next argument that lists exclusions). The types of the features can be mixed - boolean, integer, and text columns are considered categorical and double precision columns are considered continuous. Categorical variables are not encoded and used as is for the training.</p>
+<p>Array columns can also be included in the list, where the array is expanded to treat each element of the array as a feature.</p>
+<p>It is important to note that not every combination of the levels of a categorical variable is checked when evaluating a split. The levels of the non-integer categorical variable are ordered by the entropy of the variable in predicting the response. The split at each node is evaluated between these ordered levels. Integer categorical variables, however, are simply ordered by their value. </p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features_to_exclude </dt>
+<dd><p class="startdd">text. Comma-separated string of column names to exclude from the predictors list. If the <em>dependent_variable</em> argument is an expression (including cast of a column name), then this list should include the columns that are included in the <em>dependent_variable</em> expression, otherwise those columns will be included in the features (resulting in meaningless trees).</p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_cols (optional) </dt>
+<dd><p class="startdd">text, default: NULL. Comma-separated list of column names to group the data by. This will lead to creating multiple random forests, one for each group.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_trees (optional) </dt>
+<dd><p class="startdd">integer, default: 100. Maximum number of trees to grow in the Random Forest model. Actual number of trees grown may be slighlty different.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_random_features (optional) </dt>
+<dd><p class="startdd">integer, default: sqrt(n) if classification tree, otherwise n/3. Number of features to randomly select at each split.</p>
+<p class="enddd"></p>
+</dd>
+<dt>importance (optional) </dt>
+<dd><p class="startdd">boolean, default: true. Whether or not to calculate variable importance. If set to true, variable importance for categorical and continuous features will be output in the group table <em>&lt;model_table&gt;_group</em> described above. Will increase run time when variable importance is turned on. </p>
+<p class="enddd"></p>
+</dd>
+<dt>num_permutations (optional) </dt>
+<dd><p class="startdd">integer, default: 1. Number of times to permute each feature value while calculating variable importance.</p>
+<dl class="section note"><dt>Note</dt><dd>Variable importance for a feature is computed by permuting the variable with random values and computing the drop in predictive accuracy (using OOB samples). Setting this greater than 1 performs an average over multiple importance calculation. This increases the total run time and in most cases the default value of 1 is sufficient to compute the importance. </dd></dl>
+</dd>
+<dt>max_tree_depth (optional) </dt>
+<dd><p class="startdd">integer, default: 7. Maximum depth of any node of a tree, with the root node counted as depth 0. A deeper tree can lead to better prediction but will also result in longer processing time and higher memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_split (optional) </dt>
+<dd><p class="startdd">integer, default: 20. Minimum number of observations that must exist in a node for a split to be attempted.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_bucket (optional) </dt>
+<dd><p class="startdd">integer, default: min_split/3. Minimum number of observations in any terminal node. If only one of min_bucket or min_split is specified, min_split is set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_splits (optional) </dt>
+<dd><p class="startdd">integer, default: 20. Continuous-valued features are binned into discrete quantiles to compute split boundaries. This global parameter is used to compute the resolution of splits for continuous features. Higher number of bins will lead to better prediction, but will also result in longer processing time and higher memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>null_handling_params </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of key-value pairs controlling the behavior of various features handling missing values. </p><table class="output">
+<tr>
+<th>max_surrogates </th><td>Default: 0. Number of surrogates to store for each node. One of the approaches of handling NULLs is to use surrogate splits for each node. A surrogate variable is another predictor variable that is associated (correlated) with the primary split variable. The surrogate variable comes into use when the primary predictior value is NULL.  </td></tr>
+<tr>
+<th>null_as_special_cat </th><td><p class="starttd">Default: FALSE. Whether to treat NULL as a special categorical value.</p>
+<p class="endtd">If this is set to TRUE, NULL values are considered a categorical value and placed at the end of the ordering of categorical levels. Placing it at the end ensures that NULL is never used as a value to split a node on. This parameter is ignored for continuous-valued features.   </p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd><p class="startdd">boolean, default: FALSE. Provides verbose output of the results of training.</p>
+<p class="enddd"></p>
+</dd>
+<dt>sample_ratio (optional) </dt>
+<dd>double precision, in the range of (0, 1], default: 1. If sample_ratio is less than 1, a bootstrap sample size smaller than the data table is expected to be used for training each tree in the forest. A ratio that is close to 0 may result in trees with only the root node. This allows users to experiment with the function in a speedy fashion. </dd>
+</dl>
+<dl class="section note"><dt>Note</dt><dd>The main parameters that affect memory usage are: depth of tree (‘max_tree_depth’), number of features, number of values per categorical feature, and number of bins for continuous features (‘num_splits’). If you are hitting memory limits, consider reducing one or more of these parameters.</dd></dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section user"><dt>Prediction Function</dt><dd>The prediction function is provided to estimate the conditional mean given a new predictor. It has the following syntax: <pre class="syntax">
+forest_predict(random_forest_model,
+               new_data_table,
+               output_table,
+               type)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>forest_model </dt>
+<dd><p class="startdd">text. Name of the table containing the Random Forest model.</p>
+<p class="enddd"></p>
+</dd>
+<dt>new_data_table </dt>
+<dd><p class="startdd">text. Name of the table containing prediction data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">text. Name of the table to output prediction results to.</p>
+<p class="enddd"></p>
+</dd>
+<dt>type </dt>
+<dd>text, optional, default: 'response'. For regression models, the output is always the predicted value of the dependent variable. For classification models, the <em>type</em> variable can be 'response', giving the classification prediction as output, or 'prob', giving the class probabilities as output. For each value of the dependent variable, a column with the probabilities is added to the output table.  </dd>
+</dl>
+<p><a class="anchor" id="get_tree"></a></p><dl class="section user"><dt>Display Function</dt><dd>The 'get_tree' function is provided to output a graph representation of a single tree of the random forest. The output can either be in the popular 'dot' format that can be visualized using various programs including those in the GraphViz package, or in a simple text format. The details of the text format is outputted with the tree. <pre class="syntax">
+get_tree(forest_model_table,
+         gid,
+         sample_id,
+         dot_format,
+         verbose)
+</pre></dd></dl>
+<p>An additional display function is provided to output the surrogate splits chosen for each internal node. </p><pre class="syntax">
+get_tree_surr(forest_model_table,
+              gid,
+              sample_id)
+</pre><p>The output contains the list of surrogate splits for each internal node of a tree. The nodes are sorted in ascending order by id. This is equivalent to viewing the tree in a breadth-first manner. For each surrogate, the output gives the surrogate split (variable and threshold) and also provides the number of rows that were common between the primary split and the surrogate split. Finally, the number of rows present in the majority branch of the primary split is also presented. Only surrogates that perform better than this majority branch are used. When the primary variable has a NULL value the surrogate variables are used in order to compute the split for that node. If all surrogates variables are NULL, then the majority branch is used to compute the split for a tuple.</p>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>forest_model_table </dt>
+<dd><p class="startdd">text. Name of the table containing the Random Forest model.</p>
+<p class="enddd"></p>
+</dd>
+<dt>gid </dt>
+<dd><p class="startdd">integer. Id of the group that this tree is a part of.</p>
+<p class="enddd"></p>
+</dd>
+<dt>sample_id </dt>
+<dd><p class="startdd">integer. Id of the bootstrap sample that this tree if a part of.</p>
+<p class="enddd"></p>
+</dd>
+<dt>dot_format (optional) </dt>
+<dd><p class="startdd">boolean, default = TRUE. Output can either be in a dot format or a text format. If TRUE, the result is in the dot format, else output is in text format.</p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd>boolean, default = FALSE. If true, the dot format output will contain additional information (impurity, sample size, number of weighted rows for each response variable, classification or prediction if the tree was pruned at this level) </dd>
+</dl>
+<p>The output is always returned as a 'TEXT'. For the dot format, the output can be redirected to a file on the client side and then rendered using visualization programs.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd><b>Note:</b> The output results may vary due the random nature of random forests.</dd></dl>
+<p><b>Random Forest Classification Example</b></p>
+<ol type="1">
+<li>Prepare input data: <pre class="example">
+DROP TABLE IF EXISTS dt_golf;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    windy text,
+    class text
+);
+</pre> <pre class="example">
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
+(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
+(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
+(3, 'overcast', 83, 78, 'false', 'Play'),
+(4, 'rain', 70, 96, 'false', 'Play'),
+(5, 'rain', 68, 80, 'false', 'Play'),
+(6, 'rain', 65, 70, 'true', 'Don''t Play'),
+(7, 'overcast', 64, 65, 'true', 'Play'),
+(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
+(9, 'sunny', 69, 70, 'false', 'Play'),
+(10, 'rain', 75, 80, 'false', 'Play'),
+(11, 'sunny', 75, 70, 'true', 'Play'),
+(12, 'overcast', 72, 90, 'true', 'Play'),
+(13, 'overcast', 81, 75, 'false', 'Play'),
+(14, 'rain', 71, 80, 'true', 'Don''t Play');
+</pre></li>
+<li>Run the random forest training function and view summary output: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('dt_golf',         -- source table
+                           'train_output',    -- output model table
+                           'id',              -- id column
+                           'class',           -- response
+                           '"OUTLOOK", temperature, humidity, windy',   -- features
+                           NULL,              -- exclude columns
+                           NULL,              -- grouping columns
+                           20::integer,       -- number of trees
+                           2::integer,        -- number of random features
+                           TRUE::boolean,     -- variable importance
+                           1::integer,        -- num_permutations
+                           8::integer,        -- max depth
+                           3::integer,        -- min split
+                           1::integer,        -- min bucket
+                           10::integer        -- number of splits per continuous variable
+                           );
+\x on
+SELECT * FROM train_output_summary;
+</pre> Result: <pre class="result">
+-[ RECORD 1 ]---------+-----------------------------------------------
+method                | forest_train
+is_classification     | t
+source_table          | dt_golf
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | class
+independent_varnames  | "OUTLOOK",windy,temperature,humidity
+cat_features          | "OUTLOOK",windy
+con_features          | temperature,humidity
+grouping_cols         |
+num_trees             | 20
+num_random_features   | 2
+max_tree_depth        | 8
+min_split             | 3
+min_bucket            | 1
+num_splits            | 10
+verbose               | f
+importance            | t
+num_permutations      | 1
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 14
+total_rows_skipped    | 0
+dependent_var_levels  | "Don't Play","Play"
+dependent_var_type    | text
+independent_var_types | text, text, double precision, double precision
+</pre> View the group table output: <pre class="example">
+SELECT * FROM train_output_group;
+</pre> Result: <pre class="result">
+-[ RECORD 1 ]------+----------------------------------------
+gid                | 1
+success            | t
+cat_n_levels       | {3,2}
+cat_levels_in_text | {overcast,rain,sunny,false,true}
+oob_error          | 0.50000000000000000000
+cat_var_importance | {-0.206309523809524,-0.234345238095238}
+con_var_importance | {-0.308690476190476,-0.272678571428571}
+</pre></li>
+<li>Obtain a dot format display of a single tree within the forest: <pre class="example">
+\x off
+SELECT madlib.get_tree('train_output',1,2);
+</pre> Result: <pre class="result">
+ digraph "Classification tree for dt_golf" {
+ "0" [label="humidity &lt;= 75", shape=ellipse];
+ "0" -&gt; "1"[label="yes"];
+ "1" [label="\"Play"",shape=box];
+ "0" -&gt; "2"[label="no"];
+ "2" [label="humidity &lt;= 80", shape=ellipse];
+ "2" -&gt; "5"[label="yes"];
+ "5" [label=""Don't Play"",shape=box];
+ "2" -&gt; "6"[label="no"];
+ "6" [label=""OUTLOOK" in {overcast,rain}", shape=ellipse];
+ "6" -&gt; "13"[label="yes"];
+ "13" [label=""Play"",shape=box];
+ "6" -&gt; "14"[label="no"];
+ "14" [label=""Don't Play"",shape=box];
+ } //---end of digraph---------
+</pre></li>
+<li>Obtain a text display of the tree: <pre class="example">
+SELECT madlib.get_tree('train_output',1,2,FALSE);
+</pre> Result: <pre class="result">
+&#160;-------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Leaf nodes have a * while internal nodes have the split condition at the end.
+&#160;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
+&#160;- For each split the first indented child (2i+1) is the 'True' node and
+second indented child (2i+2) is the 'False' node.
+&#160;- Number of (weighted) rows for each response variable inside [].
+&#160;- Order of values = ['"Don\'t Play"', '"Play"']
+&#160;-------------------------------------
+ (0)[ 4 10]  humidity &lt;= 75
+    (1)[0 7]  * --&gt; "Play"
+    (2)[4 3]  humidity &lt;= 80
+       (5)[3 1]  * --&gt; "Don't Play"
+       (6)[1 2]  "OUTLOOK" in {overcast,rain}
+          (13)[0 2]  * --&gt; "Play"
+          (14)[1 0]  * --&gt; "Don't Play"
+&#160;-------------------------------------
+</pre></li>
+<li>Predict output categories for the same data as was used for input: <pre class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('train_output',
+                             'dt_golf',
+                             'prediction_results',
+                             'response');
+\x off
+SELECT id, estimated_class, class
+FROM prediction_results JOIN dt_golf USING (id)
+ORDER BY id;
+</pre> Result: <pre class="result">
+  id | estimated_class |   class
+----+-----------------+------------
+  1 | Don't Play      | Don't Play
+  2 | Don't Play      | Don't Play
+  3 | Play            | Play
+  4 | Play            | Play
+  5 | Play            | Play
+  6 | Don't Play      | Don't Play
+  7 | Play            | Play
+  8 | Don't Play      | Don't Play
+  9 | Play            | Play
+ 10 | Play            | Play
+ 11 | Play            | Play
+ 12 | Play            | Play
+ 13 | Play            | Play
+ 14 | Don't Play      | Don't Play
+(14 rows)
+</pre></li>
+<li>Predict probablities of output categories for the same data: <pre class="example">
+DROP TABLE IF EXISTS prediction_prob;
+SELECT madlib.forest_predict('train_output',
+                             'dt_golf',
+                             'prediction_prob',
+                             'prob');
+\x off
+SELECT id, "estimated_prob_Play", class
+FROM prediction_prob JOIN dt_golf USING (id)
+ORDER BY id;
+</pre> Result: <pre class="result">
+ id | estimated_prob_Play |   class
+----+---------------------+------------
+  1 |                0.05 | Don't Play
+  2 |                0.15 | Don't Play
+  3 |                0.95 | Play
+  4 |                0.65 | Play
+  5 |                0.75 | Play
+  6 |                 0.4 | Don't Play
+  7 |                 0.7 | Play
+  8 |                 0.1 | Don't Play
+  9 |                 0.9 | Play
+ 10 |                0.85 | Play
+ 11 |                 0.8 | Play
+ 12 |                 0.7 | Play
+ 13 |                   1 | Play
+ 14 |                 0.4 | Don't Play
+(14 rows)
+</pre></li>
+</ol>
+<p><b>Random Forest Regression Example</b></p>
+<ol type="1">
+<li>Prepare input data: <pre class="example">
+DROP TABLE IF EXISTS mt_cars;
+CREATE TABLE mt_cars (
+    id integer NOT NULL,
+    mpg double precision,
+    cyl integer,
+    disp double precision,
+    hp integer,
+    drat double precision,
+    wt double precision,
+    qsec double precision,
+    vs integer,
+    am integer,
+    gear integer,
+    carb integer
+);
+</pre> <pre class="example">
+INSERT INTO mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) VALUES
+(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
+(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
+(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
+(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
+(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
+(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
+(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
+(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
+(9,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1),
+(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
+(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
+(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
+(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
+(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
+(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
+(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
+(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
+(18,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2),
+(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
+(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
+(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
+(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
+(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
+(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
+(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
+(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
+(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
+(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
+(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
+(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
+(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
+(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
+</pre></li>
+<li>Run the random forest training function: <pre class="example">
+DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
+SELECT madlib.forest_train('mt_cars',
+                           'mt_cars_output',
+                           'id',
+                           'mpg',
+                           '*',
+                           'id, hp, drat, am, gear, carb',  -- exclude columns
+                           'am',
+                           10::integer,
+                           2::integer,
+                           TRUE::boolean,
+                           1,
+                           10,
+                           8,
+                           3,
+                           10
+                           );
+\x on
+SELECT * FROM mt_cars_output_summary;
+SELECT * FROM mt_cars_output_group;
+\x off
+</pre></li>
+<li>Display a single tree of the random forest in dot format: <pre class="example">
+SELECT madlib.get_tree('mt_cars_output',1,1);
+</pre> Result: <pre class="result">
+digraph "Regression tree for mt_cars" {
+"0" [label="28.8444",shape=box];
+} //---end of digraph---------
+</pre></li>
+<li>Predict regression output for the same data and compare with original: <pre class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('mt_cars_output',
+                             'mt_cars',
+                             'prediction_results',
+                             'response');
+SELECT am, id, estimated_mpg, mpg
+FROM prediction_results JOIN mt_cars USING (id)
+ORDER BY am, id;
+</pre> Result: <pre class="result">
+ am | id |  estimated_mpg   | mpg
+----+----+------------------+------
+  0 |  1 |  15.893525974026 | 18.7
+  0 |  3 | 21.5238492063492 | 24.4
+  0 |  5 | 20.0175396825397 | 17.8
+  0 |  6 | 14.8406818181818 | 16.4
+  0 |  8 | 14.8406818181818 | 17.3
+  0 |  9 | 20.0496825396825 | 21.4
+  0 | 10 | 14.4012272727273 | 15.2
+  0 | 11 | 20.0175396825397 | 18.1
+  0 | 13 | 15.0162878787879 | 14.3
+  0 | 14 | 21.5238492063492 | 22.8
+  0 | 16 | 20.0175396825397 | 19.2
+  0 | 18 | 15.4787532467532 | 15.2
+  0 | 19 | 14.4272987012987 | 10.4
+  0 | 21 | 14.4272987012987 | 10.4
+  0 | 23 | 14.8667532467532 | 14.7
+  0 | 25 | 21.5238492063492 | 21.5
+  0 | 27 |  15.281525974026 | 15.5
+  0 | 29 | 15.0162878787879 | 13.3
+  0 | 30 |  15.281525974026 | 19.2
+  1 |  2 | 20.6527393162393 |   21
+  1 |  4 | 20.6527393162393 |   21
+  1 |  7 | 22.7707393162393 | 22.8
+  1 | 12 | 27.0888266178266 | 32.4
+  1 | 15 | 28.2478650793651 | 30.4
+  1 | 17 | 28.2478650793651 | 33.9
+  1 | 20 | 28.2478650793651 | 27.3
+  1 | 22 | 23.8401984126984 |   26
+  1 | 24 | 26.9748650793651 | 30.4
+  1 | 26 | 20.6527393162393 | 15.8
+  1 | 28 | 20.6527393162393 |   15
+  1 | 31 | 20.6527393162393 | 19.7
+  1 | 32 | 22.7707393162393 | 21.4
+</pre></li>
+</ol>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="random__forest_8sql__in.html">random_forest.sql_in</a> documenting the training function</p>
+<p><a class="el" href="group__grp__decision__tree.html">Decision Tree</a></p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed Dec 27 2017 19:05:57 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6c103d3e/docs/v1.13/group__grp__regml.html
----------------------------------------------------------------------
diff --git a/docs/v1.13/group__grp__regml.html b/docs/v1.13/group__grp__regml.html
new file mode 100644
index 0000000..c0488ee
--- /dev/null
+++ b/docs/v1.13/group__grp__regml.html
@@ -0,0 +1,166 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Regression Models</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.13</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__regml.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Regression Models<div class="ingroups"><a class="el" href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
+<p>A collection of methods for modeling conditional expectation of a response variable. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__clustered__errors"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__clustered__errors.html">Clustered Variance</a></td></tr>
+<tr class="memdesc:group__grp__clustered__errors"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates clustered variance for linear, logistic, and multinomial logistic regression models, and Cox proportional hazards models. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__cox__prop__hazards"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__cox__prop__hazards.html">Cox-Proportional Hazards Regression</a></td></tr>
+<tr class="memdesc:group__grp__cox__prop__hazards"><td class="mdescLeft">&#160;</td><td class="mdescRight">Models the relationship between one or more independent predictor variables and the amount of time before an event occurs. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__elasticnet"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__elasticnet.html">Elastic Net Regularization</a></td></tr>
+<tr class="memdesc:group__grp__elasticnet"><td class="mdescLeft">&#160;</td><td class="mdescRight">Generates a regularized regression model for variable selection in linear and logistic regression problems, combining the L1 and L2 penalties of the lasso and ridge methods. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__glm"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__glm.html">Generalized Linear Models</a></td></tr>
+<tr class="memdesc:group__grp__glm"><td class="mdescLeft">&#160;</td><td class="mdescRight">Estimate generalized linear model (GLM). GLM is a flexible generalization of ordinary linear regression that allows for response variables that have error distribution models other than a normal distribution. The GLM generalizes linear regression by allowing the linear model to be related to the response variable via a link function and by allowing the magnitude of the variance of each measurement to be a function of its predicted value. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__linreg"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__linreg.html">Linear Regression</a></td></tr>
+<tr class="memdesc:group__grp__linreg"><td class="mdescLeft">&#160;</td><td class="mdescRight">Also called Ordinary Least Squares Regression, models linear relationship between a dependent variable and one or more independent variables. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__logreg"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__logreg.html">Logistic Regression</a></td></tr>
+<tr class="memdesc:group__grp__logreg"><td class="mdescLeft">&#160;</td><td class="mdescRight">Models the relationship between one or more predictor variables and a binary categorical dependent variable by predicting the probability of the dependent variable using a logistic function. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__marginal"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__marginal.html">Marginal Effects</a></td></tr>
+<tr class="memdesc:group__grp__marginal"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates marginal effects for the coefficients in regression problems. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__multinom"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__multinom.html">Multinomial Regression</a></td></tr>
+<tr class="memdesc:group__grp__multinom"><td class="mdescLeft">&#160;</td><td class="mdescRight">Multinomial regression is to model the conditional distribution of the multinomial response variable using a linear combination of predictors. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__ordinal"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__ordinal.html">Ordinal Regression</a></td></tr>
+<tr class="memdesc:group__grp__ordinal"><td class="mdescLeft">&#160;</td><td class="mdescRight">Regression to model data with ordinal response variable. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__robust"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__robust.html">Robust Variance</a></td></tr>
+<tr class="memdesc:group__grp__robust"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates Huber-White variance estimates for linear, logistic, and multinomial regression models, and for Cox proportional hazards models. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed Dec 27 2017 19:05:57 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6c103d3e/docs/v1.13/group__grp__regml.js
----------------------------------------------------------------------
diff --git a/docs/v1.13/group__grp__regml.js b/docs/v1.13/group__grp__regml.js
new file mode 100644
index 0000000..76e3c69
--- /dev/null
+++ b/docs/v1.13/group__grp__regml.js
@@ -0,0 +1,13 @@
+var group__grp__regml =
+[
+    [ "Clustered Variance", "group__grp__clustered__errors.html", null ],
+    [ "Cox-Proportional Hazards Regression", "group__grp__cox__prop__hazards.html", null ],
+    [ "Elastic Net Regularization", "group__grp__elasticnet.html", null ],
+    [ "Generalized Linear Models", "group__grp__glm.html", null ],
+    [ "Linear Regression", "group__grp__linreg.html", null ],
+    [ "Logistic Regression", "group__grp__logreg.html", null ],
+    [ "Marginal Effects", "group__grp__marginal.html", null ],
+    [ "Multinomial Regression", "group__grp__multinom.html", null ],
+    [ "Ordinal Regression", "group__grp__ordinal.html", null ],
+    [ "Robust Variance", "group__grp__robust.html", null ]
+];
\ No newline at end of file