You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/03/30 02:58:46 UTC
[06/51] [partial] incubator-madlib-site git commit: Add all files from old site (madlib.net)

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/7258f51a/docs/latest/group__grp__rf.html
----------------------------------------------------------------------
diff --git a/docs/latest/group__grp__rf.html b/docs/latest/group__grp__rf.html
new file mode 100644
index 0000000..12dba0e
--- /dev/null
+++ b/docs/latest/group__grp__rf.html
@@ -0,0 +1,544 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Random Forest (old implementation)</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script src="../mathjax/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="navtree_hack.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'auto');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.8</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__rf.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Random Forest (old implementation)<div class="ingroups"><a class="el" href="group__grp__deprecated.html">Deprecated Modules</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#train">Training Function</a> </li>
+<li>
+<a href="#classify">Classification Function</a> </li>
+<li>
+<a href="#score">Scoring Function</a> </li>
+<li>
+<a href="#display">Display Function</a> </li>
+<li>
+<a href="#clean">Cleaning Function</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><dl class="section warning"><dt>Warning</dt><dd><em> This is an old implementation of random forests. For a newer implementation, please see <a class="el" href="group__grp__random__forest.html">Random Forest</a></em></dd></dl>
+<p>A random forest (RF) is an ensemble classifier that consists of many decision trees and outputs the class that is voted by the majority of the individual trees.</p>
+<p>It has the following well-known advantages:</p><ul>
+<li>Overall, RF produces better accuracy.</li>
+<li>It can be very efficient for large data sets. Trees of an RF can be trained in parallel.</li>
+<li>It can handle thousands of input attributes without attribute deletion.</li>
+</ul>
+<p>This module provides an implementation of the random forest algorithm described in [1].</p>
+<p>The implementation supports:</p><ul>
+<li>Building random forests</li>
+<li>Multiple split critera, including: . Information Gain . Gini Coefficient . Gain Ratio</li>
+<li>Random forest Classification/Scoring</li>
+<li>Random forest Display</li>
+<li>Continuous and Discrete features</li>
+<li>Equal frequency discretization for continuous features</li>
+<li>Missing value handling</li>
+<li>Sampling with replacement</li>
+</ul>
+<dl class="section user"><dt>Input</dt><dd></dd></dl>
+<p>The <b>data to classify</b> is expected to be of the same form as <b>training data</b>, except that it does not need a class column.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training Function</dt><dd></dd></dl>
+<p>Run the training algorithm on the source data. </p><pre class="syntax">
+rf_train( split_criterion,
+          training_table_name,
+          result_rf_table_name,
+          num_trees,
+          features_per_node,
+          sampling_percentage,
+          continuous_feature_names,
+          feature_col_names,
+          id_col_name,
+          class_col_name,
+          how2handle_missing_value,
+          max_tree_depth,
+          node_prune_threshold,
+          node_split_threshold,
+          verbosity
+        )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>split_criterion </dt>
+<dd><p class="startdd">The name of the split criterion that should be used for tree construction. The valid values are ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL. Information gain(infogain) and gini index(gini) are biased toward multivalued attributes. Gain ratio(gainratio) adjusts for this bias. However, it tends to prefer unbalanced splits in which one partition is much smaller than the others.</p>
+<p class="enddd"></p>
+</dd>
+<dt>training_table_name </dt>
+<dd><p class="startdd">The name of the table/view with the training data. It can't be NULL and must exist.</p>
+<p>The <b>training data</b> is expected to be of the following form: </p><pre>{TABLE|VIEW} <em>trainingSource</em> (
+    ...
+    <em>id</em> INT|BIGINT,
+    <em>feature1</em> SUPPORTED_DATA_TYPE,
+    <em>feature2</em> SUPPORTED_DATA_TYPE,
+    <em>feature3</em> SUPPORTED_DATA_TYPE,
+    ....................
+    <em>featureN</em> SUPPORTED_DATA_TYPE,
+    <em>class</em>    SUPPORTED_DATA_TYPE,
+    ...
+)</pre><p>SUPPORTED_DATA_TYPE can be any of the following: SMALLINT, INT, BIGINT, FLOAT8, REAL, DECIMAL, INET, CIDR, MACADDR, BOOLEAN, CHAR, VARCHAR, TEXT, "char", DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL. </p>
+<p class="enddd"></p>
+</dd>
+<dt>result_rf_table_name </dt>
+<dd><p class="startdd">The name of the table where the resulting trees are stored. It can not be NULL and must not exist.</p>
+<p class="enddd">The output table stores an abstract object (representing the model) used for further classification. The table has the following columns: </p><table  class="output">
+<tr>
+<th>id</th><td></td></tr>
+<tr>
+<th>tree_location</th><td></td></tr>
+<tr>
+<th>feature</th><td></td></tr>
+<tr>
+<th>probability</th><td></td></tr>
+<tr>
+<th>ebp_coeff</th><td></td></tr>
+<tr>
+<th>maxclass</th><td></td></tr>
+<tr>
+<th>split_gain</th><td></td></tr>
+<tr>
+<th>live</th><td></td></tr>
+<tr>
+<th>cat_size</th><td></td></tr>
+<tr>
+<th>parent_id</th><td></td></tr>
+<tr>
+<th>lmc_nid</th><td></td></tr>
+<tr>
+<th>lmc_fval</th><td></td></tr>
+<tr>
+<th>is_feature_cont</th><td></td></tr>
+<tr>
+<th>split_value</th><td></td></tr>
+<tr>
+<th>tid</th><td></td></tr>
+<tr>
+<th>dp_ids </th><td></td></tr>
+</table>
+</dd>
+<dt>num_trees </dt>
+<dd>The number of trees to be trained. If it's NULL, 10 will be used.  </dd>
+<dt>features_per_node </dt>
+<dd>The number of features to be considered when finding a best split. If it's NULL, sqrt(p), where p is the number of features, will be used. </dd>
+<dt>sampling_percentage </dt>
+<dd>The percentage of records sampled to train a tree. If it's NULL, 0.632 bootstrap will be used continuous_feature_names A comma-separated list of the names of the features whose values are continuous. NULL means there are no continuous features. </dd>
+<dt>feature_col_names </dt>
+<dd>A comma-separated list of names of the table columns, each of which defines a feature. NULL means all the columns except the ID and Class columns will be treated as features.  </dd>
+<dt>id_col_name </dt>
+<dd>The name of the column containing id of each record. It can't be NULL. </dd>
+<dt>class_col_name </dt>
+<dd>The name of the column containing correct class of each record. It can't be NULL. </dd>
+<dt>how2handle_missing_value </dt>
+<dd>The way to handle missing value. The valid values are 'explicit' and 'ignore'. It can't be NULL. </dd>
+<dt>max_tree_depth</dt>
+<dd>The maximum tree depth. It can't be NULL. </dd>
+<dt>node_prune_threshold </dt>
+<dd>The minimum percentage of the number of records required in a child node. It can't be NULL. The range of it is in [0.0, 1.0]. This threshold only applies to the non-root nodes. Therefore, if the percentage(p) between the sampled training set size of a tree (the number of rows) and the total training set size is less than or equal to the value of this parameter, then the tree only has one node (the root node); if its value is 1, then the percentage p is less than or equal to 1 definitely. Therefore, the tree only has one node (the root node). if its value is 0, then no nodes will be pruned by this parameter. </dd>
+<dt>node_split_threshold </dt>
+<dd>The minimum percentage of the number of records required in a node in order for a further split to be possible. It can't be NULL. The range of it is in [0.0, 1.0]. If the percentage(p) between the sampled training set size of a tree (the number of rows) and the total training set size is less than the value of this parameter, then the root node will be a leaf one. Therefore, the trained tree only has one node. If the percentage p is equal to the value of this parameter, then the trained tree only has two levels, since only the root node will grow. (the root node); if its value is 0, then trees can grow extensively. </dd>
+<dt>verbosity </dt>
+<dd>Greater than 0 means this function runs in verbose mode. It can't be NULL. </dd>
+</dl>
+<p><a class="anchor" id="classify"></a></p><dl class="section user"><dt>Classification Function</dt><dd></dd></dl>
+<p>The classification function creates the result_table with the classification results. </p><pre class="syntax">
+rf_classify( rf_table_name,
+             classification_table_name,
+             result_table_name)
+</pre><p><a class="anchor" id="score"></a></p><dl class="section user"><dt>Scoring Function</dt><dd></dd></dl>
+<p>The scoring function gives a ratio of correctly classified items in the validation data set. </p><pre class="syntax">
+rf_score( rf_table_name,
+          validation_table_name,
+          verbosity)
+</pre><p><a class="anchor" id="display"></a></p><dl class="section user"><dt>Display Function</dt><dd></dd></dl>
+<p>The display tree function displays the trained trees in a human-readable format. </p><pre class="syntax">
+rf_display( rf_table_name
+          )
+</pre><p><a class="anchor" id="clean"></a></p><dl class="section user"><dt>Cleaning Function</dt><dd></dd></dl>
+<p>The clean tree function cleans up the learned model and metadata. </p><pre class="syntax">
+rf_clean( rf_table_name
+        )
+</pre><p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Prepare an input table. <pre class="example">
+SELECT * FROM golf_data ORDER BY id;
+</pre> Result: <pre class="result">
+ id | outlook  | temperature | humidity | windy  |    class
+&#160;---+----------+-------------+----------+--------+--------------
+  1 | sunny    |          85 |       85 |  false |  Do not Play
+  2 | sunny    |          80 |       90 |  true  |  Do not Play
+  3 | overcast |          83 |       78 |  false |  Play
+  4 | rain     |          70 |       96 |  false |  Play
+  5 | rain     |          68 |       80 |  false |  Play
+  6 | rain     |          65 |       70 |  true  |  Do not Play
+  7 | overcast |          64 |       65 |  true  |  Play
+  8 | sunny    |          72 |       95 |  false |  Do not Play
+  9 | sunny    |          69 |       70 |  false |  Play
+ 10 | rain     |          75 |       80 |  false |  Play
+ 11 | sunny    |          75 |       70 |  true  |  Play
+ 12 | overcast |          72 |       90 |  true  |  Play
+ 13 | overcast |          81 |       75 |  false |  Play
+ 14 | rain     |          71 |       80 |  true  |  Do not Play
+(14 rows)
+</pre></li>
+<li>Train the random forest. <pre class="example">
+SELECT * FROM madlib.rf_clean('trained_tree_infogain');
+SELECT * FROM madlib.rf_train(
+    'infogain',
+    'golf_data',
+    'trained_tree_infogain',
+    10,
+    NULL,
+    0.632,
+    'temperature,humidity',
+    'outlook,temperature,humidity,windy',
+    'id',
+    'class',
+    'explicit',
+    10,
+    0.0,
+    0.0,
+    0);
+</pre> Result: <pre class="result">
+ training_time  | num_of_samples | num_trees | features_per_node | num_tree_nodes | max_tree_depth | split_criterion |    acs_time     |    acc_time     |    olap_time    |   update_time   |    best_time
+&#160;---------------+--------------+-----------+-------------------+----------------+----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------
+ 00:00:03.60498 |           14 |        10 |                 3 |             71 |              6 | infogain        | 00:00:00.154991 | 00:00:00.404411 | 00:00:00.736876 | 00:00:00.374084 | 00:00:01.722658
+(1 row)
+</pre></li>
+<li>Check the table records that hold the random forest. <pre class="example">
+SELECT * FROM trained_tree_infogain ORDER BY tid, id;
+</pre> <pre class="result">
+ id | tree_location | feature |    probability    | ebp_coeff | maxclass |     split_gain     | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids
+&#160;---+---------------+---------+-------------------+-----------+----------+--------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
+  1 | {0}           |       3 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      24 |        1 | f               |             |   1 |
+ 24 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         1 |         |          | f               |             |   1 | {3}
+ 25 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         1 |         |          | f               |             |   1 | {3}
+ 26 | {0,3}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         1 |      42 |        1 | t               |          70 |   1 | {3}
+ 42 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        26 |         |          | f               |             |   1 |
+ 43 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        26 |         |          | f               |             |   1 |
+  2 | {0}           |       2 | 0.555555555555556 |         1 |        1 |   0.17636684303351 |    0 |        9 |         0 |      11 |        1 | t               |          65 |   2 |
+ 11 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         2 |         |          | f               |             |   2 |
+ 12 | {0,2}         |       4 | 0.714285714285714 |         1 |        1 |  0.217687074829932 |    0 |        7 |         2 |      44 |        1 | f               |             |   2 |
+ 44 | {0,2,1}       |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |        12 |      57 |        1 | f               |             |   2 | {4}
+ 45 | {0,2,2}       |       3 |                 1 |         1 |        1 |                  0 |    0 |        4 |        12 |         |          | f               |             |   2 | {4}
+ 57 | {0,2,1,1}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          78 |   2 | {4,3}
+ 58 | {0,2,1,2}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          96 |   2 | {4,3}
+ 59 | {0,2,1,3}     |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        44 |         |          | t               |          85 |   2 | {4,3}
+  3 | {0}           |       2 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      27 |        1 | t               |          80 |   3 |
+ 27 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        6 |         3 |         |          | f               |             |   3 |
+ 28 | {0,2}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         3 |      46 |        1 | t               |          90 |   3 |
+ 46 | {0,2,1}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        28 |         |          | f               |             |   3 |
+ 47 | {0,2,2}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        28 |         |          | f               |             |   3 |
+  4 | {0}           |       4 | 0.888888888888889 |         1 |        2 | 0.0493827160493827 |    0 |        9 |         0 |      13 |        1 | f               |             |   4 |
+ 13 | {0,1}         |       3 |                 1 |         1 |        2 |                  0 |    0 |        6 |         4 |         |          | f               |             |   4 | {4}
+ 14 | {0,2}         |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         4 |      48 |        1 | f               |             |   4 | {4}
+ 48 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        14 |         |          | t               |          90 |   4 | {4,3}
+ 49 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        14 |         |          | t               |          80 |   4 | {4,3}
+  5 | {0}           |       2 | 0.888888888888889 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      29 |        1 | t               |          90 |   5 |
+ 29 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        8 |         5 |         |          | f               |             |   5 |
+ 30 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        1 |         5 |         |          | f               |             |   5 |
+  6 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.345679012345679 |    0 |        9 |         0 |      15 |        1 | f               |             |   6 |
+ 15 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}
+ 16 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         6 |      51 |        1 | f               |             |   6 | {3}
+ 17 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}
+ 51 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        16 |         |          | t               |          96 |   6 | {3,4}
+ 52 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        16 |         |          | t               |          70 |   6 | {3,4}
+  7 | {0}           |       4 | 0.666666666666667 |         1 |        2 |  0.253968253968254 |    0 |        9 |         0 |      31 |        1 | f               |             |   7 |
+ 31 | {0,1}         |       2 | 0.857142857142857 |         1 |        2 |  0.102040816326531 |    0 |        7 |         7 |      36 |        1 | t               |          80 |   7 | {4}
+ 32 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        2 |         7 |         |          | f               |             |   7 | {4}
+ 36 | {0,1,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        5 |        31 |         |          | f               |             |   7 |
+ 37 | {0,1,2}       |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |        31 |      60 |        1 | t               |          95 |   7 |
+ 60 | {0,1,2,1}     |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 |
+ 61 | {0,1,2,2}     |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 |
+  8 | {0}           |       3 | 0.777777777777778 |         1 |        2 | 0.0864197530864197 |    0 |        9 |         0 |      18 |        1 | f               |             |   8 |
+ 18 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         8 |         |          | f               |             |   8 | {3}
+ 19 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         8 |      38 |        1 | f               |             |   8 | {3}
+ 20 | {0,3}         |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |         8 |      53 |        1 | t               |          70 |   8 | {3}
+ 38 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        19 |         |          | t               |          80 |   8 | {3,4}
+ 39 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        19 |         |          | t               |          80 |   8 | {3,4}
+ 53 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 |
+ 54 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 |
+  9 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.327160493827161 |    0 |        9 |         0 |      33 |        1 | f               |             |   9 |
+ 33 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         9 |         |          | f               |             |   9 | {3}
+ 34 | {0,2}         |       4 |              0.75 |         1 |        2 |              0.375 |    0 |        4 |         9 |      55 |        1 | f               |             |   9 | {3}
+ 35 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         9 |         |          | f               |             |   9 | {3}
+ 55 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        3 |        34 |         |          | t               |          96 |   9 | {3,4}
+ 56 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        34 |         |          | t               |          70 |   9 | {3,4}
+ 10 | {0}           |       3 | 0.666666666666667 |         1 |        2 |  0.277777777777778 |    0 |        9 |         0 |      21 |        1 | f               |             |  10 |
+ 21 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        10 |         |          | f               |             |  10 | {3}
+ 22 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |        10 |         |          | f               |             |  10 | {3}
+ 23 | {0,3}         |       2 |              0.75 |         1 |        1 |              0.375 |    0 |        4 |        10 |      40 |        1 | t               |          70 |  10 | {3}
+ 40 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        23 |         |          | f               |             |  10 |
+ 41 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |        23 |         |          | f               |             |  10 |
+(60 rows)
+</pre></li>
+<li>Display the random forest in a human readable format. <pre class="example">
+SELECT * FROM madlib.rf_display( 'trained_tree_infogain'
+                               );
+</pre> Result: <pre class="result">
+                                             rf_display
+&#160;----------------------------------------------------------------------------------------------------
+&#160;
+ Tree 1
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)
+         outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)
+         outlook:  = rain  : class( Play)   num_elements(2)  predict_prob(1)
+         outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667)
+             humidity:  &lt;= 70  : class( Play)   num_elements(1)  predict_prob(1)
+             humidity:  &gt; 70  : class( Do not Play)   num_elements(2)  predict_prob(1)
+&#160;
+ Tree 2
+     Root Node  : class( Do not Play)   num_elements(9)  predict_prob(0.555555555555556)
+         humidity:  &lt;= 65  : class( Play)   num_elements(2)  predict_prob(1)
+         humidity:  &gt; 65  : class( Do not Play)   num_elements(7)  predict_prob(0.714285714285714)
+             windy:  =  false  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
+                 outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)
+                 outlook:  = rain  : class( Play)   num_elements(1)  predict_prob(1)
+                 outlook:  = sunny  : class( Do not Play)   num_elements(1)  predict_prob(1)
+             windy:  =  true  : class( Do not Play)   num_elements(4)  predict_prob(1)
+&#160;
+ Tree 3
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)
+         humidity:  &lt;= 80  : class( Play)   num_elements(6)  predict_prob(1)
+         humidity:  &gt; 80  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667)
+             humidity:  &lt;= 90  : class( Do not Play)   num_elements(2)  predict_prob(1)
+             humidity:  &gt; 90  : class( Play)   num_elements(1)  predict_prob(1)
+&#160;
+ Tree 4
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)
+         windy:  =  false  : class( Play)   num_elements(6)  predict_prob(1)
+         windy:  =  true  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
+             outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)
+             outlook:  = rain  : class( Do not Play)   num_elements(1)  predict_prob(1)
+&#160;
+ Tree 5
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)
+         humidity:  &lt;= 90  : class( Play)   num_elements(8)  predict_prob(1)
+         humidity:  &gt; 90  : class( Do not Play)   num_elements(1)  predict_prob(1)
+&#160;
+ Tree 6
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)
+         outlook:  = overcast  : class( Play)   num_elements(3)  predict_prob(1)
+         outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
+             windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)
+             windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)
+         outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)
+&#160;
+ Tree 7
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)
+         windy:  =  false  : class( Play)   num_elements(7)  predict_prob(0.857142857142857)
+             humidity:  &lt;= 80  : class( Play)   num_elements(5)  predict_prob(1)
+             humidity:  &gt; 80  : class( Play)   num_elements(2)  predict_prob(0.5)
+                 humidity:  &lt;= 95  : class( Do not Play)   num_elements(1)  predict_prob(1)
+                 humidity:  &gt; 95  : class( Play)   num_elements(1)  predict_prob(1)
+         windy:  =  true  : class( Do not Play)   num_elements(2)  predict_prob(1)
+&#160;
+ Tree 8
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)
+         outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)
+         outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
+             windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)
+             windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)
+         outlook:  = sunny  : class( Play)   num_elements(2)  predict_prob(0.5)
+             humidity:  &lt;= 70  : class( Play)   num_elements(1)  predict_prob(1)
+             humidity:  &gt; 70  : class( Do not Play)   num_elements(1)  predict_prob(1)
+&#160;
+ Tree 9
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)
+         outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)
+         outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(0.75)
+             windy:  =  false  : class( Play)   num_elements(3)  predict_prob(1)
+             windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)
+         outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)
+&#160;
+ Tree 10
+     Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)
+         outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)
+         outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(1)
+         outlook:  = sunny  : class( Do not Play)   num_elements(4)  predict_prob(0.75)
+             humidity:  &lt;= 70  : class( Play)   num_elements(1)  predict_prob(1)
+             humidity:  &gt; 70  : class( Do not Play)   num_elements(3)  predict_prob(1)
+(10 rows)
+</pre></li>
+<li>Classify data with the learned model. <pre class="example">
+SELECT * FROM madlib.rf_classify( 'trained_tree_infogain',
+                                  'golf_data',
+                                  'classification_result'
+                                );
+</pre> Result: <pre class="result">
+ input_set_size | classification_time
+&#160;---------------+---------------------
+             14 | 00:00:02.215017
+(1 row)
+</pre></li>
+<li>Check the classification results. <pre class="example">
+SELECT t.id, t.outlook, t.temperature, t.humidity, t.windy, c.class
+FROM classification_result c, golf_data t
+WHERE t.id=c.id ORDER BY id;
+</pre> Result: <pre class="result">
+ id | outlook  | temperature | humidity | windy  |    class
+&#160;---+----------+-------------+----------+--------+--------------
+  1 | sunny    |          85 |       85 |  false |  Do not Play
+  2 | sunny    |          80 |       90 |  true  |  Do not Play
+  3 | overcast |          83 |       78 |  false |  Play
+  4 | rain     |          70 |       96 |  false |  Play
+  5 | rain     |          68 |       80 |  false |  Play
+  6 | rain     |          65 |       70 |  true  |  Do not Play
+  7 | overcast |          64 |       65 |  true  |  Play
+  8 | sunny    |          72 |       95 |  false |  Do not Play
+  9 | sunny    |          69 |       70 |  false |  Play
+ 10 | rain     |          75 |       80 |  false |  Play
+ 11 | sunny    |          75 |       70 |  true  |  Do not Play
+ 12 | overcast |          72 |       90 |  true  |  Play
+ 13 | overcast |          81 |       75 |  false |  Play
+ 14 | rain     |          71 |       80 |  true  |  Do not Play
+(14 rows)
+</pre></li>
+<li>Score the data against a validation set. <pre class="example">
+SELECT * FROM madlib.rf_score( 'trained_tree_infogain',
+                               'golf_data',
+                               0
+                             );
+</pre> Result: <pre class="result">
+     rf_score
+&#160;------------------
+ 0.928571428571429
+(1 row)
+</pre></li>
+<li>Clean up the random forest and other auxiliary information: <pre class="example">
+SELECT madlib.rf_clean( 'trained_tree_infogain'
+                      );
+</pre> Result: <pre class="result">
+ rf_clean
+&#160;---------
+ t
+(1 row)
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] <a href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm">http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm</a></p>
+<p>[2] <a href="http://en.wikipedia.org/wiki/Discretization_of_continuous_features">http://en.wikipedia.org/wiki/Discretization_of_continuous_features</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd>File <a class="el" href="rf_8sql__in.html" title="random forest APIs and main control logic written in PL/PGSQL ">rf.sql_in</a> documenting the SQL functions. </dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Jul 27 2015 20:37:46 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/7258f51a/docs/latest/group__grp__robust.html
----------------------------------------------------------------------
diff --git a/docs/latest/group__grp__robust.html b/docs/latest/group__grp__robust.html
new file mode 100644
index 0000000..acd0283
--- /dev/null
+++ b/docs/latest/group__grp__robust.html
@@ -0,0 +1,441 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Robust Variance</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script src="../mathjax/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="navtree_hack.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'auto');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.8</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__robust.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Robust Variance<div class="ingroups"><a class="el" href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" href="group__grp__regml.html">Regression Models</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li class="level1">
+<a href="#train_linregr">Robust Linear Regression Training Function</a> </li>
+<li class="level1">
+<a href="#train_logregr">Robust Logistic Regression Training Function</a> </li>
+<li class="level1">
+<a href="#train_mlogregr">Robust Multinomial Logistic Regression Training Function</a> </li>
+<li class="level1">
+<a href="#robust_variance_coxph">Robust Variance Function For Cox Proportional Hazards</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#background">Technical Background</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>The functions in this module calculate robust variance (Huber-White estimates) for linear regression, logistic regression, multinomial logistic regression, and Cox proportional hazards. They are useful in calculating variances in a dataset with potentially noisy outliers. The Huber-White implemented here is identical to the "HC0" sandwich operator in the R module "sandwich".</p>
+<p>The interfaces for robust linear, logistic, and multinomial logistic regression are similar. Each regression type has its own training function. The regression results are saved in an output table with small differences, depending on the regression type.</p>
+<dl class="section warning"><dt>Warning</dt><dd>Please note that the interface for Cox proportional hazards, unlike the interface of other regression methods, accepts an output model table produced by <a class="el" href="cox__prop__hazards_8sql__in.html#a737450bbfe0f10204b0074a9d45b0cef">coxph_train()</a> function.</dd></dl>
+<p><a class="anchor" id="train_linregr"></a></p><dl class="section user"><dt>Robust Linear Regression Training Function</dt><dd></dd></dl>
+<p>The <a class="el" href="robust_8sql__in.html#a390473d2fd45e268f0fc13ca971b49b4">robust_variance_linregr()</a> function has the following syntax: </p><pre class="syntax">
+robust_variance_linregr( source_table,
+                         out_table,
+                         dependent_varname,
+                         independent_varname,
+                         grouping_cols
+                       )
+</pre> <dl class="arglist">
+<dt>source_table </dt>
+<dd>VARCHAR. The name of the table containing the training data. </dd>
+<dt>out_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the generated table containing the output model. The output table contains the following columns. </p><table  class="output">
+<tr>
+<th>coef </th><td>DOUBLE PRECISION[]. Vector of the coefficients of the regression.  </td></tr>
+<tr>
+<th>std_err </th><td>DOUBLE PRECISION[]. Vector of the standard error of the coefficients.  </td></tr>
+<tr>
+<th>t_stats </th><td>DOUBLE PRECISION[]. Vector of the t-stats of the coefficients.  </td></tr>
+<tr>
+<th>p_values </th><td>DOUBLE PRECISION[]. Vector of the p-values of the coefficients.  </td></tr>
+</table>
+<p class="enddd">A summary table named &lt;out_table&gt;_summary is also created, which is the same as the summary table created by linregr_train function. Please refer to the documentation for linear regression for details.  </p>
+</dd>
+<dt>dependent_varname </dt>
+<dd>VARCHAR. The name of the column containing the dependent variable. </dd>
+<dt>independent_varname </dt>
+<dd>VARCHAR. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant 1 term in the independent variable list.  </dd>
+<dt>grouping_cols (optional) </dt>
+<dd>VARCHAR, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no grouping is used and a single result model is generated. Default value: NULL.  </dd>
+</dl>
+<p><a class="anchor" id="train_logregr"></a></p><dl class="section user"><dt>Robust Logistic Regression Training Function</dt><dd></dd></dl>
+<p>The <a class="el" href="robust_8sql__in.html#abc20ec2c5e74f268e7727c33a4bb9054">robust_variance_logregr()</a> function has the following syntax: </p><pre class="syntax">
+robust_variance_logregr( source_table,
+                         out_table,
+                         dependent_varname,
+                         independent_varname,
+                         grouping_cols,
+                         max_iter,
+                         optimizer,
+                         tolerance,
+                         verbose_mode
+                       )
+</pre> <dl class="arglist">
+<dt>source_table </dt>
+<dd>VARCHAR. The name of the table containing the training data. </dd>
+<dt>out_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the generated table containing the output model. The output table has the following columns: </p><table  class="output">
+<tr>
+<th>coef </th><td>Vector of the coefficients of the regression.  </td></tr>
+<tr>
+<th>std_err </th><td>Vector of the standard error of the coefficients.  </td></tr>
+<tr>
+<th>z_stats </th><td>Vector of the z-stats of the coefficients.  </td></tr>
+<tr>
+<th>p_values </th><td>Vector of the p-values of the coefficients.  </td></tr>
+</table>
+<p class="enddd">A summary table named &lt;out_table&gt;_summary is also created, which is the same as the summary table created by logregr_train function. Please refer to the documentation for logistic regression for details.  </p>
+</dd>
+<dt>dependent_varname </dt>
+<dd>VARCHAR. The name of the column containing the independent variable. </dd>
+<dt>independent_varname </dt>
+<dd>VARCHAR. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant 1 term in the independent variable list. </dd>
+<dt>grouping_cols (optional) </dt>
+<dd>VARCHAR, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no grouping is used and a single result model is generated.  </dd>
+<dt>max_iter (optional) </dt>
+<dd>INTEGER, default: 20. The maximum number of iterations that are allowed. </dd>
+<dt>optimizer </dt>
+<dd>VARCHAR, default: 'fista'. Name of optimizer, either 'fista' or 'igd'. </dd>
+<dt>tolerance (optional) </dt>
+<dd>DOUBLE PRECISION, default: 1e-6. The criteria to end iterations. Both the 'fista' and 'igd' optimizers compute the average difference between the coefficients of two consecutive iterations, and when the difference is smaller than tolerance or the iteration number is larger than max_iter, the computation stops.  </dd>
+<dt>verbose_mode (optional) </dt>
+<dd>BOOLEAN, default: FALSE. Whether the regression fit should print any warning messages.  </dd>
+</dl>
+<p><a class="anchor" id="train_mlogregr"></a></p><dl class="section user"><dt>Robust Multinomial Logistic Regression Function</dt><dd></dd></dl>
+<p>The <a class="el" href="robust_8sql__in.html#a1f27c072a4ef885a55825f75d12b3bd8">robust_variance_mlogregr()</a> function has the following syntax: </p><pre class="syntax">
+robust_variance_mlogregr( source_table,
+                          out_table,
+                          dependent_varname,
+                          independent_varname,
+                          ref_category,
+                          grouping_cols,
+                          optimizer_params,
+                          verbose_mode
+                        )
+</pre> <dl class="arglist">
+<dt>source_table </dt>
+<dd>VARCHAR. The name of the table containing training data, properly qualified. </dd>
+<dt>out_table </dt>
+<dd><p class="startdd">VARCHAR. The name of the table where the regression model will be stored. The output table has the following columns: </p><table  class="output">
+<tr>
+<th>category </th><td>The category.  </td></tr>
+<tr>
+<th>ref_category </th><td>The refererence category used for modeling.  </td></tr>
+<tr>
+<th>coef </th><td>Vector of the coefficients of the regression.  </td></tr>
+<tr>
+<th>std_err </th><td>Vector of the standard error of the coefficients.  </td></tr>
+<tr>
+<th>z_stats </th><td>Vector of the z-stats of the coefficients.  </td></tr>
+<tr>
+<th>p_values </th><td>Vector of the p-values of the coefficients.  </td></tr>
+</table>
+<p class="enddd">A summary table named &lt;out_table&gt;_summary is also created, which is the same as the summary table created by mlogregr_train function. Please refer to the documentation for multinomial logistic regression for details.  </p>
+</dd>
+<dt>dependent_varname </dt>
+<dd>VARCHAR. The name of the column containing the dependent variable. </dd>
+<dt>independent_varname </dt>
+<dd>VARCHAR. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant 1 term in the independent variable list. The <em>independent_varname</em> can be the name of a column that contains an array of numeric values. It can also be a string with the format 'ARRAY[1, x1, x2, x3]', where <em>x1</em>, <em>x2</em> and <em>x3</em> are each column names. </dd>
+<dt>ref_category (optional) </dt>
+<dd>INTEGER, default: 0. The reference category. </dd>
+<dt>grouping_cols (optional) </dt>
+<dd>VARCHAR, default: NULL. <em>Not currently implemented. Any non-NULL value is ignored.</em> An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no grouping is used and a single result model is generated. </dd>
+<dt>optimizer_params (optional) </dt>
+<dd>TEXT, default: NULL, which uses the default values of optimizer parameters: max_iter=20, optimizer='newton', tolerance=1e-4. It should be a string that contains pairs of 'key=value' separated by commas. </dd>
+<dt>verbose_mode (optional) </dt>
+<dd>BOOLEAN, default FALSE. <em>Not currently implemented.</em> TRUE if the regression fit should print warning messages. </dd>
+</dl>
+<p><a class="anchor" id="robust_variance_coxph"></a></p><dl class="section user"><dt>Robust Variance Function For Cox Proportional Hazards</dt><dd></dd></dl>
+<p>The <a class="el" href="clustered__variance__coxph_8sql__in.html#abaeae5d6cd30db4b06a49d24d714812e">robust_variance_coxph()</a> function has the following syntax: </p><pre class="syntax">
+robust_variance_coxph(model_table, output_table)
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>model_table </dt>
+<dd>TEXT. The name of the model table, which is exactaly the same as the 'output_table' parameter of <a class="el" href="cox__prop__hazards_8sql__in.html#a737450bbfe0f10204b0074a9d45b0cef" title="Compute cox-regression coefficients and diagnostic statistics. ">coxph_train()</a> function. </dd>
+<dt>output_table </dt>
+<dd>TEXT. The name of the table where the output is saved. It has the following columns: <table  class="output">
+<tr>
+<th>coef </th><td>FLOAT8[]. Vector of the coefficients.  </td></tr>
+<tr>
+<th>loglikelihood </th><td>FLOAT8. Log-likelihood value of the MLE estimate.  </td></tr>
+<tr>
+<th>std_err </th><td>FLOAT8[]. Vector of the standard error of the coefficients.  </td></tr>
+<tr>
+<th>robust_se </th><td>FLOAT8[]. Vector of the robust standard errors of the coefficients.  </td></tr>
+<tr>
+<th>robust_z </th><td>FLOAT8[]. Vector of the robust z-stats of the coefficients.  </td></tr>
+<tr>
+<th>robust_p </th><td>FLOAT8[]. Vector of the robust p-values of the coefficients.  </td></tr>
+<tr>
+<th>hessian </th><td>FLOAT8[]. The Hessian matrix.  </td></tr>
+</table>
+</dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
+<p><b> Logistic Regression Example </b></p><ol type="1">
+<li>View online help for the logistic regression training function. <pre class="example">
+SELECT madlib.robust_variance_logregr();
+</pre></li>
+<li>Create the training data table. <pre class="example">
+DROP TABLE IF EXISTS patients;
+CREATE TABLE patients (id INTEGER NOT NULL, second_attack INTEGER,
+    treatment INTEGER, trait_anxiety INTEGER);
+COPY patients FROM STDIN WITH DELIMITER '|';
+  1 |             1 |         1 |            70
+  3 |             1 |         1 |            50
+  5 |             1 |         0 |            40
+  7 |             1 |         0 |            75
+  9 |             1 |         0 |            70
+ 11 |             0 |         1 |            65
+ 13 |             0 |         1 |            45
+ 15 |             0 |         1 |            40
+ 17 |             0 |         0 |            55
+ 19 |             0 |         0 |            50
+  2 |             1 |         1 |            80
+  4 |             1 |         0 |            60
+  6 |             1 |         0 |            65
+  8 |             1 |         0 |            80
+ 10 |             1 |         0 |            60
+ 12 |             0 |         1 |            50
+ 14 |             0 |         1 |            35
+ 16 |             0 |         1 |            50
+ 18 |             0 |         0 |            45
+ 20 |             0 |         0 |            60
+\.
+</pre></li>
+<li>Run the logistic regression training function and compute the robust logistic variance of the regression: <pre class="example">
+DROP TABLE IF EXISTS patients_logregr;
+SELECT madlib.robust_variance_logregr( 'patients',
+                                       'patients_logregr',
+                                       'second_attack',
+                                       'ARRAY[1, treatment, trait_anxiety]'
+                                     );
+</pre></li>
+<li>View the regression results. <pre class="example">
+\x on
+Expanded display is on.
+SELECT * FROM patients_logregr;
+</pre> Result: <pre class="result">
+&#160;-[ RECORD 1 ]-------------------------------------------------------
+ coef     | {-6.36346994178179,-1.02410605239327,0.119044916668605}
+ std_err  | {3.45872062333648,1.1716192578234,0.0534328864185018}
+ z_stats  | {-1.83983346294192,-0.874094587943036,2.22793348156809}
+ p_values | {0.0657926909738889,0.382066744585541,0.0258849510757339}
+</pre> Alternatively, unnest the arrays in the results for easier reading of output. <pre class="example">
+\x off
+SELECT unnest(array['intercept', 'treatment', 'trait_anxiety' ]) as attribute,
+       unnest(coef) as coefficient,
+       unnest(std_err) as standard_error,
+       unnest(z_stats) as z_stat,
+       unnest(p_values) as pvalue
+FROM patients_logregr;
+</pre></li>
+</ol>
+<p><b> Cox Proportional Hazards Example </b></p><ol type="1">
+<li>View online help for the robust Cox Proportional hazards training method. <pre class="example">
+SELECT madlib.robust_variance_coxph();
+</pre></li>
+<li>Create an input data set. <pre class="example">
+DROP TABLE IF EXISTS sample_data;
+CREATE TABLE sample_data (
+    id INTEGER NOT NULL,
+    grp DOUBLE PRECISION,
+    wbc DOUBLE PRECISION,
+    timedeath INTEGER,
+    status BOOLEAN
+);
+COPY sample_data FROM STDIN DELIMITER '|';
+  0 |   0 | 1.45 |        35 | t
+  1 |   0 | 1.47 |        34 | t
+  3 |   0 |  2.2 |        32 | t
+  4 |   0 | 1.78 |        25 | t
+  5 |   0 | 2.57 |        23 | t
+  6 |   0 | 2.32 |        22 | t
+  7 |   0 | 2.01 |        20 | t
+  8 |   0 | 2.05 |        19 | t
+  9 |   0 | 2.16 |        17 | t
+ 10 |   0 |  3.6 |        16 | t
+ 11 |   1 |  2.3 |        15 | t
+ 12 |   0 | 2.88 |        13 | t
+ 13 |   1 |  1.5 |        12 | t
+ 14 |   0 |  2.6 |        11 | t
+ 15 |   0 |  2.7 |        10 | t
+ 16 |   0 |  2.8 |         9 | t
+ 17 |   1 | 2.32 |         8 | t
+ 18 |   0 | 4.43 |         7 | t
+ 19 |   0 | 2.31 |         6 | t
+ 20 |   1 | 3.49 |         5 | t
+ 21 |   1 | 2.42 |         4 | t
+ 22 |   1 | 4.01 |         3 | t
+ 23 |   1 | 4.91 |         2 | t
+ 24 |   1 |    5 |         1 | t
+\.
+</pre></li>
+<li>Run the Cox regression function. <pre class="example">
+SELECT madlib.coxph_train( 'sample_data',
+                           'sample_cox',
+                           'timedeath',
+                           'ARRAY[grp,wbc]',
+                           'status'
+                         );
+</pre></li>
+<li>Run the Robust Cox regression function. <pre class="example">
+SELECT madlib.robust_variance_coxph( 'sample_cox',
+                           'sample_robust_cox'
+                         );
+</pre></li>
+<li>View the results of the robust Cox regression. <pre class="example">
+\x on
+SELECT * FROM sample_robust_cox;
+</pre> Results: <pre class="result">
+-[ RECORD 1 ]-+----------------------------------------------------------------------------
+coef          | {2.54407073265105,1.67172094780081}
+loglikelihood | -37.8532498733452
+std_err       | {0.677180599295459,0.387195514577754}
+robust_se     | {0.621095581073685,0.274773521439328}
+robust_z      | {4.09610180811965,6.08399579058399}
+robust_p      | {4.2016521208424e-05,1.17223683104729e-09}
+hessian       | {{2.78043065745405,-2.25848560642669},{-2.25848560642669,8.50472838284265}}
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section user"><dt>Technical Background</dt><dd></dd></dl>
+<p>When doing regression analysis, we are sometimes interested in the variance of the computed coefficients \( \boldsymbol c \). While the built-in regression functions provide variance estimates, we may prefer a <em>robust</em> variance estimate.</p>
+<p>The robust variance calculation can be expressed in a sandwich formation, which is the form </p><p class="formulaDsp">
+\[ S( \boldsymbol c) = B( \boldsymbol c) M( \boldsymbol c) B( \boldsymbol c) \]
+</p>
+<p> where \( B( \boldsymbol c)\) and \( M( \boldsymbol c)\) are matrices. The \( B( \boldsymbol c) \) matrix, also known as the bread, is relatively straight forward, and can be computed as </p><p class="formulaDsp">
+\[ B( \boldsymbol c) = n\left(\sum_i^n -H(y_i, x_i, \boldsymbol c) \right)^{-1} \]
+</p>
+<p> where \( H \) is the hessian matrix.</p>
+<p>The \( M( \boldsymbol c)\) matrix has several variations, each with different robustness properties. The form implemented here is the Huber-White sandwich operator, which takes the form </p><p class="formulaDsp">
+\[ M_{H} =\frac{1}{n} \sum_i^n \psi(y_i,x_i, \boldsymbol c)^T \psi(y_i,x_i, \boldsymbol c). \]
+</p>
+<p>The above method for calculating robust variance (Huber-White estimates) is implemented for linear regression, logistic regression, and multinomial logistic regression. It is useful in calculating variances in a dataset with potentially noisy outliers. The Huber-White implemented here is identical to the "HC0" sandwich operator in the R module "sandwich".</p>
+<p>When multinomial logistic regression is computed before the multinomial robust regression, it uses a default reference category of zero and the regression coefficients are included in the output table. The regression coefficients in the output are in the same order as the multinomial logistic regression function, which is described below. For a problem with \( K \) dependent variables \( (1, ..., K) \) and \( J \) categories \( (0, ..., J-1) \), let \( {m_{k,j}} \) denote the coefficient for dependent variable \( k \) and category \( j \) . The output is \( {m_{k_1, j_0}, m_{k_1, j_1} \ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1} \ldots m_{k_K, j_{J-1}}} \). The order is NOT CONSISTENT with the multinomial regression marginal effect calculation with function <em>marginal_mlogregr</em>. This is deliberate because the interfaces of all multinomial regressions (robust, clustered, ...) will be moved to match that used in marginal.</p>
+<p>The robust variance of Cox proportional hazards is more complex because coeeficients are trained by maximizing a partial log-likelihood. Therefore, one cannot directly use the formula for \( M( \boldsymbol c) \) as in Huber-White robust estimator. Extra terms are needed. See [4] for details.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] vce(cluster) function in STATA: <a href="http://www.stata.com/help.cgi?vce_option">http://www.stata.com/help.cgi?vce_option</a></p>
+<p>[2] clustered estimators in R: <a href="http://people.su.se/~ma/clustering.pdf">http://people.su.se/~ma/clustering.pdf</a></p>
+<p>[3] Achim Zeileis: Object-oriented Computation of Sandwich Estimators. Research Report Series / Department of Statistics and Mathematics, 37. Department of Statistics and Mathematics, WU Vienna University of Economics and Business, Vienna. <a href="http://cran.r-project.org/web/packages/sandwich/vignettes/sandwich-OOP.pdf">http://cran.r-project.org/web/packages/sandwich/vignettes/sandwich-OOP.pdf</a></p>
+<p>[4] D. Y. Lin and L . J. Wei, <em>The Robust Inference for the Cox Proportional Hazards Model</em>, Journal of the American Statistical Association, Vol. 84, No. 408, p.1074 (1989).</p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd>File <a class="el" href="robust_8sql__in.html" title="SQL functions for robust variance linear and logistic regression. ">robust.sql_in</a> documenting the SQL functions File <a class="el" href="robust__variance__coxph_8sql__in.html" title="SQL functions for robust cox proportional hazards regression. ">robust_variance_coxph.sql_in</a> documenting more the SQL functions</dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Jul 27 2015 20:37:45 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/7258f51a/docs/latest/group__grp__sample.html
----------------------------------------------------------------------
diff --git a/docs/latest/group__grp__sample.html b/docs/latest/group__grp__sample.html
new file mode 100644
index 0000000..5e26715
--- /dev/null
+++ b/docs/latest/group__grp__sample.html
@@ -0,0 +1,151 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Random Sampling</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script src="../mathjax/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="navtree_hack.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'auto');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.8</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__sample.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Random Sampling<div class="ingroups"><a class="el" href="group__grp__early__stage.html">Early Stage Development</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#func_list">Functions</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method is still in early stage development. There may be some issues that will be addressed in a future version. Interface and implementation is subject to change. </em></dd></dl>
+<p>The random sampling module consists of useful utility functions for sampling operations. These functions can be used while implementing new algorithms.</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Functions</dt><dd></dd></dl>
+<p>Sample a single row according to weights. </p><pre class="syntax">
+weighted_sample( value,
+                 weight
+               )
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>value </dt>
+<dd>BIGINT or FLOAT8[]. Value of row. Uniqueness is not enforced. If a value occurs multiple times, the probability of sampling this value is proportional to the sum of its weights.  </dd>
+<dt>weight </dt>
+<dd>FLOAT8. Weight for row. A negative value here is treated has zero weight.  </dd>
+</dl>
+<p>Refer to the file for documentation on each of the utility functions.</p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl>
+<dl class="section see"><dt>See also</dt><dd>File <a class="el" href="sample_8sql__in.html" title="SQL functions for random sampling. ">sample.sql_in</a> documenting the SQL functions. </dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Jul 27 2015 20:37:45 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/7258f51a/docs/latest/group__grp__sketches.html
----------------------------------------------------------------------
diff --git a/docs/latest/group__grp__sketches.html b/docs/latest/group__grp__sketches.html
new file mode 100644
index 0000000..4365737
--- /dev/null
+++ b/docs/latest/group__grp__sketches.html
@@ -0,0 +1,161 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
+<title>MADlib: Cardinality Estimators</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script src="../mathjax/MathJax.js"></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="navtree_hack.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'auto');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.8</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__sketches.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Cardinality Estimators<div class="ingroups"><a class="el" href="group__grp__early__stage.html">Early Stage Development</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
+<dl class="section warning"><dt>Warning</dt><dd><em> These MADlib methods are still in early stage development. There may be some issues that will be addressed in future versions. Interface and implementation is subject to change. </em></dd></dl>
+<p>Sketches (sometimes called "synopsis data structures") are small randomized in-memory data structures that capture statistical properties of a large set of values (e.g., a column of a table). Sketches can be formed in a single pass of the data, and used to approximate a variety of descriptive statistics.</p>
+<p>We implement sketches as SQL User-Defined Aggregates (UDAs). Because they are single-pass, small-space and parallelized, a single query can use many sketches to gather summary statistics on many columns of a table efficiently.</p>
+<p>This module currently implements user-defined aggregates based on three main sketch methods:</p><ul>
+<li><em>Count-Min (CM)</em> sketches, which can be used to approximate a number of descriptive statistics including<ul>
+<li><code>COUNT(*)</code> of rows whose column value matches a given value in a set</li>
+<li><code>COUNT(*)</code> of rows whose column value falls in a range (*)</li>
+<li>order statistics including <em>median</em> and <em>centiles</em> (*)</li>
+<li><em>histograms</em>: both <em>equi-width</em> and <em>equi-depth</em> (*)</li>
+</ul>
+</li>
+<li><em>Flajolet-Martin (FM)</em> sketches for approximating <code>COUNT(DISTINCT)</code>.</li>
+<li><em>Most Frequent Value (MFV)</em> sketches, which output the most frequently-occuring values in a column, along with their associated counts.</li>
+</ul>
+<p><em>Note:</em> Features marked with a star (*) only work for discrete types that can be cast to int8.</p>
+<p>The sketch methods consist of a number of SQL UDAs (user-defined aggregates) and UDFs (user-defined functions), to be used directly in SQL queries. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__countmin"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__countmin.html">CountMin (Cormode-Muthukrishnan)</a></td></tr>
+<tr class="memdesc:group__grp__countmin"><td class="mdescLeft">&#160;</td><td class="mdescRight">Implements Cormode-Mathukrishnan <em>CountMin</em> sketches on integer values as a user-defined aggregate. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__fmsketch"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__fmsketch.html">FM (Flajolet-Martin)</a></td></tr>
+<tr class="memdesc:group__grp__fmsketch"><td class="mdescLeft">&#160;</td><td class="mdescRight">Implements Flajolet-Martin's distinct count estimation as a user-defined aggregate. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__mfvsketch"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__mfvsketch.html">MFV (Most Frequent Values)</a></td></tr>
+<tr class="memdesc:group__grp__mfvsketch"><td class="mdescLeft">&#160;</td><td class="mdescRight">Implements the most frequent values variant of the CountMin sketch as a user-defined aggregate. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Jul 27 2015 20:37:45 for MADlib by
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/7258f51a/docs/latest/group__grp__sketches.js
----------------------------------------------------------------------
diff --git a/docs/latest/group__grp__sketches.js b/docs/latest/group__grp__sketches.js
new file mode 100644
index 0000000..1e443dd
--- /dev/null
+++ b/docs/latest/group__grp__sketches.js
@@ -0,0 +1,6 @@
+var group__grp__sketches =
+[
+    [ "CountMin (Cormode-Muthukrishnan)", "group__grp__countmin.html", null ],
+    [ "FM (Flajolet-Martin)", "group__grp__fmsketch.html", null ],
+    [ "MFV (Most Frequent Values)", "group__grp__mfvsketch.html", null ]
+];
\ No newline at end of file