You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/01/10 01:07:21 UTC

incubator-madlib git commit: DT and RF: Adds verbose option for the dot output format.

Repository: incubator-madlib
Updated Branches:
  refs/heads/master c56b20910 -> 02f4602a5


DT and RF: Adds verbose option for the dot output format.

JIRA: MADLIB-1051

Closes #86


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/02f4602a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/02f4602a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/02f4602a

Branch: refs/heads/master
Commit: 02f4602a5554491c1a6d96654d34da29e5275254
Parents: c56b209
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Mon Jan 9 17:04:54 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Mon Jan 9 17:04:54 2017 -0800

----------------------------------------------------------------------
 src/modules/recursive_partitioning/DT_impl.hpp  |  70 ++++++-----
 src/modules/recursive_partitioning/DT_proto.hpp |  10 +-
 .../recursive_partitioning/decision_tree.cpp    |   9 +-
 .../recursive_partitioning/decision_tree.py_in  |   6 +-
 .../recursive_partitioning/decision_tree.sql_in | 116 ++++++++++++++-----
 .../recursive_partitioning/random_forest.py_in  |   6 +-
 .../recursive_partitioning/random_forest.sql_in |  42 +++++--
 7 files changed, 179 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/DT_impl.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp
index 702ac2c..f622f94 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -1092,7 +1092,8 @@ string
 DecisionTree<Container>::displayLeafNode(
             Index id,
             ArrayHandle<text*> &dep_levels,
-            const std::string & id_prefix){
+            const std::string & id_prefix,
+            bool verbose){
     std::stringstream predict_str;
     if (static_cast<bool>(is_regression)){
         predict_str << predict_response(id);
@@ -1105,14 +1106,14 @@ DecisionTree<Container>::displayLeafNode(
     std::stringstream display_str;
     display_str << "\"" << id_prefix << id << "\" [label=\"" << predict_str.str();
 
-    // // uncomment below if distribution of rows is required in leaf node
-    // display_str << "\\n[";
-    // if (is_regression)
-    //      display_str << statCount(predictions.row(id)) << ", "
-    //                  << statPredict(predictions.row(id));
-    // else
-    //     display_str << predictions.row(id);
-    // display_str << "]";
+    if(verbose){
+        display_str << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = ";
+        if (is_regression)
+            display_str << statPredict(predictions.row(id));
+        else{
+            display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+        }
+    }
     display_str << "\",shape=box]" << ";";
     return display_str.str();
 }
@@ -1130,7 +1131,9 @@ DecisionTree<Container>::displayInternalNode(
             ArrayHandle<text*> &con_features_str,
             ArrayHandle<text*> &cat_levels_text,
             ArrayHandle<int> &cat_n_levels,
-            const std::string & id_prefix
+            ArrayHandle<text*> &dep_levels,
+            const std::string & id_prefix,
+            bool verbose
             ){
 
     string feature_name;
@@ -1149,16 +1152,26 @@ DecisionTree<Container>::displayInternalNode(
 
     std::stringstream display_str;
     display_str << "\"" << id_prefix << id << "\" [label=\"" << label_str.str();
-    // // uncomment below if distribution of rows is required in internal node
-    // display_str << "\\n[";
-    // if (is_regression)
-    //      display_str << statCount(predictions.row(id)) << ", "
-    //                  << statPredict(predictions.row(id));
-    // else
-    //     display_str << predictions.row(id);
-    // display_str << "]";
-    display_str <<"\", shape=ellipse]" << ";";
-   return display_str.str();
+    if(verbose){
+
+        display_str << "\\n impurity = "<< impurity(predictions.row(id)) << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = ";
+        if (is_regression)
+            display_str << statPredict(predictions.row(id));
+        else{
+            display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+        }
+        std::stringstream predict_str;
+        if (static_cast<bool>(is_regression)){
+            predict_str << predict_response(id);
+        }
+        else{
+            std::string dep_value = get_text(dep_levels, static_cast<int>(predict_response(id)));
+            predict_str << escape_quotes(dep_value);
+        }
+        display_str << "\\n class = " << predict_str.str();
+    }
+    display_str << "\", shape=ellipse]" << ";";
+    return display_str.str();
 }
 // -------------------------------------------------------------------------
 
@@ -1174,11 +1187,12 @@ DecisionTree<Container>::display(
         ArrayHandle<text*> &cat_levels_text,
         ArrayHandle<int> &cat_n_levels,
         ArrayHandle<text*> &dependent_levels,
-        const std::string &id_prefix) {
+        const std::string &id_prefix,
+        bool verbose) {
 
     std::stringstream display_string;
     if (feature_indices(0) == FINISHED_LEAF){
-        display_string << displayLeafNode(0, dependent_levels, id_prefix)
+        display_string << displayLeafNode(0, dependent_levels, id_prefix, verbose)
                        << std::endl;
     }
     else{
@@ -1189,7 +1203,7 @@ DecisionTree<Container>::display(
 
                 display_string << displayInternalNode(
                         index, cat_features_str, con_features_str,
-                        cat_levels_text, cat_n_levels, id_prefix) << std::endl;
+                        cat_levels_text, cat_n_levels, dependent_levels, id_prefix, verbose) << std::endl;
 
                 // Display the children
                 Index tc = trueChild(index);
@@ -1203,7 +1217,7 @@ DecisionTree<Container>::display(
                     if (feature_indices(tc) == IN_PROCESS_LEAF ||
                         feature_indices(tc) == FINISHED_LEAF)
                         display_string
-                            << displayLeafNode(tc, dependent_levels, id_prefix)
+                            << displayLeafNode(tc, dependent_levels, id_prefix, verbose)
                             << std::endl;
                 }
 
@@ -1218,7 +1232,7 @@ DecisionTree<Container>::display(
                     if (feature_indices(fc) == IN_PROCESS_LEAF ||
                         feature_indices(fc) == FINISHED_LEAF)
                         display_string
-                            << displayLeafNode(fc, dependent_levels, id_prefix)
+                            << displayLeafNode(fc, dependent_levels, id_prefix, verbose)
                             << std::endl;
                 }
             }
@@ -1664,7 +1678,7 @@ TreeAccumulator<Container, DTree>::operator<<(const surr_tuple_type& inTuple) {
                             updateSurrStats(true,
                                             is_primary_true == is_surrogate_true,
                                             row_index,
-                                            col_index, 
+                                            col_index,
                                             dup_count);
                         }
                     }
@@ -1731,7 +1745,7 @@ TreeAccumulator<Container, DTree>::updateNodeStats(bool is_regression,
                                                   const double weight) {
     ColumnVector stats(stats_per_split);
     stats.fill(0);
-    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1; 
+    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
     if (is_regression){
         double w_response = weight * response;
         stats << weight, w_response, w_response * response, n_rows;
@@ -1758,7 +1772,7 @@ TreeAccumulator<Container, DTree>::updateStats(bool is_regression,
                                                const double weight) {
     ColumnVector stats(stats_per_split);
     stats.fill(0);
-    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1; 
+    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
     if (is_regression){
         double w_response = weight * response;
         stats << weight, w_response, w_response * response, n_rows;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/DT_proto.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_proto.hpp b/src/modules/recursive_partitioning/DT_proto.hpp
index 2f5e211..a2881a5 100644
--- a/src/modules/recursive_partitioning/DT_proto.hpp
+++ b/src/modules/recursive_partitioning/DT_proto.hpp
@@ -123,15 +123,17 @@ public:
     }
 
     uint16_t recomputeTreeDepth() const;
-    string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix);
+    string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix, bool verbose);
     string displayInternalNode(Index id,
                                ArrayHandle<text*> &cat_features_str,
                                ArrayHandle<text*> &con_features_str,
                                ArrayHandle<text*> &cat_levels_text,
                                ArrayHandle<int> &cat_n_levels,
-                               const std::string & id_prefix);
+                               ArrayHandle<text*> &dep_levels,
+                               const std::string & id_prefix,
+                               bool verbose);
     string display(ArrayHandle<text*>&, ArrayHandle<text*>&, ArrayHandle<text*>&,
-                   ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&);
+                   ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&, bool verbose);
     string getCatLabels(Index, Index, Index, ArrayHandle<text*> &,
                         ArrayHandle<int> &);
     string print_split(bool, bool, Index, double,
@@ -234,7 +236,7 @@ public:
                         MappedColumnVector,  // continuous feature values
                         MappedIntegerVector, // levels for each categorical feature
                         MappedMatrix,        // split values for each continuous feature
-                        int                  // duplicated count for each tuple 
+                        int                  // duplicated count for each tuple
                                              //   (used in random forest)
                        > surr_tuple_type;
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/decision_tree.cpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp
index 8f74c07..7a8ec95 100644
--- a/src/modules/recursive_partitioning/decision_tree.cpp
+++ b/src/modules/recursive_partitioning/decision_tree.cpp
@@ -166,7 +166,7 @@ compute_leaf_stats_transition::run(AnyType & args){
                      static_cast<uint16_t>(con_features.size()),
                      static_cast<uint32_t>(cat_levels.sum()),
                      static_cast<uint16_t>(dt.tree_depth),
-                     stats_per_split, 
+                     stats_per_split,
                      weights_as_rows
                     );
         // compute cumulative sum of the levels of the categorical variables
@@ -408,10 +408,11 @@ display_decision_tree::run(AnyType &args) {
     ArrayHandle<int> cat_n_levels = args[4].getAs<ArrayHandle<int> >();
     ArrayHandle<text*> dependent_var_levels = args[5].getAs<ArrayHandle<text*> >();
     std::string id_prefix = args[6].getAs<std::string>();
+    bool verbose = args[7].getAs<bool>();
 
     string tree_str = dt.display(cat_feature_names, con_feature_names,
                                  cat_levels_text, cat_n_levels,
-                                 dependent_var_levels, id_prefix);
+                                 dependent_var_levels, id_prefix, verbose);
     return tree_str;
 }
 
@@ -893,9 +894,9 @@ void fill_one_row(MutableNativeMatrix &frame, Tree &dt, int me, int i,
 /*
  * PivotalR: randomForest
  * Convert to R's randomForest format for getTree(..) function
- * 
+ *
  */
-AnyType 
+AnyType
 convert_to_random_forest_format::run(AnyType &args) {
     Tree dt = args[0].getAs<ByteString>();
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index ccba636..40f4b7e 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -1791,7 +1791,7 @@ def _get_display_header(table_name, dep_levels, is_regression, dot_format=True):
 #------------------------------------------------------------------------------
 
 
-def tree_display(schema_madlib, model_table, dot_format=True,
+def tree_display(schema_madlib, model_table, dot_format=True, verbose=False,
                  disp_surr=False, **kwargs):
 
     if dot_format:
@@ -1862,9 +1862,9 @@ def tree_display(schema_madlib, model_table, dot_format=True,
                     return_str_list.append('\t label="{0}"'.format(group_name.replace('"', '\\"')))
                     sql = """
                             SELECT {0}._display_decision_tree(
-                                        $1, $2, $3, $4, $5, $6, '{1}'
+                                        $1, $2, $3, $4, $5, $6, '{1}', {2}
                                     ) as display_tree
-                          """.format(schema_madlib, "g" + str(index) + "_")
+                          """.format(schema_madlib, "g" + str(index) + "_", verbose)
                 else:
                     if group_name:
                         return_str_list.append("--- Tree for {0} ---".format(group_name))

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index a2538cf..d6b7b5a 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -412,7 +412,7 @@ be visualized using various programs including those in the GraphViz package, or
 in a simple text format. The details of the text format is outputted with the
 tree.
 <pre class="syntax">
-tree_display(tree_model, dot_format)
+tree_display(tree_model, dot_format, verbosity)
 </pre>
 
 An additional display function is provided to output the surrogate splits chosen
@@ -439,6 +439,9 @@ are NULL, then the majority branch is used to compute the split for a tuple.
     <DT>dot_format</DT>
     <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
     format. If TRUE, the result is in the dot format, else output is in text format</DD>
+    <DT>verbosity</DT>
+    <DD>BOOLEAN, default = FALSE. If true, the dot format output will contain
+    additional information (impurity, sample size, etc.)</DD>
 </DL>
 
 The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -487,20 +490,20 @@ CREATE TABLE dt_golf (
 </pre>
 <pre class="example">
 COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH DELIMITER '|';
-1|sunny|85|85|'false'|'Don''t Play'
-2|sunny|80|90|'true'|'Don''t Play'
+1|sunny|85|85|'false'|'Don't Play'
+2|sunny|80|90|'true'|'Don't Play'
 3|overcast|83|78|'false'|'Play'
 4|rain|70|96|'false'|'Play'
 5|rain|68|80|'false'|'Play'
-6|rain|65|70|'true'|'Don''t Play'
+6|rain|65|70|'true'|'Don't Play'
 7|overcast|64|65|'true'|'Play'
-8|sunny|72|95|'false'|'Don''t Play'
+8|sunny|72|95|'false'|'Don't Play'
 9|sunny|69|70|'false'|'Play'
 10|rain|75|80|'false'|'Play'
 11|sunny|75|70|'true'|'Play'
 12|overcast|72|90|'true'|'Play'
 13|overcast|81|75|'false'|'Play'
-14|rain|71|80|'true'|'Don''t Play'
+14|rain|71|80|'true'|'Don't Play'
 \\.
 </pre>
 
@@ -532,28 +535,27 @@ SELECT * FROM prediction_results;
 </pre>
 Result:
 <pre class="result">
- id | estimated_class
 &nbsp;----+-----------------
-  1 | Don't Play
-  2 | Don't Play
-  3 | Play
-  4 | Play
-  5 | Play
-  6 | Don't Play
-  7 | Play
-  8 | Don't Play
-  9 | Play
- 10 | Play
- 11 | Play
- 12 | Play
- 13 | Play
- 14 | Don't Play
+  1 | 'Don't Play'
+  2 | 'Don't Play'
+  3 | 'Play'
+  4 | 'Play'
+  5 | 'Play'
+  6 | 'Don't Play'
+  7 | 'Play'
+  8 | 'Don't Play'
+  9 | 'Play'
+ 10 | 'Play'
+ 11 | 'Play'
+ 12 | 'Play'
+ 13 | 'Play'
+ 14 | 'Don't Play'
 (14 rows)
 </pre>
 
 -# Obtain a dot format display of the tree
 <pre class="example">
-SELECT madlib.tree_display('train_output');
+SELECT madlib.tree_display('train_output', TRUE);
 </pre>
 Result:
 <pre class="result">
@@ -585,6 +587,40 @@ digraph "Classification tree for dt_golf" {
 &nbsp;} //---end of digraph---------
 </pre>
 
+-# Obtain a dot format display of the tree with additional info
+<pre class="example">
+SELECT madlib.tree_display('train_output', TRUE, TRUE);
+</pre>
+Result:
+<pre class="result">
+digraph "Classification tree for dt_golf" {
+         subgraph "cluster0"{
+         label=""
+"g0_0" [label="\"OUTLOOK\" in {overcast}\\n impurity = 0.459184\\n samples = 14\\n value = [5 9]\\n class = \"'Play'\"", shape=ellipse];
+"g0_0" -> "g0_1"[label="yes"];
+"g0_1" [label="\"'Play'\"\\n samples = 4\\n value = [0 4]",shape=box];
+"g0_0" -> "g0_2"[label="no"];
+"g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = [5 5]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_2" -> "g0_5"[label="yes"];
+"g0_2" -> "g0_6"[label="no"];
+"g0_6" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
+"g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value = [3 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_5" -> "g0_11"[label="yes"];
+"g0_11" [label="\"'Don't Play'\"\\n samples = 1\\n value = [1 0]",shape=box];
+"g0_5" -> "g0_12"[label="no"];
+"g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n value = [2 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_12" -> "g0_25"[label="yes"];
+"g0_25" [label="\"'Play'\"\\n samples = 3\\n value = [0 3]",shape=box];
+"g0_12" -> "g0_26"[label="no"];
+"g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = [2 2]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_26" -> "g0_53"[label="yes"];
+"g0_53" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
+"g0_26" -> "g0_54"[label="no"];
+"g0_54" [label="\"'Play'\"\\n samples = 2\\n value = [0 2]",shape=box];
+&nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
+&nbsp;} //---end of digraph---------
+</pre>
+
 -# Obtain a text display of the tree
 <pre class="example">
 SELECT madlib.tree_display('train_output', FALSE);
@@ -1312,7 +1348,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display(
 ) RETURNS VARCHAR AS $$
 PythonFunctionBodyOnly(recursive_partitioning, decision_tree, tree_display)
     return decision_tree.tree_display(schema_madlib, model_table, dot_format=False,
-                                      disp_surr=True)
+                                      verbose=False, disp_surr=True)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
@@ -1353,15 +1389,24 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
     model_table    TEXT,
-    dot_format     BOOLEAN
+    dot_format     BOOLEAN,
+    verbose        BOOLEAN
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, decision_tree, tree_display)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
+    model_table    TEXT,
+    dot_format     BOOLEAN
+) RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.tree_display($1, $2, FALSE);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
-    model_table             TEXT
+    model_table    TEXT
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.tree_display($1, TRUE);
 $$ LANGUAGE SQL VOLATILE
@@ -1381,10 +1426,13 @@ tree.
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.tree_display(
     tree_model,             -- TEXT. Name of the table containing the decision tree model
-    dot_format              -- BOOLEAN. (OPTIONAL, Default = TRUE)
+    dot_format,             -- BOOLEAN. (OPTIONAL, Default = TRUE)
                             -- Tree can be outputed either in a dot format or a text
                             --   format. If TRUE, the result is in the dot format,
                             --   else output is in text format
+    verbose                 -- BOOLEAN. (OPTIONAL, Default = FALSE)
+                            -- If TRUE, the dot format output will contain additional
+                            -- information
     )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -1403,12 +1451,26 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
   cat_levels_in_text TEXT[],
   cat_n_levels       INTEGER[],
   dependent_levels   TEXT[],
-  id_prefix          TEXT
+  id_prefix          TEXT,
+  verbose            BOOLEAN
 )  RETURNS TEXT
 AS 'MODULE_PATHNAME', 'display_decision_tree'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
+  tree               MADLIB_SCHEMA.bytea8,
+  cat_features       TEXT[],
+  con_features       TEXT[],
+  cat_levels_in_text TEXT[],
+  cat_n_levels       INTEGER[],
+  dependent_levels   TEXT[],
+  id_prefix          TEXT
+)  RETURNS TEXT AS $$
+    SELECT MADLIB_SCHEMA._display_decision_tree($1, $2, $3, $4, $5, $6, $7, FALSE);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree_surrogate(
   tree               MADLIB_SCHEMA.bytea8,
   cat_features       TEXT[],

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 43ad64e..e006a34 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -809,7 +809,7 @@ def get_tree_surr(schema_madlib, model_table, gid, sample_id, **kwargs):
 
 
 def get_tree(schema_madlib, model_table, gid, sample_id,
-             dot_format=True, disp_surr=False, **kwargs):
+             dot_format=True, verbose=False, disp_surr=False, **kwargs):
     """Random forest tree display function"""
 
     _validate_get_tree(model_table, gid, sample_id)
@@ -885,9 +885,9 @@ def get_tree(schema_madlib, model_table, gid, sample_id,
             if dot_format:
                 sql_display = """
                         SELECT {0}._display_decision_tree(
-                            $1, $2, $3, $4, $5, $6, '{1}'
+                            $1, $2, $3, $4, $5, $6, '{1}', {2}
                             ) as display_tree
-                        """.format(schema_madlib, "")
+                        """.format(schema_madlib, "", verbose)
             else:
                 sql_display = """
                         SELECT {0}._display_text_decision_tree(

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index e463225..0cc6534 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -406,7 +406,8 @@ text format is outputted with the tree.
 get_tree(forest_model_table,
          gid,
          sample_id,
-         dot_format)
+         dot_format,
+         verbose)
 </pre>
 
 An additional display function is provided to output the surrogate splits chosen
@@ -439,9 +440,13 @@ are NULL, then the majority branch is used to compute the split for a tuple.
     <DT>sample_id</DT>
     <DD>integer. Id of the bootstrap sample that this tree if a part of.</DD>
 
-    <DT>dot_format</DT>
+    <DT>dot_format (optional)</DT>
     <DD>boolean, default = TRUE. Output can either be in a dot format or a text
     format. If TRUE, the result is in the dot format, else output is in text format.</DD>
+
+    <DT>verbose (optional)</DT>
+    <DD>boolean, default = FALSE. If true, the dot format output will contain
+    additional information (impurity, sample size, etc.)</DD>
 </DL>
 
 The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -978,12 +983,14 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
   *@param gid Group id of the tree to display
   *@param sample_id Sample id of the tree to display
   *@dot_format TRUE if dot format, FALSE for text format
+  *@verbose TRUE if the dot format output will contain additional information
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"    TEXT,
     "gid"    INTEGER,
     "sample_id"    INTEGER,
-    "dot_format"  BOOLEAN
+    "dot_format"  BOOLEAN,
+    "verbose"        BOOLEAN
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, random_forest, get_tree)
 $$ LANGUAGE plpythonu VOLATILE
@@ -992,9 +999,19 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"     TEXT,
     "gid"  INTEGER,
+    "sample_id"   INTEGER,
+    "dot_format"  BOOLEAN
+) RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, $4, FALSE::BOOLEAN);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
+    "model_table"     TEXT,
+    "gid"  INTEGER,
     "sample_id"   INTEGER
 ) RETURNS VARCHAR AS $$
-    SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN);
+    SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
@@ -1012,13 +1029,16 @@ tree.
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.get_tree(
-    forest_model,            -- TEXT. Name of the table containing the random forest model
-    gid,                -- INTEGER. Group id of the tree to be displayed
-    sample_id,               -- INTEGER. Sample of the tree to be displayed
-    dot_format        -- BOOLEAN. (OPTIONAL, Default = TRUE)
-                             -- Output can either be in a dot format or a text
-                             --   format. If TRUE, the result is in the dot format,
-                             --   else output is in text format
+    forest_model,           -- TEXT. Name of the table containing the random forest model
+    gid,                    -- INTEGER. Group id of the tree to be displayed
+    sample_id,              -- INTEGER. Sample of the tree to be displayed
+    dot_format,             -- BOOLEAN. (OPTIONAL, Default = TRUE)
+                            -- Output can either be in a dot format or a text
+                            --   format. If TRUE, the result is in the dot format,
+                            --   else output is in text format
+    verbose                 -- BOOLEAN. (OPTIONAL, Default = FALSE)
+                            -- If TRUE, the dot format output will contain additional
+                            -- information
     )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'. For the dot format, the output can be