You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/01/10 01:07:21 UTC
incubator-madlib git commit: DT and RF: Adds verbose option for the
dot output format.
Repository: incubator-madlib
Updated Branches:
refs/heads/master c56b20910 -> 02f4602a5
DT and RF: Adds verbose option for the dot output format.
JIRA: MADLIB-1051
Closes #86
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/02f4602a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/02f4602a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/02f4602a
Branch: refs/heads/master
Commit: 02f4602a5554491c1a6d96654d34da29e5275254
Parents: c56b209
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Mon Jan 9 17:04:54 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Mon Jan 9 17:04:54 2017 -0800
----------------------------------------------------------------------
src/modules/recursive_partitioning/DT_impl.hpp | 70 ++++++-----
src/modules/recursive_partitioning/DT_proto.hpp | 10 +-
.../recursive_partitioning/decision_tree.cpp | 9 +-
.../recursive_partitioning/decision_tree.py_in | 6 +-
.../recursive_partitioning/decision_tree.sql_in | 116 ++++++++++++++-----
.../recursive_partitioning/random_forest.py_in | 6 +-
.../recursive_partitioning/random_forest.sql_in | 42 +++++--
7 files changed, 179 insertions(+), 80 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/DT_impl.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp
index 702ac2c..f622f94 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -1092,7 +1092,8 @@ string
DecisionTree<Container>::displayLeafNode(
Index id,
ArrayHandle<text*> &dep_levels,
- const std::string & id_prefix){
+ const std::string & id_prefix,
+ bool verbose){
std::stringstream predict_str;
if (static_cast<bool>(is_regression)){
predict_str << predict_response(id);
@@ -1105,14 +1106,14 @@ DecisionTree<Container>::displayLeafNode(
std::stringstream display_str;
display_str << "\"" << id_prefix << id << "\" [label=\"" << predict_str.str();
- // // uncomment below if distribution of rows is required in leaf node
- // display_str << "\\n[";
- // if (is_regression)
- // display_str << statCount(predictions.row(id)) << ", "
- // << statPredict(predictions.row(id));
- // else
- // display_str << predictions.row(id);
- // display_str << "]";
+ if(verbose){
+ display_str << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = ";
+ if (is_regression)
+ display_str << statPredict(predictions.row(id));
+ else{
+ display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+ }
+ }
display_str << "\",shape=box]" << ";";
return display_str.str();
}
@@ -1130,7 +1131,9 @@ DecisionTree<Container>::displayInternalNode(
ArrayHandle<text*> &con_features_str,
ArrayHandle<text*> &cat_levels_text,
ArrayHandle<int> &cat_n_levels,
- const std::string & id_prefix
+ ArrayHandle<text*> &dep_levels,
+ const std::string & id_prefix,
+ bool verbose
){
string feature_name;
@@ -1149,16 +1152,26 @@ DecisionTree<Container>::displayInternalNode(
std::stringstream display_str;
display_str << "\"" << id_prefix << id << "\" [label=\"" << label_str.str();
- // // uncomment below if distribution of rows is required in internal node
- // display_str << "\\n[";
- // if (is_regression)
- // display_str << statCount(predictions.row(id)) << ", "
- // << statPredict(predictions.row(id));
- // else
- // display_str << predictions.row(id);
- // display_str << "]";
- display_str <<"\", shape=ellipse]" << ";";
- return display_str.str();
+ if(verbose){
+
+ display_str << "\\n impurity = "<< impurity(predictions.row(id)) << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = ";
+ if (is_regression)
+ display_str << statPredict(predictions.row(id));
+ else{
+ display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+ }
+ std::stringstream predict_str;
+ if (static_cast<bool>(is_regression)){
+ predict_str << predict_response(id);
+ }
+ else{
+ std::string dep_value = get_text(dep_levels, static_cast<int>(predict_response(id)));
+ predict_str << escape_quotes(dep_value);
+ }
+ display_str << "\\n class = " << predict_str.str();
+ }
+ display_str << "\", shape=ellipse]" << ";";
+ return display_str.str();
}
// -------------------------------------------------------------------------
@@ -1174,11 +1187,12 @@ DecisionTree<Container>::display(
ArrayHandle<text*> &cat_levels_text,
ArrayHandle<int> &cat_n_levels,
ArrayHandle<text*> &dependent_levels,
- const std::string &id_prefix) {
+ const std::string &id_prefix,
+ bool verbose) {
std::stringstream display_string;
if (feature_indices(0) == FINISHED_LEAF){
- display_string << displayLeafNode(0, dependent_levels, id_prefix)
+ display_string << displayLeafNode(0, dependent_levels, id_prefix, verbose)
<< std::endl;
}
else{
@@ -1189,7 +1203,7 @@ DecisionTree<Container>::display(
display_string << displayInternalNode(
index, cat_features_str, con_features_str,
- cat_levels_text, cat_n_levels, id_prefix) << std::endl;
+ cat_levels_text, cat_n_levels, dependent_levels, id_prefix, verbose) << std::endl;
// Display the children
Index tc = trueChild(index);
@@ -1203,7 +1217,7 @@ DecisionTree<Container>::display(
if (feature_indices(tc) == IN_PROCESS_LEAF ||
feature_indices(tc) == FINISHED_LEAF)
display_string
- << displayLeafNode(tc, dependent_levels, id_prefix)
+ << displayLeafNode(tc, dependent_levels, id_prefix, verbose)
<< std::endl;
}
@@ -1218,7 +1232,7 @@ DecisionTree<Container>::display(
if (feature_indices(fc) == IN_PROCESS_LEAF ||
feature_indices(fc) == FINISHED_LEAF)
display_string
- << displayLeafNode(fc, dependent_levels, id_prefix)
+ << displayLeafNode(fc, dependent_levels, id_prefix, verbose)
<< std::endl;
}
}
@@ -1664,7 +1678,7 @@ TreeAccumulator<Container, DTree>::operator<<(const surr_tuple_type& inTuple) {
updateSurrStats(true,
is_primary_true == is_surrogate_true,
row_index,
- col_index,
+ col_index,
dup_count);
}
}
@@ -1731,7 +1745,7 @@ TreeAccumulator<Container, DTree>::updateNodeStats(bool is_regression,
const double weight) {
ColumnVector stats(stats_per_split);
stats.fill(0);
- int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
+ int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
if (is_regression){
double w_response = weight * response;
stats << weight, w_response, w_response * response, n_rows;
@@ -1758,7 +1772,7 @@ TreeAccumulator<Container, DTree>::updateStats(bool is_regression,
const double weight) {
ColumnVector stats(stats_per_split);
stats.fill(0);
- int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
+ int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
if (is_regression){
double w_response = weight * response;
stats << weight, w_response, w_response * response, n_rows;
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/DT_proto.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_proto.hpp b/src/modules/recursive_partitioning/DT_proto.hpp
index 2f5e211..a2881a5 100644
--- a/src/modules/recursive_partitioning/DT_proto.hpp
+++ b/src/modules/recursive_partitioning/DT_proto.hpp
@@ -123,15 +123,17 @@ public:
}
uint16_t recomputeTreeDepth() const;
- string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix);
+ string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix, bool verbose);
string displayInternalNode(Index id,
ArrayHandle<text*> &cat_features_str,
ArrayHandle<text*> &con_features_str,
ArrayHandle<text*> &cat_levels_text,
ArrayHandle<int> &cat_n_levels,
- const std::string & id_prefix);
+ ArrayHandle<text*> &dep_levels,
+ const std::string & id_prefix,
+ bool verbose);
string display(ArrayHandle<text*>&, ArrayHandle<text*>&, ArrayHandle<text*>&,
- ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&);
+ ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&, bool verbose);
string getCatLabels(Index, Index, Index, ArrayHandle<text*> &,
ArrayHandle<int> &);
string print_split(bool, bool, Index, double,
@@ -234,7 +236,7 @@ public:
MappedColumnVector, // continuous feature values
MappedIntegerVector, // levels for each categorical feature
MappedMatrix, // split values for each continuous feature
- int // duplicated count for each tuple
+ int // duplicated count for each tuple
// (used in random forest)
> surr_tuple_type;
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/decision_tree.cpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp
index 8f74c07..7a8ec95 100644
--- a/src/modules/recursive_partitioning/decision_tree.cpp
+++ b/src/modules/recursive_partitioning/decision_tree.cpp
@@ -166,7 +166,7 @@ compute_leaf_stats_transition::run(AnyType & args){
static_cast<uint16_t>(con_features.size()),
static_cast<uint32_t>(cat_levels.sum()),
static_cast<uint16_t>(dt.tree_depth),
- stats_per_split,
+ stats_per_split,
weights_as_rows
);
// compute cumulative sum of the levels of the categorical variables
@@ -408,10 +408,11 @@ display_decision_tree::run(AnyType &args) {
ArrayHandle<int> cat_n_levels = args[4].getAs<ArrayHandle<int> >();
ArrayHandle<text*> dependent_var_levels = args[5].getAs<ArrayHandle<text*> >();
std::string id_prefix = args[6].getAs<std::string>();
+ bool verbose = args[7].getAs<bool>();
string tree_str = dt.display(cat_feature_names, con_feature_names,
cat_levels_text, cat_n_levels,
- dependent_var_levels, id_prefix);
+ dependent_var_levels, id_prefix, verbose);
return tree_str;
}
@@ -893,9 +894,9 @@ void fill_one_row(MutableNativeMatrix &frame, Tree &dt, int me, int i,
/*
* PivotalR: randomForest
* Convert to R's randomForest format for getTree(..) function
- *
+ *
*/
-AnyType
+AnyType
convert_to_random_forest_format::run(AnyType &args) {
Tree dt = args[0].getAs<ByteString>();
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index ccba636..40f4b7e 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -1791,7 +1791,7 @@ def _get_display_header(table_name, dep_levels, is_regression, dot_format=True):
#------------------------------------------------------------------------------
-def tree_display(schema_madlib, model_table, dot_format=True,
+def tree_display(schema_madlib, model_table, dot_format=True, verbose=False,
disp_surr=False, **kwargs):
if dot_format:
@@ -1862,9 +1862,9 @@ def tree_display(schema_madlib, model_table, dot_format=True,
return_str_list.append('\t label="{0}"'.format(group_name.replace('"', '\\"')))
sql = """
SELECT {0}._display_decision_tree(
- $1, $2, $3, $4, $5, $6, '{1}'
+ $1, $2, $3, $4, $5, $6, '{1}', {2}
) as display_tree
- """.format(schema_madlib, "g" + str(index) + "_")
+ """.format(schema_madlib, "g" + str(index) + "_", verbose)
else:
if group_name:
return_str_list.append("--- Tree for {0} ---".format(group_name))
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index a2538cf..d6b7b5a 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -412,7 +412,7 @@ be visualized using various programs including those in the GraphViz package, or
in a simple text format. The details of the text format is outputted with the
tree.
<pre class="syntax">
-tree_display(tree_model, dot_format)
+tree_display(tree_model, dot_format, verbosity)
</pre>
An additional display function is provided to output the surrogate splits chosen
@@ -439,6 +439,9 @@ are NULL, then the majority branch is used to compute the split for a tuple.
<DT>dot_format</DT>
<DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
format. If TRUE, the result is in the dot format, else output is in text format</DD>
+ <DT>verbosity</DT>
+ <DD>BOOLEAN, default = FALSE. If true, the dot format output will contain
+ additional information (impurity, sample size, etc.)</DD>
</DL>
The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -487,20 +490,20 @@ CREATE TABLE dt_golf (
</pre>
<pre class="example">
COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH DELIMITER '|';
-1|sunny|85|85|'false'|'Don''t Play'
-2|sunny|80|90|'true'|'Don''t Play'
+1|sunny|85|85|'false'|'Don't Play'
+2|sunny|80|90|'true'|'Don't Play'
3|overcast|83|78|'false'|'Play'
4|rain|70|96|'false'|'Play'
5|rain|68|80|'false'|'Play'
-6|rain|65|70|'true'|'Don''t Play'
+6|rain|65|70|'true'|'Don't Play'
7|overcast|64|65|'true'|'Play'
-8|sunny|72|95|'false'|'Don''t Play'
+8|sunny|72|95|'false'|'Don't Play'
9|sunny|69|70|'false'|'Play'
10|rain|75|80|'false'|'Play'
11|sunny|75|70|'true'|'Play'
12|overcast|72|90|'true'|'Play'
13|overcast|81|75|'false'|'Play'
-14|rain|71|80|'true'|'Don''t Play'
+14|rain|71|80|'true'|'Don't Play'
\\.
</pre>
@@ -532,28 +535,27 @@ SELECT * FROM prediction_results;
</pre>
Result:
<pre class="result">
- id | estimated_class
----+-----------------
- 1 | Don't Play
- 2 | Don't Play
- 3 | Play
- 4 | Play
- 5 | Play
- 6 | Don't Play
- 7 | Play
- 8 | Don't Play
- 9 | Play
- 10 | Play
- 11 | Play
- 12 | Play
- 13 | Play
- 14 | Don't Play
+ 1 | 'Don't Play'
+ 2 | 'Don't Play'
+ 3 | 'Play'
+ 4 | 'Play'
+ 5 | 'Play'
+ 6 | 'Don't Play'
+ 7 | 'Play'
+ 8 | 'Don't Play'
+ 9 | 'Play'
+ 10 | 'Play'
+ 11 | 'Play'
+ 12 | 'Play'
+ 13 | 'Play'
+ 14 | 'Don't Play'
(14 rows)
</pre>
-# Obtain a dot format display of the tree
<pre class="example">
-SELECT madlib.tree_display('train_output');
+SELECT madlib.tree_display('train_output', TRUE);
</pre>
Result:
<pre class="result">
@@ -585,6 +587,40 @@ digraph "Classification tree for dt_golf" {
} //---end of digraph---------
</pre>
+-# Obtain a dot format display of the tree with additional info
+<pre class="example">
+SELECT madlib.tree_display('train_output', TRUE, TRUE);
+</pre>
+Result:
+<pre class="result">
+digraph "Classification tree for dt_golf" {
+ subgraph "cluster0"{
+ label=""
+"g0_0" [label="\"OUTLOOK\" in {overcast}\\n impurity = 0.459184\\n samples = 14\\n value = [5 9]\\n class = \"'Play'\"", shape=ellipse];
+"g0_0" -> "g0_1"[label="yes"];
+"g0_1" [label="\"'Play'\"\\n samples = 4\\n value = [0 4]",shape=box];
+"g0_0" -> "g0_2"[label="no"];
+"g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = [5 5]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_2" -> "g0_5"[label="yes"];
+"g0_2" -> "g0_6"[label="no"];
+"g0_6" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
+"g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value = [3 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_5" -> "g0_11"[label="yes"];
+"g0_11" [label="\"'Don't Play'\"\\n samples = 1\\n value = [1 0]",shape=box];
+"g0_5" -> "g0_12"[label="no"];
+"g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n value = [2 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_12" -> "g0_25"[label="yes"];
+"g0_25" [label="\"'Play'\"\\n samples = 3\\n value = [0 3]",shape=box];
+"g0_12" -> "g0_26"[label="no"];
+"g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = [2 2]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_26" -> "g0_53"[label="yes"];
+"g0_53" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
+"g0_26" -> "g0_54"[label="no"];
+"g0_54" [label="\"'Play'\"\\n samples = 2\\n value = [0 2]",shape=box];
+ } //--- end of subgraph------------
+ } //---end of digraph---------
+</pre>
+
-# Obtain a text display of the tree
<pre class="example">
SELECT madlib.tree_display('train_output', FALSE);
@@ -1312,7 +1348,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display(
) RETURNS VARCHAR AS $$
PythonFunctionBodyOnly(recursive_partitioning, decision_tree, tree_display)
return decision_tree.tree_display(schema_madlib, model_table, dot_format=False,
- disp_surr=True)
+ verbose=False, disp_surr=True)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
@@ -1353,15 +1389,24 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
model_table TEXT,
- dot_format BOOLEAN
+ dot_format BOOLEAN,
+ verbose BOOLEAN
) RETURNS VARCHAR AS $$
PythonFunction(recursive_partitioning, decision_tree, tree_display)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
+ model_table TEXT,
+ dot_format BOOLEAN
+) RETURNS VARCHAR AS $$
+ SELECT MADLIB_SCHEMA.tree_display($1, $2, FALSE);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
- model_table TEXT
+ model_table TEXT
) RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.tree_display($1, TRUE);
$$ LANGUAGE SQL VOLATILE
@@ -1381,10 +1426,13 @@ tree.
------------------------------------------------------------
SELECT MADLIB_SCHEMA.tree_display(
tree_model, -- TEXT. Name of the table containing the decision tree model
- dot_format -- BOOLEAN. (OPTIONAL, Default = TRUE)
+ dot_format, -- BOOLEAN. (OPTIONAL, Default = TRUE)
-- Tree can be outputed either in a dot format or a text
-- format. If TRUE, the result is in the dot format,
-- else output is in text format
+ verbose -- BOOLEAN. (OPTIONAL, Default = FALSE)
+ -- If TRUE, the dot format output will contain additional
+ -- information
)
------------------------------------------------------------
The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -1403,12 +1451,26 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
cat_levels_in_text TEXT[],
cat_n_levels INTEGER[],
dependent_levels TEXT[],
- id_prefix TEXT
+ id_prefix TEXT,
+ verbose BOOLEAN
) RETURNS TEXT
AS 'MODULE_PATHNAME', 'display_decision_tree'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
+ tree MADLIB_SCHEMA.bytea8,
+ cat_features TEXT[],
+ con_features TEXT[],
+ cat_levels_in_text TEXT[],
+ cat_n_levels INTEGER[],
+ dependent_levels TEXT[],
+ id_prefix TEXT
+) RETURNS TEXT AS $$
+ SELECT MADLIB_SCHEMA._display_decision_tree($1, $2, $3, $4, $5, $6, $7, FALSE);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree_surrogate(
tree MADLIB_SCHEMA.bytea8,
cat_features TEXT[],
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 43ad64e..e006a34 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -809,7 +809,7 @@ def get_tree_surr(schema_madlib, model_table, gid, sample_id, **kwargs):
def get_tree(schema_madlib, model_table, gid, sample_id,
- dot_format=True, disp_surr=False, **kwargs):
+ dot_format=True, verbose=False, disp_surr=False, **kwargs):
"""Random forest tree display function"""
_validate_get_tree(model_table, gid, sample_id)
@@ -885,9 +885,9 @@ def get_tree(schema_madlib, model_table, gid, sample_id,
if dot_format:
sql_display = """
SELECT {0}._display_decision_tree(
- $1, $2, $3, $4, $5, $6, '{1}'
+ $1, $2, $3, $4, $5, $6, '{1}', {2}
) as display_tree
- """.format(schema_madlib, "")
+ """.format(schema_madlib, "", verbose)
else:
sql_display = """
SELECT {0}._display_text_decision_tree(
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index e463225..0cc6534 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -406,7 +406,8 @@ text format is outputted with the tree.
get_tree(forest_model_table,
gid,
sample_id,
- dot_format)
+ dot_format,
+ verbose)
</pre>
An additional display function is provided to output the surrogate splits chosen
@@ -439,9 +440,13 @@ are NULL, then the majority branch is used to compute the split for a tuple.
<DT>sample_id</DT>
<DD>integer. Id of the bootstrap sample that this tree if a part of.</DD>
- <DT>dot_format</DT>
+ <DT>dot_format (optional)</DT>
<DD>boolean, default = TRUE. Output can either be in a dot format or a text
format. If TRUE, the result is in the dot format, else output is in text format.</DD>
+
+ <DT>verbose (optional)</DT>
+ <DD>boolean, default = FALSE. If true, the dot format output will contain
+ additional information (impurity, sample size, etc.)</DD>
</DL>
The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -978,12 +983,14 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
*@param gid Group id of the tree to display
*@param sample_id Sample id of the tree to display
*@dot_format TRUE if dot format, FALSE for text format
+ *@verbose TRUE if the dot format output will contain additional information
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
"model_table" TEXT,
"gid" INTEGER,
"sample_id" INTEGER,
- "dot_format" BOOLEAN
+ "dot_format" BOOLEAN,
+ "verbose" BOOLEAN
) RETURNS VARCHAR AS $$
PythonFunction(recursive_partitioning, random_forest, get_tree)
$$ LANGUAGE plpythonu VOLATILE
@@ -992,9 +999,19 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
"model_table" TEXT,
"gid" INTEGER,
+ "sample_id" INTEGER,
+ "dot_format" BOOLEAN
+) RETURNS VARCHAR AS $$
+ SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, $4, FALSE::BOOLEAN);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
+ "model_table" TEXT,
+ "gid" INTEGER,
"sample_id" INTEGER
) RETURNS VARCHAR AS $$
- SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN);
+ SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
@@ -1012,13 +1029,16 @@ tree.
USAGE
------------------------------------------------------------
SELECT MADLIB_SCHEMA.get_tree(
- forest_model, -- TEXT. Name of the table containing the random forest model
- gid, -- INTEGER. Group id of the tree to be displayed
- sample_id, -- INTEGER. Sample of the tree to be displayed
- dot_format -- BOOLEAN. (OPTIONAL, Default = TRUE)
- -- Output can either be in a dot format or a text
- -- format. If TRUE, the result is in the dot format,
- -- else output is in text format
+ forest_model, -- TEXT. Name of the table containing the random forest model
+ gid, -- INTEGER. Group id of the tree to be displayed
+ sample_id, -- INTEGER. Sample of the tree to be displayed
+ dot_format, -- BOOLEAN. (OPTIONAL, Default = TRUE)
+ -- Output can either be in a dot format or a text
+ -- format. If TRUE, the result is in the dot format,
+ -- else output is in text format
+ verbose -- BOOLEAN. (OPTIONAL, Default = FALSE)
+ -- If TRUE, the dot format output will contain additional
+ -- information
)
------------------------------------------------------------
The output is always returned as a 'TEXT'. For the dot format, the output can be