You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/08/03 03:53:06 UTC
incubator-madlib git commit: DT: Add option to treat NULL as category
Repository: incubator-madlib
Updated Branches:
refs/heads/master a1f980336 -> a2f474007
DT: Add option to treat NULL as category
This commit adds an option to treat NULL as a level in the categorical
feature. The level is added as a string (instead of a NULL value) to
ensure MADlib arrays don't have NULLs in them during the binning
procedure.
Closes #156
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/a2f47400
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/a2f47400
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/a2f47400
Branch: refs/heads/master
Commit: a2f47400705da784f9a727d8b926da34c84465a2
Parents: a1f9803
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue Jul 18 14:04:37 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Aug 2 20:52:27 2017 -0700
----------------------------------------------------------------------
src/modules/recursive_partitioning/DT_impl.hpp | 2 +-
.../recursive_partitioning/decision_tree.py_in | 187 +++++++++++++------
.../recursive_partitioning/decision_tree.sql_in | 108 ++++++++++-
.../recursive_partitioning/random_forest.py_in | 38 ++--
.../recursive_partitioning/random_forest.sql_in | 96 ++++++----
.../test/decision_tree.sql_in | 2 +-
6 files changed, 309 insertions(+), 124 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/modules/recursive_partitioning/DT_impl.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp
index 27bc647..808e48c 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -1181,7 +1181,7 @@ DecisionTree<Container>::displayInternalNode(
label_str << escape_quotes(feature_name) << " <= " << feature_thresholds(id);
} else {
feature_name = get_text(cat_features_str, feature_indices(id));
- label_str << escape_quotes(feature_name) << " = ";
+ label_str << escape_quotes(feature_name) << " <= ";
// Text for all categoricals are stored in a flat array (cat_levels_text);
// find the appropriate index for this node
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 7b724c1..d82936a 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -246,11 +246,13 @@ SELECT {schema_madlib}.tree_train(
'cp' - complexity parameter with default=0.01,
'n_folds' - number of cross-validation folds
with default value of 0 (= no cross-validation)
- surrogate_params, -- A comma-separated text containing
- key=value pairs of parameters for computing surrogate splits.
+ null_handling_params, -- A comma-separated text containing
+ key=value pairs of parameters for handling NULL values.
Parameters accepted:
'max_surrogates' - Maximum number of surrogates to
compute for each split
+ 'null_as_category' - Boolean to indicate if
+ NULL should be treated as a special category
verbose -- Boolean, whether to print more info, default is False
);
@@ -291,6 +293,9 @@ The output summary table ('output_table_summary') has the following columns:
input_cp -- The complexity parameter (cp) used for pruning the
trained tree(s) (before cross-validation is run)
independent_var_types -- The types of independent variables, comma-separated
+ k -- Number of folds (NULL if not using cross validation)
+ null_proxy -- String used as replacement for NULL values
+ (NULL if null_as_category = False)
"""
elif message.lower().strip() in ['example', 'examples']:
@@ -371,7 +376,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
con_features, n_bins, boolean_cats,
min_split, min_bucket, weights,
max_depth, grp_key_to_cp, compute_cp_list=False,
- max_n_surr=0, **kwargs):
+ max_n_surr=0, null_proxy=None, **kwargs):
"""
Args:
grp_key_to_cp : Dictionary, mapping from group key to the cp value
@@ -419,7 +424,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
bins = _get_bins(schema_madlib, training_table_name, cat_features,
ordered_cat_features, con_features, n_bins,
dep_var_str, boolean_cats, n_rows, is_classification,
- dep_n_levels, filter_dep)
+ dep_n_levels, filter_dep, null_proxy)
# some features may be dropped if they have only one value
cat_features = bins['cat_features']
@@ -446,7 +451,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
boolean_cats, grouping_cols,
grouping_array_str, n_rows,
is_classification, dep_n_levels,
- filter_dep)
+ filter_dep, null_proxy)
cat_features = bins['cat_features']
# 3b) Load each group's tree state in memory and set to the initial tree
@@ -507,7 +512,8 @@ def _build_tree(schema_madlib, is_classification, split_criterion,
cat_features, ordered_cat_features,
boolean_cats, con_features, grouping_cols,
weights, max_depth, min_split, min_bucket, n_bins,
- cp_table, max_n_surr=0, msg_level="warning", k=0, **kwargs):
+ cp_table, max_n_surr=0, null_as_category=False,
+ msg_level="warning", k=0, **kwargs):
compute_cp_list = False
if grouping_cols:
@@ -545,7 +551,7 @@ def tree_train(schema_madlib, training_table_name, output_table_name,
list_of_features_to_exclude, split_criterion,
grouping_cols, weights, max_depth,
min_split, min_bucket, n_bins, pruning_params,
- surrogate_params, verbose_mode, **kwargs):
+ null_handling_params, verbose_mode, **kwargs):
""" Decision tree main training function
Args:
@@ -589,10 +595,18 @@ def tree_train(schema_madlib, training_table_name, output_table_name,
pruning_param_dict = _extract_pruning_params(pruning_params)
cp = pruning_param_dict['cp']
n_folds = pruning_param_dict['n_folds']
- surrogate_param_dict = extract_keyvalue_params(surrogate_params,
- dict(max_surrogates=int),
- dict(max_surrogates=0))
- max_n_surr = surrogate_param_dict['max_surrogates']
+
+ # null handing parameters: max_n_surr and null_as_category
+ null_handling_dict = extract_keyvalue_params(
+ null_handling_params,
+ dict(max_surrogates=int, null_as_category=bool),
+ dict(max_surrogates=0, null_as_category=False))
+ max_n_surr = null_handling_dict['max_surrogates']
+ null_as_category = null_handling_dict['null_as_category']
+ null_proxy = "__NULL__" if null_as_category else None
+ if null_as_category:
+ # can't have two ways of handling tuples with NULL values
+ max_n_surr = 0
_assert(max_n_surr >= 0,
"Maximum number of surrogates ({0}) should be non-negative".
format(max_n_surr))
@@ -650,7 +664,8 @@ def _create_output_tables(schema_madlib, training_table_name, output_table_name,
id_col_name, dependent_variable,
is_classification, n_all_rows, n_rows, dep_list, cp,
all_cols_types, grouping_cols=None,
- use_existing_tables=False, running_cv=False, k=0, **kwargs):
+ use_existing_tables=False, running_cv=False, k=0,
+ null_proxy=None, **kwargs):
if not grouping_cols:
_create_result_table(schema_madlib, tree_states[0],
bins['cat_origin'], bins['cat_n'], cat_features,
@@ -667,8 +682,8 @@ def _create_output_tables(schema_madlib, training_table_name, output_table_name,
schema_madlib, split_criterion, training_table_name,
output_table_name, id_col_name, cat_features, con_features,
dependent_variable, failed_groups, is_classification, n_all_rows,
- n_rows, dep_list, all_cols_types, cp, grouping_cols, 1, use_existing_tables,
- running_cv, k)
+ n_rows, dep_list, all_cols_types, cp, grouping_cols, 1,
+ use_existing_tables, running_cv, k, null_proxy)
# -------------------------------------------------------------------------
@@ -715,7 +730,8 @@ def _is_dep_categorical(training_table_name, dependent_variable):
def _get_bins(schema_madlib, training_table_name,
cat_features, ordered_cat_features,
con_features, n_bins, dependent_variable, boolean_cats, n_rows,
- is_classification, dep_n_levels, filter_null):
+ is_classification, dep_n_levels, filter_null_dep,
+ null_proxy=None):
""" Compute the bins of all features
@param training_table_name Data source table
@@ -760,7 +776,7 @@ def _get_bins(schema_madlib, training_table_name,
FROM (
SELECT *, random() AS rand
FROM {training_table_name}
- WHERE {filter_null}
+ WHERE {filter_null_dep}
AND not {schema_madlib}.array_contains_null({con_features_str})
) subq
WHERE rand <= {percentage}
@@ -808,10 +824,24 @@ def _get_bins(schema_madlib, training_table_name,
# For regressions
order_fun = "AVG({0})".format(dependent_variable)
- # Note that 'sql_cat_levels' goes through two levels of formatting
+ # Note that 'sql_cat_levels' goes through two levels of string formatting
# Try to obtain all the levels in one scan of the table.
# () are needed when casting the categorical variables because
# they can be expressions.
+
+ filter_str = filter_null_dep + " AND {col} IS NOT NULL"
+
+ if null_proxy is None:
+ union_null_proxy = ""
+ else:
+ union_null_proxy = "UNION " + """
+ SELECT '{null_proxy}'::text as levels,
+ 'Infinity'::double precision as dep_avg
+ FROM {training_table_name}
+ WHERE {{col}} IS NULL
+ GROUP BY {{col}}
+ """.format(**locals())
+
sql_cat_levels = """
SELECT
'{{col_name}}'::text AS colname,
@@ -825,22 +855,30 @@ def _get_bins(schema_madlib, training_table_name,
({{col}})::text AS levels,
{{order_fun}} AS dep_avg
FROM {training_table_name}
- WHERE {filter_null}
- AND {{col}} is not NULL
+ WHERE {filter_str}
GROUP BY {{col}}
+ {union_null_proxy}
) s
) s1
WHERE array_upper(levels, 1) > 1
""".format(training_table_name=training_table_name,
- filter_null=filter_null)
+ filter_str=filter_str,
+ union_null_proxy=union_null_proxy)
+
+ all_col_expressions = {}
+ for col in cat_features:
+ if col in boolean_cats:
+ all_col_expressions[col] = ("(CASE WHEN " + col +
+ " THEN 'True' ELSE 'False' END)")
+ else:
+ all_col_expressions[col] = col
sql_all_cats = ' UNION '.join(
sql_cat_levels.format(
- col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
- if col in boolean_cats else col,
- col_name=col,
- order_fun=col if col in ordered_cat_features else order_fun)
- for col in cat_features)
+ col=expr,
+ col_name=col_name,
+ order_fun=expr if col_name in ordered_cat_features else order_fun)
+ for col_name, expr in all_col_expressions.items())
all_levels = plpy.execute(sql_all_cats)
if len(all_levels) != len(cat_features):
@@ -925,7 +963,7 @@ def _get_bins_grps(
schema_madlib, training_table_name, cat_features, ordered_cat_features,
con_features, n_bins, dependent_variable, boolean_cats,
grouping_cols, grouping_array_str, n_rows, is_classification,
- dep_n_levels, filter_null):
+ dep_n_levels, filter_null_dep, null_proxy=None):
""" Compute the bins for all features in each group
@brief Similar to _get_bins except that this is for multiple groups.
@@ -974,14 +1012,14 @@ def _get_bins_grps(
(
SELECT *
FROM {training_table_name}
- WHERE {filter_null}
+ WHERE {filter_null_dep}
AND not {schema_madlib}.array_contains_null({con_features_str})
) src
JOIN
(
SELECT {grouping_cols}, count(*) AS {grp_size_str}
FROM {training_table_name}
- WHERE {filter_null}
+ WHERE {filter_null_dep}
AND not {schema_madlib}.array_contains_null({con_features_str})
GROUP BY {grouping_cols}
) grp_info
@@ -1024,6 +1062,18 @@ def _get_bins_grps(
else:
order_fun = "avg({0})".format(dependent_variable)
+ filter_str = filter_null_dep + " AND {col} IS NOT NULL"
+ if null_proxy is None:
+ union_null_proxy = ""
+ else:
+ union_null_proxy = "UNION " + """
+ SELECT '{null_proxy}'::text as levels,
+ 'Infinity'::double precision as dep_avg
+ FROM {training_table_name}
+ WHERE {{col}} IS NULL
+ GROUP BY {{col}}
+ """.format(**locals())
+
sql_cat_levels = """
SELECT
colname::text,
@@ -1040,8 +1090,7 @@ def _get_bins_grps(
({{col}})::text as levels,
{{order_fun}} as dep_avg
FROM {training_table_name}
- WHERE {filter_null}
- AND {{col}} is not NULL
+ WHERE {filter_str}
GROUP BY {{col}}, {grouping_cols}
) s
GROUP BY grp_key
@@ -1049,13 +1098,24 @@ def _get_bins_grps(
where array_upper(levels, 1) > 1
""".format(**locals())
+ all_col_expressions = {}
+ for col in cat_features:
+ if col in boolean_cats:
+ all_col_expressions[col] = ("(CASE WHEN " + col +
+ " THEN 'True' ELSE 'False' END)")
+ else:
+ if null_proxy is not None:
+ all_col_expressions[col] = ("COALESCE({0}, {1})".
+ format(col, null_proxy))
+ else:
+ all_col_expressions[col] = col
+
sql_all_cats = ' UNION ALL '.join(
sql_cat_levels.format(
- col=("(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
- if col in boolean_cats else col),
- col_name=col,
- order_fun=col if col in ordered_cat_features else order_fun)
- for col in cat_features)
+ col=expr,
+ col_name=col_name,
+ order_fun=expr if col_name in ordered_cat_features else order_fun)
+ for col_name, expr in all_col_expressions.items())
all_levels = list(plpy.execute(sql_all_cats))
all_levels.sort(key=itemgetter('grp_key'))
@@ -1100,9 +1160,16 @@ def _get_bins_grps(
def get_feature_str(schema_madlib, boolean_cats,
cat_features, con_features,
- levels_str, n_levels_str):
+ levels_str, n_levels_str,
+ null_proxy=None):
if len(cat_features) > 0:
- unique_val = unique_string()
+ # null_val is the replacement for NULL in categorial feature. If a
+ # null_proxy is set then the proxy is used to assign NULL as a valid
+ # category. If no proxy is available then NULL is replaced with a unique
+ # value. In a later step, the categorical levels are mapped to integers
+ # (1 to N). The unique value will be mapped to -1 indicating an
+ # unknown/missing value in the underlying layers.
+ null_val = unique_string() if null_proxy is None else null_proxy
cat_features_cast = []
for col in cat_features:
if col in boolean_cats:
@@ -1110,8 +1177,7 @@ def get_feature_str(schema_madlib, boolean_cats,
"(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)::text")
else:
cat_features_cast.append(
- "(coalesce(" + col + "::text,'{0}')".format(unique_val) +
- ")::text")
+ "(coalesce({0}::text, '{1}'))::text".format(col, null_val))
cat_features_str = ("{0}._map_catlevel_to_int(array[" +
", ".join(cat_features_cast) + "], {1}, {2})"
@@ -1132,7 +1198,8 @@ def get_feature_str(schema_madlib, boolean_cats,
def _one_step(schema_madlib, training_table_name, cat_features,
con_features, boolean_cats, bins, n_bins, tree_state, weights,
dep_var, min_split, min_bucket, max_depth, filter_null,
- dep_n_levels, subsample, n_random_features, max_n_surr=0):
+ dep_n_levels, subsample, n_random_features,
+ max_n_surr=0, null_proxy=None):
""" One step of tree training
@param tree_state A big double precision array that conatins
@@ -1147,7 +1214,8 @@ def _one_step(schema_madlib, training_table_name, cat_features,
boolean_cats,
cat_features,
con_features,
- "$3", "$2")
+ "$3", "$2",
+ null_proxy)
# The arguments of the aggregate (in the same order):
# 1. current tree state, madlib.bytea8
@@ -1229,7 +1297,7 @@ def _one_step_for_grps(
con_features, boolean_cats, bins, n_bins, tree_states, weights,
grouping_cols, grouping_array_str, dep_var, min_split, min_bucket,
max_depth, filter_null, dep_n_levels, subsample, n_random_features,
- max_n_surr=0):
+ max_n_surr=0, null_proxy=None):
""" One step of trees training with grouping support
"""
# The function _map_catlevel_to_int maps a categorical variable value to its
@@ -1247,7 +1315,7 @@ def _one_step_for_grps(
cat_features_str, con_features_str = get_feature_str(
schema_madlib, boolean_cats, cat_features, con_features,
- cat_levels_in_text, cat_n_levels)
+ cat_levels_in_text, cat_n_levels, null_proxy)
train_apply_func = """
{schema_madlib}._dt_apply(
@@ -1530,7 +1598,8 @@ def _create_summary_table(
cat_features, con_features, dependent_variable,
num_failed_groups, is_classification, n_all_rows, n_rows,
dep_list, all_cols_types, cp, grouping_cols=None, n_groups=1,
- use_existing_tables=False, running_cv=False, k=0):
+ use_existing_tables=False, running_cv=False, k=0, null_proxy=None):
+
# dependent variables
features = ', '.join(cat_features + con_features)
if dep_list:
@@ -1553,14 +1622,15 @@ def _create_summary_table(
cp_str = py_list_to_sql_string(cp, 'double precision')
else:
cp_str = str(cp) + "::double precision"
+ fold = k if running_cv else "NULL"
- fold = ", " + str(k) + " as k" if running_cv else ""
if use_existing_tables:
- # plpy.execute("truncate " + output_table_name + "_summary")
- header = "insert into " + add_postfix(output_table_name, "_summary") + " "
+ header = "INSERT INTO {0} ".format(
+ add_postfix(output_table_name, "_summary"))
else:
- header = "create table " + add_postfix(output_table_name, "_summary") + " as "
-
+ header = "CREATE TABLE {0} AS ".format(
+ add_postfix(output_table_name, "_summary"))
+ null_proxy_str="NULL" if null_proxy is None else null_proxy
sql = header + """
SELECT
'tree_train'::text AS method,
@@ -1580,8 +1650,10 @@ def _create_summary_table(
{dep_list_str}::text AS dependent_var_levels,
'{dep_type}'::text AS dependent_var_type,
{cp_str} AS input_cp,
- '{indep_type}'::text AS independent_var_types
- {fold}
+ '{indep_type}'::text AS independent_var_types,
+ {fold}::integer AS k,
+ '{null_proxy_str}'::text AS null_proxy
+
""".format(**locals())
plpy.execute(sql)
# ------------------------------------------------------------
@@ -1683,6 +1755,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
dep_levels = summary_elements["dependent_var_levels"]
is_classification = summary_elements["is_classification"]
dep_type = summary_elements['dependent_var_type']
+ null_proxy = summary_elements['null_proxy']
# find which columns are of type boolean
boolean_cats = set([key for key, value in get_cols_and_types(source)
@@ -1690,7 +1763,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
cat_features_str, con_features_str = get_feature_str(
schema_madlib, boolean_cats, cat_features, con_features,
- "m.cat_levels_in_text", "m.cat_n_levels")
+ "m.cat_levels_in_text", "m.cat_n_levels", null_proxy)
if use_existing_tables and table_exists(output):
plpy.execute("truncate " + output)
@@ -2258,7 +2331,8 @@ def _tree_train_using_bins(
cat_features, con_features, boolean_cats, n_bins, weights,
dep_var_str, min_split, min_bucket, max_depth, filter_dep,
dep_n_levels, is_classification, split_criterion,
- subsample=False, n_random_features=1, max_n_surr=0, **kwargs):
+ subsample=False, n_random_features=1, max_n_surr=0, null_proxy=None,
+ **kwargs):
"""Trains a tree without grouping columns"""
# Iterations for training the tree
tree_state = plpy.execute(
@@ -2284,7 +2358,7 @@ def _tree_train_using_bins(
cat_features, con_features, boolean_cats, bins,
n_bins, tree_state, weights, dep_var_str,
min_split, min_bucket, max_depth, filter_dep,
- dep_n_levels, subsample, n_random_features, max_n_surr)
+ dep_n_levels, subsample, n_random_features, max_n_surr, null_proxy)
plpy.notice("Completed training of level {0}".format(tree_depth))
return tree_state
@@ -2296,7 +2370,8 @@ def _tree_train_grps_using_bins(
boolean_cats, n_bins, weights, grouping_cols, grouping_array_str, dep_var_str,
min_split, min_bucket, max_depth, filter_dep, dep_n_levels,
is_classification, split_criterion, subsample=False,
- n_random_features=1, tree_terminated=None, max_n_surr=0, **kwargs):
+ n_random_features=1, tree_terminated=None, max_n_surr=0, null_proxy=None,
+ **kwargs):
"""Trains a tree with grouping columns included """
# Iterations for training the tree
@@ -2340,7 +2415,7 @@ def _tree_train_grps_using_bins(
tree_states, weights, grouping_cols,
grouping_array_str, dep_var_str, min_split, min_bucket,
max_depth, filter_dep, dep_n_levels, subsample,
- n_random_features, max_n_surr)
+ n_random_features, max_n_surr, null_proxy)
level += 1
plpy.notice("Finished training for level " + str(level))
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index e8f37f8..8290dcf 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -54,7 +54,7 @@ tree_train(
min_bucket,
num_splits,
pruning_params,
- surrogate_params,
+ null_handling_params,
verbosity
)
</pre>
@@ -186,16 +186,27 @@ tree_train(
</table>
</DD>
- <DT>surrogate_params (optional)</DT>
+ <DT>null_handling_params (optional)</DT>
<DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
- of surrogate splits for each node. A surrogate variable is another predictor
- variable that is associated (correlated) with the primary predictor variable
- for a split. The surrogate variable comes into use when the primary predictior
- value is NULL. This parameter currently accepts one argument:
+ of various features handling missing values.
<table class='output'>
<tr>
<th>max_surrogates</th>
<td>Default: 0. Number of surrogates to store for each node.</td>
+ One of the approaches of handling NULLs is to use surrogate splits for each
+ node. A surrogate variable is another predictor variable that is associated
+ (correlated) with the primary split variable. The surrogate
+ variable comes into use when the primary predictior value is NULL.
+ </tr>
+ <tr>
+ <th>null_as_category</th>
+ <td>Default: FALSE. Whether to treat NULL as a special categorical value.
+
+ If this is set to TRUE, NULL values are considered a categorical value and
+ placed at the end of the ordering of categorical levels. Placing it at the
+ end ensures that NULL is never used as a value to split a node on. This
+ parameter is ignored for continuous-valued features.
+ </td>
</tr>
</table>
</DD>
@@ -901,6 +912,85 @@ Result:
(32 rows)
</pre>
+<h4>NULL handling example</h4>
+
+- Create toy example to illustrate null-as-category handling
+<pre class='example'>
+drop table null_handling_example;
+create table null_handling_example (
+id integer,
+country text,
+city text,
+weather text,
+response text
+);
+
+insert into null_handling_example values
+(1,null,null,null,'a'),
+(2,'US',null,null,'b'),
+(3,'US','NY',null,'c'),
+(4,'US','NY','rainy','d');
+
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('null_handling_example', -- source table
+ 'train_output', -- output model table
+ 'id', -- id column
+ 'response', -- dependent variable
+ 'country, city, weather', -- features
+ NULL,
+ 'gini', -- split criterion
+ NULL::text, -- no grouping
+ NULL::text, -- no weights
+ 4, -- max depth
+ 1, -- min split
+ 1, -- number of bins per continuous variable
+ 10, -- number of splits
+ NULL, -- pruning parameters
+ 'null_as_category=true' -- null handling
+ );
+select cat_levels_in_text, cat_n_levels from train_output;
+</pre>
+<pre class='result'>
+ cat_levels_in_text | cat_n_levels
+------------------------------------------+--------------
+ {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2}
+</pre>
+
+- Predict for previously not seen data by assuming NULL value as the default
+<pre class='example'>
+drop table if exists table_test;
+create table table_test (
+id integer,
+country text,
+city text,
+weather text,
+expected_response text
+);
+insert into table_test values
+(1,'IN','MUM','cloudy','a'),
+(2,'US','HOU','humid','b'),
+(3,'US','NY','sunny','c'),
+(4,'US','NY','rainy','d');
+
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',
+ 'table_test',
+ 'prediction_results',
+ 'response');
+SELECT s.id, expected_response, estimated_response
+FROM prediction_results p, table_test s
+where s.id = p.id ORDER BY id;
+</pre>
+<pre class='result'>
+ id | expected_response | estimated_response
+----+-------------------+--------------------
+ 1 | a | a
+ 2 | b | b
+ 3 | c | c
+ 4 | d | d
+(4 rows)
+</pre>
+
@anchor literature
@literature
[1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; Stone, C. J. (1984). Classification and regression trees. Monterey, CA: Wadsworth & Brooks/Cole Advanced Books & Software.
@@ -995,7 +1085,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
min_bucket INTEGER,
n_bins INTEGER,
pruning_params TEXT,
- surrogate_params TEXT,
+ null_handling_params TEXT,
verbose_mode BOOLEAN
) RETURNS VOID AS $$
PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
@@ -1004,7 +1094,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
id_col_name, dependent_variable, list_of_features,
list_of_features_to_exclude, split_criterion, grouping_cols,
weights, max_depth, min_split, min_bucket, n_bins, pruning_params,
- surrogate_params, verbose_mode)
+ null_handling_params, verbose_mode)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
------------------------------------------------------------
@@ -1602,7 +1692,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
min_bucket INTEGER,
n_bins INTEGER,
pruning_params TEXT,
- surrogate_params TEXT
+ null_handling_params TEXT
) RETURNS VOID AS $$
-- verbose = false
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 05c029e..bc4bd97 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -237,7 +237,7 @@ def forest_train(
grouping_cols, num_trees, num_random_features,
importance, num_permutations, max_tree_depth,
min_split, min_bucket, num_bins,
- surrogate_params, verbose=False, sample_ratio=None, **kwargs):
+ null_handling_params, verbose=False, sample_ratio=None, **kwargs):
""" Random forest main training function
Args:
@@ -283,15 +283,19 @@ def forest_train(
num_bins = 100 if num_bins is None else num_bins
sample_ratio = 1 if sample_ratio is None else sample_ratio
- surrogate_param_dict = extract_keyvalue_params(
- surrogate_params,
- dict(max_surrogates=int), # type of variable
- dict(max_surrogates=0)) # default value of variable
- max_n_surr = surrogate_param_dict['max_surrogates']
+ null_handling_dict = extract_keyvalue_params(
+ null_handling_params,
+ dict(max_surrogates=int, null_as_category=bool),
+ dict(max_surrogates=0, null_as_category=False))
+ max_n_surr = null_handling_dict['max_surrogates']
+ null_as_category = null_handling_dict['null_as_category']
+ null_proxy = "__NULL__" if null_as_category else None
+ if null_as_category:
+ # can't have two ways of handling tuples with NULL values
+ max_n_surr = 0
_assert(max_n_surr >= 0,
"Maximum number of surrogates ({0}) should be non-negative".
format(max_n_surr))
-
##################################################################
# validate arguments
_forest_validate_args(training_table_name, output_table_name, id_col_name,
@@ -383,7 +387,7 @@ def forest_train(
cat_features, ordered_cat_features,
con_features, num_bins, dep,
boolean_cats, n_rows, is_classification,
- dep_n_levels, filter_null)
+ dep_n_levels, filter_null, null_proxy)
# some features may be dropped because they have only one value
cat_features = bins['cat_features']
bins['grp_key_cat'] = ['']
@@ -421,7 +425,7 @@ def forest_train(
con_features, num_bins, dep,
boolean_cats, grouping_cols,
grouping_array_str, n_rows,
- is_classification, dep_n_levels, filter_null)
+ is_classification, dep_n_levels, filter_null, null_proxy)
cat_features = bins['cat_features']
# a table for converting cat_features to integers
@@ -564,7 +568,7 @@ def forest_train(
boolean_cats, num_bins, 'poisson_count', dep, min_split,
min_bucket, max_tree_depth, filter_null, dep_n_levels,
is_classification, split_criterion, True,
- num_random_features, max_n_surr)
+ num_random_features, max_n_surr, null_proxy)
tree_states = [dict(tree_state=tree_state['tree_state'],
grp_key='')]
@@ -578,7 +582,7 @@ def forest_train(
max_tree_depth, filter_null, dep_n_levels,
is_classification, split_criterion, True,
num_random_features, tree_terminated=tree_terminated,
- max_n_surr=max_n_surr)
+ max_n_surr=max_n_surr, null_proxy=null_proxy)
# If a tree for a group is terminated (not finished properly),
# then we do not need to compute other trees, and can just
@@ -603,7 +607,7 @@ def forest_train(
sample_id, id_col_name, cat_features, con_features,
boolean_cats, grouping_cols, grp_key_to_grp_cols, dep,
num_permutations, is_classification, importance,
- num_bins, filter_null)
+ num_bins, filter_null, null_proxy)
###################################################################
# evaluating and summerizing random forest
@@ -695,6 +699,7 @@ def forest_predict(schema_madlib, model, source, output, pred_type='response',
dep_levels = summary_elements["dependent_var_levels"]
is_classification = summary_elements["is_classification"]
dep_type = summary_elements['dependent_var_type']
+ null_proxy = summary_elements['null_proxy']
# pred_type='prob' is allowed only for classification
_assert(is_classification or pred_type == 'response',
@@ -706,7 +711,7 @@ def forest_predict(schema_madlib, model, source, output, pred_type='response',
cat_features_str, con_features_str = get_feature_str(
schema_madlib, boolean_cats, cat_features, con_features,
- "cat_levels_in_text", "cat_n_levels")
+ "cat_levels_in_text", "cat_n_levels", null_proxy)
pred_name = ('"prob_{0}"' if pred_type == "prob" else
'"estimated_{0}"').format(dep_varname.replace('"', '').strip())
@@ -929,12 +934,12 @@ def _calculate_oob_prediction(
schema_madlib, model_table, cat_features_info_table, con_splits_table,
oob_prediction_table, oob_view, sample_id, id_col_name, cat_features,
con_features, boolean_cats, grouping_cols, grp_key_to_grp_cols, dep,
- num_permutations, is_classification, importance, num_bins, filter_null):
+ num_permutations, is_classification, importance, num_bins, filter_null, null_proxy=None):
"""Calculate predication for out-of-bag sample"""
cat_features_str, con_features_str = get_feature_str(
schema_madlib, boolean_cats, cat_features, con_features,
- "cat_levels_in_text", "cat_n_levels")
+ "cat_levels_in_text", "cat_n_levels", null_proxy)
join_str = "," if grouping_cols is None else "JOIN"
using_str = "" if grouping_cols is None else "USING (" + grouping_cols + ")"
@@ -1247,7 +1252,8 @@ def _create_summary_table(**kwargs):
{n_rows_skipped}::integer AS total_rows_skipped,
{dep_list_str}::text AS dependent_var_levels,
'{dep_type}'::text AS dependent_var_type,
- '{indep_type}'::text AS independent_var_types
+ '{indep_type}'::text AS independent_var_types,
+ '{null_proxy}'::text AS null_proxy
""".format(**kwargs)
plpy.notice("sql_create_summary_table:\n" + sql_create_summary_table)
plpy.execute(sql_create_summary_table)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index f263cf9..d9ae9bf 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -34,7 +34,7 @@ constructed using bootstrapped samples from the input data. The results of these
models are then combined to yield a single prediction, which, at the
expense of some loss in interpretation, have been found to be highly accurate.
-Please also refer to the decision tree user documentation for
+Please also refer to the decision tree user documentation for
information relevant to the implementation of random forests in MADlib.
@anchor train
@@ -56,7 +56,7 @@ forest_train(training_table_name,
min_split,
min_bucket,
num_splits,
- surrogate_params,
+ null_handling_params,
verbose,
sample_ratio
)
@@ -314,7 +314,7 @@ forest_train(training_table_name,
<DT>importance (optional)</DT>
<DD>boolean, default: true. Whether or not to calculate variable importance.
If set to true, variable importance for categorical and continuous features
- will be output in the group table <em>\<model_table\>_group</em> described
+ will be output in the group table <em>\<model_table\>_group</em> described
above. Will increase run time when variable importance is turned on. </DD>
<DT>num_permutations (optional)</DT>
@@ -331,7 +331,7 @@ forest_train(training_table_name,
<DT>max_tree_depth (optional)</DT>
<DD>integer, default: 7. Maximum depth of any node of a tree,
with the root node counted as depth 0. A deeper tree can
- lead to better prediction but will also result in
+ lead to better prediction but will also result in
longer processing time and higher memory usage.</DD>
<DT>min_split (optional)</DT>
@@ -350,13 +350,27 @@ forest_train(training_table_name,
Higher number of bins will lead to better prediction,
but will also result in longer processing time and higher memory usage.</DD>
- <DT>surrogate_params (optional)</DT>
- <DD>text, Comma-separated string of key-value pairs controlling the behavior
- of surrogate splits for each node in a tree.
+ <DT>null_handling_params</DT>
+ <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
+ of various features handling missing values.
<table class='output'>
<tr>
<th>max_surrogates</th>
<td>Default: 0. Number of surrogates to store for each node.</td>
+ One of the approaches of handling NULLs is to use surrogate splits for each
+ node. A surrogate variable is another predictor variable that is associated
+ (correlated) with the primary split variable. The surrogate
+ variable comes into use when the primary predictior value is NULL.
+ </tr>
+ <tr>
+ <th>null_as_special_cat</th>
+ <td>Default: FALSE. Whether to treat NULL as a special categorical value.
+
+ If this is set to TRUE, NULL values are considered a categorical
+ value and placed at the end of the ordering of categorical levels. Placing it
+ at the end ensures that NULL is never used as a value to split a node on.
+ This parameter is ignored for continuous-valued features.
+ </td>
</tr>
</table>
</DD>
@@ -371,10 +385,10 @@ forest_train(training_table_name,
is close to 0 may result in trees with only the root node.
This allows users to experiment with the function in a speedy fashion.</DD>
</DL>
- @note The main parameters that affect memory usage are: depth of
- tree (‘max_tree_depth’), number of features, number of values per
- categorical feature, and number of bins for continuous features (‘num_splits’).
- If you are hitting memory limits, consider reducing one or
+ @note The main parameters that affect memory usage are: depth of
+ tree (‘max_tree_depth’), number of features, number of values per
+ categorical feature, and number of bins for continuous features (‘num_splits’).
+ If you are hitting memory limits, consider reducing one or
more of these parameters.
@anchor predict
@@ -460,8 +474,8 @@ are NULL, then the majority branch is used to compute the split for a tuple.
<DT>verbose (optional)</DT>
<DD>boolean, default = FALSE. If true, the dot format output will contain
- additional information (impurity, sample size, number of weighted rows for
- each response variable, classification or prediction if the tree was
+ additional information (impurity, sample size, number of weighted rows for
+ each response variable, classification or prediction if the tree was
pruned at this level)</DD>
</DL>
@@ -539,7 +553,7 @@ dependent_varname | class
independent_varnames | "OUTLOOK",windy,temperature,humidity
cat_features | "OUTLOOK",windy
con_features | temperature,humidity
-grouping_cols |
+grouping_cols |
num_trees | 20
num_random_features | 2
max_tree_depth | 8
@@ -581,21 +595,21 @@ SELECT madlib.get_tree('train_output',1,2);
</pre>
Result:
<pre class="result">
- digraph "Classification tree for dt_golf" {
- "0" [label="humidity <= 75", shape=ellipse];
- "0" -> "1"[label="yes"];
- "1" [label="\"Play\"",shape=box];
- "0" -> "2"[label="no"];
- "2" [label="humidity <= 80", shape=ellipse];
- "2" -> "5"[label="yes"];
- "5" [label="\"Don't Play\"",shape=box];
- "2" -> "6"[label="no"];
+ digraph "Classification tree for dt_golf" {
+ "0" [label="humidity <= 75", shape=ellipse];
+ "0" -> "1"[label="yes"];
+ "1" [label="\"Play\"",shape=box];
+ "0" -> "2"[label="no"];
+ "2" [label="humidity <= 80", shape=ellipse];
+ "2" -> "5"[label="yes"];
+ "5" [label="\"Don't Play\"",shape=box];
+ "2" -> "6"[label="no"];
"6" [label="\"OUTLOOK\" in {overcast,rain}", shape=ellipse];
- "6" -> "13"[label="yes"];
- "13" [label="\"Play\"",shape=box];
- "6" -> "14"[label="no"];
- "14" [label="\"Don't Play\"",shape=box];
- } //---end of digraph---------
+ "6" -> "13"[label="yes"];
+ "13" [label="\"Play\"",shape=box];
+ "6" -> "14"[label="no"];
+ "14" [label="\"Don't Play\"",shape=box];
+ } //---end of digraph---------
</pre>
-# Obtain a text display of the tree:
@@ -613,13 +627,13 @@ second indented child (2i+2) is the 'False' node.
- Number of (weighted) rows for each response variable inside [].
- Order of values = ['"Don\'t Play"', '"Play"']
-------------------------------------
- (0)[ 4 10] humidity <= 75
- (1)[0 7] * --> "Play"
- (2)[4 3] humidity <= 80
- (5)[3 1] * --> "Don't Play"
- (6)[1 2] "OUTLOOK" in {overcast,rain}
- (13)[0 2] * --> "Play"
- (14)[1 0] * --> "Don't Play"
+ (0)[ 4 10] humidity <= 75
+ (1)[0 7] * --> "Play"
+ (2)[4 3] humidity <= 80
+ (5)[3 1] * --> "Don't Play"
+ (6)[1 2] "OUTLOOK" in {overcast,rain}
+ (13)[0 2] * --> "Play"
+ (14)[1 0] * --> "Don't Play"
-------------------------------------
</pre>
@@ -637,7 +651,7 @@ ORDER BY id;
</pre>
Result:
<pre class="result">
- id | estimated_class | class
+ id | estimated_class | class
----+-----------------+------------
1 | Don't Play | Don't Play
2 | Don't Play | Don't Play
@@ -670,7 +684,7 @@ ORDER BY id;
</pre>
Result:
<pre class="result">
- id | estimated_prob_Play | class
+ id | estimated_prob_Play | class
----+---------------------+------------
1 | 0.05 | Don't Play
2 | 0.15 | Don't Play
@@ -917,7 +931,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
min_split INTEGER,
min_bucket INTEGER,
num_splits INTEGER,
- surrogate_params TEXT,
+ null_handling_params TEXT,
verbose BOOLEAN,
sample_ratio DOUBLE PRECISION
) RETURNS VOID AS $$
@@ -939,7 +953,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
min_split,
min_bucket,
num_splits,
- surrogate_params,
+ null_handling_params,
verbose,
sample_ratio
)
@@ -1179,7 +1193,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
min_split INTEGER,
min_bucket INTEGER,
num_splits INTEGER,
- surrogate_params TEXT,
+ null_handling_params TEXT,
verbose BOOLEAN
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
@@ -1204,7 +1218,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
min_split integer,
min_bucket integer,
num_splits integer,
- surrogate_params TEXT
+ null_handling_params TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13, $14, $15, $16, FALSE::BOOLEAN);
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index b135108..93b9a52 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -344,7 +344,7 @@ SELECT tree_train('dt_golf'::text, -- source table
2::integer, -- min bucket
8::integer, -- number of bins per continuous variable
'cp=0.01, n_folds=5',
- 'max_surrogates=2'
+ 'max_surrogates=2, null_as_category=True'
);
SELECT _print_decision_tree(tree) from train_output;