You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/05/10 22:56:42 UTC
incubator-madlib git commit: DT/RF: Allow expressions in feature list
Repository: incubator-madlib
Updated Branches:
refs/heads/master 8e2778a39 -> 18b8486ca
DT/RF: Allow expressions in feature list
JIRA: MADLIB-1087
Changes:
- Add numeric as a continuous type
- Get data type of features from an expression instead of the table
column names
- Update to allow expressions in the feature list
Closes #129
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/18b8486c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/18b8486c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/18b8486c
Branch: refs/heads/master
Commit: 18b8486ca651a218ea4b84c4b2876e30cd189e33
Parents: 8e2778a
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue May 2 12:39:52 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed May 10 15:56:14 2017 -0700
----------------------------------------------------------------------
.../recursive_partitioning/decision_tree.py_in | 37 ++++++++------------
.../recursive_partitioning/random_forest.py_in | 36 ++++++++++---------
.../test/random_forest.sql_in | 33 ++++++++---------
.../modules/utilities/validate_args.py_in | 2 +-
4 files changed, 52 insertions(+), 56 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/18b8486c/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index dbf7db7..d3ca9b2 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -137,13 +137,6 @@ def _get_features_to_use(schema_madlib, training_table_name,
# ------------------------------------------------------------
-def _dict_get_quoted(input_dict, col_name):
- """Return value from dict where key could be quoted or unquoted name"""
- return input_dict.get(
- col_name, input_dict.get(unquote_ident(col_name)))
-# -------------------------------------------------------------------------
-
-
def _classify_features(feature_to_type, features):
""" Returns
1) an array of categorical features (all casted to string)
@@ -157,17 +150,16 @@ def _classify_features(feature_to_type, features):
cat_types = int_types + text_types + boolean_types
ordered_cat_types = int_types
- cat_features = [c for c in features
- if _dict_get_quoted(feature_to_type, c) in cat_types]
- ordered_cat_features = [c for c in features if _dict_get_quoted(
- feature_to_type, c) in ordered_cat_types]
+ cat_features = [c for c in features if feature_to_type[c] in cat_types]
+ ordered_cat_features = [c for c in features
+ if feature_to_type[c] in ordered_cat_types]
cat_features_set = set(cat_features)
# continuous types - 'real' is cast to 'double precision' for uniformity
- con_types = ['real', 'float8', 'double precision']
+ con_types = ['real', 'float8', 'double precision', 'numeric']
con_features = [c for c in features
if (c not in cat_features_set and
- _dict_get_quoted(feature_to_type, c) in con_types)]
+ feature_to_type[c] in con_types)]
# In order to be able to form an array, all categorical variables
# will be cast into TEXT type, but GPDB cannot cast a boolean
@@ -175,7 +167,7 @@ def _classify_features(feature_to_type, features):
# need special treatment: cast them into integers before casting
# into text.
boolean_cats = [c for c in features
- if _dict_get_quoted(feature_to_type, c) in boolean_types]
+ if feature_to_type[c] in boolean_types]
return cat_features, ordered_cat_features, con_features, boolean_cats
# ------------------------------------------------------------
@@ -481,13 +473,12 @@ def get_grouping_array_str(table_name, grouping_cols, qualifier=None):
else:
qualifier_str = ''
- all_cols_types = dict(get_cols_and_types(table_name))
grouping_cols_list = [col.strip() for col in grouping_cols.split(',')]
- grouping_cols_and_types = [(col, _dict_get_quoted(all_cols_types, col))
- for col in grouping_cols_list]
+ grouping_cols_and_types = [(c, get_expr_type(c, table_name))
+ for c in grouping_cols_list]
grouping_array_str = 'array_to_string(array[' + \
','.join("(case when " + col + " then 'True' else 'False' end)::text"
- if col_type == 'boolean' else '(' + qualifier_str + col + ')::text'
+ if col_type.lower() == 'boolean' else '(' + qualifier_str + col + ')::text'
for col, col_type in grouping_cols_and_types) + "]::text[], ',')"
return grouping_array_str
# ------------------------------------------------------------------------------
@@ -519,7 +510,9 @@ def _build_tree(schema_madlib, is_classification, split_criterion,
with MinWarning(msg_level):
plpy.notice("Building tree for cross validation")
tree_states, bins, dep_list, n_rows = _get_tree_states(**locals())
- all_cols_types = dict(get_cols_and_types(training_table_name))
+ all_cols_types = dict([(f, get_expr_type(f, training_table_name))
+ for f in cat_features + con_features])
+
n_all_rows = plpy.execute("select count(*) from " + training_table_name
)[0]['count']
cp = grp_key_to_cp.values()[0]
@@ -604,7 +597,8 @@ def tree_train(schema_madlib, training_table_name, output_table_name,
"Decision tree error: No feature is selected for the model.")
# 2)
- all_cols_types = dict(get_cols_and_types(training_table_name))
+ all_cols_types = dict([(f, get_expr_type(f, training_table_name))
+ for f in features])
cat_features, ordered_cat_features, con_features, boolean_cats = \
_classify_features(all_cols_types, features)
# get all rows
@@ -1529,8 +1523,7 @@ def _create_summary_table(
"$dep_list$")
else:
dep_list_str = "NULL"
- indep_type = ', '.join(_dict_get_quoted(all_cols_types, col)
- for col in cat_features + con_features)
+ indep_type = ', '.join(all_cols_types[c] for c in cat_features + con_features)
dep_type = _get_dep_type(training_table_name, dependent_variable)
cat_features_str = ','.join(cat_features)
con_features_str = ','.join(con_features)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/18b8486c/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 1226591..4b6f2d6 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -13,16 +13,17 @@ from math import sqrt
from utilities.control import MinWarning
from utilities.control import EnableOptimizer
from utilities.control import EnableHashagg
-from utilities.validate_args import get_cols_and_types
-from utilities.validate_args import is_var_valid
-from utilities.validate_args import input_tbl_valid
-from utilities.validate_args import output_tbl_valid
-from utilities.validate_args import cols_in_tbl_valid
from utilities.utilities import _assert
from utilities.utilities import unique_string
from utilities.utilities import add_postfix
from utilities.utilities import split_quoted_delimited_str
from utilities.utilities import extract_keyvalue_params
+from utilities.validate_args import get_cols_and_types
+from utilities.validate_args import is_var_valid
+from utilities.validate_args import input_tbl_valid
+from utilities.validate_args import output_tbl_valid
+from utilities.validate_args import cols_in_tbl_valid
+from utilities.validate_args import get_expr_type
from decision_tree import _tree_train_using_bins
from decision_tree import _tree_train_grps_using_bins
@@ -34,7 +35,6 @@ from decision_tree import _is_dep_categorical
from decision_tree import _get_n_and_deplist
from decision_tree import _classify_features
from decision_tree import _get_filter_str
-from decision_tree import _dict_get_quoted
from decision_tree import _get_display_header
from decision_tree import get_feature_str
# ------------------------------------------------------------
@@ -272,8 +272,7 @@ def forest_train(
with EnableHashagg(False):
# we disable hashagg since large number of groups could
# result in excessive memory usage.
- ##################################################################
- #### set default values
+ # set default values
if grouping_cols is not None and grouping_cols.strip() == '':
grouping_cols = None
num_trees = 100 if num_trees is None else num_trees
@@ -328,7 +327,8 @@ def forest_train(
"Random forest error: Number of features to be selected "
"is more than the actual number of features.")
- all_cols_types = dict(get_cols_and_types(training_table_name))
+ all_cols_types = dict([(f, get_expr_type(f, training_table_name))
+ for f in features])
cat_features, ordered_cat_features, con_features, boolean_cats = \
_classify_features(all_cols_types, features)
@@ -349,10 +349,10 @@ def forest_train(
dep_col_str = ("CASE WHEN " + dependent_variable +
" THEN 'True' ELSE 'False' END") if is_bool else dependent_variable
dep = ("(CASE " +
- "\n ".join([
- "WHEN ({dep_col})::text = $${c}$$ THEN {i}".format(
- dep_col=dep_col_str, c=c, i=i)
- for i, c in enumerate(dep_list)]) +
+ "\n ".
+ join(["WHEN ({dep_col})::text = $${c}$$ THEN {i}".
+ format(dep_col=dep_col_str, c=c, i=i)
+ for i, c in enumerate(dep_list)]) +
"\nEND)")
dep_n_levels = len(dep_list)
else:
@@ -388,12 +388,12 @@ def forest_train(
bins['grp_key_cat'] = ['']
else:
grouping_cols_list = [col.strip() for col in grouping_cols.split(',')]
- grouping_cols_and_types = [(col, _dict_get_quoted(all_cols_types, col))
+ grouping_cols_and_types = [(col, get_expr_type(col, training_table_name))
for col in grouping_cols_list]
grouping_array_str = (
"array_to_string(array[" +
','.join("(case when " + col + " then 'True' else 'False' end)::text"
- if col_type == 'boolean' else '(' + col + ')::text'
+ if col_type.lower() == 'boolean' else '(' + col + ')::text'
for col, col_type in grouping_cols_and_types) +
"], ',')")
grouping_cols_str = ('' if grouping_cols is None
@@ -1198,8 +1198,9 @@ def _create_summary_table(**kwargs):
else:
kwargs['dep_list_str'] = "NULL"
- kwargs['indep_type'] = ', '.join(_dict_get_quoted(kwargs['all_cols_types'], col)
- for col in kwargs['cat_features'] + kwargs['con_features'])
+ kwargs['indep_type'] = ', '.join(kwargs['all_cols_types'][col]
+ for col in kwargs['cat_features'] +
+ kwargs['con_features'])
kwargs['dep_type'] = _get_dep_type(kwargs['training_table_name'],
kwargs['dependent_variable'])
kwargs['cat_features_str'] = ','.join(kwargs['cat_features'])
@@ -1408,6 +1409,7 @@ def _validate_get_tree(model, gid, sample_id):
# ------------------------------------------------------------
+
def forest_predict_help_message(schema_madlib, message, **kwargs):
if not message:
help_string = """
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/18b8486c/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index e91c71a..086e74b 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -4,25 +4,26 @@ CREATE TABLE dt_golf (
"OUTLOOK" text,
temperature double precision,
humidity double precision,
+ cont_features double precision[],
windy boolean,
class text
) ;
-INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
-(1, 'sunny', 85, 85, false, 'Don''t Play'),
-(2, 'sunny', 80, 90, true, 'Don''t Play'),
-(3, 'overcast', 83, 78, false, 'Play'),
-(4, 'rain', 70, 96, false, 'Play'),
-(5, 'rain', 68, 80, false, 'Play'),
-(6, 'rain', 65, 70, true, 'Don''t Play'),
-(7, 'overcast', 64, 65, true, 'Play'),
-(8, 'sunny', 72, 95, false, 'Don''t Play'),
-(9, 'sunny', 69, 70, false, 'Play'),
-(10, 'rain', 75, 80, false, 'Play'),
-(11, 'sunny', 75, 70, true, 'Play'),
-(12, 'overcast', 72, 90, true, 'Play'),
-(13, 'overcast', 81, 75, false, 'Play'),
-(14, 'rain', 71, 80, true, 'Don''t Play');
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,cont_features,windy,class) VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], false, 'Don''t Play'),
+(2, 'sunny', 80, 90,ARRAY[80, 90], true, 'Don''t Play'),
+(3, 'overcast', 83, 78,ARRAY[83, 78], false, 'Play'),
+(4, 'rain', 70, 96,ARRAY[70, 96], false, 'Play'),
+(5, 'rain', 68, 80,ARRAY[68, 80], false, 'Play'),
+(6, 'rain', 65, 70,ARRAY[65, 70], true, 'Don''t Play'),
+(7, 'overcast', 64, 65,ARRAY[64, 65], true, 'Play'),
+(8, 'sunny', 72, 95,ARRAY[72, 95], false, 'Don''t Play'),
+(9, 'sunny', 69, 70,ARRAY[69, 70], false, 'Play'),
+(10, 'rain', 75, 80,ARRAY[75, 80], false, 'Play'),
+(11, 'sunny', 75, 70,ARRAY[75, 70], true, 'Play'),
+(12, 'overcast', 72, 90,ARRAY[72, 90], true, 'Play'),
+(13, 'overcast', 81, 75,ARRAY[81, 75], false, 'Play'),
+(14, 'rain', 71, 80,ARRAY[71, 80], true, 'Don''t Play');
-------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
@@ -31,7 +32,7 @@ SELECT forest_train(
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'class'::TEXT, -- response
- 'windy, temperature'::TEXT, -- features
+ 'windy, cont_features[1]'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
5, -- num of trees
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/18b8486c/src/ports/postgres/modules/utilities/validate_args.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 5832124..532326d 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -359,7 +359,7 @@ def get_expr_type(expr, tbl):
FROM {1}
LIMIT 1
""".format(expr, tbl))[0]['type']
- return expr_type.upper()
+ return expr_type.lower()
# -------------------------------------------------------------------------