You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/07/18 22:52:06 UTC
incubator-madlib git commit: DT: Fix bin computation for boolean
features
Repository: incubator-madlib
Updated Branches:
refs/heads/master e421cd587 -> 90c38975d
DT: Fix bin computation for boolean features
JIRA: MADLIB-1011
Special case statement was added to bin computation query since bool to text
cast was not available in GPDB platforms. This case statement was incorrectly
used as a column name. This has been fixed in this commit, with tests in
install-check verifying the fix.
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/90c38975
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/90c38975
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/90c38975
Branch: refs/heads/master
Commit: 90c38975dbe71d6d5ac47fbf5ed066df9db76d7c
Parents: e421cd5
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Jul 18 14:23:18 2016 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Jul 18 14:23:18 2016 -0700
----------------------------------------------------------------------
.../recursive_partitioning/decision_tree.py_in | 43 ++++++++++----------
.../test/decision_tree.sql_in | 30 +++++++-------
.../test/random_forest.sql_in | 34 ++++++++--------
3 files changed, 54 insertions(+), 53 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90c38975/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 1183a6d..00a0c5f 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -790,34 +790,34 @@ def _get_bins(schema_madlib, training_table_name, cat_features,
"AVG({dependent_variable})".format(dependent_variable=dependent_variable)
sql_cat_levels = """
+ SELECT
+ '{{col_name}}'::text AS colname,
+ levels
+ FROM (
SELECT
- '{{col}}' AS colname,
- levels
+ '{{col_name}}'::text AS colname,
+ array_agg(levels ORDER BY dep_avg) AS levels
FROM (
SELECT
- '{{col}}' AS colname,
- array_agg(levels ORDER BY dep_avg) AS levels
- FROM (
- SELECT
- ({{col}})::text AS levels,
- {order_fun} AS dep_avg
- FROM {training_table_name}
- WHERE {filter_null}
- AND {{col}} is not NULL
- GROUP BY {{col}}
- ) s
- ) s1
- where array_upper(levels, 1) > 1
- """.format(training_table_name=training_table_name,
- order_fun=order_fun, filter_null=filter_null)
+ ({{col}})::text AS levels,
+ {order_fun} AS dep_avg
+ FROM {training_table_name}
+ WHERE {filter_null}
+ AND {{col}} is not NULL
+ GROUP BY {{col}}
+ ) s
+ ) s1
+ WHERE array_upper(levels, 1) > 1
+ """.format(training_table_name=training_table_name,
+ order_fun=order_fun, filter_null=filter_null)
# Try to obtain all the levels in one scan of the table.
# () are needed when casting the categorical variables because
# they can be expressions.
sql_all_cats = ' UNION ALL '.join(
sql_cat_levels.format(col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
- if col in boolean_cats else col) for col in cat_features)
-
+ if col in boolean_cats else col,
+ col_name=plpy.quote_ident(col)) for col in cat_features)
all_levels = plpy.execute(sql_all_cats)
if len(all_levels) != len(cat_features):
@@ -1005,7 +1005,7 @@ def _get_bins_grps(
from (
SELECT
grp_key,
- '{{col}}' as colname,
+ '{{col_name}}' as colname,
array_agg(levels order by dep_avg) as levels
from (
SELECT
@@ -1025,7 +1025,8 @@ def _get_bins_grps(
sql_all_cats = ' UNION ALL '.join(
sql_cat_levels.format(
col=("(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
- if col in boolean_cats else col))
+ if col in boolean_cats else col),
+ col_name=plpy.quote_ident(col))
for col in cat_features)
all_levels = list(plpy.execute(sql_all_cats))
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90c38975/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index 3f750c9..8f9168c 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -239,25 +239,25 @@ CREATE TABLE dt_golf (
"OUTLOOK" text,
temperature double precision,
humidity double precision,
- windy text,
+ windy boolean,
class text
) ;
INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
-(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
-(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
-(3, 'overcast', 83, 78, 'false', 'Play'),
-(4, 'rain', 70, 96, 'false', 'Play'),
-(5, 'rain', 68, 80, 'false', 'Play'),
-(6, 'rain', 65, 70, 'true', 'Don''t Play'),
-(7, 'overcast', 64, 65, 'true', 'Play'),
-(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
-(9, 'sunny', 69, 70, 'false', 'Play'),
-(10, 'rain', 75, 80, 'false', 'Play'),
-(11, 'sunny', 75, 70, 'true', 'Play'),
-(12, 'overcast', 72, 90, 'true', 'Play'),
-(13, 'overcast', 81, 75, 'false', 'Play'),
-(14, 'rain', 71, 80, 'true', 'Don''t Play');
+(1, 'sunny', 85, 85, false, 'Don''t Play'),
+(2, 'sunny', 80, 90, true, 'Don''t Play'),
+(3, 'overcast', 83, 78, false, 'Play'),
+(4, 'rain', 70, 96, false, 'Play'),
+(5, 'rain', 68, 80, false, 'Play'),
+(6, 'rain', 65, 70, true, 'Don''t Play'),
+(7, 'overcast', 64, 65, true, 'Play'),
+(8, 'sunny', 72, 95, false, 'Don''t Play'),
+(9, 'sunny', 69, 70, false, 'Play'),
+(10, 'rain', 75, 80, false, 'Play'),
+(11, 'sunny', 75, 70, true, 'Play'),
+(12, 'overcast', 72, 90, true, 'Play'),
+(13, 'overcast', 81, 75, false, 'Play'),
+(14, 'rain', 71, 80, true, 'Don''t Play');
-- no grouping
DROP TABLE IF EXISTS train_output, train_output_summary;
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90c38975/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index 22b73fd..743bdfe 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -4,25 +4,25 @@ CREATE TABLE dt_golf (
"OUTLOOK" text,
temperature double precision,
humidity double precision,
- windy text,
+ windy boolean,
class text
) ;
INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
-(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
-(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
-(3, 'overcast', 83, 78, 'false', 'Play'),
-(4, 'rain', 70, 96, 'false', 'Play'),
-(5, 'rain', 68, 80, 'false', 'Play'),
-(6, 'rain', 65, 70, 'true', 'Don''t Play'),
-(7, 'overcast', 64, 65, 'true', 'Play'),
-(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
-(9, 'sunny', 69, 70, 'false', 'Play'),
-(10, 'rain', 75, 80, 'false', 'Play'),
-(11, 'sunny', 75, 70, 'true', 'Play'),
-(12, 'overcast', 72, 90, 'true', 'Play'),
-(13, 'overcast', 81, 75, 'false', 'Play'),
-(14, 'rain', 71, 80, 'true', 'Don''t Play');
+(1, 'sunny', 85, 85, false, 'Don''t Play'),
+(2, 'sunny', 80, 90, true, 'Don''t Play'),
+(3, 'overcast', 83, 78, false, 'Play'),
+(4, 'rain', 70, 96, false, 'Play'),
+(5, 'rain', 68, 80, false, 'Play'),
+(6, 'rain', 65, 70, true, 'Don''t Play'),
+(7, 'overcast', 64, 65, true, 'Play'),
+(8, 'sunny', 72, 95, false, 'Don''t Play'),
+(9, 'sunny', 69, 70, false, 'Play'),
+(10, 'rain', 75, 80, false, 'Play'),
+(11, 'sunny', 75, 70, true, 'Play'),
+(12, 'overcast', 72, 90, true, 'Play'),
+(13, 'overcast', 81, 75, false, 'Play'),
+(14, 'rain', 71, 80, true, 'Don''t Play');
-------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
@@ -31,7 +31,7 @@ SELECT forest_train(
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'class'::TEXT, -- response
- 'humidity, temperature'::TEXT, -- features
+ 'windy, temperature'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
5, -- num of trees
@@ -85,7 +85,7 @@ SELECT forest_train(
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'class::TEXT'::TEXT, -- response
- 'class, humidity, temperature'::TEXT, -- features
+ 'class, windy, temperature'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
100, -- num of trees