You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/07/18 22:52:06 UTC

incubator-madlib git commit: DT: Fix bin computation for boolean features

Repository: incubator-madlib
Updated Branches:
  refs/heads/master e421cd587 -> 90c38975d


DT: Fix bin computation for boolean features

JIRA: MADLIB-1011

Special case statement was added to bin computation query since bool to text
cast was not available in GPDB platforms. This case statement was incorrectly
used as a column name. This has been fixed in this commit, with tests in
install-check verifying the fix.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/90c38975
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/90c38975
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/90c38975

Branch: refs/heads/master
Commit: 90c38975dbe71d6d5ac47fbf5ed066df9db76d7c
Parents: e421cd5
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Jul 18 14:23:18 2016 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Jul 18 14:23:18 2016 -0700

----------------------------------------------------------------------
 .../recursive_partitioning/decision_tree.py_in  | 43 ++++++++++----------
 .../test/decision_tree.sql_in                   | 30 +++++++-------
 .../test/random_forest.sql_in                   | 34 ++++++++--------
 3 files changed, 54 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90c38975/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 1183a6d..00a0c5f 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -790,34 +790,34 @@ def _get_bins(schema_madlib, training_table_name, cat_features,
                 "AVG({dependent_variable})".format(dependent_variable=dependent_variable)
 
         sql_cat_levels = """
+            SELECT
+                '{{col_name}}'::text AS colname,
+                levels
+            FROM (
                 SELECT
-                    '{{col}}' AS colname,
-                    levels
+                    '{{col_name}}'::text AS colname,
+                    array_agg(levels ORDER BY dep_avg) AS levels
                 FROM (
                     SELECT
-                        '{{col}}' AS colname,
-                        array_agg(levels ORDER BY dep_avg) AS levels
-                    FROM (
-                        SELECT
-                            ({{col}})::text AS levels,
-                            {order_fun} AS dep_avg
-                        FROM {training_table_name}
-                        WHERE {filter_null}
-                            AND {{col}} is not NULL
-                        GROUP BY {{col}}
-                    ) s
-                ) s1
-                where array_upper(levels, 1) > 1
-                """.format(training_table_name=training_table_name,
-                           order_fun=order_fun, filter_null=filter_null)
+                        ({{col}})::text AS levels,
+                        {order_fun} AS dep_avg
+                    FROM {training_table_name}
+                    WHERE {filter_null}
+                        AND {{col}} is not NULL
+                    GROUP BY {{col}}
+                ) s
+            ) s1
+            WHERE array_upper(levels, 1) > 1
+            """.format(training_table_name=training_table_name,
+                       order_fun=order_fun, filter_null=filter_null)
 
         # Try to obtain all the levels in one scan of the table.
         # () are needed when casting the categorical variables because
         # they can be expressions.
         sql_all_cats = ' UNION ALL '.join(
             sql_cat_levels.format(col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
-                                  if col in boolean_cats else col) for col in cat_features)
-
+                                  if col in boolean_cats else col,
+                                  col_name=plpy.quote_ident(col)) for col in cat_features)
         all_levels = plpy.execute(sql_all_cats)
 
         if len(all_levels) != len(cat_features):
@@ -1005,7 +1005,7 @@ def _get_bins_grps(
                 from (
                     SELECT
                         grp_key,
-                        '{{col}}' as colname,
+                        '{{col_name}}' as colname,
                         array_agg(levels order by dep_avg) as levels
                     from (
                         SELECT
@@ -1025,7 +1025,8 @@ def _get_bins_grps(
         sql_all_cats = ' UNION ALL '.join(
             sql_cat_levels.format(
                 col=("(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
-                     if col in boolean_cats else col))
+                     if col in boolean_cats else col),
+                col_name=plpy.quote_ident(col))
             for col in cat_features)
 
         all_levels = list(plpy.execute(sql_all_cats))

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90c38975/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index 3f750c9..8f9168c 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -239,25 +239,25 @@ CREATE TABLE dt_golf (
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
-    windy text,
+    windy boolean,
     class text
 ) ;
 
 INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
-(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
-(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
-(3, 'overcast', 83, 78, 'false', 'Play'),
-(4, 'rain', 70, 96, 'false', 'Play'),
-(5, 'rain', 68, 80, 'false', 'Play'),
-(6, 'rain', 65, 70, 'true', 'Don''t Play'),
-(7, 'overcast', 64, 65, 'true', 'Play'),
-(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
-(9, 'sunny', 69, 70, 'false', 'Play'),
-(10, 'rain', 75, 80, 'false', 'Play'),
-(11, 'sunny', 75, 70, 'true', 'Play'),
-(12, 'overcast', 72, 90, 'true', 'Play'),
-(13, 'overcast', 81, 75, 'false', 'Play'),
-(14, 'rain', 71, 80, 'true', 'Don''t Play');
+(1, 'sunny', 85, 85, false, 'Don''t Play'),
+(2, 'sunny', 80, 90, true, 'Don''t Play'),
+(3, 'overcast', 83, 78, false, 'Play'),
+(4, 'rain', 70, 96, false, 'Play'),
+(5, 'rain', 68, 80, false, 'Play'),
+(6, 'rain', 65, 70, true, 'Don''t Play'),
+(7, 'overcast', 64, 65, true, 'Play'),
+(8, 'sunny', 72, 95, false, 'Don''t Play'),
+(9, 'sunny', 69, 70, false, 'Play'),
+(10, 'rain', 75, 80, false, 'Play'),
+(11, 'sunny', 75, 70, true, 'Play'),
+(12, 'overcast', 72, 90, true, 'Play'),
+(13, 'overcast', 81, 75, false, 'Play'),
+(14, 'rain', 71, 80, true, 'Don''t Play');
 
 -- no grouping
 DROP TABLE IF EXISTS train_output, train_output_summary;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90c38975/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index 22b73fd..743bdfe 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -4,25 +4,25 @@ CREATE TABLE dt_golf (
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
-    windy text,
+    windy boolean,
     class text
 ) ;
 
 INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
-(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
-(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
-(3, 'overcast', 83, 78, 'false', 'Play'),
-(4, 'rain', 70, 96, 'false', 'Play'),
-(5, 'rain', 68, 80, 'false', 'Play'),
-(6, 'rain', 65, 70, 'true', 'Don''t Play'),
-(7, 'overcast', 64, 65, 'true', 'Play'),
-(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
-(9, 'sunny', 69, 70, 'false', 'Play'),
-(10, 'rain', 75, 80, 'false', 'Play'),
-(11, 'sunny', 75, 70, 'true', 'Play'),
-(12, 'overcast', 72, 90, 'true', 'Play'),
-(13, 'overcast', 81, 75, 'false', 'Play'),
-(14, 'rain', 71, 80, 'true', 'Don''t Play');
+(1, 'sunny', 85, 85, false, 'Don''t Play'),
+(2, 'sunny', 80, 90, true, 'Don''t Play'),
+(3, 'overcast', 83, 78, false, 'Play'),
+(4, 'rain', 70, 96, false, 'Play'),
+(5, 'rain', 68, 80, false, 'Play'),
+(6, 'rain', 65, 70, true, 'Don''t Play'),
+(7, 'overcast', 64, 65, true, 'Play'),
+(8, 'sunny', 72, 95, false, 'Don''t Play'),
+(9, 'sunny', 69, 70, false, 'Play'),
+(10, 'rain', 75, 80, false, 'Play'),
+(11, 'sunny', 75, 70, true, 'Play'),
+(12, 'overcast', 72, 90, true, 'Play'),
+(13, 'overcast', 81, 75, false, 'Play'),
+(14, 'rain', 71, 80, true, 'Don''t Play');
 
 -------------------------------------------------------------------------
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
@@ -31,7 +31,7 @@ SELECT forest_train(
                   'train_output'::TEXT,    -- output model table
                   'id'::TEXT,              -- id column
                   'class'::TEXT,           -- response
-                  'humidity, temperature'::TEXT,   -- features
+                  'windy, temperature'::TEXT,   -- features
                   NULL::TEXT,        -- exclude columns
                   NULL::TEXT,        -- no grouping
                   5,                -- num of trees
@@ -85,7 +85,7 @@ SELECT forest_train(
                   'train_output'::TEXT,    -- output model table
                   'id'::TEXT,              -- id column
                   'class::TEXT'::TEXT,           -- response
-                  'class, humidity, temperature'::TEXT,   -- features
+                  'class, windy, temperature'::TEXT,   -- features
                   NULL::TEXT,        -- exclude columns
                   NULL::TEXT,        -- no grouping
                   100,                -- num of trees