You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2018/03/21 23:00:00 UTC

[jira] [Assigned] (MADLIB-1217) DT and RF: null_as_category null handling does not work with grouping

     [ https://issues.apache.org/jira/browse/MADLIB-1217?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Frank McQuillan reassigned MADLIB-1217:
---------------------------------------

    Assignee: Rahul Iyer

> DT and RF:  null_as_category null handling does not work with grouping
> ----------------------------------------------------------------------
>
>                 Key: MADLIB-1217
>                 URL: https://issues.apache.org/jira/browse/MADLIB-1217
>             Project: Apache MADlib
>          Issue Type: Bug
>          Components: Module: Decision Tree, Module: Random Forest
>            Reporter: Frank McQuillan
>            Assignee: Rahul Iyer
>            Priority: Minor
>             Fix For: v1.14
>
>
> Load data: 
> {code}
> DROP TABLE IF EXISTS mt_cars;
> CREATE TABLE mt_cars (
>     id integer NOT NULL,
>     mpg double precision,
>     cyl integer,
>     disp double precision,
>     hp integer,
>     drat double precision,
>     wt double precision,
>     qsec double precision,
>     vs integer,
>     am integer,
>     gear integer,
>     carb integer
> );
> INSERT INTO mt_cars VALUES
> (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
> (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
> (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
> (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
> (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
> (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
> (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
> (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
> (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
> (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
> (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
> (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
> (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
> (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
> (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
> (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
> (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
> (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
> (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
> (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
> (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
> (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
> (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
> (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
> (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
> (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
> (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
> (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
> (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
> (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
> (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
> (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
> {code}
> DT:
> {code}
> DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
> SELECT madlib.tree_train('mt_cars',         -- source table
>                          'train_output',    -- output model table
>                          'id',              -- id column
>                          'mpg',             -- dependent variable
>                          '*',               -- features
>                          'id, hp, drat, am, gear, carb',  -- exclude columns
>                          'mse',             -- split criterion
>                          'am',        -- grouping
>                          NULL::text,        -- no weights, all observations treated equally
>                          10,                -- max depth
>                          8,                 -- min split
>                          3,                 -- number of bins per continuous variable
>                          10,                -- number of splits
>                          NULL,              -- pruning parameters
>                          'null_as_category=TRUE'
>                          );
> {code}
> results in error
> {code}
> ERROR:  plpy.SPIError: column "__null__" does not exist
> LINE 14:                             (COALESCE(vs, __NULL__))::text a...
>                                                    ^
> QUERY:  
>                 SELECT
>                     colname::text,
>                     levels::text[],
>                     grp_key::text
>                 from (
>                     SELECT
>                         grp_key,
>                         'vs' as colname,
>                         array_agg(levels order by dep_avg) as levels
>                     from (
>                         SELECT
>                             array_to_string(array[(am)::text]::text[], ',') as grp_key,
>                             (COALESCE(vs, __NULL__))::text as levels,
>                             COALESCE(vs, __NULL__) as dep_avg
>                         FROM mt_cars
>                         WHERE (am) is not NULL and (mpg) is not NULL AND COALESCE(vs, __NULL__) IS NOT NULL
>                         GROUP BY COALESCE(vs, __NULL__), am
>                     ) s
>                     GROUP BY grp_key
>                 ) s1
>                 where array_upper(levels, 1) > 1
>                  UNION ALL 
>                 SELECT
>                     colname::text,
>                     levels::text[],
>                     grp_key::text
>                 from (
>                     SELECT
>                         grp_key,
>                         'cyl' as colname,
>                         array_agg(levels order by dep_avg) as levels
>                     from (
>                         SELECT
>                             array_to_string(array[(am)::text]::text[], ',') as grp_key,
>                             (COALESCE(cyl, __NULL__))::text as levels,
>                             COALESCE(cyl, __NULL__) as dep_avg
>                         FROM mt_cars
>                         WHERE (am) is not NULL and (mpg) is not NULL AND COALESCE(cyl, __NULL__) IS NOT NULL
>                         GROUP BY COALESCE(cyl, __NULL__), am
>                     ) s
>                     GROUP BY grp_key
>                 ) s1
>                 where array_upper(levels, 1) > 1
>                 
> CONTEXT:  Traceback (most recent call last):
>   PL/Python function "tree_train", line 28, in <module>
>     null_handling_params, verbose_mode)
>   PL/Python function "tree_train", line 489, in tree_train
>   PL/Python function "tree_train", line 293, in _get_tree_states
>   PL/Python function "tree_train", line 963, in _get_bins_grps
> PL/Python function "tree_train"
> {code}
> RF:
> {code}
> DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
> SELECT madlib.forest_train('mt_cars',
>                            'mt_cars_output',
>                            'id',
>                            'mpg',
>                            '*',
>                            'id, hp, drat, am, gear, carb',  -- exclude columns
>                            'am', -- grouping
>                            10::integer,
>                            2::integer,
>                            TRUE::boolean,
>                            1,
>                            10,
>                            8,
>                            3,
>                            10,
>                            'null_as_category=TRUE'
>                            );
> {code}
> results in error:
> {code}
> ERROR:  plpy.SPIError: column "__null__" does not exist
> LINE 14:                             (COALESCE(vs, __NULL__))::text a...
>                                                    ^
> QUERY:  
>                 SELECT
>                     colname::text,
>                     levels::text[],
>                     grp_key::text
>                 from (
>                     SELECT
>                         grp_key,
>                         'vs' as colname,
>                         array_agg(levels order by dep_avg) as levels
>                     from (
>                         SELECT
>                             array_to_string(array[(am)::text], ',') as grp_key,
>                             (COALESCE(vs, __NULL__))::text as levels,
>                             COALESCE(vs, __NULL__) as dep_avg
>                         FROM mt_cars
>                         WHERE (am) is not NULL and (mpg) is not NULL AND COALESCE(vs, __NULL__) IS NOT NULL
>                         GROUP BY COALESCE(vs, __NULL__), am
>                     ) s
>                     GROUP BY grp_key
>                 ) s1
>                 where array_upper(levels, 1) > 1
>                  UNION ALL 
>                 SELECT
>                     colname::text,
>                     levels::text[],
>                     grp_key::text
>                 from (
>                     SELECT
>                         grp_key,
>                         'cyl' as colname,
>                         array_agg(levels order by dep_avg) as levels
>                     from (
>                         SELECT
>                             array_to_string(array[(am)::text], ',') as grp_key,
>                             (COALESCE(cyl, __NULL__))::text as levels,
>                             COALESCE(cyl, __NULL__) as dep_avg
>                         FROM mt_cars
>                         WHERE (am) is not NULL and (mpg) is not NULL AND COALESCE(cyl, __NULL__) IS NOT NULL
>                         GROUP BY COALESCE(cyl, __NULL__), am
>                     ) s
>                     GROUP BY grp_key
>                 ) s1
>                 where array_upper(levels, 1) > 1
>                 
> CONTEXT:  Traceback (most recent call last):
>   PL/Python function "forest_train", line 42, in <module>
>     sample_ratio
>   PL/Python function "forest_train", line 427, in forest_train
>   PL/Python function "forest_train", line 963, in _get_bins_grps
> PL/Python function "forest_train"
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)