You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Nandish Jayaram (JIRA)" <ji...@apache.org> on 2017/04/26 23:31:04 UTC

[jira] [Created] (MADLIB-1097) Random Forest does not allow NULL values in features

Nandish Jayaram created MADLIB-1097:
---------------------------------------

             Summary: Random Forest does not allow NULL values in features
                 Key: MADLIB-1097
                 URL: https://issues.apache.org/jira/browse/MADLIB-1097
             Project: Apache MADlib
          Issue Type: Bug
          Components: Module: Random Forest
            Reporter: Nandish Jayaram


Running forest_train() with features that have NULL values results in the following error:
{code}
psql:/tmp/madlib.LkFR_5/recursive_partitioning/test/random_forest.sql_in.tmp:79: ERROR:  spiexceptions.InvalidParameterValue: Function "_rf_cat_imp_score(bytea8,integer[],double precision[],integer[],integer,double precision,boolean,double precision[])": Invalid type conversion. Null where not expected.
CONTEXT:  Traceback (most recent call last):
  PL/Python function "forest_train", line 42, in <module>
    sample_ratio
  PL/Python function "forest_train", line 605, in forest_train
  PL/Python function "forest_train", line 1052, in _calculate_oob_prediction
PL/Python function "forest_train"
{code}

The following are the input table and parameters used:
{code:sql}
CREATE TABLE dt_golf (
    id integer NOT NULL,
    "OUTLOOK" text,
    temperature double precision,
    humidity double precision,
    windy boolean,
    class text
) ;

INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
(1, 'sunny', 85, 85, false, 'Don''t Play'),
(2, 'sunny', 80, 90, true, 'Don''t Play'),
(3, 'overcast', 83, 78, false, 'Play'),
(4, 'rain', NULL, 96, false, 'Play'),
(5, 'rain', 68, 80, NULL, 'Play'),
(6, 'rain', 65, 70, true, 'Don''t Play'),
(7, 'overcast', 64, 65, true, 'Play'),
(8, 'sunny', 72, 95, false, 'Don''t Play'),
(9, 'sunny', 69, 70, false, 'Play'),
(10, 'rain', 75, 80, false, 'Play'),
(11, 'sunny', 75, 70, true, 'Play'),
(12, 'overcast', 72, 90, true, 'Play'),
(13, 'overcast', 81, 75, false, 'Play'),
(14, 'rain', 71, 80, true, 'Don''t Play');

SELECT forest_train(
                  'dt_golf'::TEXT,         -- source table
                  'train_output'::TEXT,    -- output model table
                  'id'::TEXT,              -- id column
                  'class'::TEXT,           -- response
                  'windy, temperature'::TEXT,   -- features
                  NULL::TEXT,        -- exclude columns
                  NULL::TEXT,        -- no grouping
                  5,                -- num of trees
                  1,                 -- num of random features
                  TRUE::BOOLEAN,    -- importance
                  1::INTEGER,       -- num_permutations
                  10::INTEGER,       -- max depth
                  1::INTEGER,        -- min split
                  1::INTEGER,        -- min bucket
                  8::INTEGER,        -- number of bins per continuous variable
                  'max_surrogates=0',
                  FALSE
                  );
{code}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)