You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2019/05/07 18:42:00 UTC

[jira] [Updated] (MADLIB-1322) MLP with minibatch fails for integer dependent variable

     [ https://issues.apache.org/jira/browse/MADLIB-1322?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Frank McQuillan updated MADLIB-1322:
------------------------------------
    Description: 
\
{code}
DROP TABLE IF EXISTS iris_data;
CREATE TABLE iris_data(
    id serial,
    attributes numeric[],
    class_text varchar,
    class integer,
    state varchar
);
INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
{code}

Works OK if dependent variable is TEXT:

{code}
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
                                     'iris_data_packed',  -- Output table
                                     'class_text',        -- Dependent variable TEXT
                                     'attributes'        -- Independent variables
                                    );

\d+ iris_data_packed
                               Table "public.iris_data_packed"
       Column        |        Type        | Modifiers | Storage  | Stats target | Description 
---------------------+--------------------+-----------+----------+--------------+-------------
 __id__              | bigint             |           | plain    |              | 
 dependent_varname   | double precision[] |           | extended |              | 
 independent_varname | double precision[] |           | extended |              | 


DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
-- Set seed so results are reproducible
SELECT setseed(0);
SELECT madlib.mlp_classification(
    'iris_data_packed',      -- Output table from mini-batch preprocessor
    'mlp_model',             -- Destination table
    'independent_varname',   -- Hardcode to this, from table iris_data_packed
    'dependent_varname',     -- Hardcode to this, from table iris_data_packed
    ARRAY[5],                -- Number of units per layer
    'learning_rate_init=0.1,
    n_iterations=5,
    tolerance=0',            -- Optimizer params
    'tanh',                  -- Activation function
    NULL,                    -- Default weight (1)
    FALSE,                   -- No warm start
    TRUE                    -- Not verbose
);

INFO:  Iteration: 1, Loss: <0.990848103579>
INFO:  Iteration: 2, Loss: <0.852423978558>
INFO:  Iteration: 3, Loss: <0.689764103374>
INFO:  Iteration: 4, Loss: <0.530458765792>
 mlp_classification 
--------------------
 
(1 row)
{code}

Does not work if dependent variable is INTEGER:

{code}
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
                                     'iris_data_packed',  -- Output table
                                     'class',        -- Dependent variable INTEGER
                                     'attributes',        -- Independent variables
                                     NULL, -- grouping
                                     NULL, -- buffer size (or size of the mini-batch)
                                     TRUE -- Encode scalar int dependent variable
                                     );

\d+ iris_data_packed
                               Table "public.iris_data_packed"
       Column        |        Type        | Modifiers | Storage  | Stats target | Description 
---------------------+--------------------+-----------+----------+--------------+-------------
 __id__              | bigint             |           | plain    |              | 
 dependent_varname   | double precision[] |           | extended |              | 
 independent_varname | double precision[] |           | extended |              | 


DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
-- Set seed so results are reproducible
SELECT setseed(0);
SELECT madlib.mlp_classification(
    'iris_data_packed',      -- Output table from mini-batch preprocessor
    'mlp_model',             -- Destination table
    'independent_varname',   -- Hardcode to this, from table iris_data_packed
    'dependent_varname',     -- Hardcode to this, from table iris_data_packed
    ARRAY[5],                -- Number of units per layer
    'learning_rate_init=0.1,
    n_iterations=10,
    tolerance=0',            -- Optimizer params
    'tanh',                  -- Activation function
    NULL,                    -- Default weight (1)
    FALSE,                   -- No warm start
    TRUE                    -- Not verbose
);

ERROR:  TypeError: must be string, not int
CONTEXT:  Traceback (most recent call last):
  PL/Python function "mlp_classification", line 33, in <module>
    grouping_col)
  PL/Python function "mlp_classification", line 42, in wrapper
  PL/Python function "mlp_classification", line 147, in mlp
  PL/Python function "mlp_classification", line 74, in quote_literal
PL/Python function "mlp_classification"
{code}



  was:

(1)
If I have an integer dependent variable and I mini-batch:

{code}
select madlib.minibatch_preprocessor(
'classification_train', -- input table
'mini_batch_packed_train', -- output table
'response', -- response INTEGER
'feature_vector',  -- indep vars
NULL, -- grouping
NULL, -- buffer size (or size of the mini-batch)
TRUE -- Encode scalar int dependent variable (if response is integer instead of boolean or char)
);
{code}

Then the table looks like:

{code}
madlib=# \d+ batch_packed_train_summary
             Table "public.mini_batch_packed_train_summary"
          Column          |   Type    | Modifiers | Storage  | Stats target | Description 
--------------------------+-----------+-----------+----------+--------------+-------------
 source_table             | text      |           | extended |              | 
 output_table             | text      |           | extended |              | 
 dependent_varname        | text      |           | extended |              | 
 independent_varname      | text      |           | extended |              | 
 dependent_vartype        | text      |           | extended |              | 
 buffer_size              | integer   |           | plain    |              | 
 class_values             | integer[] |           | extended |              | 
 num_rows_processed       | integer   |           | plain    |              | 
 num_missing_rows_skipped | integer   |           | plain    |              | 
 grouping_cols            | text      |           | extended |              | 
{code}

Then MLP classification fails with:

{code}
InternalError: (psycopg2.InternalError) TypeError: must be string, not int
CONTEXT:  Traceback (most recent call last):
  PL/Python function "mlp_classification", line 33, in <module>
    grouping_col)
  PL/Python function "mlp_classification", line 42, in wrapper
  PL/Python function "mlp_classification", line 147, in mlp
  PL/Python function "mlp_classification", line 74, in quote_literal
{code}


(2)
If I cast to text explicitly:

{code}
select madlib.minibatch_preprocessor(
'classification_train', -- input table
'mini_batch_packed_train', -- output table
'response::TEXT', -- response
'feature_vector',  -- indep vars
NULL, -- grouping
NULL, -- buffer size (or size of the mini-batch)
TRUE -- Encode scalar int dependent variable (if response is integer instead of boolean or char)
);
{code}

The tables looks like:

{code}
madlib=# \d+ mini_batch_packed_train_summary
            Table "public.mini_batch_packed_train_summary"
          Column          |  Type   | Modifiers | Storage  | Stats target | Description 
--------------------------+---------+-----------+----------+--------------+-------------
 source_table             | text    |           | extended |              | 
 output_table             | text    |           | extended |              | 
 dependent_varname        | text    |           | extended |              | 
 independent_varname      | text    |           | extended |              | 
 dependent_vartype        | text    |           | extended |              | 
 buffer_size              | integer |           | plain    |              | 
 class_values             | text[]  |           | extended |              | 
 num_rows_processed       | integer |           | plain    |              | 
 num_missing_rows_skipped | integer |           | plain    |              | 
 grouping_cols            | text    |           | extended |              | 
{code}

And MLP training works OK.


> MLP with minibatch fails for integer dependent variable
> -------------------------------------------------------
>
>                 Key: MADLIB-1322
>                 URL: https://issues.apache.org/jira/browse/MADLIB-1322
>             Project: Apache MADlib
>          Issue Type: Bug
>          Components: Module: Neural Networks
>            Reporter: Frank McQuillan
>            Priority: Minor
>             Fix For: v1.16
>
>
> \
> {code}
> DROP TABLE IF EXISTS iris_data;
> CREATE TABLE iris_data(
>     id serial,
>     attributes numeric[],
>     class_text varchar,
>     class integer,
>     state varchar
> );
> INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
> (1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
> (2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
> (3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
> (4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
> (5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
> (6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
> (7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
> (8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
> (9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
> (10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
> (11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
> (12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
> (13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
> (14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
> (15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
> (16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
> (17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
> (18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
> (19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
> (20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
> (21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
> (22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
> (23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
> (24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
> (25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
> (26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
> (27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
> (28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
> (29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
> (30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
> (31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
> (32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
> (33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
> (34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
> (35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
> (36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
> (37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
> (38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
> (39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
> (40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
> (41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
> (42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
> (43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
> (44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
> (45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
> (46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
> (47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
> (48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
> (49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
> (50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
> (51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
> (52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
> {code}
> Works OK if dependent variable is TEXT:
> {code}
> DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
> SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
>                                      'iris_data_packed',  -- Output table
>                                      'class_text',        -- Dependent variable TEXT
>                                      'attributes'        -- Independent variables
>                                     );
> \d+ iris_data_packed
>                                Table "public.iris_data_packed"
>        Column        |        Type        | Modifiers | Storage  | Stats target | Description 
> ---------------------+--------------------+-----------+----------+--------------+-------------
>  __id__              | bigint             |           | plain    |              | 
>  dependent_varname   | double precision[] |           | extended |              | 
>  independent_varname | double precision[] |           | extended |              | 
> DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
> -- Set seed so results are reproducible
> SELECT setseed(0);
> SELECT madlib.mlp_classification(
>     'iris_data_packed',      -- Output table from mini-batch preprocessor
>     'mlp_model',             -- Destination table
>     'independent_varname',   -- Hardcode to this, from table iris_data_packed
>     'dependent_varname',     -- Hardcode to this, from table iris_data_packed
>     ARRAY[5],                -- Number of units per layer
>     'learning_rate_init=0.1,
>     n_iterations=5,
>     tolerance=0',            -- Optimizer params
>     'tanh',                  -- Activation function
>     NULL,                    -- Default weight (1)
>     FALSE,                   -- No warm start
>     TRUE                    -- Not verbose
> );
> INFO:  Iteration: 1, Loss: <0.990848103579>
> INFO:  Iteration: 2, Loss: <0.852423978558>
> INFO:  Iteration: 3, Loss: <0.689764103374>
> INFO:  Iteration: 4, Loss: <0.530458765792>
>  mlp_classification 
> --------------------
>  
> (1 row)
> {code}
> Does not work if dependent variable is INTEGER:
> {code}
> DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
> SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
>                                      'iris_data_packed',  -- Output table
>                                      'class',        -- Dependent variable INTEGER
>                                      'attributes',        -- Independent variables
>                                      NULL, -- grouping
>                                      NULL, -- buffer size (or size of the mini-batch)
>                                      TRUE -- Encode scalar int dependent variable
>                                      );
> \d+ iris_data_packed
>                                Table "public.iris_data_packed"
>        Column        |        Type        | Modifiers | Storage  | Stats target | Description 
> ---------------------+--------------------+-----------+----------+--------------+-------------
>  __id__              | bigint             |           | plain    |              | 
>  dependent_varname   | double precision[] |           | extended |              | 
>  independent_varname | double precision[] |           | extended |              | 
> DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
> -- Set seed so results are reproducible
> SELECT setseed(0);
> SELECT madlib.mlp_classification(
>     'iris_data_packed',      -- Output table from mini-batch preprocessor
>     'mlp_model',             -- Destination table
>     'independent_varname',   -- Hardcode to this, from table iris_data_packed
>     'dependent_varname',     -- Hardcode to this, from table iris_data_packed
>     ARRAY[5],                -- Number of units per layer
>     'learning_rate_init=0.1,
>     n_iterations=10,
>     tolerance=0',            -- Optimizer params
>     'tanh',                  -- Activation function
>     NULL,                    -- Default weight (1)
>     FALSE,                   -- No warm start
>     TRUE                    -- Not verbose
> );
> ERROR:  TypeError: must be string, not int
> CONTEXT:  Traceback (most recent call last):
>   PL/Python function "mlp_classification", line 33, in <module>
>     grouping_col)
>   PL/Python function "mlp_classification", line 42, in wrapper
>   PL/Python function "mlp_classification", line 147, in mlp
>   PL/Python function "mlp_classification", line 74, in quote_literal
> PL/Python function "mlp_classification"
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)