You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2019/05/07 18:42:00 UTC
[jira] [Updated] (MADLIB-1322) MLP with minibatch fails for integer
dependent variable
[ https://issues.apache.org/jira/browse/MADLIB-1322?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Frank McQuillan updated MADLIB-1322:
------------------------------------
Description:
\
{code}
DROP TABLE IF EXISTS iris_data;
CREATE TABLE iris_data(
id serial,
attributes numeric[],
class_text varchar,
class integer,
state varchar
);
INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
{code}
Works OK if dependent variable is TEXT:
{code}
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class_text', -- Dependent variable TEXT
'attributes' -- Independent variables
);
\d+ iris_data_packed
Table "public.iris_data_packed"
Column | Type | Modifiers | Storage | Stats target | Description
---------------------+--------------------+-----------+----------+--------------+-------------
__id__ | bigint | | plain | |
dependent_varname | double precision[] | | extended | |
independent_varname | double precision[] | | extended | |
DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
-- Set seed so results are reproducible
SELECT setseed(0);
SELECT madlib.mlp_classification(
'iris_data_packed', -- Output table from mini-batch preprocessor
'mlp_model', -- Destination table
'independent_varname', -- Hardcode to this, from table iris_data_packed
'dependent_varname', -- Hardcode to this, from table iris_data_packed
ARRAY[5], -- Number of units per layer
'learning_rate_init=0.1,
n_iterations=5,
tolerance=0', -- Optimizer params
'tanh', -- Activation function
NULL, -- Default weight (1)
FALSE, -- No warm start
TRUE -- Not verbose
);
INFO: Iteration: 1, Loss: <0.990848103579>
INFO: Iteration: 2, Loss: <0.852423978558>
INFO: Iteration: 3, Loss: <0.689764103374>
INFO: Iteration: 4, Loss: <0.530458765792>
mlp_classification
--------------------
(1 row)
{code}
Does not work if dependent variable is INTEGER:
{code}
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class', -- Dependent variable INTEGER
'attributes', -- Independent variables
NULL, -- grouping
NULL, -- buffer size (or size of the mini-batch)
TRUE -- Encode scalar int dependent variable
);
\d+ iris_data_packed
Table "public.iris_data_packed"
Column | Type | Modifiers | Storage | Stats target | Description
---------------------+--------------------+-----------+----------+--------------+-------------
__id__ | bigint | | plain | |
dependent_varname | double precision[] | | extended | |
independent_varname | double precision[] | | extended | |
DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
-- Set seed so results are reproducible
SELECT setseed(0);
SELECT madlib.mlp_classification(
'iris_data_packed', -- Output table from mini-batch preprocessor
'mlp_model', -- Destination table
'independent_varname', -- Hardcode to this, from table iris_data_packed
'dependent_varname', -- Hardcode to this, from table iris_data_packed
ARRAY[5], -- Number of units per layer
'learning_rate_init=0.1,
n_iterations=10,
tolerance=0', -- Optimizer params
'tanh', -- Activation function
NULL, -- Default weight (1)
FALSE, -- No warm start
TRUE -- Not verbose
);
ERROR: TypeError: must be string, not int
CONTEXT: Traceback (most recent call last):
PL/Python function "mlp_classification", line 33, in <module>
grouping_col)
PL/Python function "mlp_classification", line 42, in wrapper
PL/Python function "mlp_classification", line 147, in mlp
PL/Python function "mlp_classification", line 74, in quote_literal
PL/Python function "mlp_classification"
{code}
was:
(1)
If I have an integer dependent variable and I mini-batch:
{code}
select madlib.minibatch_preprocessor(
'classification_train', -- input table
'mini_batch_packed_train', -- output table
'response', -- response INTEGER
'feature_vector', -- indep vars
NULL, -- grouping
NULL, -- buffer size (or size of the mini-batch)
TRUE -- Encode scalar int dependent variable (if response is integer instead of boolean or char)
);
{code}
Then the table looks like:
{code}
madlib=# \d+ batch_packed_train_summary
Table "public.mini_batch_packed_train_summary"
Column | Type | Modifiers | Storage | Stats target | Description
--------------------------+-----------+-----------+----------+--------------+-------------
source_table | text | | extended | |
output_table | text | | extended | |
dependent_varname | text | | extended | |
independent_varname | text | | extended | |
dependent_vartype | text | | extended | |
buffer_size | integer | | plain | |
class_values | integer[] | | extended | |
num_rows_processed | integer | | plain | |
num_missing_rows_skipped | integer | | plain | |
grouping_cols | text | | extended | |
{code}
Then MLP classification fails with:
{code}
InternalError: (psycopg2.InternalError) TypeError: must be string, not int
CONTEXT: Traceback (most recent call last):
PL/Python function "mlp_classification", line 33, in <module>
grouping_col)
PL/Python function "mlp_classification", line 42, in wrapper
PL/Python function "mlp_classification", line 147, in mlp
PL/Python function "mlp_classification", line 74, in quote_literal
{code}
(2)
If I cast to text explicitly:
{code}
select madlib.minibatch_preprocessor(
'classification_train', -- input table
'mini_batch_packed_train', -- output table
'response::TEXT', -- response
'feature_vector', -- indep vars
NULL, -- grouping
NULL, -- buffer size (or size of the mini-batch)
TRUE -- Encode scalar int dependent variable (if response is integer instead of boolean or char)
);
{code}
The tables looks like:
{code}
madlib=# \d+ mini_batch_packed_train_summary
Table "public.mini_batch_packed_train_summary"
Column | Type | Modifiers | Storage | Stats target | Description
--------------------------+---------+-----------+----------+--------------+-------------
source_table | text | | extended | |
output_table | text | | extended | |
dependent_varname | text | | extended | |
independent_varname | text | | extended | |
dependent_vartype | text | | extended | |
buffer_size | integer | | plain | |
class_values | text[] | | extended | |
num_rows_processed | integer | | plain | |
num_missing_rows_skipped | integer | | plain | |
grouping_cols | text | | extended | |
{code}
And MLP training works OK.
> MLP with minibatch fails for integer dependent variable
> -------------------------------------------------------
>
> Key: MADLIB-1322
> URL: https://issues.apache.org/jira/browse/MADLIB-1322
> Project: Apache MADlib
> Issue Type: Bug
> Components: Module: Neural Networks
> Reporter: Frank McQuillan
> Priority: Minor
> Fix For: v1.16
>
>
> \
> {code}
> DROP TABLE IF EXISTS iris_data;
> CREATE TABLE iris_data(
> id serial,
> attributes numeric[],
> class_text varchar,
> class integer,
> state varchar
> );
> INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
> (1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
> (2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
> (3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
> (4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
> (5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
> (6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
> (7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
> (8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
> (9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
> (10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
> (11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
> (12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
> (13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
> (14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
> (15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
> (16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
> (17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
> (18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
> (19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
> (20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
> (21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
> (22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
> (23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
> (24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
> (25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
> (26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
> (27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
> (28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
> (29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
> (30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
> (31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
> (32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
> (33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
> (34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
> (35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
> (36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
> (37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
> (38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
> (39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
> (40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
> (41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
> (42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
> (43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
> (44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
> (45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
> (46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
> (47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
> (48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
> (49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
> (50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
> (51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
> (52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
> {code}
> Works OK if dependent variable is TEXT:
> {code}
> DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
> SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
> 'iris_data_packed', -- Output table
> 'class_text', -- Dependent variable TEXT
> 'attributes' -- Independent variables
> );
> \d+ iris_data_packed
> Table "public.iris_data_packed"
> Column | Type | Modifiers | Storage | Stats target | Description
> ---------------------+--------------------+-----------+----------+--------------+-------------
> __id__ | bigint | | plain | |
> dependent_varname | double precision[] | | extended | |
> independent_varname | double precision[] | | extended | |
> DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
> -- Set seed so results are reproducible
> SELECT setseed(0);
> SELECT madlib.mlp_classification(
> 'iris_data_packed', -- Output table from mini-batch preprocessor
> 'mlp_model', -- Destination table
> 'independent_varname', -- Hardcode to this, from table iris_data_packed
> 'dependent_varname', -- Hardcode to this, from table iris_data_packed
> ARRAY[5], -- Number of units per layer
> 'learning_rate_init=0.1,
> n_iterations=5,
> tolerance=0', -- Optimizer params
> 'tanh', -- Activation function
> NULL, -- Default weight (1)
> FALSE, -- No warm start
> TRUE -- Not verbose
> );
> INFO: Iteration: 1, Loss: <0.990848103579>
> INFO: Iteration: 2, Loss: <0.852423978558>
> INFO: Iteration: 3, Loss: <0.689764103374>
> INFO: Iteration: 4, Loss: <0.530458765792>
> mlp_classification
> --------------------
>
> (1 row)
> {code}
> Does not work if dependent variable is INTEGER:
> {code}
> DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
> SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
> 'iris_data_packed', -- Output table
> 'class', -- Dependent variable INTEGER
> 'attributes', -- Independent variables
> NULL, -- grouping
> NULL, -- buffer size (or size of the mini-batch)
> TRUE -- Encode scalar int dependent variable
> );
> \d+ iris_data_packed
> Table "public.iris_data_packed"
> Column | Type | Modifiers | Storage | Stats target | Description
> ---------------------+--------------------+-----------+----------+--------------+-------------
> __id__ | bigint | | plain | |
> dependent_varname | double precision[] | | extended | |
> independent_varname | double precision[] | | extended | |
> DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
> -- Set seed so results are reproducible
> SELECT setseed(0);
> SELECT madlib.mlp_classification(
> 'iris_data_packed', -- Output table from mini-batch preprocessor
> 'mlp_model', -- Destination table
> 'independent_varname', -- Hardcode to this, from table iris_data_packed
> 'dependent_varname', -- Hardcode to this, from table iris_data_packed
> ARRAY[5], -- Number of units per layer
> 'learning_rate_init=0.1,
> n_iterations=10,
> tolerance=0', -- Optimizer params
> 'tanh', -- Activation function
> NULL, -- Default weight (1)
> FALSE, -- No warm start
> TRUE -- Not verbose
> );
> ERROR: TypeError: must be string, not int
> CONTEXT: Traceback (most recent call last):
> PL/Python function "mlp_classification", line 33, in <module>
> grouping_col)
> PL/Python function "mlp_classification", line 42, in wrapper
> PL/Python function "mlp_classification", line 147, in mlp
> PL/Python function "mlp_classification", line 74, in quote_literal
> PL/Python function "mlp_classification"
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)