You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2019/02/12 20:44:00 UTC

[jira] [Resolved] (MADLIB-1295) Encoding module is not handling bigint properly

     [ https://issues.apache.org/jira/browse/MADLIB-1295?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Frank McQuillan resolved MADLIB-1295.
-------------------------------------
    Resolution: Fixed

> Encoding module is not handling bigint properly
> -----------------------------------------------
>
>                 Key: MADLIB-1295
>                 URL: https://issues.apache.org/jira/browse/MADLIB-1295
>             Project: Apache MADlib
>          Issue Type: Bug
>          Components: Module: Utilities
>            Reporter: Frank McQuillan
>            Assignee: Rahul Iyer
>            Priority: Minor
>             Fix For: v1.16
>
>
> From 
> http://madlib.apache.org/docs/latest/group__grp__encode__categorical.html
> "all Boolean, integer and text columns are considered categorical columns and will be encoded when ‘*’ is specified for this argument."
> It works for int and smallint, however does not work for bigint:
> 1) int - OK
> {code}
> DROP TABLE IF EXISTS abalone;
> CREATE TABLE abalone (
>     id serial,
>     sex character varying,
>     length double precision,
>     diameter double precision,
>     height double precision,
>     rings int
> );
> INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
> ('M',    0.455,  0.365,  0.095,  15),
> ('M',    0.35,   0.265,  0.09,   7),
> ('F',    0.53,   0.42,   0.135,  9),
> ('M',    0.44,   0.365,  0.125,  10),
> ('I',    0.33,   0.255,  0.08,   7),
> ('I',    0.425,  0.3,    0.095,  8),
> ('F',    0.53,   0.415,  0.15,   20),
> ('F',    0.545,  0.425,  0.125,  16),
> ('M',    0.475,  0.37,   0.125,  9),
> (null,   0.55,   0.44,   0.15,   19),
> ('F',    0.525,  0.38,   0.14,   14),
> ('M',    0.43,   0.35,   0.11,   10),
> ('M',    0.49,   0.38,   0.135,  11),
> ('F',    0.535,  0.405,  0.145,  10),
> ('F',    0.47,   0.355,  0.1,    10),
> ('M',    0.5,    0.4,    0.13,   12),
> ('I',    0.355,  0.28,   0.085,  7),
> ('F',    0.44,   0.34,   0.1,    10),
> ('M',    0.365,  0.295,  0.08,   7),
> (null,   0.45,   0.32,   0.1,    9);
> {code}
> {code}
> DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
> SELECT madlib.encode_categorical_variables (
>         'abalone',                   -- Source table
>         'abalone_out',               -- Output table
>         '*',                         -- Categorical columns
>         NULL,                        -- Categorical columns to exclude
>         'id'                         -- Index columns
>         );
> SELECT * FROM abalone_out ORDER BY id;
> {code}
> {code}
>  id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9 
> ----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
>   1 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |       0 |       0 |       0
>   2 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
>   3 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
>   4 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>   5 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
>   6 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       1 |       0
>   7 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |       0 |       0 |       0
>   8 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |       0 |       0 |       0
>   9 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
>  10 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |       0 |       0 |       0
>  11 |     1 |     0 |     0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>  12 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>  13 |     0 |     0 |     1 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>  14 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>  15 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>  16 |     0 |     0 |     1 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>  17 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
>  18 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
>  19 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
>  20 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
> (20 rows)
> {code}
> 2) bigint - not OK
> {code}
> DROP TABLE IF EXISTS abalone;
> CREATE TABLE abalone (
>     id serial,
>     sex character varying,
>     length double precision,
>     diameter double precision,
>     height double precision,
>     rings bigint
> );
> INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
> ('M',    0.455,  0.365,  0.095,  15),
> ('M',    0.35,   0.265,  0.09,   7),
> ('F',    0.53,   0.42,   0.135,  9),
> ('M',    0.44,   0.365,  0.125,  10),
> ('I',    0.33,   0.255,  0.08,   7),
> ('I',    0.425,  0.3,    0.095,  8),
> ('F',    0.53,   0.415,  0.15,   20),
> ('F',    0.545,  0.425,  0.125,  16),
> ('M',    0.475,  0.37,   0.125,  9),
> (null,   0.55,   0.44,   0.15,   19),
> ('F',    0.525,  0.38,   0.14,   14),
> ('M',    0.43,   0.35,   0.11,   10),
> ('M',    0.49,   0.38,   0.135,  11),
> ('F',    0.535,  0.405,  0.145,  10),
> ('F',    0.47,   0.355,  0.1,    10),
> ('M',    0.5,    0.4,    0.13,   12),
> ('I',    0.355,  0.28,   0.085,  7),
> ('F',    0.44,   0.34,   0.1,    10),
> ('M',    0.365,  0.295,  0.08,   7),
> (null,   0.45,   0.32,   0.1,    9);
> {code}
> {code}
> DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
> SELECT madlib.encode_categorical_variables (
>         'abalone',                   -- Source table
>         'abalone_out',               -- Output table
>         '*',                         -- Categorical columns
>         NULL,                        -- Categorical columns to exclude
>         'id'                         -- Index columns
>         );
> SELECT * FROM abalone_out ORDER BY id;
> {code}
> {code}
>  id | sex_F | sex_I | sex_M 
> ----+-------+-------+-------
>   1 |     0 |     0 |     1
>   2 |     0 |     0 |     1
>   3 |     1 |     0 |     0
>   4 |     0 |     0 |     1
>   5 |     0 |     1 |     0
>   6 |     0 |     1 |     0
>   7 |     1 |     0 |     0
>   8 |     1 |     0 |     0
>   9 |     0 |     0 |     1
>  10 |     0 |     0 |     0
>  11 |     1 |     0 |     0
>  12 |     0 |     0 |     1
>  13 |     0 |     0 |     1
>  14 |     1 |     0 |     0
>  15 |     1 |     0 |     0
>  16 |     0 |     0 |     1
>  17 |     0 |     1 |     0
>  18 |     1 |     0 |     0
>  19 |     0 |     0 |     1
>  20 |     0 |     0 |     0
> (20 rows)
> {code}
> so it is not encoding rings.
> Also check other places in this module for similar issues with `bigint` .



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)