You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2016/01/09 02:28:39 UTC

[jira] [Created] (MADLIB-949) Handle multi-line list of variables better

Frank McQuillan created MADLIB-949:
--------------------------------------

             Summary: Handle multi-line list of variables better
                 Key: MADLIB-949
                 URL: https://issues.apache.org/jira/browse/MADLIB-949
             Project: Apache MADlib
          Issue Type: Bug
          Components: All Modules
            Reporter: Frank McQuillan


"
Column list input for correlation function cannot handle multi-line list of variables - see the example below where 17 columns were provided by 2 were left out of the correlation analysis due to \n being appended as a prefix to the first variable listed in a new line.

drop view if exists rrnew.rdc_model_inputs_wallcols_gas;
create or replace view rrnew.rdc_model_inputs_wallcols_gas as
    select *,
        ind_array[142] as ind_array_142,
        ind_array[140] as ind_array_140,
        ind_array[90] as ind_array_90,
        ind_array[96] as ind_array_96,
        ind_array[98] as ind_array_98,
        ind_array[134] as ind_array_134,
        ind_array[10] as ind_array_10,
        ind_array[8] as ind_array_8,
        ind_array[7] as ind_array_7,
        ind_array[95] as ind_array_95,
        ind_array[51] as ind_array_51,
        ind_array[139] as ind_array_139
    from rrnew.rdc_model_inputs_wallcols
    where wrldcat_mod = 'Gas Stations';

-- Executing query:
select madlib.correlation(
    'rrnew.rdc_model_inputs_wallcols_gas',
    'rrnew.rdc_model_test_corr_gas_01',
    'dep_var, ind_array_142, ind_array_140, ind_array_90, ind_array_96, ind_array_98,
        ind_array_134, ind_array_10, ind_array_8, ind_array_7, ind_array_95, ind_array_51, ind_array_139,
        ct_sales_txns_overw, ct_returns_txns_overw, sum_sales_amt_overw, sum_returns_amt_overw',
    TRUE);
-- ""(rrnew.rdc_model_test_corr_gas_01,15,119.778332949)""
INFO:  Summary for 'correlation' function
DETAIL: 
 Columns that don't exist in 'rrnew.rdc_model_inputs_wallcols_gas' ignored: ['\nind_array_134', '\nct_sales_txns_overw']
Producing correlation for columns: ['ind_array_7', 'ind_array_10', 'ind_array_140', 'ind_array_51', 'ind_array_142', 'ind_array_139', 'ind_array_8', 'ct_returns_txns_overw', 'ind_array_90', 'ind_array_96', 'ind_array_95', 'dep_var', 'ind_array_98', 'sum_sales_amt_overw', 'sum_returns_amt_overw']
CONTEXT:  PL/Python function ""correlation""
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'tot_cnt, mean' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CONTEXT:  SQL statement ""
        DROP TABLE IF EXISTS __madlib_temp_85896764_1449081244_37948045__;
        CREATE TABLE __madlib_temp_85896764_1449081244_37948045__ AS
        SELECT
            tot_cnt,
            count(*) AS non_null_cnt,
            mean,
            madlib.correlation_agg(x, mean) as cor_mat
        FROM
        (
            SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
            FROM rrnew.rdc_model_inputs_wallcols_gas
        ) src1,
        (
            SELECT
                count(*) AS tot_cnt,
                madlib.avg(x) AS mean
            FROM
            (
                SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
                FROM rrnew.rdc_model_inputs_wallcols_gas
            ) src2
        ) subq
        WHERE NOT madlib.array_contains_null(x)
        GROUP BY tot_cnt, mean
        ""
PL/Python function ""correlation""
NOTICE:  table ""__madlib_temp_85896764_1449081244_37948045__"" does not exist, skipping
CONTEXT:  SQL statement ""
        DROP TABLE IF EXISTS __madlib_temp_85896764_1449081244_37948045__;
        CREATE TABLE __madlib_temp_85896764_1449081244_37948045__ AS
        SELECT
            tot_cnt,
            count(*) AS non_null_cnt,
            mean,
            madlib.correlation_agg(x, mean) as cor_mat
        FROM
        (
            SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
            FROM rrnew.rdc_model_inputs_wallcols_gas
        ) src1,
        (
            SELECT
                count(*) AS tot_cnt,
                madlib.avg(x) AS mean
            FROM
            (
                SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
                FROM rrnew.rdc_model_inputs_wallcols_gas
            ) src2
        ) subq
        WHERE NOT madlib.array_contains_null(x)
        GROUP BY tot_cnt, mean
        ""
PL/Python function ""correlation""
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'method' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CONTEXT:  SQL statement ""
        DROP TABLE IF EXISTS rrnew.rdc_model_test_corr_gas_01_summary;
        CREATE TABLE rrnew.rdc_model_test_corr_gas_01_summary AS
        SELECT
            'correlation'::varchar      AS method,
            'rrnew.rdc_model_inputs_wallcols_gas'::varchar   AS source,
            'rrnew.rdc_model_test_corr_gas_01'::varchar   AS output_table,
            'ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw'::varchar  AS column_names,
            mean                        AS mean_vector,
            non_null_cnt                AS total_rows_processed,
            tot_cnt - non_null_cnt      AS total_rows_skipped
        FROM __madlib_temp_85896764_1449081244_37948045__
        ""
PL/Python function ""correlation""
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'column_position' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CONTEXT:  SQL statement ""
        DROP TABLE IF EXISTS rrnew.rdc_model_test_corr_gas_01;
        CREATE TABLE rrnew.rdc_model_test_corr_gas_01 AS
        SELECT
            *
        FROM
        (
            SELECT
                generate_series(1, 15) AS column_position,
                unnest($1) AS variable
        ) variable_subq
        JOIN
        (
            SELECT
                *
            FROM
                madlib.__deconstruct_lower_triangle(
                    (SELECT cor_mat FROM __madlib_temp_85896764_1449081244_37948045__)
                )
                AS deconstructed(column_position integer
                , ind_array_7 float8, ind_array_10 float8, ind_array_140 float8, ind_array_51 float8, ind_array_142 float8, ind_array_139 float8, ind_array_8 float8, ct_returns_txns_overw float8, ind_array_90 float8, ind_array_96 float8
                , ind_array_95 float8, dep_var float8, ind_array_98 float8, sum_sales_amt_overw float8, sum_returns_amt_overw float8)
        ) matrix_subq
        USING (column_position)
        ""
PL/Python function ""correlation""
Total query runtime: 170199 ms.
1 row retrieved.

 
select * from rrnew.rdc_model_test_corr_gas_01 order by 1;
--Total query runtime: 831 ms.
--15 rows retrieved.
1;""ind_array_7"";1;;;;;;;;;;;;;;
2;""ind_array_10"";-0.0578587333681671;1;;;;;;;;;;;;;
3;""ind_array_140"";-0.141280063490456;-0.0298824594825621;1;;;;;;;;;;;;
4;""ind_array_51"";0.598116457583963;-0.123774582622021;-0.165577429461016;1;;;;;;;;;;;
5;""ind_array_142"";-0.122703307780106;0.305285848252308;-0.0453145748949762;-0.154444832472113;1;;;;;;;;;;
6;""ind_array_139"";0.570721655517796;-0.138205302549432;-0.17841624847673;0.907429918473815;-0.167155495467681;1;;;;;;;;;
7;""ind_array_8"";-0.0984959107166764;0.0508444910170194;0.406225489292519;-0.134253845525128;-0.0272393303189428;-0.148165608861266;1;;;;;;;;
8;""ct_returns_txns_overw"";0.225717110532388;-0.0515613223161471;-0.0626670782890204;0.206218995895436;-0.0583236749061465;0.20466918711459;-0.053965315058231;1;;;;;;;
9;""ind_array_90"";-0.0931577631832484;-0.019237688633164;-0.0468388474306542;-0.116091871910697;-0.0436128383781105;-0.126957508414138;-0.0245612547247926;-0.0393587080444247;1;;;;;;
10;""ind_array_96"";-0.100053104765554;0.0485304192805481;0.407296182153894;-0.135679783095363;-0.0282837081849224;-0.148863287595196;0.925708384948968;-0.054183058576181;-0.022015894670012;1;;;;;
11;""ind_array_95"";0.937696314043715;-0.0722795513203419;-0.148796403017105;0.600446785418866;-0.131698370371973;0.59523202625587;-0.106821466241569;0.24195998270111;-0.0967319753989314;-0.104855952107336;1;;;;
12;""dep_var"";0.587437837717076;-0.175279017797318;-0.232814494915998;0.697484081384999;-0.2155480075257;0.761346299277184;-0.192642044082485;0.218586005606964;-0.16722483537619;-0.194369965415433;0.609343396712704;1;;;
13;""ind_array_98"";-0.0612907301601465;0.923526436187384;-0.0295445046974965;-0.124735781736682;0.302142475697116;-0.138233539678189;0.0506370953977326;-0.0512638644969288;-0.0154606924847994;0.0580709575395514;-0.0677296211289303;-0.176583834731307;1;;
14;""sum_sales_amt_overw"";0.718463532026027;-0.084061149182067;-0.12867635960654;0.684077740786926;-0.116794135493021;0.679447067897773;-0.100884986280351;0.236085719541536;-0.0853048449641809;-0.100468026927479;0.747788764485255;0.520951866439684;-0.0824610959973814;1;
15;""sum_returns_amt_overw"";0.146426924665982;-0.0290252464863019;-0.0360714326537947;0.175403909662651;-0.0333782028379655;0.173672108949183;-0.030740431726783;0.586611434412563;-0.022849342897844;-0.0308901206349653;0.153035222403974;0.128197057078992;-0.028901463717606;0.236068463207792;1

"





--
This message was sent by Atlassian JIRA
(v6.3.4#6332)