You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2018/06/04 05:37:00 UTC

[jira] [Commented] (MADLIB-1237) Mini-batch preprocessor fails for dt_golf dataset

    [ https://issues.apache.org/jira/browse/MADLIB-1237?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16499764#comment-16499764 ] 

Frank McQuillan commented on MADLIB-1237:
-----------------------------------------

create golf data set:
{code}
DROP TABLE IF EXISTS dt_golf CASCADE;
CREATE TABLE dt_golf (
    id integer NOT NULL,
    "OUTLOOK" text,
    temperature double precision,
    humidity double precision,
    "Temp_Humidity" double precision[],
    clouds_airquality text[],
    windy boolean,
    class text,
    observation_weight double precision
);
INSERT INTO dt_golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
{code}

run minibatch:
{code}
SELECT madlib.minibatch_preprocessor('dt_golf',
'dt_golf_packed_2',
'class',
'"Temp_Humidity"', NULL ,1, True);
select * from dt_golf_packed_2_summary;
{code}

produces:
{code}
 -[ RECORD 1 ]------------+--------------------
source_table             | dt_golf
output_table             | dt_golf_packed_2
dependent_varname        | class
independent_varname      | "Temp_Humidity"
dependent_vartype        | text
buffer_size              | 1
class_values             | {"Don't Play",Play}
num_rows_processed       | 14
num_missing_rows_skipped | 0
grouping_cols            | 
{code}

LGTM

> Mini-batch preprocessor fails for dt_golf dataset 
> --------------------------------------------------
>
>                 Key: MADLIB-1237
>                 URL: https://issues.apache.org/jira/browse/MADLIB-1237
>             Project: Apache MADlib
>          Issue Type: Bug
>          Components: Module: Utilities
>            Reporter: Frank McQuillan
>            Assignee: Jingyi Mei
>            Priority: Major
>             Fix For: v1.15
>
>
> For the dt_golf data set from 
> http://madlib.apache.org/docs/latest/group__grp__decision__tree.html#examples
> minibatch pre-processor fails
> {code}
> SELECT madlib.minibatch_preprocessor('dt_golf',
> 'dt_golf_packed_2', 
> 'class', 
> '"Temp_Humidity"', NULL ,1, True);
> ERROR: spiexceptions.SyntaxError: syntax error at or near "t"
> LINE 8: ...T madlib.array_contains_null(ARRAY[(class) = 'Don't Play', (...
>  ^
> QUERY:
>  SELECT SUM(source_table_row_count_by_group) AS source_table_row_count,
>  SUM(num_rows_processed_by_group) AS total_num_rows_processed,
>  AVG(num_rows_processed_by_group) AS avg_num_rows_processed
>  FROM (
>  SELECT COUNT(*) AS source_table_row_count_by_group,
>  SUM(CASE
>  WHEN NOT madlib.array_contains_null(ARRAY[(class) = 'Don't Play', (class) = 'Play']::INTEGER[]) AND
>  NOT madlib.array_contains_null(("Temp_Humidity")::DOUBLE PRECISION[])
>  THEN 1
>  ELSE 0
>  END) AS num_rows_processed_by_group
>  FROM dt_golf
> ) AS s
> CONTEXT: Traceback (most recent call last):
>  PL/Python function "minibatch_preprocessor", line 24, in <module>
>  minibatch_preprocessor_obj.minibatch_preprocessor()
>  PL/Python function "minibatch_preprocessor", line 45, in wrapper
>  PL/Python function "minibatch_preprocessor", line 104, in minibatch_preprocessor
>  PL/Python function "minibatch_preprocessor", line 236, in _get_skipped_rows_processed_count
> PL/Python function "minibatch_preprocessor"
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)