You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2018/06/04 05:37:00 UTC
[jira] [Commented] (MADLIB-1237) Mini-batch preprocessor fails for
dt_golf dataset
[ https://issues.apache.org/jira/browse/MADLIB-1237?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16499764#comment-16499764 ]
Frank McQuillan commented on MADLIB-1237:
-----------------------------------------
create golf data set:
{code}
DROP TABLE IF EXISTS dt_golf CASCADE;
CREATE TABLE dt_golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text,
observation_weight double precision
);
INSERT INTO dt_golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
{code}
run minibatch:
{code}
SELECT madlib.minibatch_preprocessor('dt_golf',
'dt_golf_packed_2',
'class',
'"Temp_Humidity"', NULL ,1, True);
select * from dt_golf_packed_2_summary;
{code}
produces:
{code}
-[ RECORD 1 ]------------+--------------------
source_table | dt_golf
output_table | dt_golf_packed_2
dependent_varname | class
independent_varname | "Temp_Humidity"
dependent_vartype | text
buffer_size | 1
class_values | {"Don't Play",Play}
num_rows_processed | 14
num_missing_rows_skipped | 0
grouping_cols |
{code}
LGTM
> Mini-batch preprocessor fails for dt_golf dataset
> --------------------------------------------------
>
> Key: MADLIB-1237
> URL: https://issues.apache.org/jira/browse/MADLIB-1237
> Project: Apache MADlib
> Issue Type: Bug
> Components: Module: Utilities
> Reporter: Frank McQuillan
> Assignee: Jingyi Mei
> Priority: Major
> Fix For: v1.15
>
>
> For the dt_golf data set from
> http://madlib.apache.org/docs/latest/group__grp__decision__tree.html#examples
> minibatch pre-processor fails
> {code}
> SELECT madlib.minibatch_preprocessor('dt_golf',
> 'dt_golf_packed_2',
> 'class',
> '"Temp_Humidity"', NULL ,1, True);
> ERROR: spiexceptions.SyntaxError: syntax error at or near "t"
> LINE 8: ...T madlib.array_contains_null(ARRAY[(class) = 'Don't Play', (...
> ^
> QUERY:
> SELECT SUM(source_table_row_count_by_group) AS source_table_row_count,
> SUM(num_rows_processed_by_group) AS total_num_rows_processed,
> AVG(num_rows_processed_by_group) AS avg_num_rows_processed
> FROM (
> SELECT COUNT(*) AS source_table_row_count_by_group,
> SUM(CASE
> WHEN NOT madlib.array_contains_null(ARRAY[(class) = 'Don't Play', (class) = 'Play']::INTEGER[]) AND
> NOT madlib.array_contains_null(("Temp_Humidity")::DOUBLE PRECISION[])
> THEN 1
> ELSE 0
> END) AS num_rows_processed_by_group
> FROM dt_golf
> ) AS s
> CONTEXT: Traceback (most recent call last):
> PL/Python function "minibatch_preprocessor", line 24, in <module>
> minibatch_preprocessor_obj.minibatch_preprocessor()
> PL/Python function "minibatch_preprocessor", line 45, in wrapper
> PL/Python function "minibatch_preprocessor", line 104, in minibatch_preprocessor
> PL/Python function "minibatch_preprocessor", line 236, in _get_skipped_rows_processed_count
> PL/Python function "minibatch_preprocessor"
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)