You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2016/03/03 19:42:18 UTC
[jira] [Commented] (MADLIB-917) Path - window functions (multiple
matches per partition, 1 window per match)
[ https://issues.apache.org/jira/browse/MADLIB-917?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15178348#comment-15178348 ]
Frank McQuillan commented on MADLIB-917:
----------------------------------------
Test a)
{code:sql}
DROP TABLE IF EXISTS weblog, path_output, path_output_tuples;
CREATE TABLE weblog (event_timestamp TIMESTAMP,
user_id INT,
age_group INT,
income_group INT,
gender TEXT,
region TEXT,
household_size INT,
click_event INT,
purchase_event INT,
revenue FLOAT,
margin FLOAT);
INSERT INTO weblog VALUES
('04/14/2012 23:43:00', 102201, 3, 3, 'Female', 'East', 3, 1, 1, 112, 36),
('04/14/2012 23:56:00', 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
('04/15/2012 01:04:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
('04/15/2012 01:15:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
('04/15/2012 02:15:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
('04/15/2012 02:53:00', 102201, 3, 3, 'Female', 'East', 3, 1, 1, 117, 28),
('04/15/2012 02:59:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
('04/15/2012 04:11:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/15/2012 04:25:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 91, 28),
('04/15/2012 04:32:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 112, 36),
('04/15/2012 06:26:00', 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/15/2012 06:32:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
('04/15/2012 07:02:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 118, 39),
('04/15/2012 08:51:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
('04/15/2012 09:28:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 103, 32),
('04/15/2012 10:19:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/15/2012 11:40:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
('04/15/2012 12:58:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 148, 23),
('04/15/2012 14:18:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 113, 29),
('04/15/2012 22:20:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 108, 38),
('04/15/2012 23:13:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
('04/15/2012 23:14:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/16/2012 01:55:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
('04/16/2012 02:12:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 153, 26),
('04/16/2012 04:20:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
('04/16/2012 05:38:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 0, 0, 0),
('04/16/2012 05:44:00', 102201, 3, 3, 'Female', 'East', 3, 1, 0, 0, 0),
('04/16/2012 05:59:00', 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
('04/16/2012 09:35:00', 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
('04/16/2012 10:40:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
('04/16/2012 14:23:00', 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/16/2012 20:46:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 131, 28),
('04/16/2012 21:11:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 127, 27),
('04/16/2012 22:35:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
('04/16/2012 23:51:00', 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
('04/16/2012 23:55:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
('04/16/2012 23:56:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
('04/16/2012 23:57:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 456, 77);
/*
SELECT * FROM weblog ORDER BY event_timestamp ASC;
*/
SELECT madlib.path(
'weblog', -- Name of the table
'path_output', -- Table name to store the path results
'user_id', -- Partition expression to group the data table
'event_timestamp ASC', å -- Order expression to sort the tuples of the data table
'IMPR:=click_event=0 AND purchase_event=0, CLICK:=click_event=1 AND purchase_event=0, CONV:=purchase_event=1', -- Definition of various symbols used in the pattern definition
'(IMPR){1}(CLICK){1}(CONV){1}', -- Definition of the path pattern to search for
'SUM(margin) as sum_of_margin, SUM(revenue) as sum_of_revenue', -- Aggregate/window functions to be applied on the matched paths
TRUE -- Persist matches
);
SELECT * FROM path_output;
/*
SELECT * FROM path_output_tuples order by event_timestamp ASC;
*/
{code}
produces
{code:sql}
madlib=# SELECT * FROM path_output;
user_id | sum_of_margin | sum_of_revenue
---------+---------------+----------------
101121 | 28 | 131
101331 | 113 | 568
(2 rows)
{code}
> Path - window functions (multiple matches per partition, 1 window per match)
> ----------------------------------------------------------------------------
>
> Key: MADLIB-917
> URL: https://issues.apache.org/jira/browse/MADLIB-917
> Project: Apache MADlib
> Issue Type: New Feature
> Components: Module: Utilities
> Affects Versions: v1.9
> Reporter: Frank McQuillan
> Assignee: Rahul Iyer
> Fix For: v1.9
>
> Attachments: Ecommerce data set for path test 3.csv, path query3.sql
>
>
> Story
> As a user, I want to define symbols so that I can define a regular expression of symbols to identify sequences of events that I care about.
> Partition:
> 1) Multiple matches per partition in this story.
> 2) Note that the match in the data might not span the whole partition, that is, that matched rows could just be a subset of the rows in the partition.
> Window:
> 1) Limited to 1 window per partition.
> Other
> 1) Club rows together in the case where there are multiple matches per partition, when doing aggregate/window functions. E.g., if doing sum of a revenue column, then sum all rows from all matches (as opposed to a separate sum for each match).
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)