You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Frank McQuillan (JIRA)" <ji...@apache.org> on 2016/03/03 19:42:18 UTC

[jira] [Commented] (MADLIB-917) Path - window functions (multiple matches per partition, 1 window per match)

    [ https://issues.apache.org/jira/browse/MADLIB-917?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15178348#comment-15178348 ] 

Frank McQuillan commented on MADLIB-917:
----------------------------------------

Test a)

{code:sql}

DROP TABLE IF EXISTS weblog, path_output, path_output_tuples;

CREATE TABLE weblog (event_timestamp TIMESTAMP,
			user_id INT,
			age_group INT,
			income_group INT,
			gender TEXT,
			region TEXT,
			household_size INT,
			click_event INT,
			purchase_event INT,
			revenue FLOAT,
			margin FLOAT);
                          
INSERT INTO weblog VALUES                                                     
('04/14/2012 23:43:00', 102201, 3, 3, 'Female', 'East', 3, 1, 1, 112, 36),
('04/14/2012 23:56:00', 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
('04/15/2012 01:04:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
('04/15/2012 01:15:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
('04/15/2012 02:15:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
('04/15/2012 02:53:00', 102201, 3, 3, 'Female', 'East', 3, 1, 1, 117, 28),
('04/15/2012 02:59:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
('04/15/2012 04:11:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/15/2012 04:25:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 91, 28),
('04/15/2012 04:32:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 112, 36),
('04/15/2012 06:26:00', 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/15/2012 06:32:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
('04/15/2012 07:02:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 118, 39),
('04/15/2012 08:51:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
('04/15/2012 09:28:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 103, 32),
('04/15/2012 10:19:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/15/2012 11:40:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
('04/15/2012 12:58:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 148, 23),
('04/15/2012 14:18:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 113, 29),
('04/15/2012 22:20:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 108, 38),
('04/15/2012 23:13:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
('04/15/2012 23:14:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/16/2012 01:55:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
('04/16/2012 02:12:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 153, 26),
('04/16/2012 04:20:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
('04/16/2012 05:38:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 0, 0, 0),
('04/16/2012 05:44:00', 102201, 3, 3, 'Female', 'East', 3, 1, 0, 0, 0),
('04/16/2012 05:59:00', 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
('04/16/2012 09:35:00', 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
('04/16/2012 10:40:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
('04/16/2012 14:23:00', 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
('04/16/2012 20:46:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 131, 28),
('04/16/2012 21:11:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 127, 27),
('04/16/2012 22:35:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
('04/16/2012 23:51:00', 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
('04/16/2012 23:55:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
('04/16/2012 23:56:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
('04/16/2012 23:57:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 456, 77);

/*
SELECT * FROM weblog ORDER BY event_timestamp ASC;
*/


SELECT madlib.path(                                                                   
     'weblog',    			-- Name of the table                                           
     'path_output',    		-- Table name to store the path results                         
     'user_id',  	-- Partition expression to group the data table                 
     'event_timestamp ASC',      å	-- Order expression to sort the tuples of the data table        
     'IMPR:=click_event=0 AND purchase_event=0, CLICK:=click_event=1 AND purchase_event=0, CONV:=purchase_event=1',	-- Definition of various symbols used in the pattern definition 
     '(IMPR){1}(CLICK){1}(CONV){1}',     	-- Definition of the path pattern to search for  
     'SUM(margin) as sum_of_margin, SUM(revenue) as sum_of_revenue',	-- Aggregate/window functions to be applied on the matched paths
     TRUE  -- Persist matches
     );


SELECT * FROM path_output; 


/*
SELECT * FROM path_output_tuples order by event_timestamp ASC; 
*/

{code}

produces 

{code:sql}
madlib=# SELECT * FROM path_output;
 user_id | sum_of_margin | sum_of_revenue 
---------+---------------+----------------
  101121 |            28 |            131
  101331 |           113 |            568
(2 rows)
{code}

> Path - window functions (multiple matches per partition, 1 window per match)
> ----------------------------------------------------------------------------
>
>                 Key: MADLIB-917
>                 URL: https://issues.apache.org/jira/browse/MADLIB-917
>             Project: Apache MADlib
>          Issue Type: New Feature
>          Components: Module: Utilities
>    Affects Versions: v1.9
>            Reporter: Frank McQuillan
>            Assignee: Rahul Iyer
>             Fix For: v1.9
>
>         Attachments: Ecommerce data set for path test 3.csv, path query3.sql
>
>
> Story
> As a user, I want to define symbols so that I can define a regular expression of symbols to identify sequences of events that I care about.
> Partition:
> 1) Multiple matches per partition in this story.
> 2) Note that the match in the data might not span the whole partition, that is, that matched rows could just be a subset of the rows in the partition.
> Window:
> 1) Limited to 1 window per partition.
> Other
> 1) Club rows together in the case where there are multiple matches per partition, when doing aggregate/window functions.  E.g., if doing sum of a revenue column, then sum all rows from all matches (as opposed to a separate sum for each match).



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)