You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by xt...@apache.org on 2016/07/28 16:51:50 UTC

incubator-madlib git commit: Path: Add support for overlapping patterns

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 628560a61 -> 195ad6436


Path: Add support for overlapping patterns

JIRA: MADLIB-995

- Better handling cases when partition_expr is NULL
- Explicitly set boolean default value to false
- Update docs

Closes #56


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/195ad643
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/195ad643
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/195ad643

Branch: refs/heads/master
Commit: 195ad64367303ede9a0fbc933b106126241e294c
Parents: 628560a
Author: Xiaocheng Tang <xi...@gmail.com>
Authored: Wed Jul 13 17:29:40 2016 -0700
Committer: Xiaocheng Tang <xi...@gmail.com>
Committed: Thu Jul 28 09:47:40 2016 -0700

----------------------------------------------------------------------
 src/modules/utilities/path.cpp                  | 24 ++++++--
 src/ports/postgres/modules/utilities/path.py_in | 42 +++++++++-----
 .../postgres/modules/utilities/path.sql_in      | 59 +++++++++++++-------
 .../postgres/modules/utilities/test/path.sql_in | 16 ++++++
 4 files changed, 100 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/modules/utilities/path.cpp
----------------------------------------------------------------------
diff --git a/src/modules/utilities/path.cpp b/src/modules/utilities/path.cpp
index a16553c..459d361 100644
--- a/src/modules/utilities/path.cpp
+++ b/src/modules/utilities/path.cpp
@@ -27,6 +27,7 @@ AnyType path_pattern_match::run(AnyType & args)
     std::string sym_str = args[0].getAs<char *>();
     std::string reg_str = args[1].getAs<char *>();
     MappedColumnVector row_id = args[2].getAs<MappedColumnVector>();
+    bool overlapping_patterns = args[3].getAs<bool>();
 
     if (sym_str.size() != row_id.size()) {
         std::stringstream errorMsg;
@@ -50,12 +51,23 @@ AnyType path_pattern_match::run(AnyType & args)
     smatch matches;
     // prefer regex_search over sregex_iterator so that match_results<> object
     // caches dynamically allocated memory across regex searches
-    while (regex_search(start, end, matches, reg)) {
-        size_t i0 = matches[0].first - sym_start;
-        size_t i1 = matches[0].second - sym_start;
-        _match_row_id.insert(_match_row_id.end(), row_start+i0, row_start+i1);
-        _match_id.insert(_match_id.end(), matches[0].length(), match_count++);
-        start = matches[0].second;
+    if (overlapping_patterns) {
+        while (regex_search(start, end, matches, reg)) {
+            size_t i0 = matches[0].first - sym_start;
+            size_t i1 = matches[0].second - sym_start;
+            _match_row_id.insert(_match_row_id.end(), row_start+i0, row_start+i1);
+            _match_id.insert(_match_id.end(), matches[0].length(), match_count++);
+            start = matches[0].first+1;
+        }
+    }
+    else {
+        while (regex_search(start, end, matches, reg)) {
+            size_t i0 = matches[0].first - sym_start;
+            size_t i1 = matches[0].second - sym_start;
+            _match_row_id.insert(_match_row_id.end(), row_start+i0, row_start+i1);
+            _match_id.insert(_match_id.end(), matches[0].length(), match_count++);
+            start = matches[0].second;
+        }
     }
 
     MappedColumnVector match_id(_match_id.data(), _match_id.size());

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/ports/postgres/modules/utilities/path.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/path.py_in b/src/ports/postgres/modules/utilities/path.py_in
index 1330d8e..2b3085a 100644
--- a/src/ports/postgres/modules/utilities/path.py_in
+++ b/src/ports/postgres/modules/utilities/path.py_in
@@ -40,7 +40,7 @@ m4_changequote(`<!', `!>')
 
 def path(schema_madlib, source_table, output_table, partition_expr,
          order_expr, pattern_expr, symbol_expr, agg_func,
-         persist_rows=None, **kwargs):
+         persist_rows, overlapping_patterns, **kwargs):
     """
         Perform regular pattern matching over a sequence of rows.
 
@@ -56,15 +56,15 @@ def path(schema_madlib, source_table, output_table, partition_expr,
 
     """
     with MinWarning("error"):
+        # check for both false and None
         if not partition_expr:
             partition_expr = "1 = 1"
-            as_partition_expr = "1 = 1 as all"
-        else:
-            as_partition_expr = partition_expr
-        if persist_rows is None:
+        if not persist_rows:
             # persist_rows = None implies no preference
             # persist_rows = False implies do not store the matched rows
             persist_rows = not bool(agg_func)
+        if not overlapping_patterns:
+            overlapping_patterns = False
 
         _validate(source_table, output_table, partition_expr, order_expr,
                   pattern_expr, symbol_expr, agg_func, persist_rows)
@@ -141,7 +141,8 @@ def path(schema_madlib, source_table, output_table, partition_expr,
                             {m}.path_pattern_match(
                                 array_to_string(array_agg({short_sym_name_str} ORDER BY {order_expr}), '')::text,
                                 '{new_pattern_expr}'::text,
-                                array_agg({id_col_name} ORDER BY {order_expr})::float8[]
+                                array_agg({id_col_name} ORDER BY {order_expr})::float8[],
+                                {overlapping_patterns}::boolean
                             ) as matched
                         FROM {input_with_id}
                         WHERE {short_sym_name_str} is NOT NULL
@@ -155,15 +156,26 @@ def path(schema_madlib, source_table, output_table, partition_expr,
         p_col_orig_name_str = ','.join(
             [i + " AS " + j for i, j in zip(split_p_cols, quoted_split_p_cols)])
         if agg_func:
-            plpy.execute("""
-                CREATE TABLE {output_table} AS
-                   SELECT
-                        {p_col_orig_name_str},
-                        {match_id_name},
-                        {agg_func}
-                   FROM {matched_rows}
-                   GROUP BY {as_partition_expr}, {match_id_name}
-                """.format(**locals()))
+            if partition_expr == '1 = 1':
+                # no partition
+                plpy.execute("""
+                    CREATE TABLE {output_table} AS
+                       SELECT
+                            {match_id_name},
+                            {agg_func}
+                       FROM {matched_rows}
+                       GROUP BY {match_id_name}
+                    """.format(**locals()))
+            else:
+                plpy.execute("""
+                    CREATE TABLE {output_table} AS
+                       SELECT
+                            {p_col_orig_name_str},
+                            {match_id_name},
+                            {agg_func}
+                       FROM {matched_rows}
+                       GROUP BY {partition_expr}, {match_id_name}
+                    """.format(**locals()))
             result = "Aggregation result available in table " + output_table
         else:
             result = "No aggregation table created"

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/ports/postgres/modules/utilities/path.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/path.sql_in b/src/ports/postgres/modules/utilities/path.sql_in
index 0883d07..d832ab5 100644
--- a/src/ports/postgres/modules/utilities/path.sql_in
+++ b/src/ports/postgres/modules/utilities/path.sql_in
@@ -83,7 +83,8 @@ path(
     symbol,
     pattern,
     aggregate_func,
-    persist_rows
+    persist_rows,
+    overlapping_patterns
 )
 </pre>
 
@@ -149,8 +150,8 @@ path(
         </ul>
     </dd>
 
-    <dt>aggregate_func</dt>
-    <dd>VARCHAR. A comma-separated list of aggregates to be
+    <dt>aggregate_func (optional)</dt>
+    <dd>VARCHAR, default NULL. A comma-separated list of aggregates to be
     applied to the pattern matches [3].  Please note that window functions
     cannot currently be used in the parameter 'aggregate_func'.  If you want
     to use a window function [4], output the pattern matches and write a SQL
@@ -161,12 +162,16 @@ path(
     aggregates, you can put NULL or '' in the 'aggregate_func' parameter.
     </dd>
 
-    <dt>persist_rows</dt>
-    <dd>BOOLEAN. If TRUE the matched rows are persisted in a separate output
+    <dt>persist_rows (optional)</dt>
+    <dd>BOOLEAN, default FALSE. If TRUE the matched rows are persisted in a separate output
     table. This table is named as <output_table>_tuples (the string
     "_tuples" is added as suffix to the value of <em>output_table</em>).
     </dd>
 
+    <dt>overlapping_patterns (optional)</dt>
+    <dd>BOOLEAN, default FALSE. If TRUE find every occurrence of the pattern in the partition, regardless of whether it might have been part of a previously found match.
+    </dd>
+
 </dl>
 
 
@@ -414,13 +419,10 @@ than the beer page just before checkout.
 
 @anchor note
 @note
-Please note some current limitations of the path algorithm.  These
-limitations will be addressed in subsequent releases.
+Please note some current limitations of the path algorithm.
 - Window functions cannot currently be used in the parameter
 'aggregate_func'.  Instead, output the pattern matches and write
 a SQL query with a window function over the output tuples.
-- Overlapping pattern matches are not supported.  That is,
-a given row can only belong to one pattern match (non-overlapping).
 - A given row can only match one symbol.  If a row matches
 multiple symbols, the symbol that comes <em>first</em> in the symbol
 definition list will take precedence.
@@ -481,9 +483,10 @@ CREATE TYPE MADLIB_SCHEMA.path_match_result AS (
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path_pattern_match
 (
-    symbols    TEXT,
-    pattern    TEXT,
-    row_id    FLOAT8[]
+    symbols                   TEXT,
+    pattern                   TEXT,
+    row_id                    FLOAT8[],
+    overlapping_patterns      BOOLEAN
 ) RETURNS MADLIB_SCHEMA.path_match_result
 AS 'MODULE_PATHNAME', 'path_pattern_match'
 LANGUAGE C STRICT
@@ -491,6 +494,26 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
+    source_table          VARCHAR,
+    output_table          VARCHAR,
+    partition_expr        VARCHAR,
+    order_expr            VARCHAR,
+    symbol_expr           VARCHAR,
+    pattern_expr          VARCHAR,
+    agg_func              VARCHAR,
+    persist_rows          BOOLEAN,
+    overlapping_patterns  BOOLEAN
+) RETURNS TEXT AS $$
+PythonFunction(utilities, path, path)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+-------------------------------------------------------------------------
+-- Overloaded functions for default arguments ---------------
+
+-- Do not allow overlapping by default
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
     source_table        VARCHAR,
     output_table        VARCHAR,
     partition_expr      VARCHAR,
@@ -500,14 +523,10 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
     agg_func            VARCHAR,
     persist_rows        BOOLEAN
 ) RETURNS TEXT AS $$
-PythonFunction(utilities, path, path)
-$$ LANGUAGE plpythonu
+    SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, $7, $8, False)
+$$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
-
--------------------------------------------------------------------------
--- Overloaded functions for default arguments ---------------
-
 -- Do not output matched rows by default
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
     source_table        VARCHAR,
@@ -518,7 +537,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
     pattern_expr        VARCHAR,
     agg_func            VARCHAR
 ) RETURNS TEXT AS $$
-    SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, $7, NULL)
+    SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, $7, False, False)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -531,7 +550,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
     symbol_expr         VARCHAR,
     pattern_expr        VARCHAR
 ) RETURNS TEXT AS $$
-    SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, NULL, NULL)
+    SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, NULL, False, False)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/ports/postgres/modules/utilities/test/path.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/path.sql_in b/src/ports/postgres/modules/utilities/test/path.sql_in
index bd9dad2..9de15eb 100644
--- a/src/ports/postgres/modules/utilities/test/path.sql_in
+++ b/src/ports/postgres/modules/utilities/test/path.sql_in
@@ -98,6 +98,22 @@ SELECT * FROM "Path_output_tuples";
 
 SELECT * FROM "Path_output";
 
+DROP TABLE "Path_output", "Path_output_tuples";
+SELECT path(
+     '"Weblog"',              -- Name of the table
+     '"Path_output"',         -- Table name to store the path results
+     NULL,             -- Partition expression to group the data table
+     'event_timestamp ASC',         -- Order expression to sort the tuples of the data table
+     'I:="Click_event"=0 AND purchase_event=0, Click:="Click_event"=1 AND purchase_event=0, Conv:=purchase_event=1',    -- Definition of various symbols used in the pattern definition
+     'I(click){1}(CONV){1}',        -- Definition of the path pattern to search for
+     'COUNT(*) as count'             -- Aggregate/window functions to be applied on the matched paths
+    ,TRUE
+     );
+------------------------------------------------------------
+
+SELECT assert(count::integer=3::integer, 'wrong results in path')
+FROM "Path_output";
+
 INSERT INTO "Weblog" VALUES
 ('04/15/2012 02:15:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
 ('04/15/2012 02:59:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),