You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by xt...@apache.org on 2016/07/28 16:51:50 UTC
incubator-madlib git commit: Path: Add support for overlapping
patterns
Repository: incubator-madlib
Updated Branches:
refs/heads/master 628560a61 -> 195ad6436
Path: Add support for overlapping patterns
JIRA: MADLIB-995
- Better handling cases when partition_expr is NULL
- Explicitly set boolean default value to false
- Update docs
Closes #56
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/195ad643
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/195ad643
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/195ad643
Branch: refs/heads/master
Commit: 195ad64367303ede9a0fbc933b106126241e294c
Parents: 628560a
Author: Xiaocheng Tang <xi...@gmail.com>
Authored: Wed Jul 13 17:29:40 2016 -0700
Committer: Xiaocheng Tang <xi...@gmail.com>
Committed: Thu Jul 28 09:47:40 2016 -0700
----------------------------------------------------------------------
src/modules/utilities/path.cpp | 24 ++++++--
src/ports/postgres/modules/utilities/path.py_in | 42 +++++++++-----
.../postgres/modules/utilities/path.sql_in | 59 +++++++++++++-------
.../postgres/modules/utilities/test/path.sql_in | 16 ++++++
4 files changed, 100 insertions(+), 41 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/modules/utilities/path.cpp
----------------------------------------------------------------------
diff --git a/src/modules/utilities/path.cpp b/src/modules/utilities/path.cpp
index a16553c..459d361 100644
--- a/src/modules/utilities/path.cpp
+++ b/src/modules/utilities/path.cpp
@@ -27,6 +27,7 @@ AnyType path_pattern_match::run(AnyType & args)
std::string sym_str = args[0].getAs<char *>();
std::string reg_str = args[1].getAs<char *>();
MappedColumnVector row_id = args[2].getAs<MappedColumnVector>();
+ bool overlapping_patterns = args[3].getAs<bool>();
if (sym_str.size() != row_id.size()) {
std::stringstream errorMsg;
@@ -50,12 +51,23 @@ AnyType path_pattern_match::run(AnyType & args)
smatch matches;
// prefer regex_search over sregex_iterator so that match_results<> object
// caches dynamically allocated memory across regex searches
- while (regex_search(start, end, matches, reg)) {
- size_t i0 = matches[0].first - sym_start;
- size_t i1 = matches[0].second - sym_start;
- _match_row_id.insert(_match_row_id.end(), row_start+i0, row_start+i1);
- _match_id.insert(_match_id.end(), matches[0].length(), match_count++);
- start = matches[0].second;
+ if (overlapping_patterns) {
+ while (regex_search(start, end, matches, reg)) {
+ size_t i0 = matches[0].first - sym_start;
+ size_t i1 = matches[0].second - sym_start;
+ _match_row_id.insert(_match_row_id.end(), row_start+i0, row_start+i1);
+ _match_id.insert(_match_id.end(), matches[0].length(), match_count++);
+ start = matches[0].first+1;
+ }
+ }
+ else {
+ while (regex_search(start, end, matches, reg)) {
+ size_t i0 = matches[0].first - sym_start;
+ size_t i1 = matches[0].second - sym_start;
+ _match_row_id.insert(_match_row_id.end(), row_start+i0, row_start+i1);
+ _match_id.insert(_match_id.end(), matches[0].length(), match_count++);
+ start = matches[0].second;
+ }
}
MappedColumnVector match_id(_match_id.data(), _match_id.size());
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/ports/postgres/modules/utilities/path.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/path.py_in b/src/ports/postgres/modules/utilities/path.py_in
index 1330d8e..2b3085a 100644
--- a/src/ports/postgres/modules/utilities/path.py_in
+++ b/src/ports/postgres/modules/utilities/path.py_in
@@ -40,7 +40,7 @@ m4_changequote(`<!', `!>')
def path(schema_madlib, source_table, output_table, partition_expr,
order_expr, pattern_expr, symbol_expr, agg_func,
- persist_rows=None, **kwargs):
+ persist_rows, overlapping_patterns, **kwargs):
"""
Perform regular pattern matching over a sequence of rows.
@@ -56,15 +56,15 @@ def path(schema_madlib, source_table, output_table, partition_expr,
"""
with MinWarning("error"):
+ # check for both false and None
if not partition_expr:
partition_expr = "1 = 1"
- as_partition_expr = "1 = 1 as all"
- else:
- as_partition_expr = partition_expr
- if persist_rows is None:
+ if not persist_rows:
# persist_rows = None implies no preference
# persist_rows = False implies do not store the matched rows
persist_rows = not bool(agg_func)
+ if not overlapping_patterns:
+ overlapping_patterns = False
_validate(source_table, output_table, partition_expr, order_expr,
pattern_expr, symbol_expr, agg_func, persist_rows)
@@ -141,7 +141,8 @@ def path(schema_madlib, source_table, output_table, partition_expr,
{m}.path_pattern_match(
array_to_string(array_agg({short_sym_name_str} ORDER BY {order_expr}), '')::text,
'{new_pattern_expr}'::text,
- array_agg({id_col_name} ORDER BY {order_expr})::float8[]
+ array_agg({id_col_name} ORDER BY {order_expr})::float8[],
+ {overlapping_patterns}::boolean
) as matched
FROM {input_with_id}
WHERE {short_sym_name_str} is NOT NULL
@@ -155,15 +156,26 @@ def path(schema_madlib, source_table, output_table, partition_expr,
p_col_orig_name_str = ','.join(
[i + " AS " + j for i, j in zip(split_p_cols, quoted_split_p_cols)])
if agg_func:
- plpy.execute("""
- CREATE TABLE {output_table} AS
- SELECT
- {p_col_orig_name_str},
- {match_id_name},
- {agg_func}
- FROM {matched_rows}
- GROUP BY {as_partition_expr}, {match_id_name}
- """.format(**locals()))
+ if partition_expr == '1 = 1':
+ # no partition
+ plpy.execute("""
+ CREATE TABLE {output_table} AS
+ SELECT
+ {match_id_name},
+ {agg_func}
+ FROM {matched_rows}
+ GROUP BY {match_id_name}
+ """.format(**locals()))
+ else:
+ plpy.execute("""
+ CREATE TABLE {output_table} AS
+ SELECT
+ {p_col_orig_name_str},
+ {match_id_name},
+ {agg_func}
+ FROM {matched_rows}
+ GROUP BY {partition_expr}, {match_id_name}
+ """.format(**locals()))
result = "Aggregation result available in table " + output_table
else:
result = "No aggregation table created"
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/ports/postgres/modules/utilities/path.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/path.sql_in b/src/ports/postgres/modules/utilities/path.sql_in
index 0883d07..d832ab5 100644
--- a/src/ports/postgres/modules/utilities/path.sql_in
+++ b/src/ports/postgres/modules/utilities/path.sql_in
@@ -83,7 +83,8 @@ path(
symbol,
pattern,
aggregate_func,
- persist_rows
+ persist_rows,
+ overlapping_patterns
)
</pre>
@@ -149,8 +150,8 @@ path(
</ul>
</dd>
- <dt>aggregate_func</dt>
- <dd>VARCHAR. A comma-separated list of aggregates to be
+ <dt>aggregate_func (optional)</dt>
+ <dd>VARCHAR, default NULL. A comma-separated list of aggregates to be
applied to the pattern matches [3]. Please note that window functions
cannot currently be used in the parameter 'aggregate_func'. If you want
to use a window function [4], output the pattern matches and write a SQL
@@ -161,12 +162,16 @@ path(
aggregates, you can put NULL or '' in the 'aggregate_func' parameter.
</dd>
- <dt>persist_rows</dt>
- <dd>BOOLEAN. If TRUE the matched rows are persisted in a separate output
+ <dt>persist_rows (optional)</dt>
+ <dd>BOOLEAN, default FALSE. If TRUE the matched rows are persisted in a separate output
table. This table is named as <output_table>_tuples (the string
"_tuples" is added as suffix to the value of <em>output_table</em>).
</dd>
+ <dt>overlapping_patterns (optional)</dt>
+ <dd>BOOLEAN, default FALSE. If TRUE find every occurrence of the pattern in the partition, regardless of whether it might have been part of a previously found match.
+ </dd>
+
</dl>
@@ -414,13 +419,10 @@ than the beer page just before checkout.
@anchor note
@note
-Please note some current limitations of the path algorithm. These
-limitations will be addressed in subsequent releases.
+Please note some current limitations of the path algorithm.
- Window functions cannot currently be used in the parameter
'aggregate_func'. Instead, output the pattern matches and write
a SQL query with a window function over the output tuples.
-- Overlapping pattern matches are not supported. That is,
-a given row can only belong to one pattern match (non-overlapping).
- A given row can only match one symbol. If a row matches
multiple symbols, the symbol that comes <em>first</em> in the symbol
definition list will take precedence.
@@ -481,9 +483,10 @@ CREATE TYPE MADLIB_SCHEMA.path_match_result AS (
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path_pattern_match
(
- symbols TEXT,
- pattern TEXT,
- row_id FLOAT8[]
+ symbols TEXT,
+ pattern TEXT,
+ row_id FLOAT8[],
+ overlapping_patterns BOOLEAN
) RETURNS MADLIB_SCHEMA.path_match_result
AS 'MODULE_PATHNAME', 'path_pattern_match'
LANGUAGE C STRICT
@@ -491,6 +494,26 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ partition_expr VARCHAR,
+ order_expr VARCHAR,
+ symbol_expr VARCHAR,
+ pattern_expr VARCHAR,
+ agg_func VARCHAR,
+ persist_rows BOOLEAN,
+ overlapping_patterns BOOLEAN
+) RETURNS TEXT AS $$
+PythonFunction(utilities, path, path)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+-------------------------------------------------------------------------
+-- Overloaded functions for default arguments ---------------
+
+-- Do not allow overlapping by default
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
source_table VARCHAR,
output_table VARCHAR,
partition_expr VARCHAR,
@@ -500,14 +523,10 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
agg_func VARCHAR,
persist_rows BOOLEAN
) RETURNS TEXT AS $$
-PythonFunction(utilities, path, path)
-$$ LANGUAGE plpythonu
+ SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, $7, $8, False)
+$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
--------------------------------------------------------------------------
--- Overloaded functions for default arguments ---------------
-
-- Do not output matched rows by default
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
source_table VARCHAR,
@@ -518,7 +537,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
pattern_expr VARCHAR,
agg_func VARCHAR
) RETURNS TEXT AS $$
- SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, $7, NULL)
+ SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, $7, False, False)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -531,7 +550,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.path(
symbol_expr VARCHAR,
pattern_expr VARCHAR
) RETURNS TEXT AS $$
- SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, NULL, NULL)
+ SELECT MADLIB_SCHEMA.path($1, $2, $3, $4, $5, $6, NULL, False, False)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/195ad643/src/ports/postgres/modules/utilities/test/path.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/path.sql_in b/src/ports/postgres/modules/utilities/test/path.sql_in
index bd9dad2..9de15eb 100644
--- a/src/ports/postgres/modules/utilities/test/path.sql_in
+++ b/src/ports/postgres/modules/utilities/test/path.sql_in
@@ -98,6 +98,22 @@ SELECT * FROM "Path_output_tuples";
SELECT * FROM "Path_output";
+DROP TABLE "Path_output", "Path_output_tuples";
+SELECT path(
+ '"Weblog"', -- Name of the table
+ '"Path_output"', -- Table name to store the path results
+ NULL, -- Partition expression to group the data table
+ 'event_timestamp ASC', -- Order expression to sort the tuples of the data table
+ 'I:="Click_event"=0 AND purchase_event=0, Click:="Click_event"=1 AND purchase_event=0, Conv:=purchase_event=1', -- Definition of various symbols used in the pattern definition
+ 'I(click){1}(CONV){1}', -- Definition of the path pattern to search for
+ 'COUNT(*) as count' -- Aggregate/window functions to be applied on the matched paths
+ ,TRUE
+ );
+------------------------------------------------------------
+
+SELECT assert(count::integer=3::integer, 'wrong results in path')
+FROM "Path_output";
+
INSERT INTO "Weblog" VALUES
('04/15/2012 02:15:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
('04/15/2012 02:59:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),