You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2018/07/17 22:12:46 UTC

madlib git commit: Pagerank: Remove duplicate entries from grouping output

Repository: madlib
Updated Branches:
  refs/heads/master 8b2f33ff2 -> 62b53dca7


Pagerank: Remove duplicate entries from grouping output

JIRA: MADLIB-1229
JIRA: MADLIB-1253

This commit fixes the missing output for complete graphs bug as well.

Closes #294

Co-authored-by: Orhan Kislal <ok...@pivotal.io>


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/62b53dca
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/62b53dca
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/62b53dca

Branch: refs/heads/master
Commit: 62b53dca750bf3a3cb7dfc5f59bb7961d497a066
Parents: 8b2f33f
Author: Nandish Jayaram <nj...@apache.org>
Authored: Fri Jul 13 17:09:11 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Tue Jul 17 15:09:26 2018 -0700

----------------------------------------------------------------------
 .../postgres/modules/graph/graph_utils.py_in    | 12 ++--
 src/ports/postgres/modules/graph/pagerank.py_in | 71 +++++++++++++-------
 .../postgres/modules/graph/test/pagerank.sql_in | 40 +++++++++++
 3 files changed, 94 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/62b53dca/src/ports/postgres/modules/graph/graph_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in b/src/ports/postgres/modules/graph/graph_utils.py_in
index 8a41560..b0eaee4 100644
--- a/src/ports/postgres/modules/graph/graph_utils.py_in
+++ b/src/ports/postgres/modules/graph/graph_utils.py_in
@@ -128,11 +128,13 @@ def validate_params_for_link_analysis(schema_madlib, func_name,
                 format(func_name))
 
 def update_output_grouping_tables_for_link_analysis(temp_summary_table,
-                                                          iter_num, summary_table,
-                                                          out_table, res_table,
-                                                          grouping_cols_list,
-                                                          cur_unconv,
-                                                          message_unconv=None):
+                                                    iter_num,
+                                                    summary_table,
+                                                    out_table,
+                                                    res_table,
+                                                    grouping_cols_list,
+                                                    cur_unconv,
+                                                    message_unconv=None):
     """
         This function updates the summary and output tables only for those
         groups that have converged. This is found out by looking at groups

http://git-wip-us.apache.org/repos/asf/madlib/blob/62b53dca/src/ports/postgres/modules/graph/pagerank.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.py_in b/src/ports/postgres/modules/graph/pagerank.py_in
index 31377c0..71cddd2 100644
--- a/src/ports/postgres/modules/graph/pagerank.py_in
+++ b/src/ports/postgres/modules/graph/pagerank.py_in
@@ -455,6 +455,7 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, out_
                             WHERE __t1__.{src}=__t2__.{dest}
                             AND {where_group_clause}
                         ) {vpg_where_clause_ins}
+                        GROUP BY {select_group_cols}, {vertex_id}, pagerank
                     """.format(
                     select_group_cols=get_table_qualified_col_str(
                         '__t1__', grouping_cols_list),
@@ -526,6 +527,7 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, out_
                 # This is used only when grouping is set. This essentially will have
                 # the condition that will help skip the PageRank computation on groups
                 # that have converged.
+
                 plpy.execute("""
                         CREATE TABLE {message} AS
                         SELECT {grouping_cols_select_pr}
@@ -543,6 +545,7 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, out_
                     """.format(ignore_group_clause=ignore_group_clause_pr
                                if iteration_num > 0 else ignore_group_clause_first,
                                **locals()))
+
                 # If there are nodes that have no incoming edges, they are not
                 # captured in the message table. Insert entries for such nodes,
                 # with random_prob.
@@ -564,7 +567,19 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, out_
                     # If no grouping columns are specified, then we check if there is
                     # at least one unconverged node (limit 1 is used in the
                     # query).
-                    plpy.execute("""
+                    if iteration_num == 0 and grouping_cols:
+                        # Hack to address corner case:
+                        # With grouping, if there was a graph that converged in
+                        # the very first iteration (a complete graph is an eg.
+                        # of such a graph), then the pagerank scores for that
+                        # group was not showing up in the output. The following
+                        # code just prevents convergence in the first iteration.
+                        plpy.execute("""
+                            CREATE TEMP TABLE {message_unconv} AS
+                            SELECT * FROM {distinct_grp_table}
+                        """.format(**locals()))
+                    else:
+                        plpy.execute("""
                             CREATE TEMP TABLE {message_unconv} AS
                             SELECT {grouping_cols_select_conv}
                             FROM {message}
@@ -578,19 +593,23 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, out_
                         """.format(ignore_group_clause=ignore_group_clause_ins
                                    if iteration_num > 0 else ignore_group_clause_conv,
                                    **locals()))
+
                     unconverged = plpy.execute("""SELECT COUNT(*) AS cnt FROM {0}
                         """.format(message_unconv))[0]["cnt"]
                     if iteration_num > 0 and grouping_cols:
                         # Update result and summary tables for groups that have
                         # converged
                         # since the last iteration.
-                        update_output_grouping_tables_for_link_analysis(temp_summary_table,
-                                                                        iteration_num,
-                                                                        summary_table,
-                                                                        out_table, message,
-                                                                        grouping_cols_list,
-                                                                        cur_unconv,
-                                                                        message_unconv)
+                        update_output_grouping_tables_for_link_analysis(
+                            temp_summary_table,
+                            iteration_num,
+                            summary_table,
+                            out_table,
+                            message,
+                            grouping_cols_list,
+                            cur_unconv,
+                            message_unconv)
+
                     plpy.execute("DROP TABLE IF EXISTS {0}".format(cur_unconv))
                     plpy.execute("""ALTER TABLE {message_unconv} RENAME TO
                         {cur_unconv} """.format(**locals()))
@@ -613,21 +632,24 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, out_
                         # We completed max_iters, but there are still some unconverged
                         # groups # Update the result and summary tables for unconverged
                         # groups.
-                        update_output_grouping_tables_for_link_analysis(temp_summary_table,
-                                                                        iteration_num,
-                                                                        summary_table,
-                                                                        out_table, cur,
-                                                                        grouping_cols_list,
-                                                                        cur_unconv)
+                        update_output_grouping_tables_for_link_analysis(
+                            temp_summary_table,
+                            iteration_num,
+                            summary_table,
+                            out_table,
+                            cur,
+                            grouping_cols_list,
+                            cur_unconv)
                     else:
                         # No group has converged. List of all group values are in
                         # distinct_grp_table.
-                        update_output_grouping_tables_for_link_analysis(temp_summary_table,
-                                                                        iteration_num,
-                                                                        summary_table,
-                                                                        out_table, cur,
-                                                                        grouping_cols_list,
-                                                                        distinct_grp_table)
+                        update_output_grouping_tables_for_link_analysis(
+                            temp_summary_table,
+                            iteration_num,
+                            summary_table,
+                            out_table, cur,
+                            grouping_cols_list,
+                            distinct_grp_table)
 
                 # updating the calculated pagerank value in case of
                 # Personalized Page Rank.
@@ -639,15 +661,16 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, out_
                                     """.format(out_table=out_table,
                                                total_ppr_nodes=total_ppr_nodes))
             else:
-                    # updating the calculated pagerank value in case of
-                    # Personalized Page Rank.
-                    # Ref :
-                    # https://docs.oracle.com/cd/E56133_01/latest/reference/algorithms/pagerank.html
+                # updating the calculated pagerank value in case of
+                # Personalized Page Rank.
+                # Ref :
+                # https://docs.oracle.com/cd/E56133_01/latest/reference/algorithms/pagerank.html
                 if total_ppr_nodes > 1:
                     plpy.execute("""UPDATE {table_name} set pagerank =
                                     pagerank / {total_ppr_nodes}::DOUBLE PRECISION
                                  """.format(table_name=cur,
                                             total_ppr_nodes=total_ppr_nodes))
+
                 plpy.execute("""
                         ALTER TABLE {table_name}
                         RENAME TO {out_table}

http://git-wip-us.apache.org/repos/asf/madlib/blob/62b53dca/src/ports/postgres/modules/graph/test/pagerank.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/pagerank.sql_in b/src/ports/postgres/modules/graph/test/pagerank.sql_in
index 4b93075..14d3371 100644
--- a/src/ports/postgres/modules/graph/test/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/test/pagerank.sql_in
@@ -149,3 +149,43 @@ select assert(array_agg(user_id order by pagerank desc)= '{2, 2, 1, 1, 2, 2, 1,
 -- SELECT assert(relative_error(__iterations__, 31) = 0,
 --         'PageRank: Incorrect iterations for group 2.'
 --     ) FROM pagerank_gr_out_summary WHERE user_id=2;
+
+-- Test to capture corner case reported in https://issues.apache.org/jira/browse/MADLIB-1229
+
+DROP TABLE IF EXISTS vertex, "EDGE";
+CREATE TABLE vertex(
+id INTEGER
+);
+CREATE TABLE "EDGE"(
+src INTEGER,
+dest INTEGER,
+user_id INTEGER
+);
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2);
+INSERT INTO "EDGE" VALUES
+(0, 1, 1),
+(0, 2, 1),
+(1, 2, 1),
+(2, 1, 1),
+(0, 1, 2);
+
+
+DROP TABLE IF EXISTS pagerank_gr_out;
+DROP TABLE IF EXISTS pagerank_gr_out_summary;
+SELECT pagerank(
+'vertex', -- Vertex table
+'id', -- Vertix id column
+'"EDGE"', -- "EDGE" table
+'src=src, dest=dest', -- "EDGE" args
+'pagerank_gr_out', -- Output table of PageRank
+NULL, -- Default damping factor (0.85)
+NULL, -- Default max iters (100)
+NULL, -- Default Threshold
+'user_id');
+
+SELECT assert(relative_error(SUM(pagerank), 1) < 0.00001,
+        'PageRank: Scores do not sum up to 1 for group 1.'
+    ) FROM pagerank_gr_out WHERE user_id=1;