You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@madlib.apache.org by ok...@apache.org on 2017/11/21 18:51:50 UTC

[1/2] madlib git commit: Feature: Add grouping support to HITS

Repository: madlib
Updated Branches:
  refs/heads/master 5d66df8fd -> 4b7d9cc55


http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/pagerank.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.py_in b/src/ports/postgres/modules/graph/pagerank.py_in
index 6ee09ff..ad9fe80 100644
--- a/src/ports/postgres/modules/graph/pagerank.py_in
+++ b/src/ports/postgres/modules/graph/pagerank.py_in
@@ -28,18 +28,25 @@
 """
 
 import plpy
+from graph_utils import get_graph_usage
+from graph_utils import get_default_threshold_for_link_analysis
+from graph_utils import update_output_grouping_tables_for_link_analysis
+from graph_utils import validate_graph_coding
+from graph_utils import validate_output_and_summary_tables
+from graph_utils import validate_params_for_link_analysis
+
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import _check_groups
-from utilities.utilities import _grp_from_table
+from utilities.utilities import get_filtered_cols_subquery_str
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import add_postfix
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string, split_quoted_delimited_str
 from utilities.utilities import is_platform_pg
 
 from utilities.validate_args import columns_exist_in_table, get_cols_and_types
-from graph_utils import *
-
+from utilities.validate_args import table_exists
 
 def validate_pagerank_args(schema_madlib, vertex_table, vertex_id, edge_table,
                            edge_params, out_table, damping_factor, max_iter,
@@ -49,20 +56,13 @@ def validate_pagerank_args(schema_madlib, vertex_table, vertex_id, edge_table,
     """
     validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
                           out_table, 'PageRank')
+    ## Validate args such as threshold and max_iter
+    validate_params_for_link_analysis(schema_madlib, "PageRank",
+                                            threshold, max_iter,
+                                            edge_table, grouping_cols_list)
     _assert(damping_factor >= 0.0 and damping_factor <= 1.0,
             "PageRank: Invalid damping factor value ({0}), must be between 0 and 1.".
             format(damping_factor))
-    _assert(not threshold or (threshold >= 0.0 and threshold <= 1.0),
-            "PageRank: Invalid threshold value ({0}), must be between 0 and 1.".
-            format(threshold))
-    _assert(max_iter > 0,
-            """PageRank: Invalid max_iter value ({0}), must be a positive integer.""".
-            format(max_iter))
-    if grouping_cols_list:
-        # validate the grouping columns. We currently only support grouping_cols
-        # to be column names in the edge_table, and not expressions!
-        _assert(columns_exist_in_table(edge_table, grouping_cols_list, schema_madlib),
-                "PageRank error: One or more grouping columns specified do not exist!")
 
 
 def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
@@ -111,13 +111,9 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
                     SELECT COUNT({0}) AS cnt
                     FROM {1}
                 """.format(vertex_id, vertex_table))[0]["cnt"]
-        # A fixed threshold value, of say 1e-5, might not work well when the
-        # number of vertices is a billion, since the initial pagerank value
-        # of all nodes would then be 1/1e-9. So, assign default threshold
-        # value based on number of nodes in the graph.
-        # NOTE: The heuristic below is not based on any scientific evidence.
+
         if threshold is None:
-            threshold = 1.0 / (n_vertices * 1000)
+            threshold = get_default_threshold_for_link_analysis(n_vertices)
 
         # table/column names used when grouping_cols is set.
         distinct_grp_table = ''
@@ -211,7 +207,7 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
             rand_damp = 1 - damping_factor
             grouping_where_clause = _check_groups(
                 distinct_grp_table, subq, grouping_cols_list)
-            group_by_clause = _grp_from_table(
+            group_by_clause = get_table_qualified_col_str(
                 distinct_grp_table, grouping_cols_list)
             # Find number of vertices in each group, this is the normalizing factor
             # for computing the random_prob
@@ -233,7 +229,7 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
 
             grouping_where_clause = _check_groups(
                 vertices_per_group, subq, grouping_cols_list)
-            group_by_clause = _grp_from_table(
+            group_by_clause = get_table_qualified_col_str(
                 vertices_per_group, grouping_cols_list)
 
             plpy.execute("""
@@ -278,9 +274,9 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
 
             # join clause specific to populating random_prob for nodes without any
             # incoming edges.
-            edge_grouping_cols_select = _grp_from_table(
+            edge_grouping_cols_select = get_table_qualified_col_str(
                 edge_temp_table, grouping_cols_list)
-            cur_grouping_cols_select = _grp_from_table(cur, grouping_cols_list)
+            cur_grouping_cols_select = get_table_qualified_col_str(cur, grouping_cols_list)
 
             # Create output summary table:
             cols_names_types = get_cols_and_types(edge_table)
@@ -318,11 +314,10 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
             vertices_per_group_inner_join_pr = """
                 INNER JOIN {vertices_per_group}
                 AS {vpg} ON {vpg_join_clause}""".format(**locals())
-            ignore_group_clause_pr = ' WHERE ' + get_ignore_groups(
-                summary_table, edge_temp_table, grouping_cols_list)
-            ignore_group_clause_ins_noincoming = ' WHERE ' + get_ignore_groups(
-                summary_table, nodes_with_no_incoming_edges,
-                grouping_cols_list)
+            ignore_group_clause_pr = ' WHERE ' + get_filtered_cols_subquery_str(
+                edge_temp_table, summary_table, grouping_cols_list)
+            ignore_group_clause_ins_noincoming = ' WHERE ' + get_filtered_cols_subquery_str(
+                nodes_with_no_incoming_edges, summary_table, grouping_cols_list)
             # Strings required for updating PageRank scores of vertices that have
             # no incoming edges
             grouping_cols_select_ins = cur_grouping_cols_select + ','
@@ -331,15 +326,15 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
             vpg_where_clause_ins = ' AND {vpg_t1_join_clause} '.format(
                 **locals())
             message_grp_where_ins = 'WHERE {message_grp}'.format(**locals())
-            ignore_group_clause_ins = ' AND ' + get_ignore_groups(
-                summary_table, cur, grouping_cols_list)
+            ignore_group_clause_ins = ' AND ' + get_filtered_cols_subquery_str(
+                cur, summary_table, grouping_cols_list)
             # Strings required for convergence test query
             grouping_cols_select_conv = cur_grouping_cols_select
             group_by_grouping_cols_conv = ' GROUP BY {0}'.format(
                 cur_grouping_cols_select)
             message_grp_clause_conv = '{0} AND '.format(message_grp)
-            ignore_group_clause_conv = ' AND ' + get_ignore_groups(
-                summary_table, cur, grouping_cols_list)
+            ignore_group_clause_conv = ' AND ' + get_filtered_cols_subquery_str(
+                cur, summary_table, grouping_cols_list)
             limit = ''
 
             # Find all nodes, in each group, that have no incoming edges. The PageRank
@@ -359,7 +354,7 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
                     ) {vpg_where_clause_ins}
                 """.format(
                     select_group_cols =
-                        _grp_from_table('__t1__', grouping_cols_list),
+                        get_table_qualified_col_str('__t1__', grouping_cols_list),
                     where_group_clause=
                         _check_groups('__t1__', '__t2__', grouping_cols_list),
                     **locals()))
@@ -478,9 +473,13 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
                     # Update result and summary tables for groups that have
                     # converged
                     # since the last iteration.
-                    update_result_tables(temp_summary_table, iteration_num,
-                                         summary_table, out_table, message, grouping_cols_list,
-                                         cur_unconv, message_unconv)
+                    update_output_grouping_tables_for_link_analysis(temp_summary_table,
+                                                                          iteration_num,
+                                                                          summary_table,
+                                                                          out_table, message,
+                                                                          grouping_cols_list,
+                                                                          cur_unconv,
+                                                                          message_unconv)
                 plpy.execute("DROP TABLE IF EXISTS {0}".format(cur_unconv))
                 plpy.execute("""ALTER TABLE {message_unconv} RENAME TO
                     {cur_unconv} """.format(**locals()))
@@ -502,15 +501,21 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
                     # We completed max_iters, but there are still some unconverged
                     # groups # Update the result and summary tables for unconverged
                     # groups.
-                    update_result_tables(temp_summary_table, iteration_num,
-                                         summary_table, out_table, cur, grouping_cols_list,
-                                         cur_unconv)
+                    update_output_grouping_tables_for_link_analysis(temp_summary_table,
+                                                                          iteration_num,
+                                                                          summary_table,
+                                                                          out_table, cur,
+                                                                          grouping_cols_list,
+                                                                          cur_unconv)
                 else:
                     # No group has converged. List of all group values are in
                     # distinct_grp_table.
-                    update_result_tables(temp_summary_table, iteration_num,
-                                         summary_table, out_table, cur, grouping_cols_list,
-                                         distinct_grp_table)
+                    update_output_grouping_tables_for_link_analysis(temp_summary_table,
+                                                                          iteration_num,
+                                                                          summary_table,
+                                                                          out_table, cur,
+                                                                          grouping_cols_list,
+                                                                          distinct_grp_table)
         else:
             plpy.execute("""
                 ALTER TABLE {table_name}
@@ -530,69 +535,6 @@ def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
                 """.format(vertices_per_group, temp_summary_table,
                            distinct_grp_table))
 
-
-def update_result_tables(temp_summary_table, i, summary_table, out_table,
-                         res_table, grouping_cols_list, cur_unconv,
-                         message_unconv=None):
-    """
-        This function updates the summary and output tables only for those
-        groups that have converged. This is found out by looking at groups
-        that appear in cur_unvonv but not in message_unconv: message_unconv
-        consists of groups that have not converged in the current iteration,
-        while cur_unconv contains groups that had not converged in the
-        previous iterations. The entries in cur_unconv is a superset of the
-        entries in message_unconv. So the difference in the groups across
-        the two tables represents the groups that converged in the current
-        iteration.
-    """
-    plpy.execute("TRUNCATE TABLE {0}".format(temp_summary_table))
-    if message_unconv is None:
-        # If this function is called after max_iter is completed, without
-        # convergence, all the unconverged groups from cur_unconv is used
-        # (note that message_unconv is renamed to cur_unconv before checking
-        # for unconverged==0 in the pagerank function's for loop)
-        plpy.execute("""
-            INSERT INTO {temp_summary_table}
-            SELECT * FROM {cur_unconv}
-            """.format(**locals()))
-    else:
-        plpy.execute("""
-            INSERT INTO {temp_summary_table}
-            SELECT {cur_unconv}.*
-            FROM {cur_unconv}
-            WHERE {join_condition}
-            """.format(join_condition=get_ignore_groups(
-            message_unconv, cur_unconv, grouping_cols_list), **locals()))
-    plpy.execute("""
-        INSERT INTO {summary_table}
-        SELECT *, {i}+1 AS __iteration__
-        FROM {temp_summary_table}
-        """.format(**locals()))
-    plpy.execute("""
-        INSERT INTO {out_table}
-        SELECT {res_table}.*
-        FROM {res_table}
-        INNER JOIN {temp_summary_table}
-        ON {join_condition}
-        """.format(join_condition=' AND '.join(
-        ["{res_table}.{col}={temp_summary_table}.{col}".format(
-            **locals())
-         for col in grouping_cols_list]), **locals()))
-
-
-def get_ignore_groups(first_table, second_table, grouping_cols_list):
-    """
-        This function generates the necessary clause to only select the
-        groups that appear in second_table and not in first_table.
-    """
-    second_table_cols = _grp_from_table(second_table, grouping_cols_list)
-    grouping_cols = ', '.join([col for col in grouping_cols_list])
-    return """({second_table_cols}) NOT IN
-                (SELECT {grouping_cols}
-                 FROM {first_table})
-           """.format(**locals())
-
-
 def pagerank_help(schema_madlib, message, **kwargs):
     """
     Help function for pagerank

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/pagerank.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in b/src/ports/postgres/modules/graph/pagerank.sql_in
index a4c2004..a6858bd 100644
--- a/src/ports/postgres/modules/graph/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/pagerank.sql_in
@@ -52,15 +52,15 @@ algorithm initially proposed by Larry Page and Sergey Brin is implemented here [
 @par PageRank
 <pre class="syntax">
 pagerank( vertex_table,
-            vertex_id,
-            edge_table,
-            edge_args,
-            out_table,
-            damping_factor,
-            max_iter,
-            threshold,
-            grouping_cols
-          )
+          vertex_id,
+          edge_table,
+          edge_args,
+          damping_factor,
+          out_table,
+          max_iter,
+          threshold,
+          grouping_cols
+        )
 </pre>
 
 \b Arguments
@@ -169,15 +169,15 @@ INSERT INTO edge VALUES
 (6, 3, 2);
 </pre>
 
--# Compute the PageRank:
+-# Running PageRank with default values for optional parameters:
 <pre class="syntax">
 DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
-                         'vertex',             -- Vertex table
-                         'id',                 -- Vertix id column
-                         'edge',               -- Edge table
-                         'src=src, dest=dest', -- Comma delimted string of edge arguments
-                         'pagerank_out');      -- Output table of PageRank
+                       'vertex',             -- Vertex table
+                       'id',                 -- Vertix id column
+                       'edge',               -- Edge table
+                       'src=src, dest=dest', -- Comma delimted string of edge arguments
+                       'pagerank_out');      -- Output table of PageRank
 SELECT * FROM pagerank_out ORDER BY pagerank DESC;
 </pre>
 <pre class="result">
@@ -215,7 +215,7 @@ SELECT madlib.pagerank(
 SELECT * FROM pagerank_out ORDER BY pagerank DESC;
 </pre>
 <pre class="result">
- id |      pagerank      
+ id |      pagerank
 ----+--------------------
   0 |  0.225477161441199
   3 |  0.199090328586664

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/sssp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/sssp.py_in b/src/ports/postgres/modules/graph/sssp.py_in
index 4d2bf29..e82abf9 100644
--- a/src/ports/postgres/modules/graph/sssp.py_in
+++ b/src/ports/postgres/modules/graph/sssp.py_in
@@ -34,7 +34,7 @@ from utilities.control import MinWarning
 
 from utilities.utilities import _assert
 from utilities.utilities import _check_groups
-from utilities.utilities import _grp_from_table
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import add_postfix
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
@@ -129,9 +129,9 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, edge_table,
 
         if grouping_cols:
             comma_grp = " , " + grouping_cols
-            group_by = " , " + _grp_from_table(edge_table, glist)
-            comma_grp_e = " , " + _grp_from_table(edge_table, glist)
-            comma_grp_m = " , " + _grp_from_table("message", glist)
+            group_by = " , " + get_table_qualified_col_str(edge_table, glist)
+            comma_grp_e = " , " + get_table_qualified_col_str(edge_table, glist)
+            comma_grp_m = " , " + get_table_qualified_col_str("message", glist)
             grp_comma = grouping_cols + " , "
 
             checkg_oo_sub = _check_groups(out_table, "oldupdate", glist)
@@ -223,7 +223,7 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, edge_table,
             subq = unique_string(desp='subquery')
 
             checkg_ds_sub = _check_groups(distinct_grp_table, subq, glist)
-            grp_d_comma = _grp_from_table(distinct_grp_table, glist) + ","
+            grp_d_comma = get_table_qualified_col_str(distinct_grp_table, glist) + ","
 
             plpy.execute("""
                 INSERT INTO {out_table}
@@ -478,7 +478,7 @@ def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, path_table,
 
         if grouping_cols:
             glist = split_quoted_delimited_str(grouping_cols)
-            select_grps = _grp_from_table(sssp_table, glist) + " , "
+            select_grps = get_table_qualified_col_str(sssp_table, glist) + " , "
             check_grps_t1 = " AND " + _check_groups(
                 sssp_table, temp1_name, glist)
             check_grps_t2 = " AND " + _check_groups(

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/test/hits.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/hits.sql_in b/src/ports/postgres/modules/graph/test/hits.sql_in
index 5a521f7..0cbe962 100644
--- a/src/ports/postgres/modules/graph/test/hits.sql_in
+++ b/src/ports/postgres/modules/graph/test/hits.sql_in
@@ -19,11 +19,12 @@
  *
  *//* ----------------------------------------------------------------------- */
 
-DROP TABLE IF EXISTS vertex, "EDGE";
+\set tolerance 10e-10
+DROP TABLE IF EXISTS vertex, edge;
 CREATE TABLE vertex(
         id INTEGER
         );
-CREATE TABLE "EDGE"(
+CREATE TABLE edge(
         src INTEGER,
         dest INTEGER,
         user_id INTEGER
@@ -36,7 +37,7 @@ INSERT INTO vertex VALUES
 (4),
 (5),
 (6);
-INSERT INTO "EDGE" VALUES
+INSERT INTO edge VALUES
 (0, 1, 1),
 (0, 2, 1),
 (0, 4, 1),
@@ -54,18 +55,104 @@ DROP TABLE IF EXISTS hits_out, hits_out_summary;
 SELECT hits(
          'vertex',        -- Vertex table
          'id',            -- Vertex id column
-         '"EDGE"',          -- "EDGE" table
-         'src=src, dest=dest', -- "EDGE" args
+         'edge',          -- edge table
+         'src=src, dest=dest', -- edge args
          'hits_out',       -- Output table of HITS
          3);                  -- Max number of iteration
 
+SELECT assert(count(*) = 7, 'Tuple count for hits out table != 7') FROM hits_out;
+
+\set expected_authority 0.0865332738777835
+\set expected_hub       0.375721659592363
 
 -- Test Case: when max_iter=3, the authority score for node 0 should be close to 0.086533
-SELECT assert(relative_error(authority, 0.086533) < 0.00001,
-        'HITS: calculation for authority score is wrong'
+SELECT assert(relative_error(authority, :expected_authority) < :tolerance,
+        'HITS: Incorrect value for authority score. Expected: ' || :expected_authority || ' Actual: ' || authority
     ) FROM hits_out WHERE id=0;
 
 -- Test Case: when max_iter=3, the authority score for node 0 should be close to 0.375721
-SELECT assert(relative_error(hub, 0.375721) < 0.00001,
-        'HITS: calculation for hub score is wrong'
+SELECT assert(relative_error(hub, :expected_hub) < :tolerance,
+        'HITS: Incorrect value for hub score. Expected: ' || :expected_hub || ' Actual: ' || hub
     ) FROM hits_out WHERE id=0;
+
+
+-- Test case for grouping columns
+TRUNCATE TABLE vertex, edge;
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6);
+INSERT INTO edge VALUES
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 1),
+(1, 2, 1),
+(1, 3, 1),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 1),
+(3, 0, 1),
+(4, 0, 1),
+(5, 6, 1),
+(6, 3, 1),
+(0, 1, 2),
+(0, 2, 2),
+(0, 4, 2),
+(1, 2, 2),
+(1, 3, 2),
+(2, 3, 2),
+(3, 0, 2),
+(4, 0, 2),
+(5, 6, 2),
+(6, 3, 2);
+
+DROP TABLE IF EXISTS hits_out, hits_out_summary;
+SELECT hits(
+         'vertex',        -- Vertex table
+         'id',            -- Vertex id column
+         'edge',          -- edge table'
+         'src=src, dest=dest', -- edge args
+         'hits_out',       -- Output table of HITS
+         3,                -- Max number of iteration
+         0,                -- threshold
+         'user_id');        -- grouping cols
+
+SELECT assert(count(*) = 14, 'Tuple count for hits out table != 14') FROM hits_out;
+
+\set expected_authority1 0.432666369388918
+\set expected_authority2 0.016273613872603
+SELECT assert(relative_error(authority , :expected_authority1) < :tolerance,
+        'HITS: Incorrect value for authority score. Expected: ' || :expected_authority1 || ' Actual: ' || authority
+    ) FROM hits_out WHERE id=2 and user_id = 1;
+
+SELECT assert(relative_error(authority , :expected_authority2) < :tolerance,
+        'HITS: Incorrect value for authority score. Expected: ' || :expected_authority2 || ' Actual: ' || authority
+    ) FROM hits_out WHERE id=6 and user_id = 2;
+
+-- Different max_iter and threshold values
+DROP TABLE IF EXISTS hits_out, hits_out_summary;
+SELECT hits(
+         'vertex',        -- Vertex table
+         'id',            -- Vertex id column
+         'edge',          -- edge table'
+         'src=src, dest=dest', -- edge args
+         'hits_out',      -- Output table of HITS
+         3,               -- Max number of iteration
+         0.01,            -- threshold
+         'user_id');      -- grouping cols
+
+SELECT assert(count(*) = 14, 'Tuple count for hits out table != 14') FROM hits_out;
+
+\set expected_hub1 0.040618557793769
+\set expected_hub2 0.00818161871503306
+SELECT assert(relative_error(hub , :expected_hub1) < :tolerance,
+        'HITS: Incorrect value for hub score. Expected: ' || :expected_hub1 || ' Actual: ' || hub
+    ) FROM hits_out WHERE id=3 and user_id = 1;
+
+SELECT assert(relative_error(hub , :expected_hub2) < :tolerance,
+        'HITS: Incorrect value for hub score. Expected: ' || :expected_hub2 || ' Actual: ' || hub
+    ) FROM hits_out WHERE id=5 and user_id = 2;

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/wcc.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/wcc.py_in b/src/ports/postgres/modules/graph/wcc.py_in
index 64a7dd9..3905c10 100644
--- a/src/ports/postgres/modules/graph/wcc.py_in
+++ b/src/ports/postgres/modules/graph/wcc.py_in
@@ -30,7 +30,7 @@
 import plpy
 from utilities.utilities import _assert
 from utilities.utilities import _check_groups
-from utilities.utilities import _grp_from_table
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string, split_quoted_delimited_str
 from utilities.validate_args import columns_exist_in_table, get_expr_type
@@ -137,10 +137,10 @@ def wcc(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
             """.format(**locals()))
 
         comma_toupdate_prefixed_grouping_cols = ', ' + \
-            _grp_from_table(toupdate, grouping_cols_list)
+            get_table_qualified_col_str(toupdate, grouping_cols_list)
         comma_oldupdate_prefixed_grouping_cols = ', ' + \
-            _grp_from_table(oldupdate, grouping_cols_list)
-        subq_prefixed_grouping_cols = _grp_from_table(subq, grouping_cols_list)
+            get_table_qualified_col_str(oldupdate, grouping_cols_list)
+        subq_prefixed_grouping_cols = get_table_qualified_col_str(subq, grouping_cols_list)
         old_new_update_where_condition = ' AND ' + \
             _check_groups(oldupdate, newupdate, grouping_cols_list)
         new_to_update_where_condition = ' AND ' + \
@@ -449,7 +449,7 @@ def graph_wcc_largest_cpt(schema_madlib, wcc_table, largest_cpt_table,
         groupby_clause_subt = ''
         grouping_cols_join = ''
         if grouping_cols:
-            select_grouping_cols_subq = _grp_from_table(subq, glist) + ','
+            select_grouping_cols_subq = get_table_qualified_col_str(subq, glist) + ','
             groupby_clause_subt = ' GROUP BY {0}'.format(grouping_cols)
             grouping_cols_join = ' AND ' + _check_groups (subq, subt, glist)
 
@@ -575,7 +575,7 @@ def graph_wcc_reachable_vertices(schema_madlib, wcc_table, src,
         grouping_cols_join = '' if not grouping_cols else ' AND ' + \
             _check_groups(wcc_table, subq, glist)
         subq_grouping_cols = '' if not grouping_cols else \
-            _grp_from_table(subq, glist) + ', '
+            get_table_qualified_col_str(subq, glist) + ', '
         plpy.execute("""
                 CREATE TABLE {reachable_vertices_table} AS
                 SELECT {subq_grouping_cols} {subq}.component_id,

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/sample/stratified_sample.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/stratified_sample.py_in b/src/ports/postgres/modules/sample/stratified_sample.py_in
index 165844b..0621d61 100644
--- a/src/ports/postgres/modules/sample/stratified_sample.py_in
+++ b/src/ports/postgres/modules/sample/stratified_sample.py_in
@@ -21,7 +21,7 @@ import plpy
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import _check_groups
-from utilities.utilities import _grp_from_table
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
 from utilities.utilities import split_quoted_delimited_str
@@ -65,7 +65,7 @@ def stratified_sample(schema_madlib, source_table, output_table, proportion,
             checkg_lp = " AND " + _check_groups(label,perc,glist)
             window = "PARTITION BY {0}".format(grouping_cols)
             grp_by = "GROUP BY {0}".format(grouping_cols)
-            grp_from_perc = _grp_from_table(perc,glist) + " , "
+            grp_from_perc = get_table_qualified_col_str(perc,glist) + " , "
             grp_comma = grouping_cols + " , "
 
         validate_strs(source_table, output_table, proportion, glist, target_cols)

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/sample/train_test_split.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/train_test_split.py_in b/src/ports/postgres/modules/sample/train_test_split.py_in
index 37a6505..011b14f 100644
--- a/src/ports/postgres/modules/sample/train_test_split.py_in
+++ b/src/ports/postgres/modules/sample/train_test_split.py_in
@@ -21,7 +21,7 @@ import plpy
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import _check_groups
-from utilities.utilities import _grp_from_table
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import add_postfix
 from utilities.utilities import unique_string

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 5acb24b..8beb701 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -709,16 +709,35 @@ def _check_groups(tbl1, tbl2, grp_list):
     return ' AND '.join([" {tbl1}.{i} = {tbl2}.{i} ".format(**locals())
                          for i in grp_list])
 
-
-def _grp_from_table(tbl, grp_list):
-    """
-    Helper function for selecting grouping columns of a table
+def get_filtered_cols_subquery_str(include_from_table, exclude_from_table,
+                                   filter_cols_list):
+    """
+    This function returns a subquery string with columns in the filter_cols_list
+    that appear in include_from_table but NOT IN exclude_from_table.
+    :param include_from_table: table from which cols in filter_cols_list should
+                               be included
+    :param exclude_from_table: table from which cols in filter_cols_list should
+                               be excluded
+    :param filter_cols_list: list of column names
+    :return: query string with relevant column names
+    """
+    included_cols = get_table_qualified_col_str(include_from_table, filter_cols_list)
+    cols = ', '.join([col for col in filter_cols_list])
+    return """({included_cols}) NOT IN
+                (SELECT {cols}
+                 FROM {exclude_from_table})
+           """.format(**locals())
+
+def get_table_qualified_col_str(tbl_name, col_list):
+    """
+    Helper function for selecting columns of a table
     Args:
             @param tbl        Name of the table
             @param grp_list   The list of grouping columns
     """
-    return ' , '.join([" {tbl}.{i} ".format(**locals())
-                       for i in grp_list])
+    return ' , '.join([" {tbl_name}.{col} ".format(**locals())
+                       for col in col_list])
+
 
 def get_grouping_col_str(schema_madlib, module_name, reserved_cols,
                          source_table, grouping_col):

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/utilities/validate_args.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 50cd87d..baefa5d 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -262,6 +262,13 @@ def get_first_schema(table_name):
     return None
 # -------------------------------------------------------------------------
 
+def drop_tables(table_list):
+    """
+        Drop tables specified in table_list.
+    """
+    drop_str = ', '.join(table_list)
+    if drop_str:
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(drop_str))
 
 def table_is_empty(tbl):
     """

[2/2] madlib git commit: Feature: Add grouping support to HITS

Posted by ok...@apache.org.

Feature: Add grouping support to HITS

JIRA: MADLIB-1151

Additional Authors:
Jingyi Mei <jm...@pivotal.io>
Nandish Jayaram <nj...@apache.org>

- Changes to support grouping column in HITS. Update queries to
use group by and other necessary sql constructs.
- Add the design doc section for HITS.
- Refactor some common graph utilities.

Closes #195


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/4b7d9cc5
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/4b7d9cc5
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/4b7d9cc5

Branch: refs/heads/master
Commit: 4b7d9cc55dde1b3b222237b6ee2b43fb068e28a9
Parents: 5d66df8
Author: Nikhil Kak <nk...@pivotal.io>
Authored: Tue Nov 21 10:45:12 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Tue Nov 21 10:49:27 2017 -0800

----------------------------------------------------------------------
 doc/design/figures/hits_example.pdf             | 310 +++++++++
 doc/design/modules/graph.tex                    | 126 +++-
 doc/literature.bib                              |  15 +
 src/ports/postgres/modules/graph/apsp.py_in     |  14 +-
 src/ports/postgres/modules/graph/bfs.py_in      |   6 +-
 .../postgres/modules/graph/graph_utils.py_in    |  82 ++-
 src/ports/postgres/modules/graph/hits.py_in     | 631 ++++++++++++-------
 src/ports/postgres/modules/graph/hits.sql_in    | 240 +++++--
 src/ports/postgres/modules/graph/pagerank.py_in | 154 ++---
 .../postgres/modules/graph/pagerank.sql_in      |  32 +-
 src/ports/postgres/modules/graph/sssp.py_in     |  12 +-
 .../postgres/modules/graph/test/hits.sql_in     | 105 ++-
 src/ports/postgres/modules/graph/wcc.py_in      |  12 +-
 .../modules/sample/stratified_sample.py_in      |   4 +-
 .../modules/sample/train_test_split.py_in       |   2 +-
 .../postgres/modules/utilities/utilities.py_in  |  31 +-
 .../modules/utilities/validate_args.py_in       |   7 +
 17 files changed, 1339 insertions(+), 444 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/doc/design/figures/hits_example.pdf
----------------------------------------------------------------------
diff --git a/doc/design/figures/hits_example.pdf b/doc/design/figures/hits_example.pdf
new file mode 100644
index 0000000..d86efc0
--- /dev/null
+++ b/doc/design/figures/hits_example.pdf
@@ -0,0 +1,310 @@
+%PDF-1.4
+%����
+1 0 obj
+   << 
+      /Title ()
+      /Author ()
+      /Subject ()
+      /Keywords ()
+      /Creator (yExport 1.5)
+      /Producer (org.freehep.graphicsio.pdf.YPDFGraphics2D 1.5)
+      /CreationDate (D:20171108113042-08'00')
+      /ModDate (D:20171108113042-08'00')
+      /Trapped /False
+   >>
+endobj
+2 0 obj
+   << 
+      /Type /Catalog
+      /Pages 3 0 R
+      /ViewerPreferences 4 0 R
+      /OpenAction [5 0 R /Fit]
+   >>
+endobj
+4 0 obj
+   << 
+      /FitWindow true
+      /CenterWindow false
+   >>
+endobj
+5 0 obj
+   << 
+      /Parent 3 0 R
+      /Type /Page
+      /Contents 6 0 R
+   >>
+endobj
+6 0 obj
+   << 
+      /Length 7 0 R
+      /Filter [/ASCII85Decode /FlateDecode]
+   >>
+stream
+Gb!T<]<5_WNa5su;*WF^H9X_d"q3J)GMq-+JGTONL*(trh\p!de6Hc(g:6j!"5P?(1U].5TI>[A.B`T6
+c@8(-b<LKC?XQWCI\i[o^OQ1U+0QE/s2_=NYJ@mmgF'ab]Jf<Ga'X;nV]Ys3[.A58mB?Q`QiES*ro=$+
+CXW4X+92-!^]2?(pET_&msk8q)=cVa1\UK:S+jJ.e)m3EU<VSQ_LL+_s/F/IinpAd&="@0bV(hpd.<<J
+5P4k8C[CMk[F91;kC;M&^\eVSN%\Z?jPpdFPJ@%j0E5jHr&:\S5)m=#'jg\3CTu7=H<Z.hrr#O$Qi>O]
++6pX+^E&ZL^=)f0C<&QH0)Vu?b(#)YJ!?J<G&s0HIXGkNgAV!_V-gj;TD%C(%<N?p48f(drV@UF;#L*2
+aPU7po^gr^U,>pL&-":&n?7E`s6ue6*1YP&`sZ`Vl<grhnShUF*Uirlq`$[9V#Ah-EBYVK%JYl6cOYEh
+QhaWFZ1W=UJ&f*J[J]C[%=A&gr#eH&@g7sS0O4CI*V%JeSgZ8Zcr0i8ISfkZh9fB2ZW8SFBq8V=o#j0,
+)IEY]TF0uXYM9WU?_a1?p=6:sGu;CS:ffWID`bs\qJ#=1Qh7]>',O=XBOK%aUH5c`m&:]-^!C:;hMF%g
+PG:0k/I`*TkgVc%TDOoqb%.qBp<TC96?D$M`u&;6;h!ZF9(_N#XOo(YpK@%8g\G<?O5Fs&53p,4B?B^)
+YL]E+QqelBM4'lc#G'6_Z'QQ/B.Z0ZXSJOZ2%GZe<lqARf^@_GFB7\"F_71?kNejnIoI6Z:uNsRR:kb#
+.?k(lj^-m]6/D5l#5P$bn]nIC.sP^nY;m\pF_4Rl/a_QoM+K]J^e'0$O%?HL7oIoR5h:RnkBlpQcjAF7
+2*]unk<'ffST1dI)"Yu4XORC61UpsO+thpe*%d@rS4tfnN\mNYDX!9ENGH(GbimE`1s2Js3Yq#K;TFQN
+\ZaTh4+H85?1u8%@?sCk_2n</4'$d:;"IgodIg2opV@kM%G*C<m^\0dlX:R`<?42$1!NJ(I4/Ec=um,T
+Hr7jtU9="]j=DOj4S4D:SSAt9/63S#;Eh7i!Z9<Kde<&,,>BoLh&W&tVt7fc?@%@WC4(<F;H]'(c_S'8
+e8*aOP%VZo_K[Ts)I?fDs719?B!cnG)lh(fan+^RA,n(RV!kck>b&>qn\]O^4%<H0>oBG\,_DsB@%T`9
+&C)_7mho[FJVm1-a\#@batM)/i@%aI0RC^*)]7-roJ0aui(3\;3*cTLZ[_qPOLO<^3;BrUj?$^#*'A=\
+*'mMa'[;g.B9ND%s!$-&HdQPm^kK0tSk`p[7lJoS=a!oE@@u/XkLq$`*"KqY(-[&bE>$m0IrkS(\,MNH
+Z]K(fFC9$F,KkkZ^#>#1-dq8#bulJU.X#Gf&fGdS]<@g64`U_si,">sOTT`sUk)<;LEbp)R^eZ=RHfpF
+S!Ot]jeOeW7\($1eAY>e46!J\?.(lh:>I-<HG?9i1@q/eQG9TompNaFo""1adjF)XWB4U_TZ&As2+c<5
+'4\%4cBqj/Vs`d_;GqU5]/Z4O^RVmI:nrQRc%375W,%bu*keq^F+(;0dbhDmb[sIj=g)tg5$UOSL$cc=
+\+Boj:-1$C2EduJH$%cjr]W*pG!JOQZWG*4B1W.NCd->`5-nsmX3)/q>/_jRO@.=YJFAO!4>QMIa;UT9
+O\&QR1IP1.G"3O+Aim@$OJiCM'0H,Tc`"LI!YT1s+ed1R%aaS_-m:$)RCOAZj4p*g\>9MT_Cn3Cc(0f:
+Vf?$AYGBq6g^i*Le5:t3L'JJ9?Xe0'gotut]3/fC>92:knAL-%Z6-)3qS(<M<(p[km)l`GcSkI\9[sG4
+n]Vs)aqt<5rN&dE2QC#<GNR1SBR>qq$l/23e2:LPMp=,DCGegJD'5E^cm;&[p>dhL;%LXd\.TPB;;3\i
+dRLfkdZDc(9N%L*?J5pf&p>hWlAQ"Scr"rr0q>a7#=3u'Es'D5e.,bB2tS=UoZ8gQ8'Y'P61H005u5*U
+2)/B-[W^;uUl]EIZm&("7kWI7;mNfA3\2R]4,cTl0ZhlYF`Wa_%8AC??0sIu.YVDK;OP3T;e2A.4+@j_
+Rq-;I.^Q]sCM0)Q\mu1ZOtA@6)d4Hc:pMsWcg1"4ZjSLJA8?sfUUXiUjH;)K,o_/u+kb5cH:YJf=?D4I
+_b<Qk[YCj!6aC-;p:-s=CQrY.2G\L_Cch8u8@CmfE[@2"QY_C&g!C@5]P/+Qr#cJPdP+A#2:q@Hfs!Ma
+\mu1ZOtA@,"mX&Q>J%IUh6uT),[3@TpX"1>L,?)ECIe#UIS45cOCZ6;7_Y%FS/7=tW.\[hA<JMu,ob2:
+abFun6aEE;&s<\KRaWS;/'UK$+mEO#%B];bD!7jRAA_#Lh-I;e\*T9;40,r'F6&K90I7Z`-7#'Z_Bg/@
+Y`_X;N3Iu<#[Y0T/F%+,R[7Km,aW=L='C#GZ@U&;82d+QOtBLR%P^Z!##msqU"qH,A7IjW,ocZV\RYf]
+ESGb0[^.i3gu*YqRiFY]BnO6t]4MC<ID[nj<l/<s0qTE'=K\(#.]\2;VQFk`KP&n0HdPe%LF1_Y`@Tm9
+XabH^O^'\RDo4H]<g29^9JcW#576g4LCZmBNH9/'Ur@%kPjLc+';St-Eo&ZL.p1!:C\hsQX`,aW7Fi7q
+lM^l%dRs1@V-9,"J^8@&ijF?T>#m9BKtoJ-7=Efd$MT".^bNA$ee\r_+/S:f_cCg]#&<1Yh`CGZkn\dr
+T(16&Hbe^VK"VF73GRHsDG:0b=Dli]XOe>'CpOlo51brl.Q#8QG<EB4*dH`%M,?R=cD4<;op[[8N4Eq`
+9^Ye$g]@sHN.h#cAgeot'f7cu"_AV,4^6P&G/c3r5Jo4qXqU7E5?7iOVs`&em:DFWN1-['AG$V<+<4P'
+5Hp8,#_OZA$kW+5k*Q2jC=I'BJ\0Yi#0$Kfp5s\"GFTdQIrV"*0,j46(J"EQ=ndKB5HFg=1&Yb^R*D@J
+W+r368=_ujj[_;.ngH(PREn*-dE2S4GOjs]AWPL<%d[r)5E+Bqj]7!005Ep:'=@*#qKLl85a=7JV)uj^
+m`qRt)bf0#l6bV+Nf]'P7d"/0\0H/eB;Ab<o$U,:fjt3@NRG7Z=hr2JZqu:dngPSA.D%.SN]*o1Y&I4q
+o^bB]_o$&:@@19H[a)b(1oebulL_GS?/-p[o/XQ?,A=rbek]DlTJL]!KgK]Q[oCr3:.<q%G/hlgB#AiM
+^s5j?h)DQjQ0,S'2@B8&\u\^][(ciamSJGBA-4ArU,1k$]85_Ql5Is\CF)Q;:5,,'Al2iV#B(<IM@)if
+l\G#=Hrh,m\6a'P/A_&AOln5Xg+Cb&XOeXJ%O5^)27L"3jqX"cB$M*7S?2Shkb'Yl5tIhDlX1Yt_&)rs
+)oZUc>mC2gJ<A,2Cg0VS0pP!Y.`7$sE2U,XYn,MSNA>*mLOHM*."qt6ge1kMcs7Z#o\*t-Xn.S:3F9u2
+k4-L[FmLl`etTn8Z0gCOH&UGEX`aF7KJfeea56Kkng*&>V:)\0Yi<K2fA1'&GiE-0`G`A!iuVqq];0Ou
+T+Li:4<_@:[td`5^5md=/_9E$(H#>qJg,r%!pAp>r=a[hTQB)#i4=\nnW*Pf3@tY#$gpb2M)R?Q_mAiT
+p^('K`6,_??j4ZLT7F<[f...@n>1cA+.b0^L9>-*U<hr
+FMbQ#4du!Ej6`iU&Z&e(N@,jq0)#JWChniFBCccJG2AX>qCcV/9<<$I7j-";5)8HYf%Qop'!n1X>q]O6
+]EV0,?aSIE-c[h'[^Z6\oLF?j-]mkRGh+V<U?@Fdqo]5Y4DS;G)Lu]5dAME_JQ`#>h/,jK,o%=NGhmAW
+7DD^&idRN/.$Q$#@BI<RcT86uCB[m&Mc.\d7p[RLQrHXL>%6RSlZ<:g?.-fDXG^D]otsOG7KiP9kW$J)
+Tj%ermcKMH9$\L_[q5I-83(?bd;J>4s4&V>s5I5=LP/.FaVq\,XlEc[RDKSHm6Z(s?1eQrI-)HQ[7e=`
+IJl',[I`XX`(c2:Rqt8ghGMXA>ibT%_`e1W[ktT!AV`U>_Jsg7dRhEkj![sKn\XsM?fi3QbBWEOpl(F3
+qu=,rRLJ#Uhp;tQ*WV?G!rT:TrAA\-[:e^Em*F#j6P82fV2YDX00M/1)SVmif!#)s#/uX1L]"W@V"n(X
+NdiOAAsF\2LfmXbLD=<ih3C0D7*_lGp@q^:lZ^LQG9!2Ds8F$cj@o4?L9ILq@H0R7Q%:NMAds"g'hP_C
+dEk"M@H$o_DS3kJk>IeYe?!(XB,<0]ZHU`h^:M9g:`(e5+;@B\a<G=<]9aW<jSXitOsVk-Bn5DqW#AZ5
+^D9DIcH0fa+)-9D-$^t44fD;FHR'&4ho775rC`805rX^tE?&K,@@]XQ@jP/uC,1slE(&!Q7BPhq7jQ>_
+dbtm3ZZ^L1@8!,[Bc/jmAo6AB]QDQ9Ue$p`bP/`51cs)bDilk<8STc=-BR%)*@m9W?Og3PoAQ5lT9i'r
+TN@&YC)AU`bK`3(i\I%1cCK+0-p9@lMV:#s\^SB)?PYaealt4r:A;4F;0Jun7acfb[cR[eP\q5<ST4=K
+9:$i)=NQ[qdKF/-%DJ>E/&N6a)bGZgCQ?g^[rOAuXDC;QC]nbM]1W61jNPnK`'/0[Q[dtV1le+KrJPlI
+,AD=JHC.J9\.9=p]2%`R]&!)-0l0q]XPZP91lK+./6p_g,&%,3^!!Z)C&CUmKWs9\,@9#<o6?Y))k+ag
+@(`8i8=FtVWmP`6Xf)0[;60D8[#:<1*5*C^_c@BsC-s%6':=n=rhUfQ]NpSoZ!K#FL*O8DCp+n-UNlQ.
+&&k%+3l5RRHj4513q>f[a#4$I)1Eg*9_aTj?'7o34\+DN*_GDUY)]_1C%*Dn'Pp1`nT"iir,9liN\,9`
++6gE3+7SCH.IV\YYP%Y/:%u7_/LmSa?et="1X:6BDCn&D]bHfAV(6md%V-<K"KLo5e?+slKjM='RX:VD
+>U#EAdDU#pGA+I#pRW%?:ISt'I<3%:@DgM?h[NjYHg30!^?<'5a6C>TDfP65>['`,>NIekp2I19T/9;P
+bcjV^5\35<8YbVjRa[34>g/g00$*iK0%sD7e6VL%iV&K6[*XeeFtM:@DPO8s'3f76KBN!!ge#3^0`J@q
+Bp5_p+\aV;6!i=:L'FP8G##mLZ^as7Jk+[CiLp&?i08j)pHM9r*R$P*T_4R&V]:("`&WNb3i=rX:'o_#
+>sH&T;[uD9Z#9D1_'s$h2$KnJ\b5LN']Iu/Z^">.#7f&;m.VlN:>u,';`NgqPg`ZBR;pBcj_HoTYc$TQ
+'$*&OYrIGE0'<qW].^(D<lm@UZjP[JH9Gdb`ud#:>j2lko"l4$PV__#of,[&\,/E$db<.THI2H0f*=p\
+g]r/P6E,@*#Aqc;I%WKjj(VgAk4p+EpC0>f$)-)qpebCZIBklkeXLrZ*6Ciqh@\\,^\eQM1\O`4h:C^'
+&*W>o]`)Ct7s45AQn'b[)/&(1J>7Ons+[_Y59nff_cu&T\[-13_PsO=>ZKrf,bFmYkE_anrOd2g\&=c.
+GGVO?d8IB8]Z9HlSP,QnMu1E("J"9T+q0Gc?!HTIDETUY#SVNhcE>T&8)W$I40-I"00enTq;L"4H+eG&
+(9I5kQ8X&<jQBh$D#O3U(H1Pt5;,jTmcS`RIU:'4D*pO%Ht)jA#,\"/W,25"!ms2KIbpoO+nP:,mHpM#
+Q+,LEWO61./Oq6<;r#^bFE9[Miu$h0+ME9aa%E`.44;7\e%rQ$DMd))XPBl1aNVf?Wqg@BjT^=..P%tQ
+[rOQRX7E:'H@00DWRlX?B'%#lbJVKi39nBk;#Z!Y5Hk*Ah9,+e<pnL[YEp8V[DllW[rS[LR!^RS^7#.2
+R[)L+gjs;GQ;!i]j[REV?g9l-"qkcq?EES/1$jAFFf'eDoGtQA#^1k@St\)e<B"7rli?0jU8J9@s13*I
+lmhSt_@D+NW;CjK#tWVF]KijCb3n60r4J&B!^i)rn(@TONofG06SVm;o)c1(oCFU+2os=')6N7s>hD2O
+]7+)1^9/jfAVU)-SSmlYFo9lNckaZ3hlNPc%':MD>Eer0f@696p[^#dlK%,^*Mb=^o*-sG@UT%XrC_E@
+d;FF.A(bX;D_4J$f5E*i5?<!1kM!fT0]Ph#4*\C#7%*">?<_Rlq0<(17l;thRXZbNYIEhH?M<+/c\iCk
+W.nCMIN-7#%arZ2Ma<9b$FBseL&ATDm\kg#k?W6Tb**d/EQaH_,$X;r7m_Z27sB*/jlb`l,=_;\?J.%'
+96?A-QVe\Op\Y-%h7N3Q3Vi)srM"u.*nH5HnDaWFoIMm"l'_=leRN97HZO"P?IZD#S21YJ'G3&]BD2GU
+c\kZ]*i2Fd&G^U"r9i`jI!=!3PKt[e3%+bO6Mn]n/\%G)Z0NJ1`B1e_?<C0M\N:]&Q6#f*CbZ]\K3N<>
+nu(.I\>qaQ3WXHXY/OfBLAV0rlmL)e8AbrXF58K^H!hRO`ebXrT7DrsIbi9]YLraDqJY"^5fYnd2pUUE
+D-EM,j_`rG-s`!SA>=!]!e(1T[Pc"4X7uu,lcS"LqTlR37As.KE^)5D9DKa!pX*+:_e4l'q8kom/N,2A
+IboL?I\oX2OD]E[hiIcX0CVo$#L>Fh^[=9[YP\WQH.K]K5B#HsK>.,riTJ2Yc?*Ai!O5S^V;+T+_P8bK
+mljW.YNtB-4"15L.F_ZUaj1pI4Gf*[kN/&drKV%40h:YmAZs"b[29M:O45,-(V5=tH-^?8i1=H$&+8/n
+^N<\4\')DXoNp@&+@!%Sp[\UWH5<G4jlc;[h9YE#b0S6Coh3+KID;?4j8+&W?6\ae!r7*DR@i)Nbb0"E
+9+Mo"1?RcU>Obd$Wq^N.Goalh32W=7?R?lBlZdOkcFh1ZhZi5;nh*]'_Zp[c4kJ!X[@]#V+4Y=KI51_W
+c'.;_>o''e/nt*@Tsci;M;/%8NqTUS)h&YZ/N%8D,K>H"V+'C8e,jIkiX2.-Ip\]l,nCb-R/sj^_jh+=
+ig8f?hmZ*)/O-So"7KjW=CkJu(_3NP=EbTHd=c@Mpl7%5oSdQbj#0*&3Dq'JGDnIQa-Scan<kgj]d)cB
+lM.O&,]u<?`n\Hqnq$\&/Xu>RNp1ZRe.$(;4;q\,?F=TE[!Wk`bM>-l#on8Wk@hABdP)sM^XPtpB=4dB
+j;tJlmH6EB(%E6MVX1q<3WB"+V4A!,ci3a_(YVY8fj@Ds(s[$"=^(bJ45"@EH*OjhQW'Q;hpO)?b-Ln]
+g1bP9Fk5BUA<*Yb*jdBdo[I$bmOe?[MiP$M/shT?I,RAb,EAJ&+"sHb'36@I(_Ws,j/,jbIoVhS#;'._
+4no]U\s!<KK7&%qr<pb&g,3RlHf:uJ'(A36;5)/r,o3W.pC)V*pQf\geKITNV?O;Chd)^mm8Er3VTlE9
+/'ZG,NoBcW>S$c\B>.o[h5&FDQa/5(I-Pu:kW:EsXD[rGg]](IUK$$B7ia7qADC2!FMg;Cnb_<[O,I[A
+4_&(@aeE;`p=:Tl;XPRc[Bk$='@3[s8$'<O"0?Sliu^IEK'K#X[6-FPles(Mm\5(EChPrr.WUpS@p^"_
+Ffeq3$(nqpdNhqp&V2=8`]G2\]KETr[!g%'`2>Ab6Vk#G"dQCgdO8eTC7b&D`Nsq(EOt[g(\dBqA"%F\
+.[U[K]?IiTadH_>7rek>\"k[rcLIkV/l8>tmu/.X?V*spJ[9&Tn(=Hhj"(+ZUOIBSc=c_!gSSBb0ai!]
+r7a_gFZldoBj%/QiIt?;GWj%?4o=l+\>H>i-plfU8n*mF@1;dWBuo!Sc1!sr.$`s/qm*-iB\*fihq4u<
+)j,RE3o7Vkd\MB_[.l")-HX[q;mK(D_?75fImI'6J60,e8MH:.:E9%9lMnfR?dsSG@h$VCjScdOW;d[J
+77#q;R"=bZ__NQU^9/HF16o%Le/pHt^nWt2IIk.r\VislK7\_/nZE&k)#?PA1+"SB)NW.]0kKn=@'B0r
+Af6d@cfNhbBEe:K$IO.W`f*L1r3-^s)V%[!__NR@iB8Dq&e=r\;'G+^hN8boHPmag]Q@'TU0V;'Ek3"/
+Qt/1r\G8AXpotHV*H=Ijrk^WJ_dbA'kgtlaW>p<DA[=iaRA1UYWGDUGiRgMII'k]FcA(iVK\^rg.71D`
+0Z+`P-He'bgaejVMH:aNG3?,t![Q-Up"6O4=&]@LJO4Xd?1Pn?HjanI0ChW&I-ks7,NBfW3>q.'(1rcn
+/M6&S:%,XA@8G(fOf/m2*VK9MC7%c=[jIS[5"-o)F2"FfC>2sVMqUhs[Ki>1>+fH@Z+phHJM6aA*G8lo
+7kHZq)4d`pBn<:>[6unW?eOje??ciUp>)R"ai/XU:hbf3a%u-/0;-(G7gmV'hSM?TID,QON<sGOA(%%%
+cu>8=2<8r"b7E?-b%:\Ykj\0*;ci%@RA`cJSKhuMe2<baho!0-dL!>iH0SNUjK3r[6uEdYE2&SJhAG]"
+N_l&H!ABuPXn==p?]*Z_-P[cmF`q-6PV;=q>08O2UUl[C]VBba4#3h7=7A=YDpU0N'R\@'gF9kYH[<@+
+Mq;p,[^`sEeBlV.p'T'KblD*[[2D"DPc;))8![sh.5%6]4#/-#\uR'gX2`BU'gA@YnOejm>'s@alM8uX
++$9Gj;m'p^.](b5b%YY<S!Ma)Bq0g)pfY)O^oS?d%np3)eH_tRo&<D/r02gRR""gg#Q=-hs,=b=Yo&&P
+.c4H:*;hq1-T#lN9%9`CQd$F0HHDs)5"b'p#>2Rfis]U,IWucgd;G.PS=Xot3\SXQ`>^F8ekl!k#l&UK
+\%EuHRlQ/cLS%;eK-pQd;bt<Yp/GBn%l*<1!9/WBHFjWW\r2#^04"4fUKqQh7^otjLNa_J>kGXXi4BUJ
+dFW0`qb<&h"Kb=k+.i.$Dqdb4jb:dFN`M)I2sa773Tlb^b47kY/ggPfVVh=cqaeF"4"g6`3mg_e9=FTG
+Nidb%oXfT^f+,Im)b,_-BD\(@3cq#:SpYUdVaNb^Tl4N4>dq*;C>upLU>Ht^n#[U+cWrdl!\iJ.,T;Ah
+qTlS.HYWdEa1cPpB2QgR!,_BL(ZWo;^u#IqkS]jVk!:6)>qYce)`f3D)NH5B+!+cUIk:60cSFn$FeKlu
+eAQ_"6;K[iSUkOapXY(O!n(aGE&Cc0QqqpanuYYL>1uC2GM@K1d_'5*[k@<jF,f]9rU>P<2ed^jc1JoK
+?+dX)]%(QaXURMg9qqo?pJ%D)hH-l.%hJIn3\uX?is<(oktS'l3*L27SLZijJVJ(0JZ7H%Q^(^4cu)UQ
+]'@]SO#6uU>'[8Xj$.a1gNp#NV]Xd1m.?Y%G!!D%hJ9$eSKhkc%@u%#q;f?+4@;,<akLZ,*[17273uNq
+0\Ku>2(j<lNT)8il%^pOIX'amSRNBuV%W!PatSHSMk,ZX\p7,l1j<PO>u=eiC$1iYU0&FEo\,Q0Fi9)B
+`lf6Y/D9F&f5Sg06.Os.qsGKs_u&jBgWq7Kfdtk1^:oe[pdaRLl'XMO5#gqAY.B@sr:!,b<)?s5I[dc=
+8n<V=JWNBS>KoeGrUAT2b0PY/CF=VPHlfo@:Hk\4iLn!/OudOVZNZm-[ftAHr/5dckD8Qh]0(9-Xub+O
+LRWa;3rtJiU%Wg7ikATNT7G#H?s3tNn&S6ha)U)iqV=AUb]U[jjam@5h"6ff'@=#9hqL0N(!$:*lG):;
+!?LJ/gat?1jNs:BkkkS5jXj)kjI8o[9dAmc!?Jebht4h`$+T5T&*cC2L)PE%%c<aim!:2tXjog(I*<N%
+o7M[7LGpe\n#\DEn@-&$\%U\f7t81JW@3!LE4,GjKjO@H*'EOoP\olk*QK#h`<Y8fLW:FOFC%h<dgp@6
+q*J#R>XbfL?::K%=Y;c6n6"iuN,)?b'p%*%.e<?p<g$X8#nCe`H>'c5nK7(?qVe^.[Vh@l5;8!PhJo2m
+#^u;VMf[TE#P3]FQZ0FY8%[sX!0,*H*R'th&X>W31RU5"Oib@;BSO!SUthuN?5q-f*)XB`I+%cD.6b95
+A`4J+onh'sQhH6J$\iE$]=D@BJaGPTP)V69n'Jc6CpMJ=`4b1Z<lZoVQ]s9a`J@j?X%W[m*Tm+87=KGm
+P"^A@QZ%)6*l(a4a:hr",MBD@k'+hf+LC*j!M_[OAfeuT*&FdhoX<>=aUU'DB.Vr1-b_l:5:\J[>63a&
+/eAJeW%6YJKuKBYpT2ls/p6J,BoVJ#ci=[H;4)"C,Wf%S:VZWPF67Ag0!(9<e'q>N;`]5LHW,YlAn[T1
+*0C2`)X9^DJZ<;M1gHD8%UYBNQESm3EF-JGTm#(K&q,2S'^!,_^*t?(VWPB;UCRoWkB5X`Eiau!&mYHG
+QGFgf7)";<FAnojX>3/?,LIfu>:PX]]UQY+LU%=hBs.p!l1`.jmZ7$#_o'W;b]KY5;FUBi3?5Dl1gfU<
+8tT:p1ILAq)(@7H05Wg=FQ3@f6HcB&o.I%$nP`Js@G!%ZI:e%bj\p.MCXOhu$<a^;Y&Q(lbRh7SnT.)E
+G2o1"9%\%"CnN'^qb.sdME;iI]eFIV6-$&kSUujliK_-Q\FQZ?/e:WkdTL?f7:i=:b-nQ)^TYfC:`=%F
+PSM(a<M_dnX(/W"WJh<cS;c6fNk4RrJbdM"W"eMLZdC.S4^540D9,8(G"_D=&YPC7?osWG^[_t>4tc!m
+2d9`4<9$[kH+nbfc:jYtMqLI:PKXqbAhIFPK@%q:pOpDNE"htD=j*X1+r_d$hF;S/*jVW!jir/ki'+eG
+.i4U*1GDao!*-?l,j,#T1cTtahV"XpX&?)C-Y0ZT+=PsN41aP7/6_4/ZdUSiikYMl;KfE'E1Ti^,m!O"
+=tQpnjmR4p*cW0_^U9qC>mAgi=1,,4n;qA/DUOIU#h3C+$VWX+'B;f`KP?Ea#sh/>VpJ3s3hLCM#(jH\
+COA'h-!obB]X1VQEbu=5]1s%k0fbs![j9$PW,%iFjuM@u8s^"24'k:YL3,;SVB.Z6*RntiXb([UCodY\
+PG;u?D@AJhmnEft4@;#RZ."#\AnO$>Ca,.XR`,rg]Ic\T4/lBk$PH+5c9j>pW,4SHjd;/g$Sir"b`YH6
+6nNtQhQ])\ZfAu(V+e!p`@!lHm^SNB@A=nSl"3(a4Q3p:4G00jLni=5].DOSlI2g%.;U-\alZao]M;l]
+Ob:18RA<IX]tD(bU<u(<V.L*.?o_'.[+DaVoB^Y.84Dg:HiY.aa^5jOXO/<_0fPV:Xilk_op-;:$'X8E
+1h^G#HD-b*3;`p8bbN\_9.N-*2LC%h8+IOmJA%`ge!_2-\=e$g%rWt]*DA+$BNpL!_i7]Vi#.E!;PHu+
+ZtV=pi`'H,C3Dhjajm3%ppl.P-unN\Xh)7-mO05iI+>.g"*.mqqQ_h"A=7S)S3qr[073JWR_g8f&&JgY
+e)o6G+=>tuNhK'?c:Wsm8iH(C6h)&k7@C3Rh<;89&Cf>1-Ip,_NPTL1D9-@iRoC?W2,e%gEb)bmA)3sY
+9AKm+DS7tMp:C/cKFu`Q-dab2kNC?*f"ib?DXG6+/*RK9G4A3/EIVc)\t'4lc&ghY!qRc#?g9!"mJ9Q"
+Hdtgt,H@)^0D#^2#7#h>;t\*cF%_k^:!/UUU`A+tnE#]/D3Tn5O!(F&.?*M8=.C%\mck)D-Iif:'dJ$b
+p]!bG._(@Rr/P3H_Eg$9oEZJM`6\D%n)/%#*rZOCfDrO5BdhtW$sOF+I;'9Seo&Fp=KdHr7/#a.GlQ?/
+X8\sEih+FtF\_`qDiB8+LcrVu_"7luYtJjf1WBZ[qNG"+G8r=E<rX11lr*KMMB=P_aKm\W@g9d^bJ_Ud
+a^^a+ql"a5$1mS\)f``=F?N7JP`MG#,@hF=2r.u3edB/NE`OdK+"W^,OMrt8p,EHPPP-Bt=JilmAHJmN
+A!ql0^9LAQL1EFlB,/9RMao;VQ!mEHQ3jm9m?TMGUW"tlh(4l6Z&`lT*R6`qX9@22W6&-g_\'Mf1V*8_
+HS!Xsmo3C?e(0MoU6Y4KXgB%jZjSMk;9GNcYZRd)?Pn`(VHoMIQ"X(J\fb*+;]t\I3E@[E^)Y[5RIE#-
+fGbmTKO/Hf==/tT=6H,+[6?Qcl\>eZ1c$90@[W_r*dr]#L_^5,YtS$4cDRcd6*OH<;QBNd4)_OoXhfc2
+O]!1jn5q#-*m57bYtUs5nRs5oW81Q&oZGf1Pp5]UGq;mqCrA*f/R*P>9H&rD-+nn8o)h4fbJ*IJ-HEr`
+f"sM_YtTC(AoWdP=Z'_#(>bg#cS9NQ,7(s>=Jh'OYK::BOL_;AO(E@md5$7jbJZ'?a5jJ\H"2lC;GgOa
+&k=cA=pa40l,GJ%bKYi6Vld<9roN3nZ.n\Og)oU&DI5urXE%kGJ&dFXnh1a$jR%7EPP;(Z-B@:U_'aZG
+NT.Y,==3)_)Ta_f)U2c[*-j*!A"NcRAeqK_Pd+=;]Gg/6Q80JO`A0H2eM4.--@Qi[<NkHE%+[53m^H]m
+7-1B$8M]6>5P]KgNZB\WPG3G5]W'TSXI)4(GKncdR]1LVOA8r^P$hJUBp^K9%pgDFZ71FO\_Lk`HQ]s\
+d45@;p2auii-/#,<++r`O`9oS`<c*8IN?'_0^A5ri.L_.[u37qnM_,]/Z9r]Ki68P2j\(olK$pB[uY8q
+.9k3G,Ee*tWjah^Qe=,%A[ZA-/on9q0UG]*Ce6j7lH&=UN\)7g:aQYIdKor^3]T#$"I2nLeu=TsM5ua_
+f@*jNik"X=;K8F60E)SghH@%&?mb!8o;D$GF#M;W2PtYC`:/D[pn"]pXtgF2c'BB+@sc8&amSBc-P+K-
+0)cb#c]+LL(Z4qTq",<9N"B(Xk6Qnd?hF.o5OfbflJE0lNY18&rH!Y/[lqBXr4js1S9PtU=gmt*aRF0:
+`6Y@G]^L0K[9PWZ[8so&K(!R!UQk\cVd:\?ob6)fG=M0s&^nX)oaJssE`(5f,!!YiY9*'`V('H@P+urT
+[laS31"/hrdd&^.F>m&2kIb<=-IR1BpLZfa_Ig"!KIo:;l6[U_k8/F&8qaaInj9+=Q9bn^#2,$Mi%n'5
+bT6:KQ@r3hfq)RHntaZFh,4C@9*WUuVFsZmr*3d_q`3mFqa"6E/@0P.FqQC?3Lj9()JBq:.;66UB*D'a
+>%t]@>[3nL?h@(MZghAF:.q?jS=RZ:;+aRddZ:^tpf=k0?7P9#\6TYUq.E+NY0=f^&TPDiKiiS2OB\\H
+,&EC@C9K#Q*X/7m1l8BHMrE4N2l[%LS9fWl3><MMI,t=,O/RcJ(1ju!ri(]XA0G.,j";+;#`gABj?uW3
+QLZJu7GK)ZP$+<?q$J9e('SjK;jECu/`J[M7GKrt\;+))9*C2`O19>6Lg-P.ITjZAo/=][V9E"`HH[]k
+?e+8P3om`LgLE(*q5P/unuB&SO4H\\/#/S(gcCHMp@JtTm.dgu7ggm\iUWI]MgB)skB'`h0mE5R4]RoR
+qGIG6j,JSP,!1n^b)sX^3VN4h=;tH>"-/LIrHE)J7`[((kX(1d$LIH+I#2/;m/;5.2sg'(Ja(TN%]-7p
+,,`[^<.r:2/[Lgjl1!4kjt$BoGJkF(p:W9C.os:p1JNp'rTDlgqp2X:7Gg%Y4i.u&O_4R1*j^W,8Q22k
+AA270c0WT47'h?\m(qtLl7JQa9/J.(;^t8T/M%PNUPI&pYY/*%1QY,^3XuLui?"d2lk-^X)0$$P+%5[I
+:.#W4b'^V.4M)"(`/L/(Pi9e8qlJWJQS)uYWo&97Lg-dH8)-/>qdubbEF[.XPKG+FDsO]I&[uYUnq^LP
+?ab@T6adiSDarLdZC&qQN4AO6(&Q-sA]<978JqpgDK_rS=?ha'5N8Z12eb\E;5n3HlG<nUgS<SGU=AVE
+o"I=Ni\ln;[K^?^Xad=+gRToc_'3dMX2V>;ATCD0Jku5l42iGi>rJS8*&I2M`P.X'G>s;7EP(U+<(q.*
+`pHFqNQ<N5:JVS3JS]HK/)lk%7Z?a#GWCCS,eIpf3d)AE.oS'M^p\Suh&>*n0C2^)Y$fdaP$$UF'n'Jd
+<Lc@kKe,\bd1M\ffs':6:qMSd>IgA<7"tc.k"&OZQ(kW^Bs`\#+2:>$3e$E>5P^fnL4]PH!q&k^fn)`L
+3d0>s^NddXicb(T0*&ACnb_IB?f5!1Eq0kT3PJG'-fFDad4m`aJP;!<llFackh&\$h<%?5DIB*;f71h!
+EfAIpdE6ju39kGFLf`sAH2T\g-X^>RJ@e&+a,r#iV9@U48M;-]b9Mh)q+<(^iWK[+CP^[[LWuIkQlZK&
+l9EliFsVDN4lVc1TcT@7RB+=P@Wq'sg;Pj@o<O0VfY$Joc5E%KT+S\^VFQ-NbUt^nYJgQIKIuOUD>XEA
+T!^46i\n%8BjZ8GLq%"sU3upX3TYOkfg*bWCtY)k:Mf_Ho+RT6ao').MW`V(QClgEPeWFLmd&3l?KLS9
+Ll-P23SnfT`!=n\df0.W^ZY*&(P_b?.qFS8M&j.!3:Klg%t/06d&^OI'76qR'DmTUJku5l5:IY!D@e/W
+fPND7'0C5!Z"'DW^9jL[1<Pikg#hfUc0[#*DB1l@VgV"PXa_35HAko]^]iijD;aEAY%5TK_"XC16LLuq
+EI4oZ\*ZZu``GR`1945ooIV$,\tqses3i00/LOFagbO4Rp4uqIm1HgY/E56c,&W6ZcUsL=8U@sS#boq,
+\2C<aX>iW#?#aI;HBsjM]`?SY,,?lMQBENo3Epr9`9fkVX*DR"+H*aPXGC)0Bq3!]BujY=]J*P4_T4k!
+f!8=ZKGh+4V&Op(fuur8Ekkeq/aZp47bg&uLg/H%5:tl\/Qnl9eiM*K&TPuO?]WWEH]p\br8\CY**b!Q
+i\iKcPLsgJLg/&d'bRnpa7+mhc8T3K/Orf/gfg#D>=tBaDB'X9]b.`LCA*HfNk'MTD4oi-@sH0lVCr[(
+leLq@\*R+m*e0tV,P'ZHX7G)+S5$\nR].KKSIQ]\lVpd#X])HlI!F*6gF\o@\'0jaRaS9T(3++J2iG1W
+rhY#fDJ@d_)!G=a4kLfq9:M5(_;$]LLtf_i/7,^!(Ae+I[Gqk.["g)NL2m]`MQ[/YjmO?7Ol#hu:-U7B
+:S$Liig*5GO18cgjq5)ZP+P$u.t(2AB&1%pdro2gDV^5,>MB'#ig+c(p<t4'".@1]Y^S-"3M.k^QL0nU
+EGO,rMmtX6Va]B]I,d]OP(rd%gfgj9Mn%kR[Hcg,QiVY*`%=JVT0-TX8q-bgb`0-WpalQ\[GVRG,gAYY
+\K"bjD;_-14A#?!,&T2WppV2^CK[8\imr-G\P2%%,X!CoE]cKWF0dMUEsM5'nVGqYhe#JF/`N);?5u(h
+7GJ*uO!UMHSQ*-JY>im->?Rl$8)._SrknGcgHZNolW=b.lm3infoAhn+e$gUA.c;p?PTb//BcU-WK=Bc
+4A)[)`W!k<WEnaN)0Y)5(3+[>g*TtD'o":!f!-E:\SfY:DI<3*H,HrZ/gG7*.9TkYfBS`O@9mCU[t/6j
+aI0dP@M-)s:6)t_`E%"_Nj0>I+=HCn/h=_%qb]E@,b%F_:qM[rFj<q(m6U,4;:t,a/E/QA,Ar?K).ThS
+7'4l^DV8h*Bb@huR&9hrf?[Ne=n]7;-R^3Wg"Q"Q>P>F/4qj2#EQaUh*c^L-9+eXa/E56c,&W$GPiULG
+i=WPhJaRi$s7?oWaT[Qga194IL:O-O#k6:eDRlnsARa2$+^_[-pJ6^W%?qnRZ9@35BR%HIgYFgJQFM1X
+Xn.GpDE5H/N4;$9lePd<GYQMt<iBZ`9a\X,1ESP)raJUWUl^^Va%a^=lfs5l-=pQ<NPO8-)isZ,pRtiu
+CO5ohZ$$`Ef(i4TT3jM.-`u3*+86?E5!k:!PbGTY3S$pI^9jAo[2V%Rld]lN2Bo'WPJ/[apR>-bg_7sh
+/c"dU"H&\5@@r+bo_gAagtc>dZa;:\P,F#\@s++1](s6)krGL$GlJV#3u'OI^Kq!n]52EUX/Amn1W-tV
+qu_GPoX+n*g0f\h)NNPdAqD&llX^KS*W]2;:&]<NduVApq9YeYBE\+Rd4-#/gA(Ge1?Nh(V/Je#1;e84
+?]8CW\JX2fB8'FY(G#>ifEXm]i:HZK:Bhi1)N,d;IIJQTYO_p9J\BP"qL:5OaTqH<a2r6(YFWAaG*@46
+*uH?VO9q"$iSP%^XGBs88r(&&ZO[L'`Vj..Ji;kX5]21]pd7CV3cQ68JW(2B_LsOinpSJq3`M.Uh7L5`
+n`*)/U[;R'6MB#j^olG*9FGB#0[%i?lH0\mA-K`Rb73sq6[Pg\?T4_`INSRh5_HiG^V>M%s7)0-^YcrZ
+s6:0_^S\3d56PS[';Hb?ITPhE>;]+N5Q1A\B>f<"j*q0$bO]!~>
+endstream
+endobj
+7 0 obj
+   16332
+endobj
+3 0 obj
+   << 
+      /Parent null
+      /Type /Pages
+      /MediaBox [0.0000 0.0000 1434.0 1007.0]
+      /Resources 8 0 R
+      /Kids [5 0 R]
+      /Count 1
+   >>
+endobj
+9 0 obj
+   [/PDF /Text /ImageC]
+endobj
+10 0 obj
+   << 
+      /S /Transparency
+      /CS /DeviceRGB
+      /I true
+      /K false
+   >>
+endobj
+11 0 obj
+   << 
+      /Alpha1
+      << 
+         /ca 1.0000
+         /CA 1.0000
+         /BM /Normal
+         /AIS false
+      >>
+   >>
+endobj
+8 0 obj
+   << 
+      /ProcSet 9 0 R
+      /ExtGState 11 0 R
+   >>
+endobj
+xref
+0 12
+0000000000 65535 f 
+0000000015 00000 n 
+0000000315 00000 n 
+0000017075 00000 n 
+0000000445 00000 n 
+0000000521 00000 n 
+0000000609 00000 n 
+0000017051 00000 n 
+0000017529 00000 n 
+0000017245 00000 n 
+0000017284 00000 n 
+0000017386 00000 n 
+trailer
+<< 
+   /Size 12
+   /Root 2 0 R
+   /Info 1 0 R
+>>
+startxref
+17602
+%%EOF

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/doc/design/modules/graph.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/graph.tex b/doc/design/modules/graph.tex
index c631e16..a794c8f 100644
--- a/doc/design/modules/graph.tex
+++ b/doc/design/modules/graph.tex
@@ -22,7 +22,9 @@
 \chapter[Graph]{Graph}
 
 \begin{moduleinfo}
-\item[Authors] \href{mailto:okislal@pivotal.io}{Orhan Kislal}, \href{mailto:njayaram@pivotal.io}{Nandish Jayaram}, \href{mailto:rraghu@pivotal.io}{Rashmi Raghu}
+\item[Authors] \href{mailto:okislal@pivotal.io}{Orhan Kislal}, \href{mailto:njayaram@pivotal.io}{Nandish Jayaram},
+			   \href{mailto:rraghu@pivotal.io}{Rashmi Raghu}, \href{mailto:jmei@pivotal.io@pivotal.io}{Jingyi Mei},
+			   \href{mailto:nkak@pivotal.io}{Nikhil Kak}
 \item[History]
 	\begin{modulehistory}
 		\item[v0.1] Initial version, SSSP only.
@@ -31,6 +33,7 @@
         \item[v0.4] APSP
         \item[v0.5] Weakly Connected Components
         \item[v0.6] Breadth First Search (BFS)
+        \item[v0.7] Hyperlink-Induced Topic Search (HITS)
 	\end{modulehistory}
 \end{moduleinfo}
 
@@ -665,3 +668,124 @@ table, except that for each new neighboring vertex considered, only one of the
 parents is recorded as its parent (the node with the smallest id among all
 parent nodes). The content of this updated $message$ is then copied
 to the $out$ table, and this process continues until $message$ table is empty.
+
+
+\section{HITS} \label{sec:graph:hits}
+\begin{figure}[h]
+	\centering
+	\includegraphics[width=0.5\textwidth]{figures/hits_example.pdf}
+\caption{An example graph for HITS showing normalized hub and authority scores}
+\label{hits:example}
+\end{figure}
+
+Hyperlink-Induced Topic Search (HITS)~\cite{hits} developed by Jon Kleinberg is
+a link analysis algorithm that rates Web pages. The idea behind the algorithm is
+to assign Hub and Authority scores to all vertices. Authorities are analagous to
+web pages that have good authoritative content and get linked by other web pages.
+Hubs can be thought of as large directories that themselves do not hold any
+authoritative content but provide direct links to other authoritative pages.
+
+HITS is an iterative algorithm where the Hub and Authority scores of vertices
+from the previous iteration are used to compute the new scores. The
+Hub and Authority scores of a vertex $v$, at the $i^{th}$ iteration, denoted by
+$HUB(v_i)$ and $AUTHORITY(v_i)$ is computed as:
+\begin{equation}
+    \begin{aligned}
+        AUTHORITY(v_i) = \sum_{u \in M(v)}({HUB(u_{i-1})})\\
+        HUB(v_i) = \sum_{u \in M(v)}({AUTHORITY(v_{i})})
+    \end{aligned}
+\label{eq:hits}
+\end{equation}
+
+where $N$ is the number of vertices in the graph, $M(v)$ represents the set of
+vertices that have an edge to vertex $v$, and $HUB(u_{i-1})$ and $AUTHORITY
+(v_{i})$ represent the Hub score of vertex {u} in the $(i-1) ^{th}$ iteration and
+Authority score of vertex $v$ in the $(i)^{th}$ iteration.
+
+\subsection{Implementation Details} \label{sec:hits:implementation}
+
+In this section, we discuss the MADlib implementation of HITS in depth.
+We maintain two tables at every iteration: $message$ and $cur$. The
+$cur$ table maintains the Hub and Authority scores of all vertices
+computed in the previous iteration, while $message$ maintains the updated scores
+of all vertices in the current iteration.
+
+\begin{algorithm}[HITS$(V,E)$] \label{alg:hits:high}
+\begin{algorithmic}[1]
+	\State Create $cur$ table with a default Hub and Authority score of
+			${1}$ for every vertex
+	\Repeat
+        \State Create empty table $message$.
+        \State Update Authority score in $message$ using Hub scores of vertices
+        in $cur$
+        \State Update Hub score in $message$ using Authority scores of vertices
+        in $message$
+        \State Normalize Hub and Authority scores in $message$ using L2
+        normalization
+        \State Rename $message$ to $cur$
+	\Until {both Authority and Hub scores have converged or \emph{max}
+	iterations have elapsed}
+\end{algorithmic}
+\end{algorithm}
+
+The following query is used to create and update the Hub and Authority scores
+in $message$ table using the Hub scores in $cur$ table.
+
+\begin{algorithm}[Update Hub and Authority scores$(cur, edge\_table)$]
+\label{alg:hits:update}
+\begin{lstlisting}
+-- Create message table and update authority scores
+CREATE TABLE message AS
+	SELECT cur.id AS id,
+		COALESCE(SUM(curalias.hub), 0.0) AS authority,
+	cur.hub AS hub
+	FROM cur
+        LEFT JOIN edge_table ON cur.id = edge_table.dest
+        LEFT JOIN cur AS curalias ON curalias.id = edge_table.dest
+	GROUP BY cur.id, cur.hub
+	ORDER BY cur.id
+
+-- Update hub scores in message table:
+UPDATE message
+	SET hub = subquery.hub FROM
+		(
+		SELECT  message.id AS id, COALESCE(SUM(msgalias.authority), 0) AS hub
+		FROM message
+		LEFT JOIN edge_table ON message.id = edge_table.src
+		LEFT JOIN message AS msgalias ON message.id = edge_table.dest
+		GROUP BY  message.id
+		) AS subquery
+	WHERE subquery.id = message.id
+
+\end{lstlisting}
+\end{algorithm}
+
+
+The Hub and Authority computations are terminated either when a fixed number of
+iterations are completed, or when both the Hub and Authority scores of all
+vertices have converged. The Hub/Authority score of a vertex is deemed
+converged if the absolute difference in its Hub/Authority scores from $cur$ and
+$message$ are less than a specified threshold.
+The following query is used to find all the vertices whose Hub/Authority scores
+have not converged yet.
+\begin{algorithm}[Check for Hub and Authority convergence$(cur, message,
+threshold)$]
+\label{alg:hits:update1}
+\begin{lstlisting}
+		SELECT DISTINCT cur.id FROM message
+		INNER JOIN cur ON cur.id=message.id
+		WHERE ABS(cur.authority-message.authority) > threshold
+		OR ABS(cur.hub-message.hub) > threshold
+\end{lstlisting}
+\end{algorithm}
+
+\subsection{Best Practices} \label{sec:hits:bestpractices}
+
+The HITS module in MADlib has a few optional parameters: number of iterations $max$,
+and the threshold for convergence $threshold$.
+The default values for these parameters when not specified by the user are $100$
+and $\frac{1}{N*1000}$ respectively.
+It is to be noted that this is not based on any experimental study. Users of
+MADlib are encouraged to consider this factor when setting a value for threshold,
+since a high $threshold$ value would lead to early termination of computation,
+thus resulting in incorrect Hub and Authority scores.

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index 6784f5e..8c53813 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -960,3 +960,18 @@ Applied Survival Analysis},
     Title = {{Accelerating Recurrent Neural Network Training via Two Stage Classes and Parallelization}},
     Author = {{Zhiheng Huang}}
 }
+
+ @article{hits,
+ author = {Kleinberg, Jon M.},
+ title = {Authoritative Sources in a Hyperlinked Environment},
+ journal = {J. ACM},
+ issue_date = {Sept. 1999},
+ volume = {46},
+ number = {5},
+ month = sep,
+ year = {1999},
+ pages = {604--632},
+ doi = {10.1145/324133.324140},
+ acmid = {324140},
+ publisher = {ACM}
+}

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/apsp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/apsp.py_in b/src/ports/postgres/modules/graph/apsp.py_in
index dce6c33..884ac99 100644
--- a/src/ports/postgres/modules/graph/apsp.py_in
+++ b/src/ports/postgres/modules/graph/apsp.py_in
@@ -34,7 +34,7 @@ from graph_utils import get_graph_usage
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import _check_groups
-from utilities.utilities import _grp_from_table
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import add_postfix
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
@@ -137,12 +137,12 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
             # necessary. In other cases, they are used to simplify debugging.
 
             comma_grp = " , " + grouping_cols
-            comma_grp_e = " , " + _grp_from_table("edge", glist)
-            comma_grp_m = " , " + _grp_from_table(message, glist)
+            comma_grp_e = " , " + get_table_qualified_col_str("edge", glist)
+            comma_grp_m = " , " + get_table_qualified_col_str(message, glist)
             grp_comma = grouping_cols + " , "
-            grp_v1_comma = _grp_from_table("v1", glist) + " , "
-            grp_o1_comma = _grp_from_table(out_table_1, glist) + " , "
-            grp_o_comma = _grp_from_table("out", glist) + " , "
+            grp_v1_comma = get_table_qualified_col_str("v1", glist) + " , "
+            grp_o1_comma = get_table_qualified_col_str(out_table_1, glist) + " , "
+            grp_o_comma = get_table_qualified_col_str("out", glist) + " , "
 
             checkg_eo = " AND " + _check_groups(edge_table, out_table, glist)
             checkg_eout = " AND " + _check_groups("edge", "out", glist)
@@ -545,7 +545,7 @@ def graph_apsp_get_path(schema_madlib, apsp_table,
 
         if grouping_cols:
             glist = split_quoted_delimited_str(grouping_cols)
-            select_grps = _grp_from_table(apsp_table, glist) + " , "
+            select_grps = get_table_qualified_col_str(apsp_table, glist) + " , "
             check_grps_t1 = " AND " + _check_groups(
                 apsp_table, temp1_name, glist)
             check_grps_t2 = " AND " + _check_groups(

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/bfs.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/bfs.py_in b/src/ports/postgres/modules/graph/bfs.py_in
index 61af8f5..0504d91 100644
--- a/src/ports/postgres/modules/graph/bfs.py_in
+++ b/src/ports/postgres/modules/graph/bfs.py_in
@@ -33,7 +33,7 @@ from graph_utils import get_graph_usage
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import _check_groups
-from utilities.utilities import _grp_from_table
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import _grp_null_checks
 from utilities.utilities import add_postfix
 from utilities.utilities import extract_keyvalue_params
@@ -186,8 +186,8 @@ def graph_bfs(schema_madlib, vertex_table, vertex_id, edge_table,
         if grouping_cols and grouping_cols is not '':
             grp_comma = grouping_cols + ", "
             and_grp_null_checks = " AND " + _grp_null_checks(glist)
-            grp_sube_comma = _grp_from_table(sube, glist) + " , "
-            grp_sube1_comma = _grp_from_table(sube1, glist) + " , "
+            grp_sube_comma = get_table_qualified_col_str(sube, glist) + " , "
+            grp_sube1_comma = get_table_qualified_col_str(sube1, glist) + " , "
 
         # We keep a table of every vertex, the distance to that vertex from source
         # and the parent in the path to the vertex.

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/graph_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in b/src/ports/postgres/modules/graph/graph_utils.py_in
index 0113877..8a41560 100644
--- a/src/ports/postgres/modules/graph/graph_utils.py_in
+++ b/src/ports/postgres/modules/graph/graph_utils.py_in
@@ -26,8 +26,9 @@
 
 @namespace graph
 """
-
+import plpy
 from utilities.utilities import _assert, add_postfix
+from utilities.utilities import get_filtered_cols_subquery_str
 from utilities.validate_args import get_cols
 from utilities.validate_args import unquote_ident
 from utilities.validate_args import table_exists
@@ -109,6 +110,85 @@ def validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
 
     return None
 
+def validate_params_for_link_analysis(schema_madlib, func_name,
+                                            threshold, max_iter,
+                                            edge_table=None,
+                                            grouping_cols_list=None):
+    _assert(not threshold or (threshold >= 0.0 and threshold <= 1.0),
+            "{0}: Invalid threshold value ({1}), must be between 0 and 1.".
+            format(func_name, threshold))
+    _assert(max_iter > 0,
+            """{0}: Invalid max_iter value ({1}), must be a positive integer.""".
+            format(func_name, max_iter))
+    if grouping_cols_list:
+        # validate the grouping columns. We currently only support grouping_cols
+        # to be column names in the edge_table, and not expressions!
+        _assert(columns_exist_in_table(edge_table, grouping_cols_list, schema_madlib),
+                "{0} error: One or more grouping columns specified do not exist!".
+                format(func_name))
+
+def update_output_grouping_tables_for_link_analysis(temp_summary_table,
+                                                          iter_num, summary_table,
+                                                          out_table, res_table,
+                                                          grouping_cols_list,
+                                                          cur_unconv,
+                                                          message_unconv=None):
+    """
+        This function updates the summary and output tables only for those
+        groups that have converged. This is found out by looking at groups
+        that appear in cur_unvonv but not in message_unconv: message_unconv
+        consists of groups that have not converged in the current iteration,
+        while cur_unconv contains groups that had not converged in the
+        previous iterations. The entries in cur_unconv is a superset of the
+        entries in message_unconv. So the difference in the groups across
+        the two tables represents the groups that converged in the current
+        iteration.
+    """
+    plpy.execute("TRUNCATE TABLE {0}".format(temp_summary_table))
+    if message_unconv is None:
+        # If this function is called after max_iter is completed, without
+        # convergence, all the unconverged groups from cur_unconv is used
+        # (note that message_unconv is renamed to cur_unconv before checking
+        # for unconverged==0 in the pagerank function's for loop)
+        plpy.execute("""
+            INSERT INTO {temp_summary_table}
+            SELECT * FROM {cur_unconv}
+            """.format(**locals()))
+    else:
+        plpy.execute("""
+            INSERT INTO {temp_summary_table}
+            SELECT {cur_unconv}.*
+            FROM {cur_unconv}
+            WHERE {join_condition}
+            """.format(join_condition=get_filtered_cols_subquery_str(
+            cur_unconv, message_unconv, grouping_cols_list), **locals()))
+
+    plpy.execute("""
+        INSERT INTO {summary_table}
+        SELECT *, {iter_num}+1 AS __iteration__
+        FROM {temp_summary_table}
+        """.format(**locals()))
+    plpy.execute("""
+        INSERT INTO {out_table}
+        SELECT {res_table}.*
+        FROM {res_table}
+        INNER JOIN {temp_summary_table}
+        ON {join_condition}
+        """.format(join_condition=' AND '.join(
+        ["{res_table}.{col}={temp_summary_table}.{col}".format(
+            **locals())
+         for col in grouping_cols_list]), **locals()))
+
+def get_default_threshold_for_link_analysis(n_vertices):
+    """
+        A fixed threshold value, of say 1e-5, might not work well when the
+        number of vertices is a billion, since the initial score
+        (PageRank/Authority/Hub etc.) value of all nodes would then be 1/1e-9.
+        So, assign default threshold value based on number of nodes in the graph.
+        NOTE: The heuristic below is not based on any scientific evidence.
+    """
+    _assert(n_vertices > 0, """Number of vertices must be greater than 0""")
+    return 1.0 / (n_vertices * 1000)
 
 def get_graph_usage(schema_madlib, func_name, other_text):
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/hits.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/hits.py_in b/src/ports/postgres/modules/graph/hits.py_in
index 05cd30e..5299a46 100644
--- a/src/ports/postgres/modules/graph/hits.py_in
+++ b/src/ports/postgres/modules/graph/hits.py_in
@@ -30,59 +30,73 @@
 import math
 import plpy
 import sys
+from graph_utils import get_graph_usage
+from graph_utils import get_default_threshold_for_link_analysis
+from graph_utils import update_output_grouping_tables_for_link_analysis
+from graph_utils import validate_graph_coding
+from graph_utils import validate_params_for_link_analysis
+
 from utilities.control import MinWarning
 from utilities.utilities import _assert
+from utilities.utilities import _check_groups
+from utilities.utilities import get_table_qualified_col_str
 from utilities.utilities import add_postfix
 from utilities.utilities import extract_keyvalue_params
-from utilities.utilities import unique_string
+from utilities.utilities import unique_string, split_quoted_delimited_str
 from utilities.utilities import is_platform_pg
 
-from graph_utils import *
-
+from utilities.validate_args import columns_exist_in_table, drop_tables
+from utilities.validate_args import get_cols_and_types, table_exists
 
 def validate_hits_args(schema_madlib, vertex_table, vertex_id, edge_table,
-                       edge_params, out_table, max_iter, threshold):
+                       edge_params, out_table, max_iter, threshold,
+                       grouping_cols_list=None):
     """
     Function to validate input parameters for HITS
     """
     validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
                           out_table, 'HITS')
-    _assert(not threshold or (threshold >= 0.0 and threshold <= 1.0),
-            "HITS: Invalid threshold value ({0}), must be between 0 and 1.".
-            format(threshold))
-    _assert(max_iter > 0,
-            """HITS: Invalid max_iter value ({0}), must be a positive integer.""".format(
-                max_iter))
+    # Validate args such as threshold and max_iter
+    validate_params_for_link_analysis(schema_madlib, "HITS",
+                                            threshold, max_iter,
+                                            edge_table, grouping_cols_list)
+
 
 def hits(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
-         out_table, max_iter, threshold, **kwargs):
+         out_table, max_iter, threshold, grouping_cols, **kwargs):
     """
     Function that computes the HITS scores
-
     Args:
-        @param vertex_table
-        @param vertex_id
-        @param edge_table
-        @param source_vertex
-        @param dest_vertex
-        @param out_table
-        @param max_iter
-        @param threshold
+        @param schema_madlib:
+        @param vertex_table:
+        @param vertex_id:
+        @param edge_table:
+        @param edge_args:
+        @param out_table:
+        @param max_iter:
+        @param threshold:
+        @param grouping_cols:
+        @param kwargs:
     """
     with MinWarning('warning'):
         params_types = {'src': str, 'dest': str}
         default_args = {'src': 'src', 'dest': 'dest'}
         edge_params = extract_keyvalue_params(
-                          edge_args, params_types, default_args)
+            edge_args, params_types, default_args)
 
         # populate default values for optional params if null
         if max_iter is None:
             max_iter = 100
         if not vertex_id:
             vertex_id = "id"
+        if not grouping_cols:
+            grouping_cols = ''
 
+        grouping_cols_list = split_quoted_delimited_str(grouping_cols)
         validate_hits_args(schema_madlib, vertex_table, vertex_id, edge_table,
-                               edge_params, out_table, max_iter, threshold)
+                           edge_params, out_table, max_iter, threshold,
+                           grouping_cols_list)
+
         summary_table = add_postfix(out_table, "_summary")
         _assert(not table_exists(summary_table),
                 """Graph HITS: Output summary table ({summary_table}) already
@@ -95,234 +109,391 @@ def hits(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
                             FROM {1}
                         """.format(vertex_id, vertex_table))[0]["cnt"]
 
-        # Assign default threshold value based on number of nodes in the graph.
         if threshold is None:
-            threshold = 1.0 / (n_vertices * 1000)
+            threshold = get_default_threshold_for_link_analysis(n_vertices)
 
+        # table/column names used when grouping_cols is set.
         edge_temp_table = unique_string(desp='temp_edge')
+        grouping_cols_comma = grouping_cols + ',' if grouping_cols else ''
         distribution = ('' if is_platform_pg() else
-                        "DISTRIBUTED BY ({0})".format(dest))
-        plpy.execute("DROP TABLE IF EXISTS {0}".format(edge_temp_table))
+                        "DISTRIBUTED BY ({0}{1})".format(
+                            grouping_cols_comma, dest))
+        drop_tables([edge_temp_table])
         plpy.execute("""
                 CREATE TEMP TABLE {edge_temp_table} AS
                 SELECT * FROM {edge_table}
                 {distribution}
             """.format(**locals()))
 
-        # GPDB and HAWQ have distributed by clauses to help them with indexing.
-        # For Postgres we add the index explicitly.
-        if is_platform_pg():
-            plpy.execute("CREATE INDEX ON {0}({1})".format(
-                edge_temp_table, dest))
+        ######################################################################
+        # Set initial authority_norm and hub_norm as 1, so that later the final
+        # norm should be positive number
+        authority_init_value = 1.0
+        hub_init_value = 1.0
+
+        subquery1 = unique_string(desp='subquery1')
+
+        distinct_grp_table = ''
+        select_grouping_cols_comma = ''
+        select_subquery1_grouping_cols_comma = ''
+        group_by_clause = ''
+        grouping_cols_for_create_table = ''
+        grouping_cols_for_create_table_comma = ''
+        # This table is created only when grouping is used.
+        temp_summary_table = None
+        if grouping_cols:
+            distinct_grp_table = unique_string(desp='grp')
+            drop_tables([distinct_grp_table])
+            plpy.execute("""CREATE TEMP TABLE {distinct_grp_table} AS
+                    SELECT DISTINCT {grouping_cols} FROM {edge_temp_table}
+                """.format(**locals()))
+            group_by_clause = get_table_qualified_col_str(subquery1, grouping_cols_list)
+            select_grouping_cols_comma = group_by_clause + ','
+            select_subquery1_grouping_cols_comma = grouping_cols + ','
+            group_by_clause = 'GROUP BY ' + grouping_cols + ',' + vertex_id
+            cols_names_types = get_cols_and_types(edge_table)
+            grouping_cols_for_create_table = ', '.join([c_name + " " + c_type
+                                                        for (c_name, c_type)
+                                                        in cols_names_types
+                                                        if c_name in
+                                                        grouping_cols_list])
+            grouping_cols_for_create_table_comma = \
+                grouping_cols_for_create_table + ','
+            temp_summary_table = unique_string(desp='temp_summary')
+            drop_tables([temp_summary_table])
+            plpy.execute("""
+                CREATE TEMP TABLE {temp_summary_table} (
+                    {grouping_cols_for_create_table}
+                )
+            """.format(**locals()))
+            # Create output table. This will be updated whenever a group converges
+            # Note that vertex_id is assumed to be an integer (as described in
+            # documentation)
+            plpy.execute("""
+                    CREATE TABLE {out_table} (
+                        {grouping_cols_for_create_table_comma}
+                        {vertex_id} INTEGER,
+                        authority DOUBLE PRECISION,
+                        hub DOUBLE PRECISION
+                    )
+                """.format(**locals()))
 
         # Intermediate tables required.
         cur = unique_string(desp='cur')
         message = unique_string(desp='message')
-        v1 = unique_string(desp='v1')
-        message_unconv_authority = unique_string(
-                                       desp='message_unconv_authority')
-        message_unconv_hub = unique_string(desp="message_unconv_hub")
-        tmp = unique_string(desp='tmp')
-        tmp2 = unique_string(desp='tmp2')
-        tmp3 = unique_string(desp='tmp3')
-        v2 = unique_string(desp='v2')
+        # curalias and msgalias are used as aliases for current and
+        # message tables respectively during self joins
+        curalias = unique_string(desp='curalias')
+        msgalias = unique_string(desp='msgalias')
+        message_unconv = unique_string(desp='message_unconv')
+        subquery2 = unique_string(desp='subquery2')
+
+        # GPDB has distributed by clauses to help them with indexing.
+        # For Postgres we add the index explicitly.
+        if is_platform_pg():
+            plpy.execute("CREATE INDEX ON {0}({1})".format(
+                edge_temp_table, dest))
 
         if is_platform_pg():
             cur_distribution = cnts_distribution = ''
         else:
             cur_distribution = cnts_distribution = \
-                "DISTRIBUTED BY ({0})".format(vertex_id)
-        cur_join_clause = " {cur}.{vertex_id} = {edge_temp_table}.{dest}".format(
-            **locals())
-        v1_join_clause = "{v1}.{vertex_id} = {edge_temp_table}.{src}".format(
-            **locals())
-
-        authority_init_value = 1.0
-        hub_init_value = 1.0
+                "DISTRIBUTED BY ({0}{1})".format(
+                    grouping_cols_comma, vertex_id)
+        cur_join_clause = " {cur}.{vertex_id} = {edge_temp_table}.{dest}"\
+            .format(**locals())
+        curalias_join_clause = "{curalias}.{vertex_id} = {edge_temp_table}.{src}"\
+            .format(**locals())
+        drop_tables([cur])
         plpy.execute("""
-                CREATE TEMP TABLE {cur} AS
-                SELECT {vertex_id}, {authority_init_value}::DOUBLE PRECISION 
-                AS authority,
-                {hub_init_value}::DOUBLE PRECISION AS hub
-                FROM {vertex_table}
-                {cur_distribution}
-            """.format(**locals()))
-
-        # The summary table contains the total number of iterations.
+            CREATE TEMP TABLE {cur} AS
+            SELECT {select_grouping_cols_comma} {subquery1}.{vertex_id},
+                    {authority_init_value}::DOUBLE PRECISION AS authority,
+                    {hub_init_value}::DOUBLE PRECISION AS hub
+            FROM (
+                SELECT {select_subquery1_grouping_cols_comma} {vertex_id}
+                FROM {edge_temp_table} JOIN {vertex_table}
+                ON {edge_temp_table}.{src}={vertex_table}.{vertex_id}
+                UNION
+                SELECT {select_subquery1_grouping_cols_comma} {vertex_id}
+                FROM {edge_temp_table} JOIN {vertex_table}
+                ON {edge_temp_table}.{dest}={vertex_table}.{vertex_id}
+            ) {subquery1}
+            {group_by_clause}
+            {cur_distribution}
+        """.format(**locals()))
+
+        # The summary table contains the total number of iterations for each
+        # group
         plpy.execute("""
                 CREATE TABLE {summary_table} (
+                    {grouping_cols_for_create_table_comma}
                     __iterations__ INTEGER
                 )
             """.format(**locals()))
 
-        unconverged_authority_num = 0
-        unconverged_hub_num = 0
+        # create message table
+        cur_grouping_cols_comma = ''
+        grouping_cols_join_condition_cur_and_edge = ''
+        grouping_cols_join_condition_cur_and_curalias = ''
+
+        # update message table
+        msg_grouping_cols_comma = ''
+        grouping_cols_where_clause_msg_subquery2 = ''
+        grouping_cols_join_condition_msg_and_edge = ''
+        grouping_cols_join_condition_msg_and_msgalias = ''
+        grouping_cols_where_condition_msg_subquery1 = ''
+        group_by_msg_grouping_cols = ''
+
+        # check for convergence
+        select_distinct_unconverged_rows = '{0}.{1}'.format(cur, vertex_id)
+        grouping_cols_join_condition_cur_and_msg = ''
+
+        if grouping_cols:
+            cur_grouping_cols = get_table_qualified_col_str(cur, grouping_cols_list)
+            cur_grouping_cols_comma = cur_grouping_cols + ','
+            grouping_cols_join_condition_cur_and_edge = ' AND ' + \
+                _check_groups(cur, edge_temp_table, grouping_cols_list)
+            grouping_cols_join_condition_cur_and_curalias = ' AND ' + \
+                _check_groups(cur, curalias, grouping_cols_list)
+
+            msg_grouping_cols = get_table_qualified_col_str(message, grouping_cols_list)
+            msg_grouping_cols_comma = msg_grouping_cols + ','
+            grouping_cols_where_clause_msg_subquery2 = ' AND ' + \
+                _check_groups(message, subquery2, grouping_cols_list)
+            grouping_cols_join_condition_msg_and_edge = ' AND ' + \
+                _check_groups(message, edge_temp_table, grouping_cols_list)
+            grouping_cols_join_condition_msg_and_msgalias = ' AND ' + \
+                _check_groups(message, msgalias, grouping_cols_list)
+
+            group_by_msg_grouping_cols = ' GROUP BY ' + msg_grouping_cols
+
+            grouping_cols_where_condition_msg_subquery1 = ' WHERE ' + \
+                _check_groups(message, subquery1, grouping_cols_list)
+
+            # this is used for the message_unconv table to find out how many groups
+            # have converged
+            select_distinct_unconverged_rows = cur_grouping_cols
+            grouping_cols_join_condition_cur_and_msg = ' AND ' + \
+                _check_groups(cur, message, grouping_cols_list)
+
+        # Variables common to both grouping and non-grouping cases
+        message_join_clause = "{message}.{vertex_id} = \
+        {edge_temp_table}.{src}".format(**locals())
+        msgalias_join_clause = "{msgalias}.{vertex_id} = {edge_temp_table}.{dest}".format(
+            **locals())
+        sum_norm_square_root = unique_string(desp='sum_norm_sqr_root')
+        cur_unconv = unique_string(desp='cur_unconv')
+
+        converged = False
         iteration_num = 0
+        """
+        We need to calculate the hub and authority scores for each iteration
+        and pass the current iteration values to the next iteration.
+        To achieve this, we use the following tables
+        1. cur -> This gets initialized with default values for both authority
+        and hub.
+        2. message -> This gets created for each iteration with newly
+        computed scores based on cur. At the end of each iteration, we rename
+        message to cur as a way to pass authority and hub values to the next
+        iteration.
+        This convention is similar to message passing paradigm in
+        distributed systems such as spark.
+        """
+        for iteration_num in range(max_iter):
 
-        ## Set initial authority_norm and hub_norm as 1, so that later the final 
-        ## norm should be positive number
-        authority_norm = 1
-        hub_norm = 1
+            calculate_authority_and_hub_scores(**locals())
 
-        plpy.execute("""
-                CREATE TABLE {message_unconv_authority} AS
-                SELECT {cur}.{vertex_id}
-                FROM {cur}
-                LIMIT 0
-            """.format(**locals()))
-        plpy.execute("""
-                CREATE TABLE {message_unconv_hub} AS
-                SELECT {cur}.{vertex_id}
-                FROM {cur}
-                LIMIT 0
-            """.format(**locals()))
+            # Check for convergence only if threshold != 0.
+            if threshold != 0:
 
-        for iteration_num in range(max_iter):
-            ###################################################################
-            # HITS scores for nodes in a graph at any given iteration 'i' is
-            # calculated as following:
-            # authority_i(A) = hub_i(B) + hub_i(C) + ..., where B, C are nodes 
-            # that have edges that point to node A
-            # After calculating authority scores for all nodes, hub scores are 
-            # calculated as following:
-            # hub_i(A) = authority_i(D) + authority_i(E) + ..., where D, E are 
-            # nodes that A points to
-            # At the end of each iteration, a normalization will
-            # be done for all authority scores and hub scores using L2 distance
-            ###################################################################
-
-            ###################################################################
-            # calculate authority
-            # if there is no node that point to A, authority_i(A) = 0
-            ###################################################################
-            plpy.execute("""
-                    CREATE TABLE {message} AS
-                    SELECT {cur}.{vertex_id} AS {vertex_id},
-                            COALESCE(SUM({v1}.hub), 0.0) AS authority,
-                            {cur}.hub AS hub
-                    FROM {cur}
-                        LEFT JOIN {edge_temp_table} ON {cur_join_clause}
-                        LEFT JOIN {cur} AS {v1} ON {v1_join_clause}
-                    GROUP BY {cur}.{vertex_id}, {cur}.hub
-                    ORDER BY {cur}.{vertex_id}
-                    {cur_distribution}
-                """.format(**locals()))
+                converged = check_for_convergence(**locals())
+
+                if iteration_num > 0 and grouping_cols:
+                    # Update result and summary tables for groups that have
+                    # converged
+                    # since the last iteration.
+                    update_output_grouping_tables_for_link_analysis(temp_summary_table,
+                                                                          iteration_num,
+                                                                          summary_table,
+                                                                          out_table,
+                                                                          message,
+                                                                          grouping_cols_list,
+                                                                          cur_unconv,
+                                                                          message_unconv)
+
+                drop_tables([cur_unconv])
+                plpy.execute("""ALTER TABLE {message_unconv} RENAME TO
+                    {cur_unconv} """.format(**locals()))
+
+            drop_tables([cur])
+            plpy.execute("""ALTER TABLE {message} RENAME TO {cur}
+                    """.format(**locals()))
+            drop_tables([sum_norm_square_root])
 
-            ###################################################################
-            # calculate hub
-            # if node A doesn't point to any node, hub_i(A) = 0
-            ###################################################################
-            message_join_clause = "{message}.{vertex_id} = \
-                {edge_temp_table}.{src}".format(**locals())
-            v2_join_clause = "{v2}.{vertex_id} = {edge_temp_table}.{dest}".format(
-                **locals())
-            plpy.execute("""
-                    UPDATE {message}
-                    SET hub = {tmp2}.hub
-                    FROM
-                    (SELECT {message}.{vertex_id} AS {vertex_id},
-                            COALESCE(SUM({v2}.authority), 0) AS hub
-                    FROM {message}
-                        LEFT JOIN {edge_temp_table} ON {message_join_clause}
-                        LEFT JOIN {message} AS {v2} ON {v2_join_clause}
-                    GROUP BY {message}.{vertex_id}) AS {tmp2}
-                    WHERE {tmp2}.{vertex_id} = {message}.{vertex_id}
-                """.format(**locals()))
+            if converged:
+                break
 
-            # normalize authority value with L2 distance
-            authority_norm = math.sqrt(plpy.execute("""
-                    SELECT SUM({tmp}.authority_square) AS sum_norm_square
-                    FROM
-                    (SELECT POWER(authority, 2) AS authority_square
-                    FROM {message}) AS {tmp}
-                """.format(**locals()))[0]["sum_norm_square"])
+        update_final_results(converged, threshold, cur, temp_summary_table,
+                             iteration_num, summary_table, out_table,
+                             grouping_cols_list, cur_unconv, distinct_grp_table)
 
-            if authority_norm == 0:
-                plpy.error("Error while normalizing authority score, please \
-                            make sure your graph is a directed graph")
-            plpy.execute("""
-                    UPDATE {message}
-                    SET authority = authority/{authority_norm}
-                """.format(**locals()))
+        # Cleanup All the intermediate tables
+        drop_tables([cur, message, cur_unconv, message_unconv, edge_temp_table])
 
-            # normalize hub value with L2 distance
-            hub_norm = math.sqrt(plpy.execute("""
-                    SELECT SUM({tmp3}.hub_square) AS sum_hub_square
-                    FROM
-                    (SELECT POWER(hub, 2) AS hub_square
-                    FROM {message}) AS {tmp3}
-                """.format(**locals()))[0]["sum_hub_square"])
+        if grouping_cols:
+            drop_tables([distinct_grp_table, temp_summary_table])
 
-            if hub_norm == 0:
-                plpy.error("Error happened while normalizing hub score, please \
-                            make sure your graph is a directed graph")
-            plpy.execute("""
-                    UPDATE {message}
-                    SET hub = hub/{hub_norm}
-                """.format(**locals()))
 
-            # Check for convergence:
-            # Check for convergence only if threshold != 0.
+def update_final_results(converged, threshold, cur, temp_summary_table,
+                         iteration_num, summary_table, out_table,
+                         grouping_cols_list, cur_unconv, distinct_grp_table):
+    """
+        If there still are some converged/unconverged nodes (within groups/entire
+        table), update results table for those nodes.
+    """
+    if grouping_cols_list:
+        if not converged:
             if threshold != 0:
-                # message_unconv and cur_unconv will contain the unconverged 
-                # groups after current and previous iterations respectively. 
-                # we check if there is at least one unconverged node (limit 1
-                # is used in the query).
-                limit = ' LIMIT 1 '
-                plpy.execute("""
-                        TRUNCATE TABLE {message_unconv_authority};
-                        INSERT INTO {message_unconv_authority}
-                        SELECT {cur}.{vertex_id}
-                        FROM {message}
-                        INNER JOIN {cur}
-                        ON {cur}.{vertex_id}={message}.{vertex_id}
-                        WHERE ABS({cur}.authority-{message}.authority) > {threshold}
-                        {limit}
-                    """.format(**locals()))
-                unconverged_authority_num = plpy.execute("""
-                        SELECT COUNT(*) AS cnt FROM {0}
-                    """.format(message_unconv_authority))[0]["cnt"]
-
-                plpy.execute("""
-                        TRUNCATE TABLE {message_unconv_hub};
-                        INSERT INTO {message_unconv_hub}
-                        SELECT {cur}.{vertex_id}
-                        FROM {message}
-                        INNER JOIN {cur}
-                        ON {cur}.{vertex_id}={message}.{vertex_id}
-                        WHERE ABS({cur}.hub-{message}.hub) > {threshold}
-                        {limit}
-                    """.format(**locals()))
-                unconverged_hub_num = plpy.execute("""SELECT COUNT(*) 
-                    AS cnt FROM {0}""".format(message_unconv_hub))[0]["cnt"]
+                # We completed max_iters, but there are still some unconverged
+                # groups # Update the result and summary tables for unconverged
+                # groups.
+                update_output_grouping_tables_for_link_analysis(temp_summary_table,
+                                                                      iteration_num,
+                                                                      summary_table,
+                                                                      out_table, cur,
+                                                                      grouping_cols_list,
+                                                                      cur_unconv)
             else:
-                # Do not run convergence test if threshold=0, since that implies
-                # the user wants to run max_iter iterations.
-                unconverged_authority_num = 1
-                unconverged_hub_num = 1
-
-            plpy.execute("DROP TABLE IF EXISTS {0}".format(cur))
-            plpy.execute("""ALTER TABLE {message} RENAME TO {cur}
-                """.format(**locals()))
-
-            if unconverged_authority_num == 0 and unconverged_hub_num == 0:
-                break
-
+                # No group has converged. List of all group values are in
+                # distinct_grp_table.
+                update_output_grouping_tables_for_link_analysis(temp_summary_table,
+                                                                      iteration_num,
+                                                                      summary_table,
+                                                                      out_table, cur,
+                                                                      grouping_cols_list,
+                                                                      distinct_grp_table)
+    else:
         plpy.execute("""
                 ALTER TABLE {table_name}
                 RENAME TO {out_table}
-            """.format(table_name=cur, **locals()))
-
-        # update summary_table
+                """.format(table_name=cur, **locals()))
         plpy.execute("""
-                INSERT INTO {summary_table}
-                VALUES({iteration_num}+1)
-            """.format(**locals()))
+                INSERT INTO {summary_table} VALUES
+                ({iteration_num}+1)
+                """.format(**locals()))
 
-        # Cleanup All the intermediate tables
-        plpy.execute("""DROP TABLE IF EXISTS {0},{1},{2},{3},{4}
-            """.format(edge_temp_table, cur, message,
-                       message_unconv_authority, message_unconv_hub))
+
+def check_for_convergence(**kwargs):
+    # message_unconv and cur_unconv will contain the unconverged
+    # groups after current and previous iterations respectively.
+    # we check if there is at least one unconverged node (limit 1
+    # is used in the query).
+    plpy.execute("""
+                    CREATE TABLE {message_unconv} AS
+                    SELECT DISTINCT {select_distinct_unconverged_rows}
+                    FROM {message}
+                    INNER JOIN {cur}
+                    ON {cur}.{vertex_id}={message}.{vertex_id}
+                    {grouping_cols_join_condition_cur_and_msg}
+                    WHERE ABS({cur}.authority-{message}.authority) > {threshold}
+                    OR ABS({cur}.hub-{message}.hub) > {threshold}
+                """.format(**kwargs))
+    unconverged_node_num = plpy.execute("""
+                        SELECT COUNT(*) AS cnt FROM {message_unconv}
+                    """.format(**kwargs))[0]["cnt"]
+    return unconverged_node_num == 0
+
+
+def calculate_authority_and_hub_scores(**kwargs):
+    """
+    This function is responsible for calculating the authority and hub scores.
+    This is done in a two-step process:
+    1. create message table and compute authority score.
+    2. Use authority scores computed to update the hub score.
+
+    :param kwargs: dict of locals() of the calling function.
+    :return:
+    """
+    ###################################################################
+    # HITS scores for nodes in a graph at any given iteration 'i' is
+    # calculated as following:
+    # authority_i(A) = hub_i(B) + hub_i(C) + ..., where B, C are nodes
+    # that have edges that point to node A
+    # After calculating authority scores for all nodes, hub scores are
+    # calculated as following:
+    # hub_i(A) = authority_i(D) + authority_i(E) + ..., where D, E are
+    # nodes that A points to
+    # At the end of each iteration, a normalization will
+    # be done for all authority scores and hub scores using L2 distance
+    ###################################################################
+
+    ###################################################################
+    # calculate authority
+    # if there is no node that point to A, authority_i(A) = 0
+    ###################################################################
+    plpy.execute("""
+                    CREATE TABLE {message} AS
+                    SELECT {cur_grouping_cols_comma} {cur}.{vertex_id} AS {vertex_id},
+                            COALESCE(SUM({curalias}.hub), 0.0) AS authority,
+                            {cur}.hub AS hub
+                    FROM {cur}
+                        LEFT JOIN {edge_temp_table} ON
+                        {cur_join_clause} {grouping_cols_join_condition_cur_and_edge}
+                        LEFT JOIN {cur} AS {curalias} ON
+                        {curalias_join_clause} {grouping_cols_join_condition_cur_and_curalias}
+                    GROUP BY {cur_grouping_cols_comma} {cur}.{vertex_id}, {cur}.hub
+                    {cur_distribution}
+                """.format(**kwargs))
+    ###################################################################
+    # calculate hub
+    # if node A doesn't point to any node, hub_i(A) = 0
+    ###################################################################
+    plpy.execute("""
+                    UPDATE {message}
+                    SET hub = {subquery2}.hub
+                    FROM
+                    (SELECT {msg_grouping_cols_comma} {message}.{vertex_id} AS {vertex_id},
+                            COALESCE(SUM({msgalias}.authority), 0) AS hub
+                    FROM {message}
+                        LEFT JOIN {edge_temp_table} ON
+                        {message_join_clause} {grouping_cols_join_condition_msg_and_edge}
+                        LEFT JOIN {message} AS {msgalias} ON
+                        {msgalias_join_clause} {grouping_cols_join_condition_msg_and_msgalias}
+                    GROUP BY {msg_grouping_cols_comma} {message}.{vertex_id}) AS {subquery2}
+                    WHERE {subquery2}.{vertex_id} = {message}.{vertex_id}
+                    {grouping_cols_where_clause_msg_subquery2}
+                """.format(**kwargs))
+    # normalize authority and hub score with L2 distance
+    plpy.execute("""
+                    CREATE TEMP TABLE {sum_norm_square_root} AS
+                        SELECT {msg_grouping_cols_comma}
+                            SQRT(SUM(POWER(authority, 2))) AS auth_sum_norm_square_root,
+                            SQRT(SUM(POWER(hub, 2))) AS hub_sum_norm_square_root
+                        FROM {message}
+                    {group_by_msg_grouping_cols}
+                """.format(**kwargs))
+
+    num_zero_sum_norm_square_root = plpy.execute("""
+                                                    SELECT COUNT(*) AS cnt
+                                                    FROM {sum_norm_square_root}
+                                                    WHERE auth_sum_norm_square_root = 0
+                                                    OR hub_sum_norm_square_root = 0
+                                                """.format(**kwargs))[0]["cnt"]
+
+    if num_zero_sum_norm_square_root > 0:
+        plpy.error("Error while normalizing authority score, please \
+                            make sure your graph is a directed graph")
+    plpy.execute("""
+                    UPDATE {message}
+                        SET authority = {message}.authority/{subquery1}.auth_sum_norm_square_root,
+                            hub = {message}.hub/{subquery1}.hub_sum_norm_square_root
+                    from (SELECT {grouping_cols_comma}
+                            auth_sum_norm_square_root,
+                            hub_sum_norm_square_root
+                        FROM {sum_norm_square_root}) {subquery1}
+                    {grouping_cols_where_condition_msg_subquery1}
+                """.format(**kwargs))
 
 
 def hits_help(schema_madlib, message, **kwargs):
@@ -341,10 +512,12 @@ def hits_help(schema_madlib, message, **kwargs):
             message.lower() in ("usage", "help", "?"):
         help_string = "Get from method below"
         help_string = get_graph_usage(schema_madlib, 'HITS',
-"""out_table     TEXT, -- Name of the output table for HITS
-    max_iter     INTEGER, -- Maximum iteration number (DEFAULT = 100)
-    threshold    DOUBLE PRECISION, -- Stopping criteria (DEFAULT = 1/(N*1000),
-                                   -- N is number of vertices in the graph)
+     """out_table     TEXT, -- Name of the output table for HITS
+        max_iter      INTEGER, -- Maximum iteration number (DEFAULT = 100)
+        threshold     DOUBLE PRECISION, -- Stopping criteria (DEFAULT = 1/(N*1000),
+                                        -- N is number of vertices in the graph)
+        grouping_cols TEXT -- Comma separated column names to group on
+                           -- (DEFAULT = NULL, no grouping)
 """) + """
 
 A summary table is also created that contains information regarding the
@@ -388,7 +561,17 @@ INSERT INTO edge VALUES
 (3, 0, 1),
 (4, 0, 1),
 (5, 6, 1),
-(6, 3, 1);
+(6, 3, 1),
+(0, 1, 2),
+(0, 2, 2),
+(0, 4, 2),
+(1, 2, 2),
+(1, 3, 2),
+(2, 3, 2),
+(3, 0, 2),
+(4, 0, 2),
+(5, 6, 2),
+(6, 3, 2);
 
 -- Compute the HITS score:
 DROP TABLE IF EXISTS hits_out, hits_out_summary;
@@ -397,15 +580,29 @@ SELECT {schema_madlib}.hits(
                      'id',                 -- Vertex id column
                      'edge',               -- Edge table
                      'src=src, dest=dest', -- Comma delimited string of edge arguments
-                     'hits_out');      -- Output table of HITS
+                     'hits_out');          -- Output table of HITS
+-- View the authority and hub scores of all vertices, ordered by their id.
+SELECT * FROM hits_out ORDER BY id;
+
+-- Compute the HITS score of nodes associated with each user:
+DROP TABLE IF EXISTS hits_out, hits_out_summary;
+SELECT {schema_madlib}.hits(
+             'vertex',             -- Vertex table
+             'id',                 -- Vertix id column
+             'edge',               -- Edge table
+             'src=src, dest=dest', -- Comma delimted string of edge arguments
+             'hits_out',           -- Output table of HITS
+             NULL,                 -- Default max_iter
+             NULL,                 -- Threshold
+             'user_id');           -- Grouping column
+
+-- View the authority and hub scores of all vertices, ordered by the grouping column.
+SELECT * FROM hits_out ORDER BY user_id, id;
 
--- View the authority of all vertices, sorted by their scores.
-SELECT authority FROM hits_out ORDER BY authority DESC;
--- View the hub of all vertices, sorted by their scores.
-SELECT hub FROM hits_out ORDER BY hub DESC;
 -- View the summary table to find the number of iterations required for
 -- convergence.
 SELECT * FROM hits_out_summary;
+
 """
         else:
             help_string = """

http://git-wip-us.apache.org/repos/asf/madlib/blob/4b7d9cc5/src/ports/postgres/modules/graph/hits.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/hits.sql_in b/src/ports/postgres/modules/graph/hits.sql_in
index b54c256..d06bbb8 100644
--- a/src/ports/postgres/modules/graph/hits.sql_in
+++ b/src/ports/postgres/modules/graph/hits.sql_in
@@ -40,10 +40,11 @@ m4_include(`SQLCommon.m4')
 </ul>
 </div>
 
-@brief Find the HITS scores(Authority and Hub) of all vertices in a directed graph.
+@brief Find the HITS scores(authority and hub) of all vertices in a directed
+graph.
 
 Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the 
-authority score and hub score of every vertex, where authority estimates the 
+authority score and hub score of every vertex, where authority estimates the
 value of the content of the page and hub estimates the value of its links to
 other pages. This algorithm was developed by Jon Kleinberg to rate web pages.
 
@@ -51,14 +52,14 @@ other pages. This algorithm was developed by Jon Kleinberg to rate web pages.
 @par HITS
 <pre class="syntax">
 hits( vertex_table,
-        vertex_id,
-        edge_table,
-        edge_args,
-        out_table,
-        max_iter,
-        threshold,
-        grouping_cols
-          )
+      vertex_id,
+      edge_table,
+      edge_args,
+      out_table,
+      max_iter,
+      threshold,
+      grouping_cols
+    )
 </pre>
 
 \b Arguments
@@ -90,8 +91,11 @@ this string argument:
     a row for every vertex from 'vertex_table' with the following columns:
     - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' 
                   for column naming.
-    - authority : The vertex's Authority score.
-    - hub : The vertex's Hub score.</dd>
+    - authority : The vertex's authority score.
+    - hub : The vertex's hub score.
+    - grouping_cols : Grouping column (if any) values associated with the vertex_id.
+
+</dd>
 
 A summary table is also created that contains information 
 regarding the number of iterations required for convergence.
@@ -100,26 +104,26 @@ parameter.
 
 <dt>max_iter (optional) </dt>
 <dd>INTEGER, default: 100. The maximum number of iterations allowed. An 
-    iteration consists of both Authority and Hub phases.</dd>
+    iteration consists of both authority and hub phases.</dd>
 
 <dt>threshold (optional) </dt>
-<dd>FLOAT8, default: (1/number of vertices * 1000). If the difference 
-    between the values of both scores (Authority and Hub) for every vertex of 
-    two consecutive iterations is smaller than 'threshold', or the iteration 
-    number is larger than 'max_iter', the computation stops. If you set the 
-    threshold to zero, then you will force the algorithm to run for the full 
-    number of iterations specified in 'max_iter'. Threshold need to be set to 
-    a value equal or less than 1 since both values (Authority and Hub) of nodes
-    are initialized as 1. Note that both Authority and Hub value difference 
-    must be below threshold for the algorithm to stop.</dd>
+<dd>FLOAT8, default: (1/number of vertices * 1000).
+    If the difference between two consecutive iterations of authority AND two
+    consecutive iterations of hub is smaller than 'threshold', then the
+    computation stops. If you set the threshold to zero, then you will force the
+    algorithm to run for the full number of iterations specified in 'max_iter'.
+    Threshold needs to be set to a value between 0 and 1. Note that both
+    authority and hub value difference must be below threshold for the
+    algorithm to stop.
+</dd>
 
-<dt>grouping_cols (optional)[not support yet]</dt>
+<dt>grouping_cols (optional)</dt>
 <dd>TEXT, default: NULL. A single column or a list of comma-separated columns 
     that divides the input data into discrete groups, resulting in one 
     distribution per group. When this value is NULL, no grouping is used and a
     single model is generated for all data.
     @note Expressions are not currently supported for 'grouping_cols'.
-    The grouping support will be added later.</dd>
+</dd>
 
 </dl>
 
@@ -127,8 +131,8 @@ parameter.
 @par Notes
 
 1. The HITS algorithm is based on Kleinberg's paper [1].
-2. This algorithm supports multigraph, and duplicated edges do count for multiple
-   times while calculating authority and hub scores. 
+2. This algorithm supports multigraph and each duplicated edge is considered
+   for counting when calculating authority and hub scores.
 
 @anchor examples
 @examp
@@ -167,7 +171,7 @@ INSERT INTO edge VALUES
 (6, 3, 1);
 </pre>
 
--# Compute the HITS scores:
+-# Running HITS with default values for optional parameters:
 <pre class="syntax">
 DROP TABLE IF EXISTS hits_out, hits_out_summary;
 SELECT madlib.hits(
@@ -181,13 +185,13 @@ SELECT * FROM hits_out ORDER BY id;
 <pre class="result">
  id |      authority       |         hub
 ----+----------------------+----------------------
-  0 | 8.43871829094767e-07 |    0.338306115082446
-  1 |    0.158459587238488 |    0.527865350447717
-  2 |      0.4056279696903 |    0.675800764727121
-  3 |    0.721775835522934 | 3.95111934817192e-07
-  4 |    0.158459587238488 | 3.95111934817192e-07
-  5 |    0.316385413093535 |    0.189719957843093
-  6 |    0.405199928761725 |    0.337944978189023
+  0 |    8.43871829093e-07 |    0.338306115082665
+  1 |    0.158459587238244 |    0.527865350448059
+  2 |    0.405627969689677 |    0.675800764727558
+  3 |    0.721775835521825 |    3.95111934817e-07
+  4 |    0.158459587238244 |    3.95111934817e-07
+  5 |    0.316385413093048 |    0.189719957843216
+  6 |    0.405199928761102 |    0.337944978189241
 (7 rows)
 </pre>
 <pre class="syntax">
@@ -199,7 +203,8 @@ SELECT * FROM hits_out_summary;
               17
 (1 row)
 </pre>
--# Running HITS with a max_iter of 3 results in different final values:
+-# Running HITS with max_iter of 3 results in different authority and hub
+scores:
 <pre class="syntax">
 DROP TABLE IF EXISTS hits_out, hits_out_summary;
 SELECT madlib.hits(
@@ -207,20 +212,20 @@ SELECT madlib.hits(
              'id',                 -- Vertex id column
              'edge',               -- Edge table
              'src=src, dest=dest', -- Comma delimited string of edge arguments
-             'hits_out',           -- Output table of HITS
-             3);                   -- Max iteration
+             'hits_out',           -- Output table
+              3);                  -- Max iteration
 SELECT * FROM hits_out ORDER BY id;
 </pre>
 <pre class="result">
- id |    authority    |        hub
-----+-----------------+--------------------
-  0 | 0.0865332738776 |  0.375721659592658
-  1 | 0.1838832069899 |  0.533118571043637
-  2 |  0.432666369388 |   0.65497424442504
-  3 | 0.7030828502555 | 0.0406185577938009
-  4 | 0.1838832069899 | 0.0406185577938009
-  5 | 0.3028664585716 |  0.182783510072104
-  6 | 0.3893997324492 |  0.330025782074632
+ id |     authority     |        hub
+----+-------------------+--------------------
+  0 |   0.08653327387778 | 0.375721659592363
+  1 |   0.18388320699029 | 0.533118571043218
+  2 |   0.43266636938891 | 0.654974244424525
+  3 |   0.70308285025699 | 0.040618557793769
+  4 |   0.18388320699029 | 0.040618557793769
+  5 |   0.30286645857224 | 0.182783510071961
+  6 |   0.38939973245002 | 0.330025782074373
 (7 rows)
 </pre>
 <pre class="syntax">
@@ -233,7 +238,43 @@ SELECT * FROM hits_out_summary;
 (1 row)
 </pre>
 
--# Running HITS with a threshold of 0.01 results in different final values:
+-# Running HITS with a low threshold of 0.00001 results in more iterations for convergence:
+<pre class="syntax">
+DROP TABLE IF EXISTS hits_out, hits_out_summary;
+SELECT madlib.hits(
+             'vertex',             -- Vertex table
+             'id',                 -- Vertex id column
+             'edge',               -- Edge table
+             'src=src, dest=dest', -- Comma delimited string of edge arguments
+             'hits_out',           -- Output table
+             NULL,                 -- Default max_iter
+             0.00001);             -- Threshold
+SELECT * FROM hits_out ORDER BY id;
+</pre>
+<pre class="result">
+ id |      authority       |         hub
+----+----------------------+---------------------
+  0 |    1.15243075426e-09 |     0.33800946769422
+  1 |    0.158264459912827 |    0.527792117750177
+  2 |    0.405384672299625 |    0.675965453766535
+  3 |     0.72186275724613 |    5.39583282614e-10
+  4 |    0.158264459912827 |    5.39583282614e-10
+  5 |    0.316493740997913 |    0.189793242747412
+  6 |    0.405356461070609 |    0.337985666133163
+(7 rows)
+</pre>
+<pre class="syntax">
+SELECT * FROM hits_out_summary;
+</pre>
+<pre class="result">
+ __iterations__
+-----------------+
+              25
+(1 row)
+</pre>
+
+
+-# Running HITS with both max_iter and threshold:
 <pre class="syntax">
 DROP TABLE IF EXISTS hits_out, hits_out_summary;
 SELECT madlib.hits(
@@ -241,21 +282,21 @@ SELECT madlib.hits(
              'id',                 -- Vertex id column
              'edge',               -- Edge table
              'src=src, dest=dest', -- Comma delimited string of edge arguments
-             'hits_out',           -- Output table of HITS
-             NULL,
-             0.01);                -- Threshold
+             'hits_out',           -- Output table
+             20,                   -- Default max_iter
+             0.00001);             -- Threshold
 SELECT * FROM hits_out ORDER BY id;
 </pre>
 <pre class="result">
- id |      authority      |         hub
-----+---------------------+---------------------
-  0 | 0.00732986181947514 |   0.351368354456342
-  1 |   0.167097943665847 |   0.530913722392824
-  2 |   0.416198716437073 |   0.668522269025145
-  3 |   0.717639283762987 | 0.00343216951849906
-  4 |   0.167097943665847 | 0.00343216951849906
-  5 |   0.311633656418623 |    0.18657058991966
-  6 |   0.398446707343031 |   0.336030846920549
+ id |      authority       |         hub
+----+----------------------+---------------------
+  0 |    7.11260011825e-08 |    0.33810307986005
+  1 |    0.158326035587958 |   0.527815233930963
+  2 |    0.405461453180491 |   0.675913495026452
+  3 |    0.721835343230399 |   3.33021322089e-08
+  4 |    0.158326035587958 |   3.33021322089e-08
+  5 |    0.316459563893809 |   0.189770119973925
+  6 |    0.405307074424261 |   0.337972831786458
 (7 rows)
 </pre>
 <pre class="syntax">
@@ -264,9 +305,68 @@ SELECT * FROM hits_out_summary;
 <pre class="result">
  __iterations__
 -----------------+
-              6
+             20
 (1 row)
 </pre>
+The algorithm stopped at 20 iterations even though the convergence for threshold
+of 0.00001 is at 25 iterations. This is because max_iter was set to 20.
+
+-# Running HITS with grouping column and default values for max_iter and threshold.
+Add more rows to the edge table to create different graphs based on the user_id column.
+<pre class="syntax">
+INSERT INTO edge VALUES
+(0, 1, 2),
+(0, 2, 2),
+(0, 4, 2),
+(1, 2, 2),
+(1, 3, 2),
+(2, 3, 2),
+(3, 0, 2),
+(4, 0, 2),
+(5, 6, 2),
+(6, 3, 2);
+DROP TABLE IF EXISTS hits_out, hits_out_summary;
+SELECT madlib.hits(
+             'vertex',             -- Vertex table
+             'id',                 -- Vertex id column
+             'edge',               -- Edge table
+             'src=src, dest=dest', -- Comma delimited string of edge arguments
+             'hits_out',           -- Output table
+             NULL,                 -- Default max_iter
+             NULL,                 -- Threshold
+             'user_id');           -- Grouping column
+SELECT * FROM hits_out ORDER BY user_id, id;
+</pre>
+<pre class="result">
+ user_id | id |      authority       |         hub
+---------+----+----------------------+----------------------
+       1 |  0 |    8.43871829093e-07 |    0.338306115082665
+       1 |  1 |    0.158459587238244 |    0.527865350448059
+       1 |  2 |    0.405627969689677 |    0.675800764727558
+       1 |  3 |    0.721775835521825 |    3.95111934817e-07
+       1 |  4 |    0.158459587238244 |    3.95111934817e-07
+       1 |  5 |    0.316385413093048 |    0.189719957843216
+       1 |  6 |    0.405199928761102 |    0.337944978189241
+       2 |  0 |    1.60841750444e-05 |    0.632262085114062
+       2 |  1 |    0.316079985713431 |    0.632529390899584
+       2 |  2 |    0.632364174872359 |    0.316347297480213
+       2 |  3 |    0.632694582987791 |    8.04208767442e-06
+       2 |  4 |    0.316079985713431 |    8.04208767442e-06
+       2 |  5 |                    0 |    1.22712519446e-10
+       2 |  6 |    2.45425034248e-10 |    0.316347297480213
+(14 rows)
+</pre>
+<pre class="syntax">
+SELECT * FROM hits_out_summary order by user_id;
+</pre>
+<pre class="result">
+ user_id | __iterations__
+---------+----------------
+       1 |             17
+       2 |             16
+(2 rows)
+</pre>
+
 @anchor literature
 @par Literature
 
@@ -280,7 +380,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
     edge_args       TEXT,
     out_table       TEXT,
     max_iter        INTEGER,
-    threshold       FLOAT8
+    threshold       FLOAT8,
+    grouping_cols   VARCHAR
 ) RETURNS VOID AS $$
     PythonFunction(graph, hits, hits)
 $$ LANGUAGE plpythonu VOLATILE
@@ -292,9 +393,22 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
     edge_table      TEXT,
     edge_args       TEXT,
     out_table       TEXT,
+    max_iter        INTEGER,
+    threshold       FLOAT8
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, $6, $7, NULL )
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
+    vertex_table    TEXT,
+    vertex_id       TEXT,
+    edge_table      TEXT,
+    edge_args       TEXT,
+    out_table       TEXT,
     max_iter        INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, $6,NULL)
+    SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, $6, NULL, NULL )
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
@@ -305,7 +419,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
     edge_args       TEXT,
     out_table       TEXT
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, 100, NULL)
+    SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, 100, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------