You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2020/03/11 18:29:20 UTC

[madlib] 01/02: Graph: Filter out infinite paths

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit eabaea93f833d442a8264696e2e8e1d151d46a74
Author: Orhan Kislal <ok...@apache.org>
AuthorDate: Tue Mar 10 19:42:45 2020 -0400

    Graph: Filter out infinite paths
    
    JIRA: MADLIB-1415
    
    SSSP, APSP and BFS output tables had entries for unconnected vertices,
    denoted by a NULL parent column. This commit filters these rows to
    clean up the output tables.
    
    The BFS output had a slightly different output format where vertices
    parent column was left NULL for both unreachable vertices as well as
    the source vertex itself. This is different than SSSP and APSP so BFS
    is changed to follow the same pattern.
---
 src/ports/postgres/modules/graph/apsp.py_in           |  5 ++++-
 src/ports/postgres/modules/graph/bfs.py_in            |  8 ++++++++
 src/ports/postgres/modules/graph/bfs.sql_in           | 14 +++++++-------
 src/ports/postgres/modules/graph/measures.py_in       |  8 +-------
 src/ports/postgres/modules/graph/measures.sql_in      | 13 ++++++++-----
 src/ports/postgres/modules/graph/sssp.py_in           |  4 ++++
 src/ports/postgres/modules/graph/test/apsp.sql_in     | 11 +++++++++++
 src/ports/postgres/modules/graph/test/bfs.sql_in      |  9 +++++++++
 src/ports/postgres/modules/graph/test/measures.sql_in |  2 +-
 src/ports/postgres/modules/graph/test/sssp.sql_in     | 11 +++++++++++
 10 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/src/ports/postgres/modules/graph/apsp.py_in b/src/ports/postgres/modules/graph/apsp.py_in
index 1d55696..e854ab9 100644
--- a/src/ports/postgres/modules/graph/apsp.py_in
+++ b/src/ports/postgres/modules/graph/apsp.py_in
@@ -381,10 +381,13 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
                     """Graph APSP: Detected a negative cycle in the """ +
                     """sub-graphs of following groups: {0}.""".
                     format(str(negs)[1:-1]))
-
         else:
             plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
 
+
+    # Filter out the infinite paths (disconnected pairs)
+    plpy.execute(""" DELETE FROM {0} WHERE parent IS NULL
+        """.format(out_table))
     return None
 
 
diff --git a/src/ports/postgres/modules/graph/bfs.py_in b/src/ports/postgres/modules/graph/bfs.py_in
index 6fc008e..e802aac 100644
--- a/src/ports/postgres/modules/graph/bfs.py_in
+++ b/src/ports/postgres/modules/graph/bfs.py_in
@@ -375,6 +375,14 @@ def graph_bfs(schema_madlib, vertex_table, vertex_id, edge_table,
             # stored in this iteration. This is used to check if the iterations
             # need to continue.
             vct = plpy.execute(count_qry.format(**locals()))[0]['count']
+
+        # Filter out the infinite paths (disconnected pairs)
+        plpy.execute(""" UPDATE {out_table} SET parent = {source_vertex}
+                         WHERE {vertex_id} = {source_vertex}
+                    """.format(**locals()))
+
+        plpy.execute(""" DELETE FROM {0} WHERE parent IS NULL
+            """.format(out_table))
         # Drop temp tables
         plpy.execute("DROP TABLE IF EXISTS {0},{1}".format(toupdate, message))
 
diff --git a/src/ports/postgres/modules/graph/bfs.sql_in b/src/ports/postgres/modules/graph/bfs.sql_in
index ea991fa..ac945a1 100644
--- a/src/ports/postgres/modules/graph/bfs.sql_in
+++ b/src/ports/postgres/modules/graph/bfs.sql_in
@@ -200,7 +200,7 @@ SELECT * FROM out ORDER BY dist,id;
 <pre class="result">
  id | dist | parent
 ----+------+--------
-  3 |    0 |
+  3 |    0 |      3
   1 |    1 |      3
   4 |    1 |      3
   5 |    1 |      3
@@ -236,7 +236,7 @@ SELECT * FROM out_max ORDER BY dist,id;
 <pre class="result">
  id | dist | parent
 ----+------+--------
-  3 |    0 |
+  3 |    0 |      3
   1 |    1 |      3
   4 |    1 |      3
   5 |    1 |      3
@@ -269,7 +269,7 @@ SELECT * FROM out_alt ORDER BY v_id;
 <pre class="result">
  v_id | dist | parent
 ------+------+--------
-    8 |    0 |
+    8 |    0 |      8
     9 |    1 |      8
    10 |    1 |      8
    11 |    2 |      9
@@ -292,7 +292,7 @@ SELECT * FROM out_alt_dir ORDER BY v_id;
 <pre class="result">
  v_id | dist | parent
 ------+------+--------
-    8 |    0 |
+    8 |    0 |      8
     9 |    1 |      8
    10 |    2 |      9
    11 |    2 |      9
@@ -348,11 +348,11 @@ SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
 <pre class="result">
  g1  | g2 | id | dist | parent
 -----+----+----+------+--------
- 100 | a  |  8 |    0 |
+ 100 | a  |  8 |    0 |      8
  100 | a  |  9 |    1 |      8
  100 | a  | 10 |    1 |      8
  100 | a  | 11 |    2 |      9
- 202 | c  |  8 |    0 |
+ 202 | c  |  8 |    0 |      8
  202 | c  |  9 |    1 |      8
  202 | c  | 10 |    1 |      8
  202 | c  | 11 |    2 |      9
@@ -378,7 +378,7 @@ SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
 <pre class="result">
  g1  | g2 | id | dist | parent
 -----+----+----+------+--------
- 100 | a  |  3 |    0 |
+ 100 | a  |  3 |    0 |      3
  100 | a  |  1 |    1 |      3
  100 | a  |  4 |    1 |      3
  100 | a  |  5 |    1 |      3
diff --git a/src/ports/postgres/modules/graph/measures.py_in b/src/ports/postgres/modules/graph/measures.py_in
index a3dd778..3ec07e9 100644
--- a/src/ports/postgres/modules/graph/measures.py_in
+++ b/src/ports/postgres/modules/graph/measures.py_in
@@ -181,13 +181,7 @@ class Graph(object):
             CREATE TABLE {out_table} AS
             SELECT
                 {grouping_cols_comma}
-                -- Filtering 'Infinity' occurs in CASE instead of WHERE clause
-                -- so that the edge is part of the average value i.e. part of
-                -- the count of paths but zero addition to sum of distances.
-                AVG(CASE WHEN {e.weight} = 'Infinity'::double precision
-                            THEN 0::double precision
-                            ELSE {e.weight}::double precision
-                    END) as avg_path_length
+                AVG({e.weight}::double precision) as avg_path_length
             FROM {apsp_table}
             WHERE {e.src} != {e.dest}
                   {filter_clause}
diff --git a/src/ports/postgres/modules/graph/measures.sql_in b/src/ports/postgres/modules/graph/measures.sql_in
index 92680ba..0879832 100644
--- a/src/ports/postgres/modules/graph/measures.sql_in
+++ b/src/ports/postgres/modules/graph/measures.sql_in
@@ -443,7 +443,10 @@ m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
 
 This function computes the average of the shortest paths between each pair of
 vertices. Average path length is based on "reachable target vertices", so it
-ignores infinite-length paths between vertices that are not connected.
+ignores infinite-length paths between vertices that are not connected. This means
+the function will average the path lengths in each connected component. If the
+user requires the average path length of a particular component, the
+weakly_connected_components function may be used to isolate the relevant vertices.
 
 @note This function assumes a valid output from a prior APSP run - both the APSP
 table and the associated output summary table. APSP is a computationally
@@ -532,7 +535,7 @@ SELECT * FROM out_avg_path_length;
 <pre class="result">
  avg_path_length
 \------------------
- 2.01785714285714
+ 2.973684210526316
 (1 row)
 </pre>
 
@@ -570,9 +573,9 @@ SELECT * FROM out_gr_path ORDER BY grp;
 </pre>
 <pre class="result">
  grp |  avg_path_length
-\-------+--------------------
-   0 |  2.01785714285714
-   1 | 0.466666666666667
+\-------+----------------------
+   0 |  2.973684210526316
+   1 |               0.56
 (2 rows)
 </pre>
 */
diff --git a/src/ports/postgres/modules/graph/sssp.py_in b/src/ports/postgres/modules/graph/sssp.py_in
index 0819132..26ddf88 100644
--- a/src/ports/postgres/modules/graph/sssp.py_in
+++ b/src/ports/postgres/modules/graph/sssp.py_in
@@ -381,6 +381,10 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, edge_table,
                     """sub-graphs of following groups: {0}.""".
                     format(str(negs)[1:-1]))
 
+        # Filter out the infinite paths (disconnected pairs)
+        plpy.execute(""" DELETE FROM {0} WHERE parent IS NULL
+            """.format(out_table))
+
         plpy.execute("DROP TABLE IF EXISTS {0}".format(oldupdate))
     return None
 
diff --git a/src/ports/postgres/modules/graph/test/apsp.sql_in b/src/ports/postgres/modules/graph/test/apsp.sql_in
index f98b76b..bf46260 100644
--- a/src/ports/postgres/modules/graph/test/apsp.sql_in
+++ b/src/ports/postgres/modules/graph/test/apsp.sql_in
@@ -119,3 +119,14 @@ CREATE TABLE e2 AS SELECT src::bigint, "DEST"::bigint, weight FROM "EDGE";
 DROP TABLE IF EXISTS public.out2, public.out2_summary, public.out2_path;
 SELECT graph_apsp('v2',NULL,'e2','dest="DEST"','public.out2');
 SELECT graph_apsp_get_path('public.out2',0,7,'public.out2_path');
+
+-- Test for infinite paths
+DROP TABLE IF EXISTS out, out_summary, out_path;
+DELETE FROM "EDGE" WHERE "DEST" = 7;
+
+SELECT graph_apsp('vertex',NULL,'"EDGE"','dest="DEST"','out');
+
+SELECT assert(count(*) = 0, 'Null parent in the out table') FROM
+(SELECT * FROM out WHERE parent IS NULL)q1;
+
+SELECT graph_apsp_get_path('out',0,5,'out_path');
diff --git a/src/ports/postgres/modules/graph/test/bfs.sql_in b/src/ports/postgres/modules/graph/test/bfs.sql_in
index 5a32b96..f36eeb6 100644
--- a/src/ports/postgres/modules/graph/test/bfs.sql_in
+++ b/src/ports/postgres/modules/graph/test/bfs.sql_in
@@ -296,3 +296,12 @@ CREATE TABLE e2 AS SELECT "SRC"::bigint, dest::bigint, weight FROM "EDGE";
 
 DROP TABLE IF EXISTS public.out2, public.out2_summary;
 SELECT graph_bfs('v2',NULL,'e2','src="SRC"',3,'public.out2');
+
+-- Test for infinite paths
+DROP TABLE IF EXISTS out, out_summary, out_path;
+DELETE FROM "EDGE" WHERE dest = 7;
+
+SELECT graph_bfs('vertex',NULL,'"EDGE"','src="SRC"',3,'out');
+
+SELECT assert(count(*) = 0, 'Null parent in the out table')
+    FROM (SELECT * FROM out WHERE parent IS NULL)q1;
diff --git a/src/ports/postgres/modules/graph/test/measures.sql_in b/src/ports/postgres/modules/graph/test/measures.sql_in
index 38e2e72..a98faff 100644
--- a/src/ports/postgres/modules/graph/test/measures.sql_in
+++ b/src/ports/postgres/modules/graph/test/measures.sql_in
@@ -86,7 +86,7 @@ SELECT assert(diameter=14, 'Invalid value for diameter') FROM public.__madlib__o
 DROP TABLE IF EXISTS public.__madlib__out_avg_path_length;
 SELECT graph_avg_path_length('out_apsp', 'public.__madlib__out_avg_path_length');
 SELECT * FROM public.__madlib__out_avg_path_length;
-SELECT assert(relative_error(avg_path_length, 2.0178) < 1e-2,
+SELECT assert(relative_error(avg_path_length, 2.97) < 1e-2,
               'Invalid value for avg_path_length') FROM public.__madlib__out_avg_path_length;
 
 -- Compute the in and out degrees
diff --git a/src/ports/postgres/modules/graph/test/sssp.sql_in b/src/ports/postgres/modules/graph/test/sssp.sql_in
index 9f1a913..6a2f881 100644
--- a/src/ports/postgres/modules/graph/test/sssp.sql_in
+++ b/src/ports/postgres/modules/graph/test/sssp.sql_in
@@ -164,3 +164,14 @@ CREATE TABLE e2 AS SELECT src::bigint, dest::bigint, weight FROM "EDGE";
 DROP TABLE IF EXISTS public.out2, public.out2_summary, public.out2_path;
 SELECT graph_sssp('v2',NULL,'e2',NULL,0,'public.out2');
 SELECT graph_sssp_get_path('public.out2',5,'public.out2_path');
+
+-- Test for infinite paths
+DROP TABLE IF EXISTS out, out_summary, out_path;
+DELETE FROM "EDGE" WHERE dest = 7;
+
+SELECT graph_sssp('vertex',NULL,'"EDGE"',NULL,0,'out');
+
+SELECT assert(count(*) = 0, 'Null parent in the out table') FROM
+(SELECT * FROM out WHERE parent IS NULL)q1;
+
+SELECT graph_sssp_get_path('out',5,'out_path');