You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/08/29 20:41:41 UTC

[04/50] [abbrv] incubator-madlib git commit: APSP: Remove multiple updates for GPDB compatibility

APSP: Remove multiple updates for GPDB compatibility

In addition, add the design document section for APSP.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/08ed9266
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/08ed9266
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/08ed9266

Branch: refs/heads/latest_release
Commit: 08ed926618dd50610dcdaffb3d19ce79ab012bfa
Parents: f50f76d
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Jun 14 13:22:22 2017 -0700
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Jun 14 13:22:22 2017 -0700

----------------------------------------------------------------------
 doc/design/modules/graph.tex                | 98 ++++++++++++++++++++++--
 doc/literature.bib                          |  9 ++-
 src/ports/postgres/modules/graph/apsp.py_in | 36 ++++++---
 3 files changed, 125 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08ed9266/doc/design/modules/graph.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/graph.tex b/doc/design/modules/graph.tex
index ec68743..ec46842 100644
--- a/doc/design/modules/graph.tex
+++ b/doc/design/modules/graph.tex
@@ -28,6 +28,7 @@
 		\item[v0.1] Initial version, SSSP only.
 		\item[v0.2] Graph Framework, SSSP implementation details.
         \item[v0.3] PageRank
+        \item[v0.4] APSP
 	\end{modulehistory}
 \end{moduleinfo}
 
@@ -142,9 +143,9 @@ negative cycle in the graph.
 \end{algorithm}
 
 \begin{description}
-\item edge: (src,dest,val). The edges of the graph.
-\item cur: id -> (val,parent). The intermediate SSSP results.
-\item toupdate: iter -> (id -> (val,parent)). The set of updates.
+\item edge: $(src,dest,val)$. The edges of the graph.
+\item cur: $id \rightarrow (val,parent)$. The intermediate SSSP results.
+\item toupdate: $iter \rightarrow (id \rightarrow (val,parent))$. The set of updates.
 \end{description}
 
 Changes from the standard Bellman-Ford algorithm:
@@ -220,14 +221,14 @@ values, find parents, eliminate duplicates and ensure improvement}.
 
 \item We begin our analysis at the innermost subquery, emph{find values}
 (lines 11-16). This subquery takes a set of vertices (in the table
-$old_update$) and finds the reachable vertices. In case a vertex is reachable
+$old\_update$) and finds the reachable vertices. In case a vertex is reachable
 by multiple vertices, only the path that has the minimum cost is considered
 (hence the name find values). There are two important points to note:
 	\begin{itemize}
 	\item The input vertices need the value of their path as well.
 		\begin{itemize}
 		\item In our example, both $v_1$ and $v_2$ can reach $v_3$. We would
-		have to use $v_2$ -> $v_3$ edge since that gives the lowest possible
+		have to use $v_2 \rightarrow v_3$ edge since that gives the lowest possible
 		path value.
 		\end{itemize}
 	\item The subquery is aggregating the rows using the $min$ operator for
@@ -273,6 +274,91 @@ of the algorithm.
 Please note that, for ideal performance, \emph{vertex} and \emph{edge} tables
 should be distributed on \emph{vertex id} and \emph{source id} respectively.
 
+\section{All Pairs Shortest Paths} \label{sec:graph:apsp}
+
+Given a graph and a source vertex, all pairs shortest paths (APSP) algorithm
+finds a path for every vertex pair such that the sum of the weights of its
+constituent edges is minimized. Please refer to the
+Section~\ref{sec:graph:sssp} on single source shortest path for the
+mathematical definition of shortest path.
+
+Our implementation has a dynamic programming approach, based on the matrix
+multiplication inspired APSP algorithm \cite{apsp}. The idea is similar to
+the one from SSSP implementation. We start with a naive approximation for the
+cost of every vertex pair (infinite). At each iteration, these values are
+refined based on the edge list and the existing approximations. This
+refinement step is similar to a matrix multiplication. For every vertex pair
+$i,j$, we check every edge $e: j \rightarrow k$ to see if it is possible to
+use $e$ to reduce the cost of path $i \rightarrow k$. If there are no
+refinements at any given step, the algorithm returns the calculated results.
+If the algorithm does not converge in $|V|-1$ iterations, this indicates the
+existence of a negative cycle in the graph.
+
+\begin{algorithm}[APSP$(V,E)$] \label{alg:apsp}
+\alginput{Vertex set $v$, edge set $E$}
+\algoutput{Distance and parent set for every vertex pair}
+\begin{algorithmic}[1]
+
+	\While {$update$ is $True$}
+		\State $update \set False$
+		\For{every vertex pair $i,j$}
+			\For{every edge $j \rightarrow k$}
+		 		\If{$ val(i \rightarrow j) + val(j \rightarrow k) < val(i \rightarrow k)$}
+		 			\State $val(i \rightarrow k) \set  val(i \rightarrow j) + val(j \rightarrow k)$
+		 			\State $parent(i \rightarrow k) \set j$
+		 			\State $update \set True$
+		 		\EndIf
+			\EndFor
+		\EndFor
+	\EndWhile
+\end{algorithmic}
+\end{algorithm}
+
+\subsection{Implementation Details}
+
+The implementation details are similar to the SSSP as the requirements and
+restrictions such as finding the parent, distinct updates, etc. are common in
+both cases. This section will mostly focus on the differences in the APSP
+implementation.
+
+\begin{algorithm}[Find Updates$(E,out)$]
+\label{alg:apsp:findu}
+\begin{lstlisting}
+INSERT INTO update
+	SELECT DISTINCT ON (y.src, y.dest) y.src AS src, y.dest AS dest
+		y.val AS val,
+		y.parent AS parent
+	FROM out INNER JOIN (
+			SELECT
+				x.src AS src, x.dest AS dest,
+				x.val AS val, out.dest AS parent
+			FROM out
+				INNER JOIN edge_table
+				ON (edge_table.src = out.dest)
+				INNER JOIN (
+					SELECT out.src AS src, edge_table.dest AS dest,
+						min(out.val + edge_table.weight) AS val
+					FROM out INNER JOIN
+						edge_table ON
+						(edge_table.src=out.dest)
+					GROUP BY out.src, edge_table.dest
+				) x
+				ON (edge_table.src = x.src AND edge_table.dest = x.dest)
+			WHERE ABS(out.val + edge_table.weight - x.val) < EPSILON
+		) AS y ON (y.src = out.src AND y.dest = out.dest)
+	WHERE y.val < out.val
+\end{lstlisting}
+\end{algorithm}
+
+The only major difference comes in the innermost subquery (lines 13-18). The
+\emph{group by} clause ensures that we try to reduce the weight for every
+$out.src$ ($i$) and $edge\_table.dest$ ($k$) pair. The \emph{inner join on}
+clause ensures that there is a connecting edge ($j\rightarrow k$) that can be
+used for the $i,j$ pair. The rest of the changes are mostly trivial as the
+algorithm needs to check for both source and destination during joins (instead
+of just the destination).
+
+
 \section{PageRank} \label{sec:graph:pagerank}
 \begin{figure}[h]
     \centering
@@ -417,3 +503,5 @@ noted that this is not based on any experimental study. Users of MADlib are
 encouraged to consider this factor when setting a value for threshold, since
 a high $threshold$ value would lead to early termination of PageRank
 computation, thus resulting in incorrect PageRank values.
+
+

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08ed9266/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index c98a131..d6941c4 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -913,4 +913,11 @@ Applied Survival Analysis},
            title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},
           author = {S. Brin and L. Page},
             year = {1998}
-}
\ No newline at end of file
+}
+
+@misc{apsp,
+	title = {All Pairs Shortest Paths},
+	author = {Rendell, Alistair},
+	howpublished = {\url{http://users.cecs.anu.edu.au/~Alistair.Rendell/Teaching/apac_comp3600/module4/all_pairs_shortest_paths.xhtml}},
+	note = {Accessed: 2017-06-07}
+}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08ed9266/src/ports/postgres/modules/graph/apsp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/apsp.py_in b/src/ports/postgres/modules/graph/apsp.py_in
index 298f57c..1331301 100644
--- a/src/ports/postgres/modules/graph/apsp.py_in
+++ b/src/ports/postgres/modules/graph/apsp.py_in
@@ -123,8 +123,8 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
 		checkg_eout = ""
 		checkg_ex = ""
 		checkg_om = ""
-		checkg_o1v_sub = ""
-		checkg_ov_sub = ""
+		checkg_o1t_sub = ""
+		checkg_ot_sub = ""
 		checkg_ot = ""
 		checkg_o1t = ""
 		checkg_vv = ""
@@ -151,8 +151,8 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
 			checkg_eout = " AND " + _check_groups("edge","out",glist)
 			checkg_ex = " AND " + _check_groups("edge","x",glist)
 			checkg_om = " AND " + _check_groups(out_table,message,glist)
-			checkg_o1v_sub = _check_groups("out",tmp_view,glist)
-			checkg_ov_sub = _check_groups(out_table,tmp_view,glist)
+			checkg_o1t_sub = _check_groups("out",tmp_view,glist)
+			checkg_ot_sub = _check_groups(out_table,tmp_view,glist)
 			checkg_ot = " AND " + _check_groups(out_table,tmp_view,glist)
 			checkg_o1t = " AND " + _check_groups("out","t",glist)
 			checkg_vv = " AND " + _check_groups("v1","v2",glist)
@@ -277,14 +277,26 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
 				""".format(**locals()))
 
 			# Distance = 1: every edge means there is a path from src to dest
+
+			# There may be multiple edges defined as a->b,
+			# we only need the minimum weighted one.
+
+			plpy.execute(
+				""" CREATE VIEW {tmp_view} AS
+					SELECT {grp_comma} {src}, {dest},
+						min({weight}) AS {weight}
+					FROM {edge_table}
+					GROUP BY {grp_comma} {src}, {dest}
+				""".format(**locals()))
 			plpy.execute(
 				""" UPDATE {out_table} SET
-				{weight} = {edge_table}.{weight}, parent = {edge_table}.{dest}
-				FROM {edge_table}
-				WHERE {out_table}.{src} = {edge_table}.{src}
-				AND {out_table}.{dest} = {edge_table}.{dest}
-				AND {out_table}.{weight} > {edge_table}.{weight} {checkg_eo}
+				{weight} = {tmp_view}.{weight}, parent = {tmp_view}.{dest}
+				FROM {tmp_view}
+				WHERE {out_table}.{src} = {tmp_view}.{src}
+				AND {out_table}.{dest} = {tmp_view}.{dest}
+				AND {out_table}.{weight} > {tmp_view}.{weight} {checkg_ot}
 				""".format(**locals()))
+			plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
 
 			ot_sql1 = ot_sql.format(**locals())
 			out_table_1 = out_table
@@ -293,7 +305,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
 		# If not done by v_cnt iterations, there is a negative cycle.
 		v_cnt = plpy.execute(
 			""" SELECT max(count) as max FROM (
-					SELECT count({src}) AS count
+					SELECT count(DISTINCT {src}) AS count
 					FROM {out_table_1}
 					{grp_by}) x
 			""".format(**locals()))[0]['max']
@@ -451,7 +463,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
 						WHERE NOT EXISTS(
 							SELECT 1
 							FROM {tmp_view}
-							WHERE {checkg_o1v_sub}
+							WHERE {checkg_o1t_sub}
 							);"""
 					plpy.execute(sql_del.format(**locals()))
 					plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
@@ -460,7 +472,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
 				else:
 					sql_del = """ DELETE FROM {out_table}
 						USING {tmp_view}
-						WHERE {checkg_ov_sub}"""
+						WHERE {checkg_ot_sub}"""
 					plpy.execute(sql_del.format(**locals()))
 
 				plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))