You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/08/29 20:41:41 UTC
[04/50] [abbrv] incubator-madlib git commit: APSP: Remove multiple
updates for GPDB compatibility
APSP: Remove multiple updates for GPDB compatibility
In addition, add the design document section for APSP.
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/08ed9266
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/08ed9266
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/08ed9266
Branch: refs/heads/latest_release
Commit: 08ed926618dd50610dcdaffb3d19ce79ab012bfa
Parents: f50f76d
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Jun 14 13:22:22 2017 -0700
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Jun 14 13:22:22 2017 -0700
----------------------------------------------------------------------
doc/design/modules/graph.tex | 98 ++++++++++++++++++++++--
doc/literature.bib | 9 ++-
src/ports/postgres/modules/graph/apsp.py_in | 36 ++++++---
3 files changed, 125 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08ed9266/doc/design/modules/graph.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/graph.tex b/doc/design/modules/graph.tex
index ec68743..ec46842 100644
--- a/doc/design/modules/graph.tex
+++ b/doc/design/modules/graph.tex
@@ -28,6 +28,7 @@
\item[v0.1] Initial version, SSSP only.
\item[v0.2] Graph Framework, SSSP implementation details.
\item[v0.3] PageRank
+ \item[v0.4] APSP
\end{modulehistory}
\end{moduleinfo}
@@ -142,9 +143,9 @@ negative cycle in the graph.
\end{algorithm}
\begin{description}
-\item edge: (src,dest,val). The edges of the graph.
-\item cur: id -> (val,parent). The intermediate SSSP results.
-\item toupdate: iter -> (id -> (val,parent)). The set of updates.
+\item edge: $(src,dest,val)$. The edges of the graph.
+\item cur: $id \rightarrow (val,parent)$. The intermediate SSSP results.
+\item toupdate: $iter \rightarrow (id \rightarrow (val,parent))$. The set of updates.
\end{description}
Changes from the standard Bellman-Ford algorithm:
@@ -220,14 +221,14 @@ values, find parents, eliminate duplicates and ensure improvement}.
\item We begin our analysis at the innermost subquery, emph{find values}
(lines 11-16). This subquery takes a set of vertices (in the table
-$old_update$) and finds the reachable vertices. In case a vertex is reachable
+$old\_update$) and finds the reachable vertices. In case a vertex is reachable
by multiple vertices, only the path that has the minimum cost is considered
(hence the name find values). There are two important points to note:
\begin{itemize}
\item The input vertices need the value of their path as well.
\begin{itemize}
\item In our example, both $v_1$ and $v_2$ can reach $v_3$. We would
- have to use $v_2$ -> $v_3$ edge since that gives the lowest possible
+ have to use $v_2 \rightarrow v_3$ edge since that gives the lowest possible
path value.
\end{itemize}
\item The subquery is aggregating the rows using the $min$ operator for
@@ -273,6 +274,91 @@ of the algorithm.
Please note that, for ideal performance, \emph{vertex} and \emph{edge} tables
should be distributed on \emph{vertex id} and \emph{source id} respectively.
+\section{All Pairs Shortest Paths} \label{sec:graph:apsp}
+
+Given a graph and a source vertex, all pairs shortest paths (APSP) algorithm
+finds a path for every vertex pair such that the sum of the weights of its
+constituent edges is minimized. Please refer to the
+Section~\ref{sec:graph:sssp} on single source shortest path for the
+mathematical definition of shortest path.
+
+Our implementation has a dynamic programming approach, based on the matrix
+multiplication inspired APSP algorithm \cite{apsp}. The idea is similar to
+the one from SSSP implementation. We start with a naive approximation for the
+cost of every vertex pair (infinite). At each iteration, these values are
+refined based on the edge list and the existing approximations. This
+refinement step is similar to a matrix multiplication. For every vertex pair
+$i,j$, we check every edge $e: j \rightarrow k$ to see if it is possible to
+use $e$ to reduce the cost of path $i \rightarrow k$. If there are no
+refinements at any given step, the algorithm returns the calculated results.
+If the algorithm does not converge in $|V|-1$ iterations, this indicates the
+existence of a negative cycle in the graph.
+
+\begin{algorithm}[APSP$(V,E)$] \label{alg:apsp}
+\alginput{Vertex set $v$, edge set $E$}
+\algoutput{Distance and parent set for every vertex pair}
+\begin{algorithmic}[1]
+
+ \While {$update$ is $True$}
+ \State $update \set False$
+ \For{every vertex pair $i,j$}
+ \For{every edge $j \rightarrow k$}
+ \If{$ val(i \rightarrow j) + val(j \rightarrow k) < val(i \rightarrow k)$}
+ \State $val(i \rightarrow k) \set val(i \rightarrow j) + val(j \rightarrow k)$
+ \State $parent(i \rightarrow k) \set j$
+ \State $update \set True$
+ \EndIf
+ \EndFor
+ \EndFor
+ \EndWhile
+\end{algorithmic}
+\end{algorithm}
+
+\subsection{Implementation Details}
+
+The implementation details are similar to the SSSP as the requirements and
+restrictions such as finding the parent, distinct updates, etc. are common in
+both cases. This section will mostly focus on the differences in the APSP
+implementation.
+
+\begin{algorithm}[Find Updates$(E,out)$]
+\label{alg:apsp:findu}
+\begin{lstlisting}
+INSERT INTO update
+ SELECT DISTINCT ON (y.src, y.dest) y.src AS src, y.dest AS dest
+ y.val AS val,
+ y.parent AS parent
+ FROM out INNER JOIN (
+ SELECT
+ x.src AS src, x.dest AS dest,
+ x.val AS val, out.dest AS parent
+ FROM out
+ INNER JOIN edge_table
+ ON (edge_table.src = out.dest)
+ INNER JOIN (
+ SELECT out.src AS src, edge_table.dest AS dest,
+ min(out.val + edge_table.weight) AS val
+ FROM out INNER JOIN
+ edge_table ON
+ (edge_table.src=out.dest)
+ GROUP BY out.src, edge_table.dest
+ ) x
+ ON (edge_table.src = x.src AND edge_table.dest = x.dest)
+ WHERE ABS(out.val + edge_table.weight - x.val) < EPSILON
+ ) AS y ON (y.src = out.src AND y.dest = out.dest)
+ WHERE y.val < out.val
+\end{lstlisting}
+\end{algorithm}
+
+The only major difference comes in the innermost subquery (lines 13-18). The
+\emph{group by} clause ensures that we try to reduce the weight for every
+$out.src$ ($i$) and $edge\_table.dest$ ($k$) pair. The \emph{inner join on}
+clause ensures that there is a connecting edge ($j\rightarrow k$) that can be
+used for the $i,j$ pair. The rest of the changes are mostly trivial as the
+algorithm needs to check for both source and destination during joins (instead
+of just the destination).
+
+
\section{PageRank} \label{sec:graph:pagerank}
\begin{figure}[h]
\centering
@@ -417,3 +503,5 @@ noted that this is not based on any experimental study. Users of MADlib are
encouraged to consider this factor when setting a value for threshold, since
a high $threshold$ value would lead to early termination of PageRank
computation, thus resulting in incorrect PageRank values.
+
+
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08ed9266/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index c98a131..d6941c4 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -913,4 +913,11 @@ Applied Survival Analysis},
title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},
author = {S. Brin and L. Page},
year = {1998}
-}
\ No newline at end of file
+}
+
+@misc{apsp,
+ title = {All Pairs Shortest Paths},
+ author = {Rendell, Alistair},
+ howpublished = {\url{http://users.cecs.anu.edu.au/~Alistair.Rendell/Teaching/apac_comp3600/module4/all_pairs_shortest_paths.xhtml}},
+ note = {Accessed: 2017-06-07}
+}
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08ed9266/src/ports/postgres/modules/graph/apsp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/apsp.py_in b/src/ports/postgres/modules/graph/apsp.py_in
index 298f57c..1331301 100644
--- a/src/ports/postgres/modules/graph/apsp.py_in
+++ b/src/ports/postgres/modules/graph/apsp.py_in
@@ -123,8 +123,8 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
checkg_eout = ""
checkg_ex = ""
checkg_om = ""
- checkg_o1v_sub = ""
- checkg_ov_sub = ""
+ checkg_o1t_sub = ""
+ checkg_ot_sub = ""
checkg_ot = ""
checkg_o1t = ""
checkg_vv = ""
@@ -151,8 +151,8 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
checkg_eout = " AND " + _check_groups("edge","out",glist)
checkg_ex = " AND " + _check_groups("edge","x",glist)
checkg_om = " AND " + _check_groups(out_table,message,glist)
- checkg_o1v_sub = _check_groups("out",tmp_view,glist)
- checkg_ov_sub = _check_groups(out_table,tmp_view,glist)
+ checkg_o1t_sub = _check_groups("out",tmp_view,glist)
+ checkg_ot_sub = _check_groups(out_table,tmp_view,glist)
checkg_ot = " AND " + _check_groups(out_table,tmp_view,glist)
checkg_o1t = " AND " + _check_groups("out","t",glist)
checkg_vv = " AND " + _check_groups("v1","v2",glist)
@@ -277,14 +277,26 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
""".format(**locals()))
# Distance = 1: every edge means there is a path from src to dest
+
+ # There may be multiple edges defined as a->b,
+ # we only need the minimum weighted one.
+
+ plpy.execute(
+ """ CREATE VIEW {tmp_view} AS
+ SELECT {grp_comma} {src}, {dest},
+ min({weight}) AS {weight}
+ FROM {edge_table}
+ GROUP BY {grp_comma} {src}, {dest}
+ """.format(**locals()))
plpy.execute(
""" UPDATE {out_table} SET
- {weight} = {edge_table}.{weight}, parent = {edge_table}.{dest}
- FROM {edge_table}
- WHERE {out_table}.{src} = {edge_table}.{src}
- AND {out_table}.{dest} = {edge_table}.{dest}
- AND {out_table}.{weight} > {edge_table}.{weight} {checkg_eo}
+ {weight} = {tmp_view}.{weight}, parent = {tmp_view}.{dest}
+ FROM {tmp_view}
+ WHERE {out_table}.{src} = {tmp_view}.{src}
+ AND {out_table}.{dest} = {tmp_view}.{dest}
+ AND {out_table}.{weight} > {tmp_view}.{weight} {checkg_ot}
""".format(**locals()))
+ plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
ot_sql1 = ot_sql.format(**locals())
out_table_1 = out_table
@@ -293,7 +305,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
# If not done by v_cnt iterations, there is a negative cycle.
v_cnt = plpy.execute(
""" SELECT max(count) as max FROM (
- SELECT count({src}) AS count
+ SELECT count(DISTINCT {src}) AS count
FROM {out_table_1}
{grp_by}) x
""".format(**locals()))[0]['max']
@@ -451,7 +463,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
WHERE NOT EXISTS(
SELECT 1
FROM {tmp_view}
- WHERE {checkg_o1v_sub}
+ WHERE {checkg_o1t_sub}
);"""
plpy.execute(sql_del.format(**locals()))
plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
@@ -460,7 +472,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
else:
sql_del = """ DELETE FROM {out_table}
USING {tmp_view}
- WHERE {checkg_ov_sub}"""
+ WHERE {checkg_ot_sub}"""
plpy.execute(sql_del.format(**locals()))
plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))