You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/08/29 20:42:04 UTC
[27/50] [abbrv] incubator-madlib git commit: Graph: Add initial set
of centrality measures
Graph: Add initial set of centrality measures
JIRA: MADLIB-1073
This commit adds the following graph measures:
- Closeness (uses APSP)
- Graph diameter (uses APSP)
- Average path length (uses APSP)
- In/out degrees
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/06788cc4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/06788cc4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/06788cc4
Branch: refs/heads/latest_release
Commit: 06788cc486fbd15391e3d477748a37fe0657baf1
Parents: 3f599c9
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Jul 31 13:54:48 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Jul 31 13:54:48 2017 -0700
----------------------------------------------------------------------
doc/mainpage.dox.in | 23 +-
src/ports/postgres/modules/graph/apsp.py_in | 13 +-
src/ports/postgres/modules/graph/measures.py_in | 639 +++++++++++++++
.../postgres/modules/graph/measures.sql_in | 816 +++++++++++++++++++
.../postgres/modules/graph/test/measures.sql_in | 150 ++++
5 files changed, 1623 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index de70d5d..30f566d 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -123,23 +123,24 @@ complete matrix stored as a distributed table.
@defgroup grp_stemmer Stemming
@ingroup grp_datatrans
-@defgroup grp_graph Graph
-@{Contains graph algorithms. @}
+@defgroup grp_graph Graph
+Contains graph algorithms.
+@{
@defgroup grp_apsp All Pairs Shortest Path
- @ingroup grp_graph
-
@defgroup grp_bfs Breadth-First Search
- @ingroup grp_graph
-
+ @defgroup grp_graph_measures Measures
+ Graph Measures
+ @{
+ @defgroup grp_graph_closeness Closeness
+ @defgroup grp_graph_diameter Graph Diameter
+ @defgroup grp_graph_avg_path_length Average Path Length
+ @defgroup grp_graph_vertex_degrees In-Out degree
+ @}
@defgroup grp_pagerank PageRank
- @ingroup grp_graph
-
@defgroup grp_sssp Single Source Shortest Path
- @ingroup grp_graph
-
@defgroup grp_wcc Weakly Connected Components
- @ingroup grp_graph
+@}
@defgroup grp_mdl Model Evaluation
@{Contains functions for evaluating accuracy and validation of predictive methods. @}
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/apsp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/apsp.py_in b/src/ports/postgres/modules/graph/apsp.py_in
index ab8a566..42ee698 100644
--- a/src/ports/postgres/modules/graph/apsp.py_in
+++ b/src/ports/postgres/modules/graph/apsp.py_in
@@ -73,16 +73,16 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
# Prepare the input for recording in the summary table
if vertex_id is None:
- v_st = "NULL"
+ v_st = ''
vertex_id = "id"
else:
v_st = vertex_id
if edge_args is None:
- e_st = "NULL"
+ e_st = ''
else:
e_st = edge_args
if grouping_cols is None:
- g_st = "NULL"
+ g_st = ''
glist = None
else:
g_st = grouping_cols
@@ -127,8 +127,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
checkg_vv_sub = "TRUE"
grp_by = ""
- if grouping_cols is not None:
-
+ if grouping_cols:
# We use actual table names in some cases and aliases in others
# In some cases, we swap the table names so use of an alias is
# necessary. In other cases, they are used to simplify debugging.
@@ -526,10 +525,10 @@ def graph_apsp_get_path(schema_madlib, apsp_table,
dest = edge_params["dest"]
weight = edge_params["weight"]
- if vertex_id == "NULL":
+ if not vertex_id or vertex_id == "NULL":
vertex_id = "id"
- if grouping_cols == "NULL":
+ if not grouping_cols or grouping_cols == "NULL":
grouping_cols = None
select_grps = ""
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/measures.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/measures.py_in b/src/ports/postgres/modules/graph/measures.py_in
new file mode 100644
index 0000000..f5c38b4
--- /dev/null
+++ b/src/ports/postgres/modules/graph/measures.py_in
@@ -0,0 +1,639 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Graph Measures
+
+"""
+@file measures.py_in
+
+@namespace graph
+"""
+
+import plpy
+
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import add_postfix
+from utilities.utilities import extract_keyvalue_params
+
+from utilities.validate_args import get_cols
+from utilities.validate_args import unquote_ident
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+from utilities.validate_args import columns_exist_in_table
+
+from graph_utils import get_graph_usage
+
+from collections import namedtuple
+from functools import partial
+
+
+class Graph(object):
+ """Class representing a Graph object"""
+ def __init__(self,
+ vertex_table, vertex_col_names,
+ edge_table, edge_col_names,
+ grouping_cols=None,
+ should_validate=True,
+ schema_madlib='madlib'):
+ self.vertex_table = vertex_table
+ self.vertex_id_col = vertex_col_names['id']
+
+ self.edge_table = edge_table
+ self.edge_params = namedtuple('Edge', 'src dest weight')(
+ edge_col_names['src'], edge_col_names['dest'], edge_col_names['weight'])
+
+ self.grouping_cols = grouping_cols
+ self._madlib = schema_madlib
+
+ if should_validate:
+ self._validate()
+
+ # ----------------------------------------------------------------------
+
+ @staticmethod
+ def get_edge_params(edge_arg_str):
+ params_types = {'src': str, 'dest': str, 'weight': str}
+ default_args = {'src': 'src', 'dest': 'dest', 'weight': 'weight'}
+ return extract_keyvalue_params(edge_arg_str, params_types, default_args)
+ # ----------------------------------------------------------------------
+
+ def _validate(self):
+ _assert(self.vertex_table and
+ self.vertex_table.strip().lower() not in ('null', ''),
+ "Graph: Invalid vertex table name".format(**locals()))
+ _assert(table_exists(self.vertex_table),
+ "Graph: Vertex table ({0}) is missing".format(self.vertex_table))
+ _assert(not table_is_empty(self.vertex_table),
+ "Graph: Vertex table ({0}) is empty".format(self.vertex_table))
+
+ _assert(self.edge_table and self.edge_table.strip().lower() not in ('null', ''),
+ "Graph: Invalid edge table name".format(**locals()))
+ _assert(table_exists(self.edge_table),
+ "Graph: Edge table ({0}) is missing".format(self.edge_table))
+ _assert(not table_is_empty(self.edge_table),
+ "Graph: Edge table ({0}) is empty".format(self.edge_table))
+
+ existing_cols = set(unquote_ident(i) for i in get_cols(self.vertex_table))
+ _assert(self.vertex_id_col in existing_cols,
+ "Graph: The vertex column {0} is not present in "
+ "vertex table ({1})".format(self.vertex_id_col, self.vertex_table))
+
+ _assert(columns_exist_in_table(self.edge_table, self.edge_params),
+ "Graph: Not all columns from {0} are present in edge table ({1})".
+ format(self.edge_params, self.edge_table))
+ # ----------------------------------------------------------------------
+
+ def closeness(self, out_table, apsp_table, vertex_filter_expr=None, **kwargs):
+ """ Compute various centrality metrics
+
+ Following metrics are computed for a vertex, considering only
+ reachable vertices for that vertex:
+ - inverse_sum_dist: Inverse of the sum of shortest distances
+ - inverse_average_dist: Inverse of the average of shortest distances
+ - sum_inverse_dist: Sum of the inverse of shortest distances
+ - k_degree: Total number of reachable vertices
+
+ Args:
+ @param out_table: str. Name of table to store results in
+ @param apsp_table: str. APSP output is used to compute these metrics.
+ @param vertex_filter_expr: str. A WHERE clause can be specified to restrict
+ the output vertices
+ Returns:
+ None
+ """
+ grouping_cols_comma = self.grouping_cols + ", " if self.grouping_cols else ''
+ e = self.edge_params
+ filter_clause = "WHERE " + vertex_filter_expr if vertex_filter_expr else ''
+
+ plpy.execute("""
+ CREATE TABLE {out_table} AS
+ SELECT
+ {grouping_cols_comma}
+ {e.src},
+ 1.0 / sum_dist AS inverse_sum_dist,
+ CASE WHEN k_degree = 0 THEN NULL
+ ELSE k_degree::double precision / sum_dist
+ END AS inverse_avg_dist,
+ sum_inverse_dist,
+ k_degree
+ FROM (
+ -- For below measures treat 'Infinity' as NULL so that
+ -- 'sum' and 'avg' ignore the value.
+ SELECT
+ {grouping_cols_comma}
+ {e.src},
+ sum(CASE WHEN {e.weight} = 'Infinity'::double precision THEN NULL
+ WHEN {e.src} = {e.dest} THEN NULL
+ ELSE {e.weight} END) AS sum_dist,
+ sum(CASE WHEN {e.weight} = 'Infinity'::double precision THEN NULL
+ WHEN {e.weight} = 0::double precision THEN 0.
+ ELSE 1.0 / {e.weight} END) AS sum_inverse_dist,
+ count(CASE WHEN {e.weight} = 'Infinity'::double precision THEN NULL
+ WHEN {e.src} = {e.dest} THEN NULL
+ ELSE 1 END) AS k_degree
+ FROM {apsp_table}
+ {filter_clause}
+ GROUP BY {grouping_cols_comma}
+ {e.src}
+ -- Don't place the 'Infinity' checks above in a WHERE clause
+ -- since groups with only 'Infinity' rows need to show up
+ -- in output table
+ ) AS q
+ """.format(**locals()))
+ # --------------------------------------------------------------------------
+
+ def avg_path_length(self, out_table, apsp_table, vertex_filter_expr=None, **kwargs):
+ """ Compute the average path length of graph
+
+ This is the average of the shortest path distances between unique vertices.
+
+ Args:
+ @param out_table: str. Name of table to store results in
+ @param apsp_table: str. APSP output is used to compute these metrics.
+ Returns:
+ None
+ """
+ if self.grouping_cols:
+ grouping_cols_comma = self.grouping_cols + ", "
+ group_by_str = 'GROUP BY ' + self.grouping_cols
+ else:
+ grouping_cols_comma = group_by_str = ''
+ e = self.edge_params
+ filter_clause = "AND " + vertex_filter_expr if vertex_filter_expr else ''
+ plpy.execute("""
+ CREATE TABLE {out_table} AS
+ SELECT
+ {grouping_cols_comma}
+ -- Filtering 'Infinity' occurs in CASE instead of WHERE clause
+ -- so that the edge is part of the average value i.e. part of
+ -- the count of paths but zero addition to sum of distances.
+ AVG(CASE WHEN {e.weight} = 'Infinity'::double precision
+ THEN 0::double precision
+ ELSE {e.weight}::double precision
+ END) as avg_path_length
+ FROM {apsp_table}
+ WHERE {e.src} != {e.dest}
+ {filter_clause}
+ {group_by_str}
+ """.format(**locals()))
+ # ----------------------------------------------------------------------
+
+ def in_out_degrees(self, out_table, **kwargs):
+ """
+ Args:
+ @param out_table: str. Name of table to store results
+
+ Returns:
+ None
+ """
+ # TODO: validate if output columns names are in grouping_cols
+ if self.grouping_cols:
+ grouping_cols_comma = self.grouping_cols + ", "
+ else:
+ grouping_cols_comma = ''
+ e = self.edge_params
+
+ plpy.execute("""
+ CREATE TABLE {out_table} AS
+ SELECT
+ {grouping_cols_comma}
+ in_q.vertex as {self.vertex_id_col},
+ indegree,
+ outdegree
+ FROM
+ (
+ SELECT
+ {grouping_cols_comma}
+ {e.dest} as vertex,
+ count(*) as indegree
+ FROM {self.edge_table}
+ WHERE {e.src} != {e.dest}
+ GROUP BY {grouping_cols_comma}
+ {e.dest}
+ ) as in_q
+ JOIN
+ (
+ SELECT
+ {grouping_cols_comma}
+ {e.src} as vertex,
+ count(*) as outdegree
+ FROM {self.edge_table}
+ WHERE {e.src} != {e.dest}
+ GROUP BY {grouping_cols_comma}
+ {e.src}
+ ) as out_q
+ USING ({grouping_cols_comma} vertex)
+ """.format(**locals()))
+ # --------------------------------------------------------------------------
+
+ def diameter(self, out_table, apsp_table, **kwargs):
+ """ Compute the diameter of graph
+
+ Diameter is defined as the maximum of the shortest path distances between
+ any pair of vertices
+ Args:
+ @param out_table: str. Name of table to store results in
+ @param apsp_table: str. APSP output is used to compute these metrics.
+ Returns:
+ None
+ """
+ if self.grouping_cols:
+ grouping_cols_comma = self.grouping_cols + ", "
+ group_by_str = 'GROUP BY ' + self.grouping_cols
+ else:
+ grouping_cols_comma = group_by_str = ''
+ e = self.edge_params
+ plpy.execute("""
+ CREATE TABLE {out_table} AS
+ SELECT
+ {grouping_cols_comma}
+ {e.weight} AS diameter,
+ {self._madlib}.matrix_agg(
+ ARRAY[{e.src}, {e.dest}]::double precision[])::integer[]
+ AS diameter_end_vertices
+ FROM
+ {apsp_table} JOIN
+ (
+ SELECT
+ {grouping_cols_comma}
+ max({e.weight}) as {e.weight}
+ FROM {apsp_table}
+ WHERE {e.weight} != 'Infinity'::double precision
+ {group_by_str}
+ ) q
+ USING ({grouping_cols_comma} {e.weight})
+ GROUP BY {grouping_cols_comma} {e.weight}
+ """.format(**locals()))
+# ------------------------------------------------------------------------------
+
+
+def graph_apsp_measures(schema_madlib, apsp_table, out_table,
+ measure_name, vertex_filter_expr=None, **kwargs):
+ """ Define measure that depend on APSP output
+
+ This function acts as a stub for all functions that depend on APSP being run
+ prior.
+
+ """
+
+ _assert(table_exists(apsp_table) and not table_is_empty(apsp_table),
+ "Graph: Invalid APSP table: {0}".format(apsp_table))
+
+ summary_table_name = add_postfix(apsp_table, "_summary")
+ summary_cols = ['vertex_table', 'vertex_id',
+ 'edge_table', 'edge_args', 'grouping_cols']
+ _assert(table_exists(summary_table_name),
+ "Graph: Summary APSP table ({0}) does not exist".format(summary_table_name))
+ _assert(columns_exist_in_table(summary_table_name, summary_cols, schema_madlib),
+ "Graph: Missing some columns from summary table ({0})".format(summary_table_name))
+ _assert(out_table and out_table.strip().lower() not in ('null', ''),
+ "Graph: Invalid output table name ({0})".format(out_table))
+ _assert(not table_exists(out_table),
+ "Graph: Output table ({0}) already exists".format(out_table))
+
+ with MinWarning('warning'):
+ s = plpy.execute("SELECT * FROM {0}".format(summary_table_name))[0]
+ edge_col_names = Graph.get_edge_params(s['edge_args'])
+ g = Graph(s['vertex_table'], dict([('id', s['vertex_id'])]),
+ s['edge_table'], edge_col_names,
+ s['grouping_cols'],
+ should_validate=False,
+ schema_madlib=schema_madlib)
+ try:
+ measure_func = getattr(g, measure_name)
+ measure_func(out_table, apsp_table,
+ vertex_filter_expr=vertex_filter_expr)
+ except AttributeError:
+ plpy.error('Measure {0} not implemented yet'.format(measure_name))
+# ----------------------------------------------------------------------
+
+
+graph_closeness = partial(graph_apsp_measures, measure_name='closeness')
+graph_diameter = partial(graph_apsp_measures, measure_name='diameter')
+graph_avg_path_length = partial(graph_apsp_measures, measure_name='avg_path_length')
+
+
+def graph_vertex_degrees(schema_madlib, vertex_table, vertex_id, edge_table,
+ edge_args, out_table, grouping_cols, **kwargs):
+ """
+ Args:
+ @param schema_madlib: str. Name of MADlib schema
+ @param vertex_table: str. Table name containing vertex data
+ @param vertex_id: str. Column name containing ids of vertices
+ @param edge_table: str. Table name containing edge data
+ @param edge_args: str. Parameters describing edges
+ @param out_table: str. Name of table to store results
+ @param grouping_cols: str. Columns to group computation by
+
+
+ Returns:
+ None
+ """
+ _assert(out_table and out_table.strip().lower() not in ('null', ''),
+ "Graph: Invalid output table name!".format(**locals()))
+ _assert(not table_exists(out_table),
+ "Graph: Output table already exists!".format(**locals()))
+
+ if not vertex_id:
+ vertex_id = 'id'
+ edge_col_names = Graph.get_edge_params(edge_args)
+ g = Graph(vertex_table, dict([('id', vertex_id)]),
+ edge_table, edge_col_names, grouping_cols,
+ schema_madlib=schema_madlib)
+ g.in_out_degrees(out_table)
+# ----------------------------------------------------------------------
+
+# -----------------------------------------------------------------------
+# All help functions
+# -----------------------------------------------------------------------
+
+
+CREATE_GRAPH_EXAMPLE = """
+-- Create a graph, represented as vertex and edge tables.
+DROP TABLE IF EXISTS vertex,edge,out,out_summary,out_path;
+CREATE TABLE vertex(
+ id INTEGER
+ );
+CREATE TABLE edge(
+ src INTEGER,
+ dest INTEGER,
+ weight DOUBLE PRECISION
+);
+
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6),
+(7)
+;
+INSERT INTO edge VALUES
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 10),
+(1, 2, 2),
+(1, 3, 10),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 3),
+(3, 0, 1),
+(4, 0, -2),
+(5, 6, 1),
+(6, 7, 1)
+;
+"""
+
+COMPUTE_APSP_EXAMPLE = """
+-- Compute the apsp:
+DROP TABLE IF EXISTS out;
+SELECT graph_apsp(
+ 'vertex', -- Vertex table
+ 'id', -- Vertix id column
+ 'edge', -- Edge table
+ 'src=src, dest=dest, weight=weight', -- Comma delimited string of edge arguments
+ 'out' -- Output table of apsp
+);
+"""
+
+
+def graph_closeness_help(schema_madlib, message, **kwargs):
+
+ intro = """
+The Closeness function returns various closeness centrality measures and the
+k-degree for given subset of vertices. The closeness measures are the inverse of
+the sum, the inverse of the average, and the sum of inverses of the shortest
+distances to all reachable target vertices (excluding the source vertex).
+ """
+
+ if not message:
+ help_string = intro + """
+For more details:
+ SELECT {schema_madlib}.graph_closeness('usage')
+ """
+ elif message.lower() in ['usage', 'help', '?']:
+ help_string = intro + """
+
+----------------------------------------------------------------------------
+ USAGE
+----------------------------------------------------------------------------
+SELECT {schema_madlib}.graph_closeness(
+ apsp_table TEXT, -- Name of table containing APSP results
+ out_table TEXT, -- Name of table to store Closeness measuress
+ vertex_filter_expr TEXT -- Valid PostgreSQL expression that describes the
+ -- vertices to generate closeness measures for.
+)
+
+----------------------------------------------------------------------------
+ OUTPUT
+----------------------------------------------------------------------------
+The output table contains a row for every vertex of every group and have
+the following columns (in addition to the grouping columns):
+ - inverse_sum_dist : Inverse of the sum of shortest distances to all reachable
+ vertices.
+ - inverse_average_dist: Inverse of the average of shortest distances to all
+ reachable vertices.
+ - sum_inverse_dist : Sum of the inverse of shortest distances to all reachable
+ vertices.
+ - k_degree : Total number of reachable vertices.
+
+ """
+ elif message.lower() in ['example', 'examples']:
+ help_string = """
+----------------------------------------------------------------------------
+ EXAMPLES
+----------------------------------------------------------------------------
+{create_graph_example}
+{compute_apsp_example}
+
+-# Compute the closeness measure for all nodes:
+DROP TABLE IF EXISTS out_closeness;
+SELECT {schema_madlib}.graph_closeness('out_apsp', 'out_closeness');
+SELECT * FROM out_closeness;
+ """
+ else:
+ help_string = "No such option. Use {schema_madlib}.graph_closeness()"
+
+ return help_string.format(schema_madlib=schema_madlib,
+ create_graph_example=CREATE_GRAPH_EXAMPLE,
+ compute_apsp_example=COMPUTE_APSP_EXAMPLE)
+# -------------------------------------------------------------------------
+
+
+def graph_diameter_help(schema_madlib, message, **kwargs):
+
+ intro = """
+Diameter is defined as the longest of all shortest paths in a graph.
+ """
+
+ if not message:
+ help_string = intro + """
+For more details:
+ SELECT {schema_madlib}.graph_diameter('usage')
+ """
+ elif message.lower() in ['usage', 'help', '?']:
+ help_string = intro + """
+
+----------------------------------------------------------------------------
+ USAGE
+----------------------------------------------------------------------------
+SELECT {schema_madlib}.graph_diameter(
+ apsp_table TEXT, -- Name of table containing APSP results
+ out_table TEXT -- Name of table to store Closeness measuress
+)
+
+----------------------------------------------------------------------------
+ OUTPUT
+----------------------------------------------------------------------------
+It contains a row for every group, the diameter value and the two vertices
+that are the farthest apart.
+ """
+ elif message.lower() in ['example', 'examples']:
+ help_string = """
+----------------------------------------------------------------------------
+ EXAMPLES
+----------------------------------------------------------------------------
+{create_graph_example}
+{compute_apsp_example}
+
+-# Compute the diameter measure for the graph:
+DROP TABLE IF EXISTS out_diameter;
+SELECT {schema_madlib}.graph_diameter('out_apsp', 'out_diameter');
+SELECT * FROM out_diameter;
+ """
+ else:
+ help_string = "No such option. Use {schema_madlib}.graph_diameter()"
+
+ return help_string.format(schema_madlib=schema_madlib,
+ create_graph_example=CREATE_GRAPH_EXAMPLE,
+ compute_apsp_example=COMPUTE_APSP_EXAMPLE)
+# -------------------------------------------------------------------------
+
+
+def graph_avg_path_length_help(schema_madlib, message, **kwargs):
+
+ intro = """
+This function computes the average of the shortest paths between each pair of
+vertices. Average path length is based on "reachable target vertices", so it
+ignores infinite-length paths between vertices that are not connected.
+ """
+
+ if not message:
+ help_string = intro + """
+For more details:
+ SELECT {schema_madlib}.graph_avg_path_length('usage')
+ """
+ elif message.lower() in ['usage', 'help', '?']:
+ help_string = intro + """
+
+----------------------------------------------------------------------------
+ USAGE
+----------------------------------------------------------------------------
+SELECT {schema_madlib}.graph_avg_path_length(
+ apsp_table TEXT, -- Name of table containing APSP results
+ out_table TEXT -- Name of table to store Closeness measuress
+)
+
+----------------------------------------------------------------------------
+ OUTPUT
+----------------------------------------------------------------------------
+It contains a row for every group, and the average path value.
+"""
+ elif message.lower() in ['example', 'examples']:
+ help_string = """
+----------------------------------------------------------------------------
+ EXAMPLES
+----------------------------------------------------------------------------
+{create_graph_example}
+{compute_apsp_example}
+
+-# Compute the average path length for the graph:
+DROP TABLE IF EXISTS out_avg_path_length;
+SELECT {schema_madlib}.graph_avg_path_length('out_apsp', 'out_avg_path_length');
+SELECT * FROM out_avg_path_length;
+ """
+ else:
+ help_string = "No such option. Use {schema_madlib}.graph_avg_path_length()"
+
+ return help_string.format(schema_madlib=schema_madlib,
+ create_graph_example=CREATE_GRAPH_EXAMPLE,
+ compute_apsp_example=COMPUTE_APSP_EXAMPLE)
+# -------------------------------------------------------------------------
+
+
+def graph_vertex_degrees_help(schema_madlib, message, **kwargs):
+
+ intro = """
+This function computes the degree of each node. The node degree is the number of
+edges adjacent to that node. The node in-degree is the number of edges pointing
+in to the node and node out-degree is the number of edges pointing out of the
+node.
+ """
+ usage_text = get_graph_usage(schema_madlib,
+ 'graph_vertex_degrees',
+ """
+ out_table TEXT, -- Name of the table to store the result of apsp.
+ grouping_cols TEXT -- The list of grouping columns.""")
+
+ if not message:
+ help_string = intro + """
+For more details:
+ SELECT {schema_madlib}.graph_vertex_degrees('usage')
+ """
+ elif message.lower() in ['usage', 'help', '?']:
+ help_string = intro + """
+{graph_usage}
+
+----------------------------------------------------------------------------
+ OUTPUT
+----------------------------------------------------------------------------
+It contains a row for every vertex of every group and has the following columns
+(in addition to the grouping columns):
+ - vertex : The id for the source vertex. Will use the input vertex
+ column 'id' for column naming.
+ - indegree : Number of incoming edges to the vertex.
+ - outdegree : Number of outgoing edges from the vertex.
+
+"""
+ elif message.lower() in ['example', 'examples']:
+ help_string = """
+----------------------------------------------------------------------------
+ EXAMPLES
+----------------------------------------------------------------------------
+{create_graph_example}
+
+DROP TABLE IF EXISTS degrees;
+SELECT {schema_madlib}.graph_vertex_degrees(
+ 'vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge', -- Edge table
+ 'src=src, dest=dest, weight=weight',
+ 'degrees'); -- Output table of shortest paths
+SELECT * FROM degrees ORDER BY id;
+ """
+ else:
+ help_string = "No such option. Use {schema_madlib}.graph_vertex_degrees()"
+
+ return help_string.format(schema_madlib=schema_madlib,
+ create_graph_example=CREATE_GRAPH_EXAMPLE,
+ graph_usage=usage_text)
+# -------------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/measures.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/measures.sql_in b/src/ports/postgres/modules/graph/measures.sql_in
new file mode 100644
index 0000000..1992139
--- /dev/null
+++ b/src/ports/postgres/modules/graph/measures.sql_in
@@ -0,0 +1,816 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * @file closeness.sql_in
+ *
+ * @brief SQL functions for graph analytics
+ * @date Jun 2017
+ *
+ * @sa Provides analytical measures for graphs
+ *
+ *//* ----------------------------------------------------------------------- */
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_graph_closeness
+@brief Computes the closeness centrality value of each node in the graph.
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#closeness">Closeness</a></li>
+<li><a href="#examples">Examples</a></li>
+</ul>
+</div>
+
+The Closeness function returns various closeness centrality measures and the
+k-degree for given subset of vertices. The closeness measures are the inverse of
+the sum, the inverse of the average, and the sum of inverses of the shortest
+distances to all reachable target vertices (excluding the source vertex).
+
+@note The closeness measures require a valid output from a prior APSP run - both
+the APSP table and the associated output summary table. APSP is a
+computationally expensive algorithm because it finds the shortest path between
+all nodes in the graph. The worst case run-time for this implementation is O(V^2
+* E) where V is the number of vertices and E is the number of edges. In
+practice, run-time will be generally be much less than this, depending on the
+graph.
+
+@anchor closeness
+@par Closeness
+<pre class="syntax">
+graph_closeness( apsp_table,
+ output_table,
+ vertex_filter_expr
+ )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>apsp_table</dt>
+<dd>TEXT. Name of the output table generated by a prior run of all pairs shortest path (APSP).
+</dd>
+
+<dt>out_table</dt>
+<dd>TEXT. Name of the table to store the closeness measures.
+It contains a row for every vertex of every group and have
+the following columns (in addition to the grouping columns):
+ - inverse_sum_dist: Inverse of the sum of shortest distances to all reachable
+ vertices.
+ - inverse_average_dist: Inverse of the average of shortest distances to all
+ reachable vertices.
+ - sum_inverse_dist: Sum of the inverse of shortest distances to all reachable
+ vertices.
+ - k_degree: Total number of reachable vertices.
+</dd>
+
+<dt>vertex_filter_expr (optional)</dt>
+
+<dd>TEXT, default = NULL. Valid PostgreSQL expression that describes the
+ vertices to generate closeness measures for. If this parameter is not
+specified, closeness measures are generated for all vertices in the apsp table.
+This input should be treated like a WHERE clause.
+
+Some example inputs:
+- If you want a short list of vertices, say 1, 2 and 3:
+<pre>vertex_id IN (1, 2, 3)</pre>
+- If you want a range of vertices between 1000 and 2000:
+<pre>vertix_id BETWEEN 1000 AND 2000</pre>
+- If you want a set of vertices from a separate table satisfying to a condition
+<pre>EXISTS (SELECT vertex_id FROM vertices_of_interest
+ WHERE vertex_id > 5000 AND condition = 'xyz')
+</pre>
+
+</dd>
+</dl>
+
+@anchor examples
+@examp
+
+-# Create vertex and edge tables to represent the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+ id INTEGER,
+ name TEXT
+ );
+CREATE TABLE edge(
+ src_id INTEGER,
+ dest_id INTEGER,
+ edge_weight FLOAT8
+ );
+INSERT INTO vertex VALUES
+(0, 'A'),
+(1, 'B'),
+(2, 'C'),
+(3, 'D'),
+(4, 'E'),
+(5, 'F'),
+(6, 'G'),
+(7, 'H');
+INSERT INTO edge VALUES
+(0, 1, 1.0),
+(0, 2, 1.0),
+(0, 4, 10.0),
+(1, 2, 2.0),
+(1, 3, 10.0),
+(2, 3, 1.0),
+(2, 5, 1.0),
+(2, 6, 3.0),
+(3, 0, 1.0),
+(4, 0, -2.0),
+(5, 6, 1.0),
+(6, 7, 1.0);
+</pre>
+
+-# Calculate the all-pair shortest paths:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_apsp, out_apsp_summary;
+SELECT madlib.graph_apsp('vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ -- Edge arguments (NULL means use default naming)
+ 'out_apsp'); -- Output table of shortest paths
+</pre>
+
+-# Compute the closeness measure for all nodes:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_closeness;
+SELECT madlib.graph_closeness('out_apsp', 'out_closeness');
+SELECT * FROM out_closeness;
+</pre>
+<pre class="result">
+ src_id | inverse_sum_dist | inverse_avg_dist | sum_inverse_dist | k_degree
+\--------+--------------------+-------------------+------------------+----------
+ 1 | 0.0285714285714286 | 0.2 | 1.93809523809524 | 7
+ 3 | 0.0357142857142857 | 0.25 | 2.87424242424242 | 7
+ 4 | -1 | -7 | -1 | 7
+ 0 | 0.0434782608695652 | 0.304347826086957 | 3.68333333333333 | 7
+ 6 | 1 | 1 | 1 | 1
+ 2 | 0.0416666666666667 | 0.291666666666667 | 3.75 | 7
+ 5 | 0.333333333333333 | 0.666666666666667 | 1.5 | 2
+ 7 | [NULL] | [NULL] | 0 | 0
+(8 rows)
+</pre>
+
+-# Create a graph with 2 groups and find APSP for each group:
+<pre class="syntax">
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+ SELECT *, 0 AS grp FROM edge
+ UNION
+ SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+</pre>
+
+-# Find APSP for all groups:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr, out_gr_summary;
+SELECT madlib.graph_apsp(
+ 'vertex', -- Vertex table
+ NULL, -- Vertex id column (NULL means use default naming)
+ 'edge_gr', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ 'out_gr', -- Output table of shortest paths
+ 'grp' -- Grouping columns
+);
+</pre>
+
+-# Compute closeness measure for vertex 0 to vertex 5 in every group
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr_path;
+SELECT madlib.graph_closeness('out_gr', 'out_gr_closeness', 'src_id >= 0 and src_id <=5');
+SELECT * FROM out_gr_closeness ORDER BY grp;
+</pre>
+<pre class="result">
+ grp | src_id | inverse_sum_dist | inverse_avg_dist | sum_inverse_dist | k_degree
+-------+----------+-----------------------+----------------------+---------------------+------------
+ 0 | 0 | 0.0434782608695652 | 0.304347826086957 | 3.68333333333333 | 7
+ 0 | 5 | 0.333333333333333 | 0.666666666666667 | 1.5 | 2
+ 0 | 4 | -1 | -7 | -1 | 7
+ 0 | 3 | 0.0357142857142857 | 0.25 | 2.87424242424242 | 7
+ 0 | 1 | 0.0285714285714286 | 0.2 | 1.93809523809524 | 7
+ 0 | 2 | 0.0416666666666667 | 0.291666666666667 | 3.75 | 7
+ 1 | 3 | 0.142857142857143 | 0.714285714285714 | 1.97979797979798 | 5
+ 1 | 5 | [NULL] | [NULL] | 0 | 0
+ 1 | 0 | 0.25 | 1.25 | 2.5 | 5
+ 1 | 1 | 0.0588235294117647 | 0.294117647058824 | 0.988095238095238 | 5
+ 1 | 2 | 0.1 | 0.5 | 1.79166666666667 | 5
+ 1 | 4 | -0.0416666666666667 | -0.208333333333333 | -2.55 | 5
+(12 rows)
+</pre>
+
+*/
+-------------------------------------------------------------------------
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness(
+ apsp_table TEXT,
+ out_table TEXT,
+ vertex_filter_expr TEXT
+) RETURNS VOID AS $$
+ PythonFunction(graph, measures, graph_closeness)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness(
+ apsp_table TEXT,
+ out_table TEXT
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.graph_closeness($1, $2, NULL);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness(
+ message VARCHAR
+) RETURNS VARCHAR AS $$
+ PythonFunction(graph, measures, graph_closeness_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness()
+RETURNS VARCHAR AS $$
+ SELECT MADLIB_SCHEMA.graph_closeness('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+/**
+@addtogroup grp_graph_diameter
+@brief Computes the diameter of a graph.
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#diameter">Diameter</a></li>
+<li><a href="#examples">Examples</a></li>
+</ul>
+</div>
+
+Diameter is defined as the longest of all shortest paths in a graph.
+
+@note This function assumes a valid output from a prior APSP run - both the APSP
+table and the associated output summary table. APSP is a computationally
+expensive algorithm because it finds the shortest path between all nodes in the
+graph. The worst case run-time for this implementation is O(V^2 * E) where V is
+the number of vertices and E is the number of edges. In practice, run-time will
+be generally be much less than this, depending on the graph.
+
+@anchor diameter
+@par Diameter
+<pre class="syntax">
+graph_diameter( apsp_table,
+ output_table
+ )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>apsp_table</dt>
+<dd>TEXT. Name of the output table generated by a prior run of all pairs shortest path (APSP).
+</dd>
+
+<dt>out_table</dt>
+<dd>TEXT. Name of the table to store the diameter. It contains a row for every
+group, the diameter value and the two vertices that are the farthest apart.
+</dd>
+
+</dl>
+
+@anchor examples
+@examp
+
+-# Create vertex and edge tables to represent the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+ id INTEGER,
+ name TEXT
+ );
+CREATE TABLE edge(
+ src_id INTEGER,
+ dest_id INTEGER,
+ edge_weight FLOAT8
+ );
+INSERT INTO vertex VALUES
+(0, 'A'),
+(1, 'B'),
+(2, 'C'),
+(3, 'D'),
+(4, 'E'),
+(5, 'F'),
+(6, 'G'),
+(7, 'H');
+INSERT INTO edge VALUES
+(0, 1, 1.0),
+(0, 2, 1.0),
+(0, 4, 10.0),
+(1, 2, 2.0),
+(1, 3, 10.0),
+(2, 3, 1.0),
+(2, 5, 1.0),
+(2, 6, 3.0),
+(3, 0, 1.0),
+(4, 0, -2.0),
+(5, 6, 1.0),
+(6, 7, 1.0);
+</pre>
+
+-# Calculate the all-pair shortest paths:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_apsp, out_apsp_summary;
+SELECT madlib.graph_apsp('vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ -- Edge arguments (NULL means use default naming)
+ 'out_apsp'); -- Output table of shortest paths
+</pre>
+
+-# Compute the diameter measure for the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_diameter;
+SELECT madlib.graph_diameter('out_apsp', 'out_diameter');
+SELECT * FROM out_diameter;
+</pre>
+<pre class="result">
+diameter | diameter_end_vertices
+\---------+-----------------------
+ 14 | {{1,4}}
+(1 row)
+</pre>
+
+-# Create a graph with 2 groups and find APSP for each group:
+<pre class="syntax">
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+ SELECT *, 0 AS grp FROM edge
+ UNION
+ SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+</pre>
+
+-# Find APSP for all groups:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr, out_gr_summary;
+SELECT madlib.graph_apsp(
+ 'vertex', -- Vertex table
+ NULL, -- Vertex id column (NULL means use default naming)
+ 'edge_gr', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ 'out_gr', -- Output table of shortest paths
+ 'grp' -- Grouping columns
+);
+</pre>
+
+-# Find the diameter of graph in every group
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr_path;
+SELECT madlib.graph_diameter('out_gr', 'out_gr_diameter');
+SELECT * FROM out_gr_diameter ORDER BY grp;
+</pre>
+<pre class="result">
+grp | diameter | diameter_end_vertices
+\------+------------+-------------------------
+ 0 | 14 | {{1,4}}
+ 1 | 14 | {{1,4}}
+(2 rows)
+</pre>
+
+*/
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_diameter(
+ apsp_table TEXT,
+ out_table TEXT
+) RETURNS VOID AS $$
+ PythonFunction(graph, measures, graph_diameter)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_diameter(
+ message VARCHAR
+) RETURNS VARCHAR AS $$
+ PythonFunction(graph, measures, graph_diameter_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_diameter()
+RETURNS VARCHAR AS $$
+ SELECT MADLIB_SCHEMA.graph_diameter('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+/**
+@addtogroup grp_graph_avg_path_length
+@brief Computes the average shortest-path length of a graph.
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#avg_path_length">Average Path Length</a></li>
+<li><a href="#examples">Examples</a></li>
+</ul>
+</div>
+
+This function computes the average of the shortest paths between each pair of
+vertices. Average path length is based on "reachable target vertices", so it
+ignores infinite-length paths between vertices that are not connected.
+
+@note This function assumes a valid output from a prior APSP run - both the APSP
+table and the associated output summary table. APSP is a computationally
+expensive algorithm because it finds the shortest path between all nodes in the
+graph. The worst case run-time for this implementation is O(V^2 * E) where V is
+the number of vertices and E is the number of edges. In practice, run-time will
+be generally be much less than this, depending on the graph.
+
+@anchor avg_path_length
+@par Average Path Length
+<pre class="syntax">
+graph_avg_path_length( apsp_table,
+ output_table
+ )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>apsp_table</dt>
+<dd>TEXT. Name of the output table generated by a prior run of all pairs
+shortest path (APSP).
+</dd>
+
+<dt>out_table</dt>
+<dd>TEXT. Name of the table to store the average path length. It contains a row
+for every group, and the average path value.
+</dd>
+</dl>
+
+@anchor examples
+@examp
+
+-# Create vertex and edge tables to represent the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+ id INTEGER,
+ name TEXT
+ );
+CREATE TABLE edge(
+ src_id INTEGER,
+ dest_id INTEGER,
+ edge_weight FLOAT8
+ );
+INSERT INTO vertex VALUES
+(0, 'A'),
+(1, 'B'),
+(2, 'C'),
+(3, 'D'),
+(4, 'E'),
+(5, 'F'),
+(6, 'G'),
+(7, 'H');
+INSERT INTO edge VALUES
+(0, 1, 1.0),
+(0, 2, 1.0),
+(0, 4, 10.0),
+(1, 2, 2.0),
+(1, 3, 10.0),
+(2, 3, 1.0),
+(2, 5, 1.0),
+(2, 6, 3.0),
+(3, 0, 1.0),
+(4, 0, -2.0),
+(5, 6, 1.0),
+(6, 7, 1.0);
+</pre>
+
+-# Calculate the all-pair shortest paths:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_apsp, out_apsp_summary;
+SELECT madlib.graph_apsp('vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ -- Edge arguments (NULL means use default naming)
+ 'out_apsp'); -- Output table of shortest paths
+</pre>
+
+-# Compute the average path length measure:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_avg_path_length;
+SELECT madlib.graph_avg_path_length('out_apsp', 'out_avg_path_length');
+SELECT * FROM out_avg_path_length;
+</pre>
+<pre class="result">
+ avg_path_length
+\------------------
+ 2.01785714285714
+(1 row)
+</pre>
+
+-# Create a graph with 2 groups and find APSP for each group:
+<pre class="syntax">
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+ SELECT *, 0 AS grp FROM edge
+ UNION
+ SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+</pre>
+
+-# Find APSP for all groups:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr, out_gr_summary;
+SELECT madlib.graph_apsp(
+ 'vertex', -- Vertex table
+ NULL, -- Vertex id column (NULL means use default naming)
+ 'edge_gr', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ 'out_gr', -- Output table of shortest paths
+ 'grp' -- Grouping columns
+);
+</pre>
+
+-# Find the average path length in every group
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr_path;
+SELECT madlib.graph_avg_path_length('out_gr', 'out_gr_path');
+SELECT * FROM out_gr_path ORDER BY grp;
+</pre>
+<pre class="result">
+ grp | avg_path_length
+\-------+--------------------
+ 0 | 2.01785714285714
+ 1 | 0.466666666666667
+(2 rows)
+</pre>
+*/
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_avg_path_length(
+ apsp_table TEXT,
+ out_table TEXT
+) RETURNS VOID AS $$
+ PythonFunction(graph, measures, graph_avg_path_length)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_avg_path_length(
+ message VARCHAR
+) RETURNS VARCHAR AS $$
+ PythonFunction(graph, measures, graph_avg_path_length_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_avg_path_length()
+RETURNS VARCHAR AS $$
+ SELECT MADLIB_SCHEMA.graph_avg_path_length('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+/**
+@addtogroup grp_graph_vertex_degrees
+@brief Computes the degrees for each vertex
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#degrees">In-out degrees</a></li>
+<li><a href="#examples">Examples</a></li>
+</ul>
+</div>
+
+This function computes the degree of each node. The node degree is the number of
+edges adjacent to that node. The node in-degree is the number of edges pointing
+in to the node and node out-degree is the number of edges pointing out of the
+node.
+
+@anchor degrees
+@par In-out degrees
+<pre class="syntax">
+graph_vertex_degrees(
+ vertex_table,
+ vertex_id,
+ edge_table,
+ edge_args,
+ out_table,
+ grouping_cols
+)
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>vertex_table</dt>
+<dd>TEXT. Name of the table containing the vertex data for the graph. Must
+contain the column specified in the 'vertex_id' parameter below.</dd>
+
+<dt>vertex_id</dt>
+<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
+vertex ids. The vertex ids are of type INTEGER with no duplicates.
+They do not need to be contiguous.</dd>
+
+<dt>edge_table</dt>
+<dd>TEXT. Name of the table containing the edge data. The edge table must
+contain columns for source vertex, destination vertex and edge weight.
+Column naming convention is described below in the 'edge_args' parameter.</dd>
+
+<dt>edge_args</dt>
+<dd>TEXT. A comma-delimited string containing multiple named arguments of
+the form "name=value". The following parameters are supported for
+this string argument:
+ - src (INTEGER): Name of the column containing the source vertex ids in the
+ edge table. Default column name is 'src'.
+ - dest (INTEGER): Name of the column containing the destination vertex ids
+ in the edge table. Default column name is 'dest'.
+ - weight (FLOAT8): Name of the column containing the edge weights in the
+ edge table. Default column name is 'weight'.</dd>
+
+<dt>out_table</dt>
+<dd>TEXT. Name of the table to store the result.
+It contains a row for every vertex of every group and has
+the following columns (in addition to the grouping columns):
+ - vertex: The id for the source vertex. Will use the input vertex
+ column 'id' for column naming.
+ - indegree: Number of incoming edges to the vertex.
+ - outdegree: Number of outgoing edges from the vertex.
+
+<dt>grouping_cols</dt>
+<dd>TEXT, default = NULL. List of columns used to group the input into discrete
+subgraphs. These columns must exist in the edge table. When this value is null,
+no grouping is used and a single result is generated. </dd>
+</dl>
+
+@anchor examples
+@examp
+
+-# Create vertex and edge tables to represent the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+ id INTEGER,
+ name TEXT
+ );
+CREATE TABLE edge(
+ src_id INTEGER,
+ dest_id INTEGER,
+ edge_weight FLOAT8
+ );
+INSERT INTO vertex VALUES
+(0, 'A'),
+(1, 'B'),
+(2, 'C'),
+(3, 'D'),
+(4, 'E'),
+(5, 'F'),
+(6, 'G'),
+(7, 'H');
+INSERT INTO edge VALUES
+(0, 1, 1.0),
+(0, 2, 1.0),
+(0, 4, 10.0),
+(1, 2, 2.0),
+(1, 3, 10.0),
+(2, 3, 1.0),
+(2, 5, 1.0),
+(2, 6, 3.0),
+(3, 0, 1.0),
+(4, 0, -2.0),
+(5, 6, 1.0),
+(6, 7, 1.0);
+</pre>
+
+-# Calculate the in-out degrees for each node:
+<pre class="syntax">
+DROP TABLE IF EXISTS degrees;
+SELECT madlib.graph_vertex_degrees(
+ 'vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ 'degrees'); -- Output table of shortest paths
+SELECT * FROM degrees ORDER BY id;
+</pre>
+<pre class="result">
+ id | indegree | outdegree
+\----+----------+-----------
+ 0 | 2 | 3
+ 1 | 1 | 2
+ 2 | 2 | 3
+ 3 | 2 | 1
+ 4 | 1 | 1
+ 5 | 1 | 1
+ 6 | 2 | 1
+(7 rows)
+</pre>
+
+-# Create a graph with 2 groups and find degrees for each group:
+<pre class="syntax">
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+ SELECT *, 0 AS grp FROM edge
+ UNION
+ SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+</pre>
+
+-# Find APSP for all groups:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr;
+SELECT madlib.graph_vertex_degrees(
+ 'vertex', -- Vertex table
+ NULL, -- Vertex id column (NULL means use default naming)
+ 'edge_gr', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ 'out_gr', -- Output table of shortest paths
+ 'grp' -- Grouping columns
+);
+SELECT * FROM out_gr WHERE id < 2 ORDER BY grp, id;
+</pre>
+<pre class="result">
+ grp | id | indegree | outdegree
+\-------+------+------------+-------------
+ 0 | 0 | 2 | 3
+ 0 | 1 | 1 | 2
+ 1 | 0 | 2 | 3
+ 1 | 1 | 1 | 2
+(4 rows)
+</pre>
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees(
+ vertex_table TEXT,
+ vertex_id TEXT,
+ edge_table TEXT,
+ edge_args TEXT,
+ out_table TEXT,
+ grouping_cols TEXT
+) RETURNS VOID AS $$
+ PythonFunction(graph, measures, graph_vertex_degrees)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees(
+ vertex_table TEXT,
+ vertex_id TEXT,
+ edge_table TEXT,
+ edge_args TEXT,
+ out_table TEXT
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.graph_vertex_degrees($1, $2, $3, $4, $5, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees(
+ message VARCHAR
+) RETURNS VARCHAR AS $$
+ PythonFunction(graph, measures, graph_vertex_degrees_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees()
+RETURNS VARCHAR AS $$
+ SELECT MADLIB_SCHEMA.graph_vertex_degrees('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/test/measures.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/measures.sql_in b/src/ports/postgres/modules/graph/test/measures.sql_in
new file mode 100644
index 0000000..b0bf95e
--- /dev/null
+++ b/src/ports/postgres/modules/graph/test/measures.sql_in
@@ -0,0 +1,150 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+-- Create vertex and edge tables to represent the graph:
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+ id INTEGER,
+ name TEXT
+ );
+CREATE TABLE edge(
+ src_id INTEGER,
+ dest_id INTEGER,
+ edge_weight FLOAT8
+ );
+INSERT INTO vertex VALUES
+(0, 'A'),
+(1, 'B'),
+(2, 'C'),
+(3, 'D'),
+(4, 'E'),
+(5, 'F'),
+(6, 'G'),
+(7, 'H');
+INSERT INTO edge VALUES
+(0, 1, 1.0),
+(0, 2, 1.0),
+(0, 4, 10.0),
+(1, 2, 2.0),
+(1, 3, 10.0),
+(2, 3, 1.0),
+(2, 5, 1.0),
+(2, 6, 3.0),
+(3, 0, 1.0),
+(4, 0, -2.0),
+(5, 6, 1.0),
+(6, 7, 1.0);
+
+-- Calculate the all-pair shortest paths:
+DROP TABLE IF EXISTS out_apsp, out_apsp_summary;
+SELECT graph_apsp('vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ -- Edge arguments (NULL means use default naming)
+ 'out_apsp'); -- Output table of shortest paths
+
+-- Compute the closeness measure for all nodes:
+DROP TABLE IF EXISTS out_closeness;
+SELECT graph_closeness('out_apsp', 'out_closeness');
+SELECT * FROM out_closeness;
+
+SELECT assert(relative_error(inverse_sum_dist, 0.04347) < 1e-2 and
+ relative_error(inverse_avg_dist, 0.3043) < 1e-2 and
+ relative_error(sum_inverse_dist, 3.6833) < 1e-2 and
+ k_degree = 7,
+ 'Incorrect value for closeness')
+FROM out_closeness
+WHERE src_id = 0;
+
+-- Compute the diameter measure for graph
+DROP TABLE IF EXISTS out_diameter;
+SELECT graph_diameter('out_apsp', 'out_diameter');
+SELECT * FROM out_diameter;
+SELECT assert(diameter=14, 'Invalid value for diameter') FROM out_diameter;
+
+-- Compute the average path length measure for graph
+DROP TABLE IF EXISTS out_avg_path_length;
+SELECT graph_avg_path_length('out_apsp', 'out_avg_path_length');
+SELECT * FROM out_avg_path_length;
+SELECT assert(relative_error(avg_path_length, 2.0178) < 1e-2,
+ 'Invalid value for avg_path_length') FROM out_avg_path_length;
+
+-- Compute the in and out degrees
+DROP TABLE IF EXISTS out_degrees;
+SELECT graph_vertex_degrees('vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ -- Edge arguments (NULL means use default naming)
+ 'out_degrees');
+SELECT * FROM out_degrees;
+SELECT assert(indegree = 2 and outdegree = 3, 'Invalid value for degrees')
+FROM out_degrees
+WHERE id = 0;
+-------------------------------------------------------------------------
+-- Grouping -----------------------------------------------------------
+------------------------------------------------------------------------------
+
+-- Create a graph with 2 groups and find APSP for each group:
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+ SELECT *, 0 AS grp FROM edge
+ UNION
+ SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+
+-- Find APSP for all groups:
+DROP TABLE IF EXISTS out_apsp, out_apsp_summary;
+SELECT graph_apsp('vertex', -- Vertex table
+ 'id', -- Vertex id column (NULL means use default naming)
+ 'edge_gr', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ 'out_apsp', -- Output table of shortest paths
+ 'grp' -- Grouping columns
+);
+
+DROP TABLE IF EXISTS out_closeness;
+SELECT graph_closeness('out_apsp', 'out_closeness');
+SELECT * FROM out_closeness ORDER BY grp, src_id;
+
+DROP TABLE IF EXISTS out_diameter;
+SELECT graph_diameter('out_apsp', 'out_diameter');
+SELECT * FROM out_diameter ORDER BY grp;
+
+-- Compute the closeness measure for all nodes:
+DROP TABLE IF EXISTS out;
+SELECT graph_avg_path_length('out_apsp', 'out');
+SELECT * FROM out ORDER BY grp;
+
+-- Compute the closeness measure for all nodes:
+DROP TABLE IF EXISTS out_degrees;
+SELECT graph_vertex_degrees('vertex', -- Vertex table
+ 'id', -- Vertix id column (NULL means use default naming)
+ 'edge_gr', -- Edge table
+ 'src=src_id, dest=dest_id, weight=edge_weight',
+ 'out_degrees',
+ 'grp');
+SELECT * FROM out_degrees ORDER BY grp, id;