You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by fm...@apache.org on 2017/07/28 18:55:16 UTC
incubator-madlib-site git commit: update weakly connected cpts
workbook with helper functions
Repository: incubator-madlib-site
Updated Branches:
refs/heads/asf-site 12422c78a -> de140c36b
update weakly connected cpts workbook with helper functions
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/commit/de140c36
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/tree/de140c36
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/diff/de140c36
Branch: refs/heads/asf-site
Commit: de140c36b2cd6ad091637d3f5444edecbc0f739d
Parents: 12422c7
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Fri Jul 28 11:54:29 2017 -0700
Committer: Frank McQuillan <fm...@pivotal.io>
Committed: Fri Jul 28 11:54:29 2017 -0700
----------------------------------------------------------------------
.../Weakly-connected-cpts-v1.ipynb | 584 ------------
.../Weakly-connected-cpts-v2.ipynb | 908 +++++++++++++++++++
2 files changed, 908 insertions(+), 584 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/de140c36/community-artifacts/Weakly-connected-cpts-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Weakly-connected-cpts-v1.ipynb b/community-artifacts/Weakly-connected-cpts-v1.ipynb
deleted file mode 100644
index 813692b..0000000
--- a/community-artifacts/Weakly-connected-cpts-v1.ipynb
+++ /dev/null
@@ -1,584 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Weakly connected components\n",
- "\n",
- "Weakly connected components was added in MADlib 1.12."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The sql extension is already loaded. To reload it, use:\n",
- " %reload_ext sql\n"
- ]
- }
- ],
- "source": [
- "%load_ext sql"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "u'Connected: gpdbchina@madlib'"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Greenplum 4.3.10.0\n",
- "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
- " \n",
- "# PostgreSQL local\n",
- "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
- "\n",
- "# Greenplum 4.2.3.0\n",
- "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>version</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-188-ge47dc8a, cmake configuration time: Wed Jul 5 18:23:50 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(u'MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-188-ge47dc8a, cmake configuration time: Wed Jul 5 18:23:50 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%sql select madlib.version();\n",
- "#%sql select version();"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 1. Create vertex and edge tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n",
- "Done.\n",
- "Done.\n",
- "14 rows affected.\n",
- "18 rows affected.\n",
- "18 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>src</th>\n",
- " <th>dest</th>\n",
- " <th>user_id</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>0</td>\n",
- " <td>2</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>2</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>3</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>3</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>5</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>6</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>3</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>5</td>\n",
- " <td>6</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>6</td>\n",
- " <td>3</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>10</td>\n",
- " <td>11</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>10</td>\n",
- " <td>12</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>11</td>\n",
- " <td>12</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>11</td>\n",
- " <td>13</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>12</td>\n",
- " <td>13</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>13</td>\n",
- " <td>10</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>15</td>\n",
- " <td>14</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>15</td>\n",
- " <td>16</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(0, 1, 1),\n",
- " (0, 2, 1),\n",
- " (1, 2, 1),\n",
- " (1, 3, 1),\n",
- " (2, 3, 1),\n",
- " (2, 5, 1),\n",
- " (2, 6, 1),\n",
- " (3, 0, 1),\n",
- " (5, 6, 1),\n",
- " (6, 3, 1),\n",
- " (10, 11, 2),\n",
- " (10, 12, 2),\n",
- " (11, 12, 2),\n",
- " (11, 13, 2),\n",
- " (12, 13, 2),\n",
- " (13, 10, 2),\n",
- " (15, 14, 2),\n",
- " (15, 16, 2)]"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%sql \n",
- "DROP TABLE IF EXISTS vertex, edge;\n",
- "\n",
- "CREATE TABLE vertex(\n",
- " id INTEGER\n",
- ");\n",
- "\n",
- "CREATE TABLE edge(\n",
- " src INTEGER,\n",
- " dest INTEGER,\n",
- " user_id INTEGER\n",
- ");\n",
- "\n",
- "INSERT INTO vertex VALUES\n",
- "(0),\n",
- "(1),\n",
- "(2),\n",
- "(3),\n",
- "(4),\n",
- "(5),\n",
- "(6),\n",
- "(10),\n",
- "(11),\n",
- "(12),\n",
- "(13),\n",
- "(14),\n",
- "(15),\n",
- "(16);\n",
- "\n",
- "INSERT INTO edge VALUES\n",
- "(0, 1, 1),\n",
- "(0, 2, 1),\n",
- "(1, 2, 1),\n",
- "(1, 3, 1),\n",
- "(2, 3, 1),\n",
- "(2, 5, 1),\n",
- "(2, 6, 1),\n",
- "(3, 0, 1),\n",
- "(5, 6, 1),\n",
- "(6, 3, 1),\n",
- "(10, 11, 2),\n",
- "(10, 12, 2),\n",
- "(11, 12, 2),\n",
- "(11, 13, 2),\n",
- "(12, 13, 2),\n",
- "(13, 10, 2),\n",
- "(15, 16, 2),\n",
- "(15, 14, 2);\n",
- "\n",
- "SELECT * FROM edge ORDER BY src, dest;"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 2. Find all weakly connected components"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n",
- "1 rows affected.\n",
- "14 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>id</th>\n",
- " <th>component_id</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>3</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>5</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>6</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>4</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>10</td>\n",
- " <td>10</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>11</td>\n",
- " <td>10</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>12</td>\n",
- " <td>10</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>13</td>\n",
- " <td>10</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>14</td>\n",
- " <td>14</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>15</td>\n",
- " <td>14</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>16</td>\n",
- " <td>14</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(0, 0),\n",
- " (1, 0),\n",
- " (2, 0),\n",
- " (3, 0),\n",
- " (5, 0),\n",
- " (6, 0),\n",
- " (4, 4),\n",
- " (10, 10),\n",
- " (11, 10),\n",
- " (12, 10),\n",
- " (13, 10),\n",
- " (14, 14),\n",
- " (15, 14),\n",
- " (16, 14)]"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%sql\n",
- "DROP TABLE IF EXISTS wcc_out;\n",
- "\n",
- "SELECT madlib.weakly_connected_components(\n",
- " 'vertex', -- Vertex table\n",
- " 'id', -- Vertix id column\n",
- " 'edge', -- Edge table\n",
- " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
- " 'wcc_out'); -- Output table of weakly connected components\n",
- "\n",
- "SELECT * FROM wcc_out ORDER BY component_id, id;"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 3. Weakly connected components with grouping"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n",
- "1 rows affected.\n",
- "13 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>id</th>\n",
- " <th>component_id</th>\n",
- " <th>user_id</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>3</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>5</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>6</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>10</td>\n",
- " <td>10</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>11</td>\n",
- " <td>10</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>12</td>\n",
- " <td>10</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>13</td>\n",
- " <td>10</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>14</td>\n",
- " <td>14</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>15</td>\n",
- " <td>14</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>16</td>\n",
- " <td>14</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(0, 0, 1),\n",
- " (1, 0, 1),\n",
- " (2, 0, 1),\n",
- " (3, 0, 1),\n",
- " (5, 0, 1),\n",
- " (6, 0, 1),\n",
- " (10, 10, 2),\n",
- " (11, 10, 2),\n",
- " (12, 10, 2),\n",
- " (13, 10, 2),\n",
- " (14, 14, 2),\n",
- " (15, 14, 2),\n",
- " (16, 14, 2)]"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%sql\n",
- "DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n",
- "\n",
- "SELECT madlib.weakly_connected_components(\n",
- " 'vertex', -- Vertex table\n",
- " 'id', -- Vertix id column\n",
- " 'edge', -- Edge table\n",
- " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
- " 'wcc_out', -- Output table of weakly connected components\n",
- " 'user_id'); -- Grouping column name\n",
- "\n",
- "SELECT * FROM wcc_out ORDER BY component_id, id, user_id;"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Note that vertex '4' is not identified as a separate component in the above result. This is because disconnected nodes cannot be assigned to a particular group with the current graph representation in MADlib."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/de140c36/community-artifacts/Weakly-connected-cpts-v2.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Weakly-connected-cpts-v2.ipynb b/community-artifacts/Weakly-connected-cpts-v2.ipynb
new file mode 100644
index 0000000..3258e44
--- /dev/null
+++ b/community-artifacts/Weakly-connected-cpts-v2.ipynb
@@ -0,0 +1,908 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Weakly connected components\n",
+ "\n",
+ "Weakly connected components was added in MADlib 1.12. This notebook also includes helper functions based on the WCC output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
+ " \"You should import from traitlets.config instead.\", ShimWarning)\n",
+ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
+ " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "u'Connected: gpdbchina@madlib'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Greenplum 4.3.10.0\n",
+ "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+ " \n",
+ "# PostgreSQL local\n",
+ "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+ "\n",
+ "# Greenplum 4.2.3.0\n",
+ "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>version</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-195-g85e89ef, cmake configuration time: Wed Jul 26 23:21:17 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-195-g85e89ef, cmake configuration time: Wed Jul 26 23:21:17 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql select madlib.version();\n",
+ "#%sql select version();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Create vertex and edge tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "Done.\n",
+ "Done.\n",
+ "14 rows affected.\n",
+ "18 rows affected.\n",
+ "18 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>src</th>\n",
+ " <th>dest</th>\n",
+ " <th>user_id</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>12</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>12</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>13</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>13</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>10</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>14</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>16</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0, 1, 1),\n",
+ " (0, 2, 1),\n",
+ " (1, 2, 1),\n",
+ " (1, 3, 1),\n",
+ " (2, 3, 1),\n",
+ " (2, 5, 1),\n",
+ " (2, 6, 1),\n",
+ " (3, 0, 1),\n",
+ " (5, 6, 1),\n",
+ " (6, 3, 1),\n",
+ " (10, 11, 2),\n",
+ " (10, 12, 2),\n",
+ " (11, 12, 2),\n",
+ " (11, 13, 2),\n",
+ " (12, 13, 2),\n",
+ " (13, 10, 2),\n",
+ " (15, 14, 2),\n",
+ " (15, 16, 2)]"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql \n",
+ "DROP TABLE IF EXISTS vertex, edge;\n",
+ "\n",
+ "CREATE TABLE vertex(\n",
+ " id INTEGER\n",
+ ");\n",
+ "\n",
+ "CREATE TABLE edge(\n",
+ " src INTEGER,\n",
+ " dest INTEGER,\n",
+ " user_id INTEGER\n",
+ ");\n",
+ "\n",
+ "INSERT INTO vertex VALUES\n",
+ "(0),\n",
+ "(1),\n",
+ "(2),\n",
+ "(3),\n",
+ "(4),\n",
+ "(5),\n",
+ "(6),\n",
+ "(10),\n",
+ "(11),\n",
+ "(12),\n",
+ "(13),\n",
+ "(14),\n",
+ "(15),\n",
+ "(16);\n",
+ "\n",
+ "INSERT INTO edge VALUES\n",
+ "(0, 1, 1),\n",
+ "(0, 2, 1),\n",
+ "(1, 2, 1),\n",
+ "(1, 3, 1),\n",
+ "(2, 3, 1),\n",
+ "(2, 5, 1),\n",
+ "(2, 6, 1),\n",
+ "(3, 0, 1),\n",
+ "(5, 6, 1),\n",
+ "(6, 3, 1),\n",
+ "(10, 11, 2),\n",
+ "(10, 12, 2),\n",
+ "(11, 12, 2),\n",
+ "(11, 13, 2),\n",
+ "(12, 13, 2),\n",
+ "(13, 10, 2),\n",
+ "(15, 16, 2),\n",
+ "(15, 14, 2);\n",
+ "\n",
+ "SELECT * FROM edge ORDER BY src, dest;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Find all weakly connected components"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "14 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>component_id</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>14</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>14</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>14</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0, 0),\n",
+ " (1, 0),\n",
+ " (2, 0),\n",
+ " (3, 0),\n",
+ " (5, 0),\n",
+ " (6, 0),\n",
+ " (4, 4),\n",
+ " (10, 10),\n",
+ " (11, 10),\n",
+ " (12, 10),\n",
+ " (13, 10),\n",
+ " (14, 14),\n",
+ " (15, 14),\n",
+ " (16, 14)]"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n",
+ "\n",
+ "SELECT madlib.weakly_connected_components(\n",
+ " 'vertex', -- Vertex table\n",
+ " 'id', -- Vertix id column\n",
+ " 'edge', -- Edge table\n",
+ " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
+ " 'wcc_out'); -- Output table of weakly connected components\n",
+ "\n",
+ "SELECT * FROM wcc_out ORDER BY component_id, id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. Weakly connected components with grouping"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "13 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>component_id</th>\n",
+ " <th>user_id</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>10</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>10</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>10</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>10</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>14</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>14</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>14</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0, 0, 1),\n",
+ " (1, 0, 1),\n",
+ " (2, 0, 1),\n",
+ " (3, 0, 1),\n",
+ " (5, 0, 1),\n",
+ " (6, 0, 1),\n",
+ " (10, 10, 2),\n",
+ " (11, 10, 2),\n",
+ " (12, 10, 2),\n",
+ " (13, 10, 2),\n",
+ " (14, 14, 2),\n",
+ " (15, 14, 2),\n",
+ " (16, 14, 2)]"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n",
+ "\n",
+ "SELECT madlib.weakly_connected_components(\n",
+ " 'vertex', -- Vertex table\n",
+ " 'id', -- Vertix id column\n",
+ " 'edge', -- Edge table\n",
+ " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
+ " 'wcc_out', -- Output table of weakly connected components\n",
+ " 'user_id'); -- Grouping column name\n",
+ "\n",
+ "SELECT * FROM wcc_out ORDER BY component_id, id, user_id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note that vertex '4' is not identified as a separate component in the above result. This is because disconnected nodes cannot be assigned to a particular group with the current graph representation in MADlib."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 4. Largest connected component"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>user_id</th>\n",
+ " <th>component_id</th>\n",
+ " <th>num_vertices</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>10</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 0, 6L), (2, 10, 4L)]"
+ ]
+ },
+ "execution_count": 106,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql \n",
+ "DROP TABLE IF EXISTS largest_cpt_table;\n",
+ "\n",
+ "SELECT madlib.graph_wcc_largest_cpt(\n",
+ " 'wcc_out', -- WCC output table\n",
+ " 'largest_cpt_table'); -- output table containing largest component ID\n",
+ "\n",
+ "SELECT * FROM largest_cpt_table ORDER BY component_id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 5. Histogram of number of vertices per connected component"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "3 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>user_id</th>\n",
+ " <th>component_id</th>\n",
+ " <th>num_vertices</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>10</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>14</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 0, 6L), (2, 10, 4L), (2, 14, 3L)]"
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS histogram_table; \n",
+ "\n",
+ "SELECT madlib.graph_wcc_histogram(\n",
+ " 'wcc_out', -- WCC's output table\n",
+ " 'histogram_table'); -- output table containing the histogram of vertices\n",
+ "\n",
+ "SELECT * FROM histogram_table ORDER BY component_id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 6. Check if two vertices belong to the same component"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>user_id</th>\n",
+ " <th>component_id</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>14</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(2, 14)]"
+ ]
+ },
+ "execution_count": 108,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS vc_table;\n",
+ "\n",
+ "SELECT madlib.graph_wcc_vertex_check(\n",
+ " 'wcc_out', -- WCC output table\n",
+ " '14,15', -- Pair of vertex IDs\n",
+ " 'vc_table'); -- output table containing components that contain the two vertices\n",
+ "\n",
+ "SELECT * FROM vc_table ORDER BY component_id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 7. Find all reachable vertices from a source vertex"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "5 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>user_id</th>\n",
+ " <th>component_id</th>\n",
+ " <th>dest</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>5</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 0, 5), (1, 0, 6)]"
+ ]
+ },
+ "execution_count": 109,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS reach_table;\n",
+ "\n",
+ "SELECT madlib.graph_wcc_reachable_vertices(\n",
+ " 'wcc_out', -- WCC output table\n",
+ " '0', -- source vertex\n",
+ " 'reach_table'); -- output table containing all vertices reachable from source vertex\n",
+ "\n",
+ "SELECT * FROM reach_table ORDER BY component_id, dest;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 8. Count of connected components"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>user_id</th>\n",
+ " <th>num_components</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 1L), (2, 2L)]"
+ ]
+ },
+ "execution_count": 110,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS count_table;\n",
+ "\n",
+ "SELECT madlib.graph_wcc_num_cpts(\n",
+ " 'wcc_out', -- WCC output table\n",
+ " 'count_table'); -- output table containing number of components per group\n",
+ "\n",
+ "SELECT * FROM count_table;"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}