You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by fm...@apache.org on 2017/12/08 20:55:15 UTC

[5/7] madlib-site git commit: add new workbooks for 1dot13

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/HITS-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/HITS-v1.ipynb b/community-artifacts/HITS-v1.ipynb
new file mode 100644
index 0000000..40fef57
--- /dev/null
+++ b/community-artifacts/HITS-v1.ipynb
@@ -0,0 +1,1240 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# HITS (Hyperlink-Induced Topic Search) \n",
+    "Outputs the authority score and hub score of every vertex, where authority estimates the value of the content of the page and hub estimates the value of its links to other pages.  Added in MADlib 1.13."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
+      "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+      "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
+      "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "u'Connected: gpdbchina@madlib'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Greenplum 4.3.10.0\n",
+    "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+    "        \n",
+    "# PostgreSQL local\n",
+    "#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>version</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>MADlib version: 1.13-dev, git revision: rel/v1.12-31-g4b7d9cc, cmake configuration time: Tue Nov 21 22:31:28 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-31-g4b7d9cc, cmake configuration time: Tue Nov 21 22:31:28 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%sql select madlib.version();\n",
+    "#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1.  Create vertex and edge tables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "Done.\n",
+      "Done.\n",
+      "7 rows affected.\n",
+      "12 rows affected.\n",
+      "12 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>src</th>\n",
+       "        <th>dest</th>\n",
+       "        <th>user_id</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>1</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>5</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>6</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>0</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>0</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>6</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(0, 1, 1),\n",
+       " (0, 2, 1),\n",
+       " (0, 4, 1),\n",
+       " (1, 2, 1),\n",
+       " (1, 3, 1),\n",
+       " (2, 3, 1),\n",
+       " (2, 5, 1),\n",
+       " (2, 6, 1),\n",
+       " (3, 0, 1),\n",
+       " (4, 0, 1),\n",
+       " (5, 6, 1),\n",
+       " (6, 3, 1)]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS vertex, edge;\n",
+    "\n",
+    "CREATE TABLE vertex(\n",
+    "        id INTEGER\n",
+    "        );\n",
+    "\n",
+    "CREATE TABLE edge(\n",
+    "        src INTEGER,\n",
+    "        dest INTEGER,\n",
+    "        user_id INTEGER\n",
+    "        );\n",
+    "\n",
+    "INSERT INTO vertex VALUES\n",
+    "(0),\n",
+    "(1),\n",
+    "(2),\n",
+    "(3),\n",
+    "(4),\n",
+    "(5),\n",
+    "(6);\n",
+    "\n",
+    "INSERT INTO edge VALUES\n",
+    "(0, 1, 1),\n",
+    "(0, 2, 1),\n",
+    "(0, 4, 1),\n",
+    "(1, 2, 1),\n",
+    "(1, 3, 1),\n",
+    "(2, 3, 1),\n",
+    "(2, 5, 1),\n",
+    "(2, 6, 1),\n",
+    "(3, 0, 1),\n",
+    "(4, 0, 1),\n",
+    "(5, 6, 1),\n",
+    "(6, 3, 1);\n",
+    "\n",
+    "SELECT * from edge ORDER BY src, dest;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2.  Compute the HITS scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "7 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>authority</th>\n",
+       "        <th>hub</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>8.43871829093e-07</td>\n",
+       "        <td>0.338306115083</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>0.158459587238</td>\n",
+       "        <td>0.527865350448</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>0.40562796969</td>\n",
+       "        <td>0.675800764728</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>0.721775835522</td>\n",
+       "        <td>3.95111934817e-07</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>0.158459587238</td>\n",
+       "        <td>3.95111934817e-07</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>0.316385413093</td>\n",
+       "        <td>0.189719957843</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>0.405199928761</td>\n",
+       "        <td>0.337944978189</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(0, 8.4387182909347e-07, 0.338306115082665),\n",
+       " (1, 0.158459587238244, 0.527865350448059),\n",
+       " (2, 0.405627969689677, 0.675800764727558),\n",
+       " (3, 0.721775835521825, 3.95111934817447e-07),\n",
+       " (4, 0.158459587238244, 3.95111934817447e-07),\n",
+       " (5, 0.316385413093048, 0.189719957843216),\n",
+       " (6, 0.405199928761102, 0.337944978189241)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
+    "\n",
+    "SELECT madlib.hits(\n",
+    "             'vertex',             -- Vertex table\n",
+    "             'id',                 -- Vertex id column\n",
+    "             'edge',               -- Edge table\n",
+    "             'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
+    "             'hits_out');          -- Output table of HITS\n",
+    "\n",
+    "SELECT * FROM hits_out ORDER BY id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Look at the summary table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__iterations__</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>17</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(17,)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT * FROM hits_out_summary;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Different number of iterations\n",
+    "Results in different hub and authority scores."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "7 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>authority</th>\n",
+       "        <th>hub</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>0.0865332738778</td>\n",
+       "        <td>0.375721659592</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>0.18388320699</td>\n",
+       "        <td>0.533118571043</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>0.432666369389</td>\n",
+       "        <td>0.654974244425</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>0.703082850257</td>\n",
+       "        <td>0.0406185577938</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>0.18388320699</td>\n",
+       "        <td>0.0406185577938</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>0.302866458572</td>\n",
+       "        <td>0.182783510072</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>0.38939973245</td>\n",
+       "        <td>0.330025782074</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(0, 0.0865332738777835, 0.375721659592363),\n",
+       " (1, 0.18388320699029, 0.533118571043218),\n",
+       " (2, 0.432666369388918, 0.654974244424525),\n",
+       " (3, 0.703082850256991, 0.040618557793769),\n",
+       " (4, 0.18388320699029, 0.040618557793769),\n",
+       " (5, 0.302866458572242, 0.182783510071961),\n",
+       " (6, 0.389399732450026, 0.330025782074373)]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
+    "\n",
+    "SELECT madlib.hits(\n",
+    "             'vertex',             -- Vertex table\n",
+    "             'id',                 -- Vertex id column\n",
+    "             'edge',               -- Edge table\n",
+    "             'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
+    "             'hits_out',           -- Output table of HITS\n",
+    "             3);                   -- Max iteration\n",
+    "\n",
+    "SELECT * FROM hits_out ORDER BY id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Look at the summary table: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__iterations__</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(3,)]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT * FROM hits_out_summary;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. Different theshold\n",
+    "Running HITS with a low threshold of 0.00001 results in more iterations for convergence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "7 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>authority</th>\n",
+       "        <th>hub</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>0.194028500029</td>\n",
+       "        <td>0.39062401003</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>0.194028500029</td>\n",
+       "        <td>0.528491307688</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>0.436564125065</td>\n",
+       "        <td>0.643380722403</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>0.679099750102</td>\n",
+       "        <td>0.0919115317719</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>0.194028500029</td>\n",
+       "        <td>0.0919115317719</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>0.291042750044</td>\n",
+       "        <td>0.183823063544</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>0.388057000058</td>\n",
+       "        <td>0.321690361202</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(0, 0.194028500029066, 0.390624010030425),\n",
+       " (1, 0.194028500029066, 0.528491307688221),\n",
+       " (2, 0.436564125065399, 0.643380722403052),\n",
+       " (3, 0.679099750101732, 0.0919115317718646),\n",
+       " (4, 0.194028500029066, 0.0919115317718646),\n",
+       " (5, 0.2910427500436, 0.183823063543729),\n",
+       " (6, 0.388057000058133, 0.321690361201526)]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
+    "\n",
+    "SELECT madlib.hits(\n",
+    "             'vertex',             -- Vertex table\n",
+    "             'id',                 -- Vertex id column\n",
+    "             'edge',               -- Edge table\n",
+    "             'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
+    "             'hits_out',           -- Output table of HITS\n",
+    "             NULL,                 -- Default max_iter\n",
+    "             0.5);             -- Threshold\n",
+    "\n",
+    "SELECT * FROM hits_out ORDER BY id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Look at the summary table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__iterations__</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(2,)]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT * FROM hits_out_summary;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5.  Different number of iterations and threshold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "7 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>authority</th>\n",
+       "        <th>hub</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>7.11260011826e-08</td>\n",
+       "        <td>0.33810307986</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>0.158326035588</td>\n",
+       "        <td>0.527815233931</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>0.40546145318</td>\n",
+       "        <td>0.675913495026</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>0.72183534323</td>\n",
+       "        <td>3.33021322089e-08</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>0.158326035588</td>\n",
+       "        <td>3.33021322089e-08</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>0.316459563894</td>\n",
+       "        <td>0.189770119974</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>0.405307074424</td>\n",
+       "        <td>0.337972831786</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(0, 7.11260011825861e-08, 0.33810307986005),\n",
+       " (1, 0.158326035587958, 0.527815233930963),\n",
+       " (2, 0.405461453180491, 0.675913495026452),\n",
+       " (3, 0.721835343230399, 3.3302132208914e-08),\n",
+       " (4, 0.158326035587958, 3.3302132208914e-08),\n",
+       " (5, 0.316459563893809, 0.189770119973925),\n",
+       " (6, 0.405307074424261, 0.337972831786458)]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
+    "\n",
+    "SELECT madlib.hits(\n",
+    "             'vertex',             -- Vertex table\n",
+    "             'id',                 -- Vertex id column\n",
+    "             'edge',               -- Edge table\n",
+    "             'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
+    "             'hits_out',           -- Output table\n",
+    "             20,                   -- Default max_iter\n",
+    "             0.00001);             -- Threshold\n",
+    "\n",
+    "SELECT * FROM hits_out ORDER BY id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Look at the summary table.  The algorithm stopped at 20 iterations even though the convergence for threshold of 0.00001 is at 25 iterations. This is because max_iter was set to 20."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__iterations__</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>20</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(20,)]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT * FROM hits_out_summary;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 6. Grouping\n",
+    "Running HITS with grouping column and default values for max_iter and threshold. Add more rows to the edge table to create different graphs based on the user_id column."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 rows affected.\n",
+      "Done.\n",
+      "1 rows affected.\n",
+      "14 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>user_id</th>\n",
+       "        <th>id</th>\n",
+       "        <th>authority</th>\n",
+       "        <th>hub</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>0</td>\n",
+       "        <td>8.43871829093e-07</td>\n",
+       "        <td>0.338306115083</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>1</td>\n",
+       "        <td>0.158459587238</td>\n",
+       "        <td>0.527865350448</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>2</td>\n",
+       "        <td>0.40562796969</td>\n",
+       "        <td>0.675800764728</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>0.721775835522</td>\n",
+       "        <td>3.95111934817e-07</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>0.158459587238</td>\n",
+       "        <td>3.95111934817e-07</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>5</td>\n",
+       "        <td>0.316385413093</td>\n",
+       "        <td>0.189719957843</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>6</td>\n",
+       "        <td>0.405199928761</td>\n",
+       "        <td>0.337944978189</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>0</td>\n",
+       "        <td>1.60841750445e-05</td>\n",
+       "        <td>0.632262085114</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>0.316079985713</td>\n",
+       "        <td>0.6325293909</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>0.632364174872</td>\n",
+       "        <td>0.31634729748</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>0.632694582988</td>\n",
+       "        <td>8.04208767443e-06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>0.316079985713</td>\n",
+       "        <td>8.04208767443e-06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>5</td>\n",
+       "        <td>0.0</td>\n",
+       "        <td>1.22712519446e-10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2.45425034248e-10</td>\n",
+       "        <td>0.31634729748</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(1, 0, 8.4387182909347e-07, 0.338306115082665),\n",
+       " (1, 1, 0.158459587238244, 0.527865350448059),\n",
+       " (1, 2, 0.405627969689677, 0.675800764727558),\n",
+       " (1, 3, 0.721775835521825, 3.95111934817447e-07),\n",
+       " (1, 4, 0.158459587238244, 3.95111934817447e-07),\n",
+       " (1, 5, 0.316385413093048, 0.189719957843216),\n",
+       " (1, 6, 0.405199928761102, 0.337944978189241),\n",
+       " (2, 0, 1.60841750444904e-05, 0.632262085114062),\n",
+       " (2, 1, 0.316079985713431, 0.632529390899584),\n",
+       " (2, 2, 0.632364174872359, 0.316347297480213),\n",
+       " (2, 3, 0.632694582987791, 8.04208767442759e-06),\n",
+       " (2, 4, 0.316079985713431, 8.04208767442759e-06),\n",
+       " (2, 5, 0.0, 1.22712519446222e-10),\n",
+       " (2, 6, 2.45425034248205e-10, 0.316347297480213)]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "INSERT INTO edge VALUES\n",
+    "(0, 1, 2),\n",
+    "(0, 2, 2),\n",
+    "(0, 4, 2),\n",
+    "(1, 2, 2),\n",
+    "(1, 3, 2),\n",
+    "(2, 3, 2),\n",
+    "(3, 0, 2),\n",
+    "(4, 0, 2),\n",
+    "(5, 6, 2),\n",
+    "(6, 3, 2);\n",
+    "\n",
+    "DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
+    "\n",
+    "SELECT madlib.hits(\n",
+    "             'vertex',             -- Vertex table\n",
+    "             'id',                 -- Vertex id column\n",
+    "             'edge',               -- Edge table\n",
+    "             'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
+    "             'hits_out',           -- Output table\n",
+    "             NULL,                 -- Default max_iter\n",
+    "             NULL,                 -- Threshold\n",
+    "             'user_id');           -- Grouping column\n",
+    "\n",
+    "SELECT * FROM hits_out ORDER BY user_id, id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 7. Other\n",
+    "Let's check against the output from p. 8 of http://www.cis.hut.fi/Opinnot/T-61.6020/2008/pagerank_hits.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "Done.\n",
+      "Done.\n",
+      "4 rows affected.\n",
+      "6 rows affected.\n",
+      "6 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>src</th>\n",
+       "        <th>dest</th>\n",
+       "        <th>user_id</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>1</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>1</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(0, 1, 1), (0, 2, 1), (0, 3, 1), (1, 2, 1), (1, 3, 1), (2, 1, 1)]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS vertex, edge;\n",
+    "\n",
+    "CREATE TABLE vertex(\n",
+    "        id INTEGER\n",
+    "        );\n",
+    "\n",
+    "CREATE TABLE edge(\n",
+    "        src INTEGER,\n",
+    "        dest INTEGER,\n",
+    "        user_id INTEGER\n",
+    "        );\n",
+    "\n",
+    "INSERT INTO vertex VALUES\n",
+    "(0),\n",
+    "(1),\n",
+    "(2),\n",
+    "(3);\n",
+    "\n",
+    "INSERT INTO edge VALUES\n",
+    "(0, 1, 1),\n",
+    "(0, 2, 1),\n",
+    "(0, 3, 1),\n",
+    "(1, 2, 1),\n",
+    "(1, 3, 1),\n",
+    "(2, 1, 1);\n",
+    "\n",
+    "SELECT * from edge ORDER BY src, dest;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>user_id</th>\n",
+       "        <th>__iterations__</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>16</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(1, 17), (2, 16)]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT * FROM hits_out_summary order by user_id;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "4 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>authority</th>\n",
+       "        <th>hub</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>0</td>\n",
+       "        <td>0.0</td>\n",
+       "        <td>0.788680749581</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>0.459746429928</td>\n",
+       "        <td>0.577334927798</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>0.627946343316</td>\n",
+       "        <td>0.211345821783</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>0.627946343316</td>\n",
+       "        <td>0.0</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(0, 0.0, 0.788680749580922),\n",
+       " (1, 0.459746429927966, 0.577334927797799),\n",
+       " (2, 0.627946343316246, 0.211345821783123),\n",
+       " (3, 0.627946343316246, 0.0)]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
+    "\n",
+    "SELECT madlib.hits(\n",
+    "             'vertex',             -- Vertex table\n",
+    "             'id',                 -- Vertex id column\n",
+    "             'edge',               -- Edge table\n",
+    "             'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
+    "             'hits_out',           -- Output table of HITS\n",
+    "             100);                   -- Max iteration\n",
+    "\n",
+    "SELECT * FROM hits_out ORDER BY id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Yes ^^^ matches the results from the reference."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/Linear-regression-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Linear-regression-v1.ipynb b/community-artifacts/Linear-regression-v1.ipynb
new file mode 100644
index 0000000..4ae89cf
--- /dev/null
+++ b/community-artifacts/Linear-regression-v1.ipynb
@@ -0,0 +1,1188 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Linear regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
+      "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+      "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
+      "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "u'Connected: gpdbchina@madlib'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Greenplum 4.3.10.0\n",
+    "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+    "        \n",
+    "# PostgreSQL local\n",
+    "#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>version</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>MADlib version: 1.13-dev, git revision: rel/v1.12-31-g4b7d9cc, cmake configuration time: Tue Nov 21 22:31:28 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-31-g4b7d9cc, cmake configuration time: Tue Nov 21 22:31:28 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%sql select madlib.version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Load test data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "Done.\n",
+      "15 rows affected.\n",
+      "15 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>tax</th>\n",
+       "        <th>bedroom</th>\n",
+       "        <th>bath</th>\n",
+       "        <th>price</th>\n",
+       "        <th>size</th>\n",
+       "        <th>lot</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>590</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>50000</td>\n",
+       "        <td>770</td>\n",
+       "        <td>22100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>1050</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>85000</td>\n",
+       "        <td>1410</td>\n",
+       "        <td>12000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>20</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>22500</td>\n",
+       "        <td>1060</td>\n",
+       "        <td>3500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>870</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>90000</td>\n",
+       "        <td>1300</td>\n",
+       "        <td>17500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>1320</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>133000</td>\n",
+       "        <td>1500</td>\n",
+       "        <td>30000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>1350</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>90500</td>\n",
+       "        <td>820</td>\n",
+       "        <td>25700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>2790</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.5</td>\n",
+       "        <td>260000</td>\n",
+       "        <td>2130</td>\n",
+       "        <td>25000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>680</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>142500</td>\n",
+       "        <td>1170</td>\n",
+       "        <td>22000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>1840</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>160000</td>\n",
+       "        <td>1500</td>\n",
+       "        <td>19000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>3680</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>240000</td>\n",
+       "        <td>2790</td>\n",
+       "        <td>20000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>1660</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>87000</td>\n",
+       "        <td>1030</td>\n",
+       "        <td>17500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>1620</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>118600</td>\n",
+       "        <td>1250</td>\n",
+       "        <td>20000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>3100</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>140000</td>\n",
+       "        <td>1760</td>\n",
+       "        <td>38000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>2070</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3.0</td>\n",
+       "        <td>148000</td>\n",
+       "        <td>1550</td>\n",
+       "        <td>14000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>650</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.5</td>\n",
+       "        <td>65000</td>\n",
+       "        <td>1450</td>\n",
+       "        <td>12000</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(1, 590, 2, 1.0, 50000, 770, 22100),\n",
+       " (2, 1050, 3, 2.0, 85000, 1410, 12000),\n",
+       " (3, 20, 3, 1.0, 22500, 1060, 3500),\n",
+       " (4, 870, 2, 2.0, 90000, 1300, 17500),\n",
+       " (5, 1320, 3, 2.0, 133000, 1500, 30000),\n",
+       " (6, 1350, 2, 1.0, 90500, 820, 25700),\n",
+       " (7, 2790, 3, 2.5, 260000, 2130, 25000),\n",
+       " (8, 680, 2, 1.0, 142500, 1170, 22000),\n",
+       " (9, 1840, 3, 2.0, 160000, 1500, 19000),\n",
+       " (10, 3680, 4, 2.0, 240000, 2790, 20000),\n",
+       " (11, 1660, 3, 1.0, 87000, 1030, 17500),\n",
+       " (12, 1620, 3, 2.0, 118600, 1250, 20000),\n",
+       " (13, 3100, 3, 2.0, 140000, 1760, 38000),\n",
+       " (14, 2070, 2, 3.0, 148000, 1550, 14000),\n",
+       " (15, 650, 3, 1.5, 65000, 1450, 12000)]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql \n",
+    "DROP TABLE IF EXISTS houses;\n",
+    "\n",
+    "CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,\n",
+    "            size INT, lot INT);\n",
+    "\n",
+    "INSERT INTO houses VALUES   \n",
+    "  (1 ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100),\n",
+    "  (2 , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000),\n",
+    "  (3 ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500),\n",
+    "  (4 ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500),\n",
+    "  (5 , 1320 ,       3 ,    2 , 133000 , 1500 , 30000),\n",
+    "  (6 , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700),\n",
+    "  (7 , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000),\n",
+    "  (8 ,  680 ,       2 ,    1 , 142500 , 1170 , 22000),\n",
+    "  (9 , 1840 ,       3 ,    2 , 160000 , 1500 , 19000),\n",
+    " (10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000),\n",
+    " (11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500),\n",
+    " (12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000),\n",
+    " (13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),\n",
+    " (14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),\n",
+    " (15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000);\n",
+    "    \n",
+    "SELECT * FROM houses ORDER BY id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Train regression model\n",
+    "First, we generate a single regression for all data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>method</th>\n",
+       "        <th>source_table</th>\n",
+       "        <th>out_table</th>\n",
+       "        <th>dependent_varname</th>\n",
+       "        <th>independent_varname</th>\n",
+       "        <th>num_rows_processed</th>\n",
+       "        <th>num_missing_rows_skipped</th>\n",
+       "        <th>grouping_col</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>linregr</td>\n",
+       "        <td>houses</td>\n",
+       "        <td>houses_linregr</td>\n",
+       "        <td>price</td>\n",
+       "        <td>ARRAY[1, tax, bath, size]</td>\n",
+       "        <td>15</td>\n",
+       "        <td>0</td>\n",
+       "        <td>None</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(u'linregr', u'houses', u'houses_linregr', u'price', u'ARRAY[1, tax, bath, size]', 15, 0, None)]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS houses_linregr, houses_linregr_summary;\n",
+    "\n",
+    "SELECT madlib.linregr_train( 'houses',\n",
+    "                             'houses_linregr',\n",
+    "                             'price',\n",
+    "                             'ARRAY[1, tax, bath, size]'\n",
+    "                           );\n",
+    "\n",
+    "SELECT * FROM houses_linregr_summary;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Train regression model with grouping\n",
+    "Next we generate three output models, one for each value of \"bedroom\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>method</th>\n",
+       "        <th>source_table</th>\n",
+       "        <th>out_table</th>\n",
+       "        <th>dependent_varname</th>\n",
+       "        <th>independent_varname</th>\n",
+       "        <th>num_rows_processed</th>\n",
+       "        <th>num_missing_rows_skipped</th>\n",
+       "        <th>grouping_col</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>linregr</td>\n",
+       "        <td>houses</td>\n",
+       "        <td>houses_linregr_bedroom</td>\n",
+       "        <td>price</td>\n",
+       "        <td>ARRAY[1, tax, bath, size]</td>\n",
+       "        <td>15</td>\n",
+       "        <td>0</td>\n",
+       "        <td>bedroom</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(u'linregr', u'houses', u'houses_linregr_bedroom', u'price', u'ARRAY[1, tax, bath, size]', 15, 0, u'bedroom')]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql \n",
+    "DROP TABLE IF EXISTS houses_linregr_bedroom, houses_linregr_bedroom_summary;\n",
+    "\n",
+    "SELECT madlib.linregr_train( 'houses',\n",
+    "                             'houses_linregr_bedroom',\n",
+    "                             'price',\n",
+    "                             'ARRAY[1, tax, bath, size]',\n",
+    "                             'bedroom'\n",
+    "                           );\n",
+    "\n",
+    "SELECT * FROM houses_linregr_bedroom_summary;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. Review model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>coef</th>\n",
+       "        <th>r2</th>\n",
+       "        <th>std_err</th>\n",
+       "        <th>t_stats</th>\n",
+       "        <th>p_values</th>\n",
+       "        <th>condition_no</th>\n",
+       "        <th>num_rows_processed</th>\n",
+       "        <th>num_missing_rows_skipped</th>\n",
+       "        <th>variance_covariance</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>[-12849.4168959872, 28.9613922651775, 10181.6290712649, 50.516894915353]</td>\n",
+       "        <td>0.768577580597</td>\n",
+       "        <td>[33453.0344331377, 15.8992104963991, 19437.7710925915, 32.9280231740856]</td>\n",
+       "        <td>[-0.384103179688204, 1.82156166004197, 0.523806408809164, 1.53416118083608]</td>\n",
+       "        <td>[0.708223134615411, 0.0958005827189554, 0.610804093526515, 0.153235085548177]</td>\n",
+       "        <td>9002.5045707</td>\n",
+       "        <td>15</td>\n",
+       "        <td>0</td>\n",
+       "        <td>[[1119105512.7847, 217782.067878005, -283344228.394538, -616679.693190829], [217782.067878005, 252.784894408806, -46373.1796964037, -369.864520095145], [-283344228.394538, -46373.1796964038, 377826945.047986, -209088.217319699], [-616679.693190829, -369.864520095145, -209088.217319699, 1084.25471015312]]</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[([-12849.4168959872, 28.9613922651775, 10181.6290712649, 50.516894915353], 0.768577580597462, [33453.0344331377, 15.8992104963991, 19437.7710925915, 32.9280231740856], [-0.384103179688204, 1.82156166004197, 0.523806408809164, 1.53416118083608], [0.708223134615411, 0.0958005827189554, 0.610804093526515, 0.153235085548177], 9002.50457069859, 15L, 0L, [[1119105512.7847, 217782.067878005, -283344228.394538, -616679.693190829], [217782.067878005, 252.784894408806, -46373.1796964037, -369.864520095145], [-283344228.394538, -46373.1796964038, 377826945.047986, -209088.217319699], [-616679.693190829, -369.864520095145, -209088.217319699, 1084.25471015312]])]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT * FROM houses_linregr;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively you can unnest the results for easier reading of output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>attribute</th>\n",
+       "        <th>coefficient</th>\n",
+       "        <th>standard_error</th>\n",
+       "        <th>t_stat</th>\n",
+       "        <th>pvalue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>intercept</td>\n",
+       "        <td>-12849.416896</td>\n",
+       "        <td>33453.0344331</td>\n",
+       "        <td>-0.384103179688</td>\n",
+       "        <td>0.708223134615</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>tax</td>\n",
+       "        <td>28.9613922652</td>\n",
+       "        <td>15.8992104964</td>\n",
+       "        <td>1.82156166004</td>\n",
+       "        <td>0.095800582719</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>bath</td>\n",
+       "        <td>10181.6290713</td>\n",
+       "        <td>19437.7710926</td>\n",
+       "        <td>0.523806408809</td>\n",
+       "        <td>0.610804093527</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>size</td>\n",
+       "        <td>50.5168949154</td>\n",
+       "        <td>32.9280231741</td>\n",
+       "        <td>1.53416118084</td>\n",
+       "        <td>0.153235085548</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(u'intercept', -12849.4168959872, 33453.0344331377, -0.384103179688204, 0.708223134615411),\n",
+       " (u'tax', 28.9613922651775, 15.8992104963991, 1.82156166004197, 0.0958005827189554),\n",
+       " (u'bath', 10181.6290712649, 19437.7710925915, 0.523806408809164, 0.610804093526515),\n",
+       " (u'size', 50.516894915353, 32.9280231740856, 1.53416118083608, 0.153235085548177)]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT unnest(ARRAY['intercept','tax','bath','size']) as attribute,\n",
+    "       unnest(coef) as coefficient,\n",
+    "       unnest(std_err) as standard_error,\n",
+    "       unnest(t_stats) as t_stat,\n",
+    "       unnest(p_values) as pvalue\n",
+    "FROM houses_linregr;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "# 5. Review model with grouping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>bedroom</th>\n",
+       "        <th>coef</th>\n",
+       "        <th>r2</th>\n",
+       "        <th>std_err</th>\n",
+       "        <th>t_stats</th>\n",
+       "        <th>p_values</th>\n",
+       "        <th>condition_no</th>\n",
+       "        <th>num_rows_processed</th>\n",
+       "        <th>num_missing_rows_skipped</th>\n",
+       "        <th>variance_covariance</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>[-84242.0345406619, 55.4430144648689, -78966.9753675336, 225.611910021196]</td>\n",
+       "        <td>0.968809546465</td>\n",
+       "        <td>[35018.9991666351, 19.5731125321026, 23036.8071292953, 49.0448678149637]</td>\n",
+       "        <td>[-2.40560942760823, 2.83261103076655, -3.42786111479457, 4.60011251069905]</td>\n",
+       "        <td>[0.250804617665626, 0.21605133377637, 0.180704400437667, 0.136272031474349]</td>\n",
+       "        <td>10086.1048725</td>\n",
+       "        <td>5</td>\n",
+       "        <td>0</td>\n",
+       "        <td>[[1226330302.63279, -300921.595597853, 551696673.399772, -1544160.63236657], [-300921.595597853, 383.106734194352, -304863.397298569, 323.251642470093], [551696673.399772, -304863.397298569, 530694482.712349, -946345.586402425], [-1544160.63236657, 323.251642470093, -946345.586402425, 2405.39905898726]]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>[-88155.8292501588, 27.1966436294421, 41404.0293363614, 62.6375210753234]</td>\n",
+       "        <td>0.841699901311</td>\n",
+       "        <td>[57867.9999702679, 17.8272309154706, 43643.1321511158, 70.8506824864022]</td>\n",
+       "        <td>[-1.52339512848988, 1.52556747362489, 0.948695185143874, 0.884077878675974]</td>\n",
+       "        <td>[0.188161432894911, 0.187636685729916, 0.38634003237497, 0.417132778705835]</td>\n",
+       "        <td>11722.6225642</td>\n",
+       "        <td>9</td>\n",
+       "        <td>0</td>\n",
+       "        <td>[[3348705420.55893, 433697.545104307, -70253017.4577515, -2593488.13800241], [433697.545104307, 317.810162113512, -90019.0797451144, -529.274668274391], [-70253017.4577515, -90019.0797451146, 1904722983.95976, -2183233.19448568], [-2593488.13800241, -529.27466827439, -2183233.19448568, 5019.81920878897]]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>[0.0112536020318379, 41.4132554771633, 0.0225072040636757, 31.3975496688276]</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>[0.0, 0.0, 0.0, 0.0]</td>\n",
+       "        <td>[inf, inf, inf, inf]</td>\n",
+       "        <td>None</td>\n",
+       "        <td>inf</td>\n",
+       "        <td>1</td>\n",
+       "        <td>0</td>\n",
+       "        <td>[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(2, [-84242.0345406619, 55.4430144648689, -78966.9753675336, 225.611910021196], 0.968809546465205, [35018.9991666351, 19.5731125321026, 23036.8071292953, 49.0448678149637], [-2.40560942760823, 2.83261103076655, -3.42786111479457, 4.60011251069905], [0.250804617665626, 0.21605133377637, 0.180704400437667, 0.136272031474349], 10086.1048725104, 5L, 0L, [[1226330302.63279, -300921.595597853, 551696673.399772, -1544160.63236657], [-300921.595597853, 383.106734194352, -304863.397298569, 323.251642470093], [551696673.399772, -304863.397298569, 530694482.712349, -946345.586402425], [-1544160.63236657, 323.251642470093, -946345.586402425, 2405.39905898726]]),\n",
+       " (3, [-88155.8292501588, 27.1966436294421, 41404.0293363614, 62.6375210753234], 0.841699901311221, [57867.9999702679, 17.8272309154706, 43643.1321511158, 70.8506824864022], [-1.52339512848988, 1.52556747362489, 0.948695185143874, 0.884077878675974], [0.188161432894911, 0.187636685729916, 0.38634003237497, 0.417132778705835], 11722.6225642065, 9L, 0L, [[3348705420.55893, 433697.545104307, -70253017.4577515, -2593488.13800241], [433697.545104307, 317.810162113512, -90019.0797451144, -529.274668274391], [-70253017.4577515, -90019.0797451146, 1904722983.95976, -2183233.19448568], [-2593488.13800241, -529.27466827439, -2183233.19448568, 5019.81920878897]]),\n",
+       " (4, [0.0112536020318379, 41.4132554771633, 0.0225072040636757, 31.3975496688276], 1.0, [0.0, 0.0, 0.0, 0.0], [inf, inf, inf, inf], None, inf, 1L, 0L, [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT * FROM houses_linregr_bedroom ORDER BY bedroom;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 6. Prediction\n",
+    "Compare predicted price with actual, in the case where not grouping is used.  That is, there is only one regression model for all data.  (This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "15 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>tax</th>\n",
+       "        <th>bedroom</th>\n",
+       "        <th>bath</th>\n",
+       "        <th>price</th>\n",
+       "        <th>size</th>\n",
+       "        <th>lot</th>\n",
+       "        <th>predict</th>\n",
+       "        <th>residual</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>590</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>50000</td>\n",
+       "        <td>770</td>\n",
+       "        <td>22100</td>\n",
+       "        <td>53317.4426966</td>\n",
+       "        <td>-3317.44269655</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>1050</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>85000</td>\n",
+       "        <td>1410</td>\n",
+       "        <td>12000</td>\n",
+       "        <td>109152.124956</td>\n",
+       "        <td>-24152.1249556</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>20</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>22500</td>\n",
+       "        <td>1060</td>\n",
+       "        <td>3500</td>\n",
+       "        <td>51459.3486309</td>\n",
+       "        <td>-28959.3486309</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>870</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>90000</td>\n",
+       "        <td>1300</td>\n",
+       "        <td>17500</td>\n",
+       "        <td>98382.2159072</td>\n",
+       "        <td>-8382.21590721</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>1320</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>133000</td>\n",
+       "        <td>1500</td>\n",
+       "        <td>30000</td>\n",
+       "        <td>121518.22141</td>\n",
+       "        <td>11481.7785904</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>1350</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>90500</td>\n",
+       "        <td>820</td>\n",
+       "        <td>25700</td>\n",
+       "        <td>77853.9455639</td>\n",
+       "        <td>12646.0544361</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>2790</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.5</td>\n",
+       "        <td>260000</td>\n",
+       "        <td>2130</td>\n",
+       "        <td>25000</td>\n",
+       "        <td>201007.926372</td>\n",
+       "        <td>58992.0736283</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>680</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>142500</td>\n",
+       "        <td>1170</td>\n",
+       "        <td>22000</td>\n",
+       "        <td>76130.7259666</td>\n",
+       "        <td>66369.2740334</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>1840</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>160000</td>\n",
+       "        <td>1500</td>\n",
+       "        <td>19000</td>\n",
+       "        <td>136578.145387</td>\n",
+       "        <td>23421.8546125</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>3680</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>240000</td>\n",
+       "        <td>2790</td>\n",
+       "        <td>20000</td>\n",
+       "        <td>255033.901596</td>\n",
+       "        <td>-15033.9015962</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>1660</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>87000</td>\n",
+       "        <td>1030</td>\n",
+       "        <td>17500</td>\n",
+       "        <td>97440.5250983</td>\n",
+       "        <td>-10440.5250983</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>1620</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>118600</td>\n",
+       "        <td>1250</td>\n",
+       "        <td>20000</td>\n",
+       "        <td>117577.41536</td>\n",
+       "        <td>1022.58463968</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>3100</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>140000</td>\n",
+       "        <td>1760</td>\n",
+       "        <td>38000</td>\n",
+       "        <td>186203.89232</td>\n",
+       "        <td>-46203.8923196</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>2070</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3.0</td>\n",
+       "        <td>148000</td>\n",
+       "        <td>1550</td>\n",
+       "        <td>14000</td>\n",
+       "        <td>155946.739426</td>\n",
+       "        <td>-7946.73942552</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>650</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.5</td>\n",
+       "        <td>65000</td>\n",
+       "        <td>1450</td>\n",
+       "        <td>12000</td>\n",
+       "        <td>94497.4293105</td>\n",
+       "        <td>-29497.4293105</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(1, 590, 2, 1.0, 50000, 770, 22100, 53317.4426965543, -3317.44269655428),\n",
+       " (2, 1050, 3, 2.0, 85000, 1410, 12000, 109152.124955627, -24152.1249556268),\n",
+       " (3, 20, 3, 1.0, 22500, 1060, 3500, 51459.3486308555, -28959.3486308555),\n",
+       " (4, 870, 2, 2.0, 90000, 1300, 17500, 98382.215907206, -8382.21590720599),\n",
+       " (5, 1320, 3, 2.0, 133000, 1500, 30000, 121518.221409606, 11481.7785903935),\n",
+       " (6, 1350, 2, 1.0, 90500, 820, 25700, 77853.9455638568, 12646.0544361432),\n",
+       " (7, 2790, 3, 2.5, 260000, 2130, 25000, 201007.926371722, 58992.0736282778),\n",
+       " (8, 680, 2, 1.0, 142500, 1170, 22000, 76130.7259665615, 66369.2740334385),\n",
+       " (9, 1840, 3, 2.0, 160000, 1500, 19000, 136578.145387499, 23421.8546125013),\n",
+       " (10, 3680, 4, 2.0, 240000, 2790, 20000, 255033.901596231, -15033.9015962306),\n",
+       " (11, 1660, 3, 1.0, 87000, 1030, 17500, 97440.5250982859, -10440.5250982859),\n",
+       " (12, 1620, 3, 2.0, 118600, 1250, 20000, 117577.415360321, 1022.58463967856),\n",
+       " (13, 3100, 3, 2.0, 140000, 1760, 38000, 186203.892319614, -46203.8923196141),\n",
+       " (14, 2070, 2, 3.0, 148000, 1550, 14000, 155946.739425522, -7946.73942552213),\n",
+       " (15, 650, 3, 1.5, 65000, 1450, 12000, 94497.4293105374, -29497.4293105374)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT houses.*,\n",
+    "       madlib.linregr_predict( m.coef,\n",
+    "                              ARRAY[1,tax,bath,size]\n",
+    "                             ) as predict,\n",
+    "        price -\n",
+    "          madlib.linregr_predict( m.coef,\n",
+    "                                 ARRAY[1,tax,bath,size] \n",
+    "                                ) as residual\n",
+    "FROM houses, houses_linregr m ORDER BY id;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 7. Prediction with grouping\n",
+    "Compare predicted price with actual with grouping.  It means a different model is used depending on the number of bedrooms."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "15 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>tax</th>\n",
+       "        <th>bedroom</th>\n",
+       "        <th>bath</th>\n",
+       "        <th>price</th>\n",
+       "        <th>size</th>\n",
+       "        <th>lot</th>\n",
+       "        <th>predict</th>\n",
+       "        <th>residual</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>590</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>50000</td>\n",
+       "        <td>770</td>\n",
+       "        <td>22100</td>\n",
+       "        <td>43223.5393424</td>\n",
+       "        <td>6776.4606576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>1050</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>85000</td>\n",
+       "        <td>1410</td>\n",
+       "        <td>12000</td>\n",
+       "        <td>111527.60995</td>\n",
+       "        <td>-26527.6099497</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>20</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>22500</td>\n",
+       "        <td>1060</td>\n",
+       "        <td>3500</td>\n",
+       "        <td>20187.9052986</td>\n",
+       "        <td>2312.09470137</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>870</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>90000</td>\n",
+       "        <td>1300</td>\n",
+       "        <td>17500</td>\n",
+       "        <td>99354.9203363</td>\n",
+       "        <td>-9354.92033626</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>1320</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>133000</td>\n",
+       "        <td>1500</td>\n",
+       "        <td>30000</td>\n",
+       "        <td>124508.080626</td>\n",
+       "        <td>8491.91937359</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>1350</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>90500</td>\n",
+       "        <td>820</td>\n",
+       "        <td>25700</td>\n",
+       "        <td>96640.8258368</td>\n",
+       "        <td>-6140.82583676</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>2790</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.5</td>\n",
+       "        <td>260000</td>\n",
+       "        <td>2130</td>\n",
+       "        <td>25000</td>\n",
+       "        <td>224650.799707</td>\n",
+       "        <td>35349.2002927</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>680</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>142500</td>\n",
+       "        <td>1170</td>\n",
+       "        <td>22000</td>\n",
+       "        <td>138458.174653</td>\n",
+       "        <td>4041.82534729</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>1840</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>160000</td>\n",
+       "        <td>1500</td>\n",
+       "        <td>19000</td>\n",
+       "        <td>138650.335314</td>\n",
+       "        <td>21349.6646863</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>3680</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>240000</td>\n",
+       "        <td>2790</td>\n",
+       "        <td>20000</td>\n",
+       "        <td>240000.0</td>\n",
+       "        <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>1660</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.0</td>\n",
+       "        <td>87000</td>\n",
+       "        <td>1030</td>\n",
+       "        <td>17500</td>\n",
+       "        <td>62911.2752187</td>\n",
+       "        <td>24088.7247813</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>1620</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>118600</td>\n",
+       "        <td>1250</td>\n",
+       "        <td>20000</td>\n",
+       "        <td>117007.693446</td>\n",
+       "        <td>1592.30655359</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>3100</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2.0</td>\n",
+       "        <td>140000</td>\n",
+       "        <td>1760</td>\n",
+       "        <td>38000</td>\n",
+       "        <td>189203.861766</td>\n",
+       "        <td>-49203.8617664</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>2070</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3.0</td>\n",
+       "        <td>148000</td>\n",
+       "        <td>1550</td>\n",
+       "        <td>14000</td>\n",
+       "        <td>143322.539832</td>\n",
+       "        <td>4677.46016813</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>650</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1.5</td>\n",
+       "        <td>65000</td>\n",
+       "        <td>1450</td>\n",
+       "        <td>12000</td>\n",
+       "        <td>82452.4386727</td>\n",
+       "        <td>-17452.4386727</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(1, 590, 2, 1.0, 50000, 770, 22100, 43223.5393423978, 6776.46065760222),\n",
+       " (2, 1050, 3, 2.0, 85000, 1410, 12000, 111527.609949684, -26527.609949684),\n",
+       " (3, 20, 3, 1.0, 22500, 1060, 3500, 20187.9052986341, 2312.09470136587),\n",
+       " (4, 870, 2, 2.0, 90000, 1300, 17500, 99354.9203362612, -9354.92033626116),\n",
+       " (5, 1320, 3, 2.0, 133000, 1500, 30000, 124508.080626412, 8491.91937358756),\n",
+       " (6, 1350, 2, 1.0, 90500, 820, 25700, 96640.8258367579, -6140.8258367579),\n",
+       " (7, 2790, 3, 2.5, 260000, 2130, 25000, 224650.799707327, 35349.2002926733),\n",
+       " (8, 680, 2, 1.0, 142500, 1170, 22000, 138458.174652714, 4041.82534728572),\n",
+       " (9, 1840, 3, 2.0, 160000, 1500, 19000, 138650.335313722, 21349.6646862777),\n",
+       " (10, 3680, 4, 2.0, 240000, 2790, 20000, 240000.0, 0.0),\n",
+       " (11, 1660, 3, 1.0, 87000, 1030, 17500, 62911.2752186594, 24088.7247813406),\n",
+       " (12, 1620, 3, 2.0, 118600, 1250, 20000, 117007.693446414, 1592.30655358579),\n",
+       " (13, 3100, 3, 2.0, 140000, 1760, 38000, 189203.861766403, -49203.8617664034),\n",
+       " (14, 2070, 2, 3.0, 148000, 1550, 14000, 143322.539831869, 4677.46016813093),\n",
+       " (15, 650, 3, 1.5, 65000, 1450, 12000, 82452.4386727394, -17452.4386727394)]"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "SELECT houses.*,\n",
+    "       madlib.linregr_predict( m.coef,\n",
+    "                              ARRAY[1,tax,bath,size]\n",
+    "                             ) as predict,\n",
+    "        price -\n",
+    "          madlib.linregr_predict( m.coef,\n",
+    "                                 ARRAY[1,tax,bath,size] \n",
+    "                                ) as residual\n",
+    "FROM houses, houses_linregr_bedroom m\n",
+    "WHERE houses.bedroom = m.bedroom\n",
+    "ORDER BY id;"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}