You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by fm...@apache.org on 2018/04/23 21:57:58 UTC

[14/15] madlib-site git commit: jupyter notebooks for 1.14 release

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Balanced-sampling-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Balanced-sampling-v1.ipynb b/community-artifacts/Balanced-sampling-v1.ipynb
new file mode 100644
index 0000000..5f6ec23
--- /dev/null
+++ b/community-artifacts/Balanced-sampling-v1.ipynb
@@ -0,0 +1,3706 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Balanced sampling\n",
+    "\n",
+    "This module offers a number of re-sampling techniques including under-sampling majority classes, over-sampling minority classes, and combinations of the two.\n",
+    "\n",
+    "Balanced sampling was added in MADlib 1.14."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The sql extension is already loaded. To reload it, use:\n",
+      "  %reload_ext sql\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "u'Connected: gpadmin@madlib'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+    "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+    "        \n",
+    "# PostgreSQL local\n",
+    "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+    "\n",
+    "# Greenplum Database 4.3.10.0\n",
+    "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>version</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%sql select madlib.version();\n",
+    "#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Load data\n",
+    "Based in part on the flags data set from https://archive.ics.uci.edu/ml/datasets/Flags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "Done.\n",
+      "22 rows affected.\n",
+      "22 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>Argentina</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2777</td>\n",
+       "        <td>28</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>Australia</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>7690</td>\n",
+       "        <td>15</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>Greece</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>132</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>Austria</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>84</td>\n",
+       "        <td>8</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>Canada</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9976</td>\n",
+       "        <td>24</td>\n",
+       "        <td>1</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>China</td>\n",
+       "        <td>5</td>\n",
+       "        <td>1</td>\n",
+       "        <td>9561</td>\n",
+       "        <td>1008</td>\n",
+       "        <td>7</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>Denmark</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>43</td>\n",
+       "        <td>5</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>Luxembourg</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>0</td>\n",
+       "        <td>4</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>Norway</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>324</td>\n",
+       "        <td>4</td>\n",
+       "        <td>6</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>Portugal</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>92</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>5</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>16</td>\n",
+       "        <td>Spain</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>505</td>\n",
+       "        <td>38</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>18</td>\n",
+       "        <td>Switzerland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>41</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>19</td>\n",
+       "        <td>UK</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>245</td>\n",
+       "        <td>56</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>21</td>\n",
+       "        <td>xElba</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>1</td>\n",
+       "        <td>1</td>\n",
+       "        <td>6</td>\n",
+       "        <td>None</td>\n",
+       "        <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>22</td>\n",
+       "        <td>xPrussia</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>249</td>\n",
+       "        <td>61</td>\n",
+       "        <td>4</td>\n",
+       "        <td>None</td>\n",
+       "        <td>None</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+       " (2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+       " (8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+       " (9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+       " (3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+       " (5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
+       " (6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+       " (7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+       " (12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+       " (14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+       " (15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
+       " (16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+       " (18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+       " (19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+       " (10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (21, u'xElba', 3, 1, 1, 1, 6, None, None),\n",
+       " (22, u'xPrussia', 3, 1, 249, 61, 4, None, None)]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS flags;\n",
+    "\n",
+    "CREATE TABLE flags (\n",
+    "    id INTEGER,\n",
+    "    name TEXT,\n",
+    "    landmass INTEGER,\n",
+    "    zone INTEGER,\n",
+    "    area INTEGER,\n",
+    "    population INTEGER,\n",
+    "    language INTEGER,\n",
+    "    colours INTEGER,\n",
+    "    mainhue TEXT\n",
+    ");\n",
+    "\n",
+    "INSERT INTO flags VALUES\n",
+    "(1, 'Argentina', 2, 3, 2777, 28, 2, 2, 'blue'),\n",
+    "(2, 'Australia', 6, 2, 7690, 15, 1, 3, 'blue'),\n",
+    "(3, 'Austria', 3, 1, 84, 8, 4, 2, 'red'),\n",
+    "(4, 'Brazil', 2, 3, 8512, 119, 6, 4, 'green'),\n",
+    "(5, 'Canada', 1, 4, 9976, 24, 1, 2, 'red'),\n",
+    "(6, 'China', 5, 1, 9561, 1008, 7, 2, 'red'),\n",
+    "(7, 'Denmark', 3, 1, 43, 5, 6, 2, 'red'),\n",
+    "(8, 'Greece', 3, 1, 132, 10, 6, 2, 'blue'),\n",
+    "(9, 'Guatemala', 1, 4, 109, 8, 2, 2, 'blue'),\n",
+    "(10, 'Ireland', 3, 4, 70, 3, 1, 3, 'white'),\n",
+    "(11, 'Jamaica', 1, 4, 11, 2, 1, 3, 'green'),\n",
+    "(12, 'Luxembourg', 3, 1, 3, 0, 4, 3, 'red'),\n",
+    "(13, 'Mexico', 1, 4, 1973, 77, 2, 4, 'green'),\n",
+    "(14, 'Norway', 3, 1, 324, 4, 6, 3, 'red'),\n",
+    "(15, 'Portugal', 3, 4, 92, 10, 6, 5, 'red'),\n",
+    "(16, 'Spain', 3, 4, 505, 38, 2, 2, 'red'),\n",
+    "(17, 'Sweden', 3, 1, 450, 8, 6, 2, 'blue'),\n",
+    "(18, 'Switzerland', 3, 1, 41, 6, 4, 2, 'red'),\n",
+    "(19, 'UK', 3, 4, 245, 56, 1, 3, 'red'),\n",
+    "(20, 'USA', 1, 4, 9363, 231, 1, 3, 'white'),\n",
+    "(21, 'xElba', 3, 1, 1, 1, 6, NULL, NULL),\n",
+    "(22, 'xPrussia', 3, 1, 249, 61, 4, NULL, NULL);\n",
+    "\n",
+    "SELECT * FROM flags ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2.  Uniform sampling \n",
+    "\n",
+    "All class values will be resampled so that they have the same number of rows. The output data size will be the same as the input data size, ignoring NULL values.  Uniform sampling is the default for the 'class_size' parameter so we do not need to explicitly set it:  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "20 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>19</td>\n",
+       "        <td>1</td>\n",
+       "        <td>Argentina</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2777</td>\n",
+       "        <td>28</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>18</td>\n",
+       "        <td>2</td>\n",
+       "        <td>Australia</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>7690</td>\n",
+       "        <td>15</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>20</td>\n",
+       "        <td>8</td>\n",
+       "        <td>Greece</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>132</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>17</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>16</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>6</td>\n",
+       "        <td>China</td>\n",
+       "        <td>5</td>\n",
+       "        <td>1</td>\n",
+       "        <td>9561</td>\n",
+       "        <td>1008</td>\n",
+       "        <td>7</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>7</td>\n",
+       "        <td>Denmark</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>43</td>\n",
+       "        <td>5</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>12</td>\n",
+       "        <td>Luxembourg</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>0</td>\n",
+       "        <td>4</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>18</td>\n",
+       "        <td>Switzerland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>41</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>19</td>\n",
+       "        <td>UK</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>245</td>\n",
+       "        <td>56</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(19L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+       " (18L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+       " (20L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+       " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (16L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (6L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (7L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+       " (10L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+       " (8L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+       " (1L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+       " (4L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+       " (2L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+       " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+       " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+       " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (13L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS output_table;\n",
+    "\n",
+    "SELECT madlib.balance_sample(\n",
+    "                              'flags',             -- Source table\n",
+    "                              'output_table',      -- Output table\n",
+    "                              'mainhue');          -- Class column\n",
+    "  \n",
+    "SELECT * FROM output_table ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next we do uniform sampling again, but this time we specify a size for the output table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "12 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>8</td>\n",
+       "        <td>Greece</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>132</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>5</td>\n",
+       "        <td>Canada</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9976</td>\n",
+       "        <td>24</td>\n",
+       "        <td>1</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>12</td>\n",
+       "        <td>Luxembourg</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>0</td>\n",
+       "        <td>4</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>18</td>\n",
+       "        <td>Switzerland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>41</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(3L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+       " (2L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (1L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (6L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (5L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+       " (9L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
+       " (8L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+       " (7L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+       " (12L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS output_table;\n",
+    "\n",
+    "SELECT madlib.balance_sample(\n",
+    "                              'flags',             -- Source table\n",
+    "                              'output_table',      -- Output table\n",
+    "                              'mainhue',           -- Class column\n",
+    "                              'uniform',           -- Uniform sample\n",
+    "                               12);                -- Desired output table size\n",
+    "\n",
+    "SELECT * FROM output_table ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "scrolled": true
+   },
+   "source": [
+    "# 3. Oversampling\n",
+    "Oversample with replacement such that all class values except NULLs end up with the same number of observations as the majority class.  Countries with red flags is the majority class with 10 observations, so other class values will be oversampled to 10 observations:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "40 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>37</td>\n",
+       "        <td>2</td>\n",
+       "        <td>Australia</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>7690</td>\n",
+       "        <td>15</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>39</td>\n",
+       "        <td>2</td>\n",
+       "        <td>Australia</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>7690</td>\n",
+       "        <td>15</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>38</td>\n",
+       "        <td>2</td>\n",
+       "        <td>Australia</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>7690</td>\n",
+       "        <td>15</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>40</td>\n",
+       "        <td>8</td>\n",
+       "        <td>Greece</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>132</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>32</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>31</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>35</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>33</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>36</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>34</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>18</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>16</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>17</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>19</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>20</td>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>3</td>\n",
+       "        <td>Austria</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>84</td>\n",
+       "        <td>8</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>5</td>\n",
+       "        <td>Canada</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9976</td>\n",
+       "        <td>24</td>\n",
+       "        <td>1</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>6</td>\n",
+       "        <td>China</td>\n",
+       "        <td>5</td>\n",
+       "        <td>1</td>\n",
+       "        <td>9561</td>\n",
+       "        <td>1008</td>\n",
+       "        <td>7</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>7</td>\n",
+       "        <td>Denmark</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>43</td>\n",
+       "        <td>5</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>12</td>\n",
+       "        <td>Luxembourg</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>0</td>\n",
+       "        <td>4</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>14</td>\n",
+       "        <td>Norway</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>324</td>\n",
+       "        <td>4</td>\n",
+       "        <td>6</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>15</td>\n",
+       "        <td>Portugal</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>92</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>5</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>16</td>\n",
+       "        <td>Spain</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>505</td>\n",
+       "        <td>38</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>18</td>\n",
+       "        <td>Switzerland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>41</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>19</td>\n",
+       "        <td>UK</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>245</td>\n",
+       "        <td>56</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>27</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>29</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>28</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>26</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>30</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>22</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>24</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>23</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>25</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>21</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(37L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+       " (39L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+       " (38L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+       " (40L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+       " (32L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (31L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (35L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (33L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (36L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (34L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (18L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (16L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (15L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (19L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (11L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (13L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (14L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (12L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (20L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+       " (8L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+       " (6L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
+       " (2L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+       " (5L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+       " (1L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+       " (9L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+       " (4L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
+       " (10L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+       " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+       " (7L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+       " (27L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (29L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (28L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (26L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (30L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (22L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (24L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (23L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (25L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+       " (21L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS output_table;\n",
+    "\n",
+    "SELECT madlib.balance_sample(\n",
+    "                              'flags',             -- Source table\n",
+    "                              'output_table',      -- Output table\n",
+    "                              'mainhue',           -- Class column\n",
+    "                              'oversample');       -- Oversample\n",
+    "\n",
+    "SELECT * FROM output_table ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. Undersampling\n",
+    "Undersample such that all class values except NULLs end up with the same number of observations as the minority class.  Countries with white flags is the minority class with 2 observations, so other class values will be undersampled to 2 observations:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "8 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>Argentina</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2777</td>\n",
+       "        <td>28</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>14</td>\n",
+       "        <td>Norway</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>324</td>\n",
+       "        <td>4</td>\n",
+       "        <td>6</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>19</td>\n",
+       "        <td>UK</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>245</td>\n",
+       "        <td>56</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(2L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+       " (1L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (3L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (6L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+       " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+       " (7L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (8L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS output_table;\n",
+    "\n",
+    "SELECT madlib.balance_sample(\n",
+    "                              'flags',             -- Source table\n",
+    "                              'output_table',      -- Output table\n",
+    "                              'mainhue',           -- Class column\n",
+    "                              'undersample');      -- Undersample\n",
+    "\n",
+    "SELECT * FROM output_table ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the case of bootstrapping, we may want to undersample with replacement, so we set the 'with_replacement' parameter to TRUE:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "8 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>3</td>\n",
+       "        <td>Austria</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>84</td>\n",
+       "        <td>8</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>7</td>\n",
+       "        <td>Denmark</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>43</td>\n",
+       "        <td>5</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(7L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (8L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (2L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (1L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (4L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+       " (3L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+       " (6L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (5L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS output_table;\n",
+    "\n",
+    "SELECT madlib.balance_sample(\n",
+    "                              'flags',             -- Source table\n",
+    "                              'output_table',      -- Output table\n",
+    "                              'mainhue',           -- Class column\n",
+    "                              'undersample',       -- Undersample\n",
+    "                               NULL,               -- Output table size will be calculated\n",
+    "                               NULL,               -- No grouping\n",
+    "                              'TRUE');             -- Sample with replacement\n",
+    "\n",
+    "SELECT * FROM output_table ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note above that some rows may appear multiple times above since we sampled with replacement."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5. Setting class size by count\n",
+    "\n",
+    "Here we set the number of rows for red and blue flags, and leave green and white flags unchanged:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "19 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>1</td>\n",
+       "        <td>Argentina</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2777</td>\n",
+       "        <td>28</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>1</td>\n",
+       "        <td>Argentina</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2777</td>\n",
+       "        <td>28</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>1</td>\n",
+       "        <td>Argentina</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2777</td>\n",
+       "        <td>28</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>16</td>\n",
+       "        <td>8</td>\n",
+       "        <td>Greece</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>132</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>17</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>Austria</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>84</td>\n",
+       "        <td>8</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>6</td>\n",
+       "        <td>China</td>\n",
+       "        <td>5</td>\n",
+       "        <td>1</td>\n",
+       "        <td>9561</td>\n",
+       "        <td>1008</td>\n",
+       "        <td>7</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>7</td>\n",
+       "        <td>Denmark</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>43</td>\n",
+       "        <td>5</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>12</td>\n",
+       "        <td>Luxembourg</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>0</td>\n",
+       "        <td>4</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>14</td>\n",
+       "        <td>Norway</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>324</td>\n",
+       "        <td>4</td>\n",
+       "        <td>6</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>16</td>\n",
+       "        <td>Spain</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>505</td>\n",
+       "        <td>38</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>19</td>\n",
+       "        <td>UK</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>245</td>\n",
+       "        <td>56</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>18</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>19</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(11L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+       " (12L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+       " (10L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+       " (16L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+       " (14L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (13L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (15L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+       " (2L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+       " (3L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+       " (1L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+       " (6L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+       " (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+       " (4L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+       " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+       " (18L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (19L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS output_table;\n",
+    "\n",
+    "SELECT madlib.balance_sample(\n",
+    "                              'flags',             -- Source table\n",
+    "                              'output_table',      -- Output table\n",
+    "                              'mainhue',           -- Class column\n",
+    "                              'red=7, blue=7');    -- Want 7 reds and 7 blues\n",
+    "\n",
+    "SELECT * FROM output_table ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next we set the number of rows for red and blue flags, and also set an output table size.  This means that green and white flags will be uniformly sampled to get to the desired output table size:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "22 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>21</td>\n",
+       "        <td>2</td>\n",
+       "        <td>Australia</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>7690</td>\n",
+       "        <td>15</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>22</td>\n",
+       "        <td>8</td>\n",
+       "        <td>Greece</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>132</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>16</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>17</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>19</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>20</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>18</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>11</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>6</td>\n",
+       "        <td>China</td>\n",
+       "        <td>5</td>\n",
+       "        <td>1</td>\n",
+       "        <td>9561</td>\n",
+       "        <td>1008</td>\n",
+       "        <td>7</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>7</td>\n",
+       "        <td>Denmark</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>43</td>\n",
+       "        <td>5</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>4</td>\n",
+       "        <td>14</td>\n",
+       "        <td>Norway</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>324</td>\n",
+       "        <td>4</td>\n",
+       "        <td>6</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>15</td>\n",
+       "        <td>Portugal</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>92</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>5</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>3</td>\n",
+       "        <td>16</td>\n",
+       "        <td>Spain</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>505</td>\n",
+       "        <td>38</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>1</td>\n",
+       "        <td>18</td>\n",
+       "        <td>Switzerland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>41</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>2</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>5</td>\n",
+       "        <td>19</td>\n",
+       "        <td>UK</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>245</td>\n",
+       "        <td>56</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>red</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>13</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>15</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>14</td>\n",
+       "        <td>10</td>\n",
+       "        <td>Ireland</td>\n",
+       "        <td>3</td>\n",
+       "        <td>4</td>\n",
+       "        <td>70</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>12</td>\n",
+       "        <td>20</td>\n",
+       "        <td>USA</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>9363</td>\n",
+       "        <td>231</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>white</td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "[(21L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+       " (22L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+       " (16L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+       " (19L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (20L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (18L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+       " (10L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (9L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (11L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+       " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+       " (7L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+       " (2L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+       " (4L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+       " (6L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
+       " (3L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+       " (1L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+       " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+       " (13L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+       " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%sql\n",
+    "DROP TABLE IF EXISTS output_table;\n",
+    "\n",
+    "SELECT madlib.balance_sample(\n",
+    "                              'flags',             -- Source table\n",
+    "                              'output_table',      -- Output table\n",
+    "                              'mainhue',           -- Class column\n",
+    "                              'red=7, blue=7',     -- Want 7 reds and 7 blues\n",
+    "                               22);                -- Desired output table size\n",
+    "\n",
+    "SELECT * FROM output_table ORDER BY mainhue, name;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 6.  NULL handling\n",
+    "\n",
+    "To make NULL a valid class value, set the parameter to keep NULLs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n",
+      "1 rows affected.\n",
+      "25 rows affected.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <th>__madlib_id__</th>\n",
+       "        <th>id</th>\n",
+       "        <th>name</th>\n",
+       "        <th>landmass</th>\n",
+       "        <th>zone</th>\n",
+       "        <th>area</th>\n",
+       "        <th>population</th>\n",
+       "        <th>language</th>\n",
+       "        <th>colours</th>\n",
+       "        <th>mainhue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>24</td>\n",
+       "        <td>1</td>\n",
+       "        <td>Argentina</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>2777</td>\n",
+       "        <td>28</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>25</td>\n",
+       "        <td>2</td>\n",
+       "        <td>Australia</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>7690</td>\n",
+       "        <td>15</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>22</td>\n",
+       "        <td>8</td>\n",
+       "        <td>Greece</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>132</td>\n",
+       "        <td>10</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>21</td>\n",
+       "        <td>9</td>\n",
+       "        <td>Guatemala</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>109</td>\n",
+       "        <td>8</td>\n",
+       "        <td>2</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>23</td>\n",
+       "        <td>17</td>\n",
+       "        <td>Sweden</td>\n",
+       "        <td>3</td>\n",
+       "        <td>1</td>\n",
+       "        <td>450</td>\n",
+       "        <td>8</td>\n",
+       "        <td>6</td>\n",
+       "        <td>2</td>\n",
+       "        <td>blue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>7</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>8</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>Brazil</td>\n",
+       "        <td>2</td>\n",
+       "        <td>3</td>\n",
+       "        <td>8512</td>\n",
+       "        <td>119</td>\n",
+       "        <td>6</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>10</td>\n",
+       "        <td>11</td>\n",
+       "        <td>Jamaica</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>11</td>\n",
+       "        <td>2</td>\n",
+       "        <td>1</td>\n",
+       "        <td>3</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>9</td>\n",
+       "        <td>13</td>\n",
+       "        <td>Mexico</td>\n",
+       "        <td>1</td>\n",
+       "        <td>4</td>\n",
+       "        <td>1973</td>\n",
+       "        <td>77</td>\n",
+       "        <td>2</td>\n",
+       "        <td>4</td>\n",
+       "        <td>green</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "        <td>2</td>\n",
+       "        <td>5</td>\n",
+       "        <

<TRUNCATED>