You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by fm...@apache.org on 2018/04/23 21:57:58 UTC
[14/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Balanced-sampling-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Balanced-sampling-v1.ipynb b/community-artifacts/Balanced-sampling-v1.ipynb
new file mode 100644
index 0000000..5f6ec23
--- /dev/null
+++ b/community-artifacts/Balanced-sampling-v1.ipynb
@@ -0,0 +1,3706 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Balanced sampling\n",
+ "\n",
+ "This module offers a number of re-sampling techniques including under-sampling majority classes, over-sampling minority classes, and combinations of the two.\n",
+ "\n",
+ "Balanced sampling was added in MADlib 1.14."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The sql extension is already loaded. To reload it, use:\n",
+ " %reload_ext sql\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "u'Connected: gpadmin@madlib'"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+ "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+ " \n",
+ "# PostgreSQL local\n",
+ "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+ "\n",
+ "# Greenplum Database 4.3.10.0\n",
+ "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>version</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql select madlib.version();\n",
+ "#%sql select version();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Load data\n",
+ "Based in part on the flags data set from https://archive.ics.uci.edu/ml/datasets/Flags"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "Done.\n",
+ "22 rows affected.\n",
+ "22 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>Argentina</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>2777</td>\n",
+ " <td>28</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>Australia</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>7690</td>\n",
+ " <td>15</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>Greece</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>132</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>Austria</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>84</td>\n",
+ " <td>8</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>Canada</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9976</td>\n",
+ " <td>24</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>China</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>9561</td>\n",
+ " <td>1008</td>\n",
+ " <td>7</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>Denmark</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>43</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>Luxembourg</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>Norway</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>324</td>\n",
+ " <td>4</td>\n",
+ " <td>6</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>Portugal</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>92</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>5</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>Spain</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>505</td>\n",
+ " <td>38</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>Switzerland</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>41</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>UK</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>245</td>\n",
+ " <td>56</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>21</td>\n",
+ " <td>xElba</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>6</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>22</td>\n",
+ " <td>xPrussia</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>249</td>\n",
+ " <td>61</td>\n",
+ " <td>4</td>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+ " (2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+ " (8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+ " (9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+ " (3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+ " (5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
+ " (6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+ " (7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+ " (12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+ " (14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+ " (15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
+ " (16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+ " (18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+ " (19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+ " (10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (21, u'xElba', 3, 1, 1, 1, 6, None, None),\n",
+ " (22, u'xPrussia', 3, 1, 249, 61, 4, None, None)]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS flags;\n",
+ "\n",
+ "CREATE TABLE flags (\n",
+ " id INTEGER,\n",
+ " name TEXT,\n",
+ " landmass INTEGER,\n",
+ " zone INTEGER,\n",
+ " area INTEGER,\n",
+ " population INTEGER,\n",
+ " language INTEGER,\n",
+ " colours INTEGER,\n",
+ " mainhue TEXT\n",
+ ");\n",
+ "\n",
+ "INSERT INTO flags VALUES\n",
+ "(1, 'Argentina', 2, 3, 2777, 28, 2, 2, 'blue'),\n",
+ "(2, 'Australia', 6, 2, 7690, 15, 1, 3, 'blue'),\n",
+ "(3, 'Austria', 3, 1, 84, 8, 4, 2, 'red'),\n",
+ "(4, 'Brazil', 2, 3, 8512, 119, 6, 4, 'green'),\n",
+ "(5, 'Canada', 1, 4, 9976, 24, 1, 2, 'red'),\n",
+ "(6, 'China', 5, 1, 9561, 1008, 7, 2, 'red'),\n",
+ "(7, 'Denmark', 3, 1, 43, 5, 6, 2, 'red'),\n",
+ "(8, 'Greece', 3, 1, 132, 10, 6, 2, 'blue'),\n",
+ "(9, 'Guatemala', 1, 4, 109, 8, 2, 2, 'blue'),\n",
+ "(10, 'Ireland', 3, 4, 70, 3, 1, 3, 'white'),\n",
+ "(11, 'Jamaica', 1, 4, 11, 2, 1, 3, 'green'),\n",
+ "(12, 'Luxembourg', 3, 1, 3, 0, 4, 3, 'red'),\n",
+ "(13, 'Mexico', 1, 4, 1973, 77, 2, 4, 'green'),\n",
+ "(14, 'Norway', 3, 1, 324, 4, 6, 3, 'red'),\n",
+ "(15, 'Portugal', 3, 4, 92, 10, 6, 5, 'red'),\n",
+ "(16, 'Spain', 3, 4, 505, 38, 2, 2, 'red'),\n",
+ "(17, 'Sweden', 3, 1, 450, 8, 6, 2, 'blue'),\n",
+ "(18, 'Switzerland', 3, 1, 41, 6, 4, 2, 'red'),\n",
+ "(19, 'UK', 3, 4, 245, 56, 1, 3, 'red'),\n",
+ "(20, 'USA', 1, 4, 9363, 231, 1, 3, 'white'),\n",
+ "(21, 'xElba', 3, 1, 1, 1, 6, NULL, NULL),\n",
+ "(22, 'xPrussia', 3, 1, 249, 61, 4, NULL, NULL);\n",
+ "\n",
+ "SELECT * FROM flags ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Uniform sampling \n",
+ "\n",
+ "All class values will be resampled so that they have the same number of rows. The output data size will be the same as the input data size, ignoring NULL values. Uniform sampling is the default for the 'class_size' parameter so we do not need to explicitly set it: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "20 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>1</td>\n",
+ " <td>Argentina</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>2777</td>\n",
+ " <td>28</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>2</td>\n",
+ " <td>Australia</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>7690</td>\n",
+ " <td>15</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>8</td>\n",
+ " <td>Greece</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>132</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>6</td>\n",
+ " <td>China</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>9561</td>\n",
+ " <td>1008</td>\n",
+ " <td>7</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>7</td>\n",
+ " <td>Denmark</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>43</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>12</td>\n",
+ " <td>Luxembourg</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>18</td>\n",
+ " <td>Switzerland</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>41</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>19</td>\n",
+ " <td>UK</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>245</td>\n",
+ " <td>56</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(19L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+ " (18L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+ " (20L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+ " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (16L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (6L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (7L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+ " (10L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+ " (8L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+ " (1L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+ " (4L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+ " (2L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+ " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+ " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+ " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (13L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS output_table;\n",
+ "\n",
+ "SELECT madlib.balance_sample(\n",
+ " 'flags', -- Source table\n",
+ " 'output_table', -- Output table\n",
+ " 'mainhue'); -- Class column\n",
+ " \n",
+ "SELECT * FROM output_table ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next we do uniform sampling again, but this time we specify a size for the output table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "12 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>8</td>\n",
+ " <td>Greece</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>132</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>5</td>\n",
+ " <td>Canada</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9976</td>\n",
+ " <td>24</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>12</td>\n",
+ " <td>Luxembourg</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>18</td>\n",
+ " <td>Switzerland</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>41</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(3L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+ " (2L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (1L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (6L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (5L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+ " (9L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
+ " (8L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+ " (7L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+ " (12L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS output_table;\n",
+ "\n",
+ "SELECT madlib.balance_sample(\n",
+ " 'flags', -- Source table\n",
+ " 'output_table', -- Output table\n",
+ " 'mainhue', -- Class column\n",
+ " 'uniform', -- Uniform sample\n",
+ " 12); -- Desired output table size\n",
+ "\n",
+ "SELECT * FROM output_table ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "scrolled": true
+ },
+ "source": [
+ "# 3. Oversampling\n",
+ "Oversample with replacement such that all class values except NULLs end up with the same number of observations as the majority class. Countries with red flags is the majority class with 10 observations, so other class values will be oversampled to 10 observations:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "40 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>37</td>\n",
+ " <td>2</td>\n",
+ " <td>Australia</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>7690</td>\n",
+ " <td>15</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>39</td>\n",
+ " <td>2</td>\n",
+ " <td>Australia</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>7690</td>\n",
+ " <td>15</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>38</td>\n",
+ " <td>2</td>\n",
+ " <td>Australia</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>7690</td>\n",
+ " <td>15</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>40</td>\n",
+ " <td>8</td>\n",
+ " <td>Greece</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>132</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>32</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>31</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>35</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>33</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>36</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>34</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>3</td>\n",
+ " <td>Austria</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>84</td>\n",
+ " <td>8</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>5</td>\n",
+ " <td>Canada</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9976</td>\n",
+ " <td>24</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>6</td>\n",
+ " <td>China</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>9561</td>\n",
+ " <td>1008</td>\n",
+ " <td>7</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>7</td>\n",
+ " <td>Denmark</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>43</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>12</td>\n",
+ " <td>Luxembourg</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>14</td>\n",
+ " <td>Norway</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>324</td>\n",
+ " <td>4</td>\n",
+ " <td>6</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>15</td>\n",
+ " <td>Portugal</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>92</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>5</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>16</td>\n",
+ " <td>Spain</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>505</td>\n",
+ " <td>38</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>18</td>\n",
+ " <td>Switzerland</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>41</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>19</td>\n",
+ " <td>UK</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>245</td>\n",
+ " <td>56</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>27</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>29</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>28</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>26</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>30</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>22</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>24</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>23</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>25</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>21</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(37L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+ " (39L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+ " (38L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+ " (40L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+ " (32L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (31L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (35L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (33L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (36L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (34L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (18L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (16L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (15L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (19L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (11L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (13L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (14L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (12L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (20L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+ " (8L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+ " (6L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
+ " (2L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+ " (5L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+ " (1L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+ " (9L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+ " (4L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
+ " (10L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+ " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+ " (7L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+ " (27L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (29L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (28L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (26L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (30L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (22L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (24L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (23L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (25L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
+ " (21L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS output_table;\n",
+ "\n",
+ "SELECT madlib.balance_sample(\n",
+ " 'flags', -- Source table\n",
+ " 'output_table', -- Output table\n",
+ " 'mainhue', -- Class column\n",
+ " 'oversample'); -- Oversample\n",
+ "\n",
+ "SELECT * FROM output_table ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 4. Undersampling\n",
+ "Undersample such that all class values except NULLs end up with the same number of observations as the minority class. Countries with white flags is the minority class with 2 observations, so other class values will be undersampled to 2 observations:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "8 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>Argentina</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>2777</td>\n",
+ " <td>28</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>14</td>\n",
+ " <td>Norway</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>324</td>\n",
+ " <td>4</td>\n",
+ " <td>6</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>19</td>\n",
+ " <td>UK</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>245</td>\n",
+ " <td>56</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(2L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+ " (1L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (3L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (6L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+ " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+ " (7L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (8L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS output_table;\n",
+ "\n",
+ "SELECT madlib.balance_sample(\n",
+ " 'flags', -- Source table\n",
+ " 'output_table', -- Output table\n",
+ " 'mainhue', -- Class column\n",
+ " 'undersample'); -- Undersample\n",
+ "\n",
+ "SELECT * FROM output_table ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the case of bootstrapping, we may want to undersample with replacement, so we set the 'with_replacement' parameter to TRUE:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "8 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>3</td>\n",
+ " <td>Austria</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>84</td>\n",
+ " <td>8</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>7</td>\n",
+ " <td>Denmark</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>43</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(7L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (8L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (2L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (1L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (4L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+ " (3L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+ " (6L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (5L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS output_table;\n",
+ "\n",
+ "SELECT madlib.balance_sample(\n",
+ " 'flags', -- Source table\n",
+ " 'output_table', -- Output table\n",
+ " 'mainhue', -- Class column\n",
+ " 'undersample', -- Undersample\n",
+ " NULL, -- Output table size will be calculated\n",
+ " NULL, -- No grouping\n",
+ " 'TRUE'); -- Sample with replacement\n",
+ "\n",
+ "SELECT * FROM output_table ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note above that some rows may appear multiple times above since we sampled with replacement."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 5. Setting class size by count\n",
+ "\n",
+ "Here we set the number of rows for red and blue flags, and leave green and white flags unchanged:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "19 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>1</td>\n",
+ " <td>Argentina</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>2777</td>\n",
+ " <td>28</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>1</td>\n",
+ " <td>Argentina</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>2777</td>\n",
+ " <td>28</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>Argentina</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>2777</td>\n",
+ " <td>28</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>8</td>\n",
+ " <td>Greece</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>132</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>Austria</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>84</td>\n",
+ " <td>8</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>6</td>\n",
+ " <td>China</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>9561</td>\n",
+ " <td>1008</td>\n",
+ " <td>7</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>7</td>\n",
+ " <td>Denmark</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>43</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>12</td>\n",
+ " <td>Luxembourg</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>14</td>\n",
+ " <td>Norway</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>324</td>\n",
+ " <td>4</td>\n",
+ " <td>6</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>16</td>\n",
+ " <td>Spain</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>505</td>\n",
+ " <td>38</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>19</td>\n",
+ " <td>UK</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>245</td>\n",
+ " <td>56</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(11L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+ " (12L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+ " (10L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
+ " (16L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+ " (14L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (13L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (15L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
+ " (2L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
+ " (3L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+ " (1L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+ " (6L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
+ " (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+ " (4L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+ " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+ " (18L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (19L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS output_table;\n",
+ "\n",
+ "SELECT madlib.balance_sample(\n",
+ " 'flags', -- Source table\n",
+ " 'output_table', -- Output table\n",
+ " 'mainhue', -- Class column\n",
+ " 'red=7, blue=7'); -- Want 7 reds and 7 blues\n",
+ "\n",
+ "SELECT * FROM output_table ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next we set the number of rows for red and blue flags, and also set an output table size. This means that green and white flags will be uniformly sampled to get to the desired output table size:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "22 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>21</td>\n",
+ " <td>2</td>\n",
+ " <td>Australia</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>7690</td>\n",
+ " <td>15</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>22</td>\n",
+ " <td>8</td>\n",
+ " <td>Greece</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>132</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>6</td>\n",
+ " <td>China</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>9561</td>\n",
+ " <td>1008</td>\n",
+ " <td>7</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>7</td>\n",
+ " <td>Denmark</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>43</td>\n",
+ " <td>5</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>14</td>\n",
+ " <td>Norway</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>324</td>\n",
+ " <td>4</td>\n",
+ " <td>6</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>15</td>\n",
+ " <td>Portugal</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>92</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>5</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>16</td>\n",
+ " <td>Spain</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>505</td>\n",
+ " <td>38</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>18</td>\n",
+ " <td>Switzerland</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>41</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>19</td>\n",
+ " <td>UK</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>245</td>\n",
+ " <td>56</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>red</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>10</td>\n",
+ " <td>Ireland</td>\n",
+ " <td>3</td>\n",
+ " <td>4</td>\n",
+ " <td>70</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>20</td>\n",
+ " <td>USA</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>9363</td>\n",
+ " <td>231</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>white</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(21L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
+ " (22L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
+ " (16L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
+ " (19L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (20L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (18L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
+ " (10L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (9L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (11L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
+ " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
+ " (7L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
+ " (2L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
+ " (4L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
+ " (6L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
+ " (3L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
+ " (1L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
+ " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
+ " (13L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
+ " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS output_table;\n",
+ "\n",
+ "SELECT madlib.balance_sample(\n",
+ " 'flags', -- Source table\n",
+ " 'output_table', -- Output table\n",
+ " 'mainhue', -- Class column\n",
+ " 'red=7, blue=7', -- Want 7 reds and 7 blues\n",
+ " 22); -- Desired output table size\n",
+ "\n",
+ "SELECT * FROM output_table ORDER BY mainhue, name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 6. NULL handling\n",
+ "\n",
+ "To make NULL a valid class value, set the parameter to keep NULLs:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "25 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__madlib_id__</th>\n",
+ " <th>id</th>\n",
+ " <th>name</th>\n",
+ " <th>landmass</th>\n",
+ " <th>zone</th>\n",
+ " <th>area</th>\n",
+ " <th>population</th>\n",
+ " <th>language</th>\n",
+ " <th>colours</th>\n",
+ " <th>mainhue</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>24</td>\n",
+ " <td>1</td>\n",
+ " <td>Argentina</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>2777</td>\n",
+ " <td>28</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>25</td>\n",
+ " <td>2</td>\n",
+ " <td>Australia</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>7690</td>\n",
+ " <td>15</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>22</td>\n",
+ " <td>8</td>\n",
+ " <td>Greece</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>132</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>21</td>\n",
+ " <td>9</td>\n",
+ " <td>Guatemala</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>109</td>\n",
+ " <td>8</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>23</td>\n",
+ " <td>17</td>\n",
+ " <td>Sweden</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>450</td>\n",
+ " <td>8</td>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " <td>blue</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>Brazil</td>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " <td>8512</td>\n",
+ " <td>119</td>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>11</td>\n",
+ " <td>Jamaica</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>11</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>13</td>\n",
+ " <td>Mexico</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>1973</td>\n",
+ " <td>77</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>green</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>5</td>\n",
+ " <
<TRUNCATED>