You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by fm...@apache.org on 2017/01/04 21:20:58 UTC
incubator-madlib-site git commit: notebook for new cat vars encoding
in 1.10
Repository: incubator-madlib-site
Updated Branches:
refs/heads/asf-site 86869127c -> bc296ce9a
notebook for new cat vars encoding in 1.10
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/commit/bc296ce9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/tree/bc296ce9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/diff/bc296ce9
Branch: refs/heads/asf-site
Commit: bc296ce9a27b42c0ab429926ace9e4575e9c9ccf
Parents: 8686912
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Wed Jan 4 13:20:05 2017 -0800
Committer: Frank McQuillan <fm...@pivotal.io>
Committed: Wed Jan 4 13:20:05 2017 -0800
----------------------------------------------------------------------
...coding-categorical-variables-1dot10-v1.ipynb | 1243 ++++++++++++++++++
1 file changed, 1243 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bc296ce9/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb b/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb
new file mode 100644
index 0000000..611b5ff
--- /dev/null
+++ b/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb
@@ -0,0 +1,1243 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Encoding categorical variables (MADlib v1.10+)\n",
+ "This is the new module that replaces create_indicator_variables() which has been deprecated as of MADlib v1.10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
+ " \"You should import from traitlets.config instead.\", ShimWarning)\n",
+ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
+ " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "u'Connected: gpdbchina@madlib'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n",
+ "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+ "#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>version</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>MADlib version: 1.10.0-dev, git revision: v1.7.1-203-g51f7a94, cmake configuration time: Tue Jan 3 23:26:27 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'MADlib version: 1.10.0-dev, git revision: v1.7.1-203-g51f7a94, cmake configuration time: Tue Jan 3 23:26:27 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql select madlib.version();\n",
+ "#%sql select version();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Load data set\n",
+ "Use a subset of the abalone dataset:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "Done.\n",
+ "20 rows affected.\n",
+ "20 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>sex</th>\n",
+ " <th>length</th>\n",
+ " <th>diameter</th>\n",
+ " <th>height</th>\n",
+ " <th>rings</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>M</td>\n",
+ " <td>0.455</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.095</td>\n",
+ " <td>15</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>M</td>\n",
+ " <td>0.35</td>\n",
+ " <td>0.265</td>\n",
+ " <td>0.09</td>\n",
+ " <td>7</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>F</td>\n",
+ " <td>0.53</td>\n",
+ " <td>0.42</td>\n",
+ " <td>0.135</td>\n",
+ " <td>9</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>M</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.125</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>I</td>\n",
+ " <td>0.33</td>\n",
+ " <td>0.255</td>\n",
+ " <td>0.08</td>\n",
+ " <td>7</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>I</td>\n",
+ " <td>0.425</td>\n",
+ " <td>0.3</td>\n",
+ " <td>0.095</td>\n",
+ " <td>8</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>F</td>\n",
+ " <td>0.53</td>\n",
+ " <td>0.415</td>\n",
+ " <td>0.15</td>\n",
+ " <td>20</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>F</td>\n",
+ " <td>0.545</td>\n",
+ " <td>0.425</td>\n",
+ " <td>0.125</td>\n",
+ " <td>16</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>M</td>\n",
+ " <td>0.475</td>\n",
+ " <td>0.37</td>\n",
+ " <td>0.125</td>\n",
+ " <td>9</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>None</td>\n",
+ " <td>0.55</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.15</td>\n",
+ " <td>19</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>F</td>\n",
+ " <td>0.525</td>\n",
+ " <td>0.38</td>\n",
+ " <td>0.14</td>\n",
+ " <td>14</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>M</td>\n",
+ " <td>0.43</td>\n",
+ " <td>0.35</td>\n",
+ " <td>0.11</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>M</td>\n",
+ " <td>0.49</td>\n",
+ " <td>0.38</td>\n",
+ " <td>0.135</td>\n",
+ " <td>11</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>F</td>\n",
+ " <td>0.535</td>\n",
+ " <td>0.405</td>\n",
+ " <td>0.145</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>F</td>\n",
+ " <td>0.47</td>\n",
+ " <td>0.355</td>\n",
+ " <td>0.1</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>M</td>\n",
+ " <td>0.5</td>\n",
+ " <td>0.4</td>\n",
+ " <td>0.13</td>\n",
+ " <td>12</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>I</td>\n",
+ " <td>0.355</td>\n",
+ " <td>0.28</td>\n",
+ " <td>0.085</td>\n",
+ " <td>7</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>F</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.34</td>\n",
+ " <td>0.1</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>M</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.295</td>\n",
+ " <td>0.08</td>\n",
+ " <td>7</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>None</td>\n",
+ " <td>0.45</td>\n",
+ " <td>0.32</td>\n",
+ " <td>0.1</td>\n",
+ " <td>9</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, u'M', 0.455, 0.365, 0.095, 15),\n",
+ " (2, u'M', 0.35, 0.265, 0.09, 7),\n",
+ " (3, u'F', 0.53, 0.42, 0.135, 9),\n",
+ " (4, u'M', 0.44, 0.365, 0.125, 10),\n",
+ " (5, u'I', 0.33, 0.255, 0.08, 7),\n",
+ " (6, u'I', 0.425, 0.3, 0.095, 8),\n",
+ " (7, u'F', 0.53, 0.415, 0.15, 20),\n",
+ " (8, u'F', 0.545, 0.425, 0.125, 16),\n",
+ " (9, u'M', 0.475, 0.37, 0.125, 9),\n",
+ " (10, None, 0.55, 0.44, 0.15, 19),\n",
+ " (11, u'F', 0.525, 0.38, 0.14, 14),\n",
+ " (12, u'M', 0.43, 0.35, 0.11, 10),\n",
+ " (13, u'M', 0.49, 0.38, 0.135, 11),\n",
+ " (14, u'F', 0.535, 0.405, 0.145, 10),\n",
+ " (15, u'F', 0.47, 0.355, 0.1, 10),\n",
+ " (16, u'M', 0.5, 0.4, 0.13, 12),\n",
+ " (17, u'I', 0.355, 0.28, 0.085, 7),\n",
+ " (18, u'F', 0.44, 0.34, 0.1, 10),\n",
+ " (19, u'M', 0.365, 0.295, 0.08, 7),\n",
+ " (20, None, 0.45, 0.32, 0.1, 9)]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql \n",
+ "DROP TABLE IF EXISTS abalone;\n",
+ "\n",
+ "CREATE TABLE abalone (\n",
+ " id serial,\n",
+ " sex character varying,\n",
+ " length double precision,\n",
+ " diameter double precision,\n",
+ " height double precision,\n",
+ " rings int\n",
+ ");\n",
+ "\n",
+ "INSERT INTO abalone (sex, length, diameter, height, rings) VALUES\n",
+ "('M', 0.455, 0.365, 0.095, 15),\n",
+ "('M', 0.35, 0.265, 0.09, 7),\n",
+ "('F', 0.53, 0.42, 0.135, 9),\n",
+ "('M', 0.44, 0.365, 0.125, 10),\n",
+ "('I', 0.33, 0.255, 0.08, 7),\n",
+ "('I', 0.425, 0.3, 0.095, 8),\n",
+ "('F', 0.53, 0.415, 0.15, 20),\n",
+ "('F', 0.545, 0.425, 0.125, 16),\n",
+ "('M', 0.475, 0.37, 0.125, 9),\n",
+ "(NULL, 0.55, 0.44, 0.15, 19),\n",
+ "('F', 0.525, 0.38, 0.14, 14),\n",
+ "('M', 0.43, 0.35, 0.11, 10),\n",
+ "('M', 0.49, 0.38, 0.135, 11),\n",
+ "('F', 0.535, 0.405, 0.145, 10),\n",
+ "('F', 0.47, 0.355, 0.1, 10),\n",
+ "('M', 0.5, 0.4, 0.13, 12),\n",
+ "('I', 0.355, 0.28, 0.085, 7),\n",
+ "('F', 0.44, 0.34, 0.1, 10),\n",
+ "('M', 0.365, 0.295, 0.08, 7),\n",
+ "(NULL, 0.45, 0.32, 0.1, 9);\n",
+ "\n",
+ "SELECT * FROM abalone ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Create new table with one-hot encoding.\n",
+ "The column 'sex' is replaced by three columns encoding the values 'F', 'M' and 'I'. Null values are not encoded by default:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "20 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>length</th>\n",
+ " <th>diameter</th>\n",
+ " <th>height</th>\n",
+ " <th>rings</th>\n",
+ " <th>sex_F</th>\n",
+ " <th>sex_I</th>\n",
+ " <th>sex_M</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0.455</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.095</td>\n",
+ " <td>15</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>0.35</td>\n",
+ " <td>0.265</td>\n",
+ " <td>0.09</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0.53</td>\n",
+ " <td>0.42</td>\n",
+ " <td>0.135</td>\n",
+ " <td>9</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.125</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>0.33</td>\n",
+ " <td>0.255</td>\n",
+ " <td>0.08</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>0.425</td>\n",
+ " <td>0.3</td>\n",
+ " <td>0.095</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>0.53</td>\n",
+ " <td>0.415</td>\n",
+ " <td>0.15</td>\n",
+ " <td>20</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>0.545</td>\n",
+ " <td>0.425</td>\n",
+ " <td>0.125</td>\n",
+ " <td>16</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0.475</td>\n",
+ " <td>0.37</td>\n",
+ " <td>0.125</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>0.55</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.15</td>\n",
+ " <td>19</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>0.525</td>\n",
+ " <td>0.38</td>\n",
+ " <td>0.14</td>\n",
+ " <td>14</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>0.43</td>\n",
+ " <td>0.35</td>\n",
+ " <td>0.11</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>0.49</td>\n",
+ " <td>0.38</td>\n",
+ " <td>0.135</td>\n",
+ " <td>11</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>0.535</td>\n",
+ " <td>0.405</td>\n",
+ " <td>0.145</td>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>0.47</td>\n",
+ " <td>0.355</td>\n",
+ " <td>0.1</td>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>0.5</td>\n",
+ " <td>0.4</td>\n",
+ " <td>0.13</td>\n",
+ " <td>12</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>0.355</td>\n",
+ " <td>0.28</td>\n",
+ " <td>0.085</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.34</td>\n",
+ " <td>0.1</td>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.295</td>\n",
+ " <td>0.08</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>0.45</td>\n",
+ " <td>0.32</td>\n",
+ " <td>0.1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 0.455, 0.365, 0.095, 15, 0, 0, 1),\n",
+ " (2, 0.35, 0.265, 0.09, 7, 0, 0, 1),\n",
+ " (3, 0.53, 0.42, 0.135, 9, 1, 0, 0),\n",
+ " (4, 0.44, 0.365, 0.125, 10, 0, 0, 1),\n",
+ " (5, 0.33, 0.255, 0.08, 7, 0, 1, 0),\n",
+ " (6, 0.425, 0.3, 0.095, 8, 0, 1, 0),\n",
+ " (7, 0.53, 0.415, 0.15, 20, 1, 0, 0),\n",
+ " (8, 0.545, 0.425, 0.125, 16, 1, 0, 0),\n",
+ " (9, 0.475, 0.37, 0.125, 9, 0, 0, 1),\n",
+ " (10, 0.55, 0.44, 0.15, 19, 0, 0, 0),\n",
+ " (11, 0.525, 0.38, 0.14, 14, 1, 0, 0),\n",
+ " (12, 0.43, 0.35, 0.11, 10, 0, 0, 1),\n",
+ " (13, 0.49, 0.38, 0.135, 11, 0, 0, 1),\n",
+ " (14, 0.535, 0.405, 0.145, 10, 1, 0, 0),\n",
+ " (15, 0.47, 0.355, 0.1, 10, 1, 0, 0),\n",
+ " (16, 0.5, 0.4, 0.13, 12, 0, 0, 1),\n",
+ " (17, 0.355, 0.28, 0.085, 7, 0, 1, 0),\n",
+ " (18, 0.44, 0.34, 0.1, 10, 1, 0, 0),\n",
+ " (19, 0.365, 0.295, 0.08, 7, 0, 0, 1),\n",
+ " (20, 0.45, 0.32, 0.1, 9, 0, 0, 0)]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " 'sex' -- Categorical columns\n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Encode null values\n",
+ "Now include NULL values in encoding (note the additional column 'sex_NULL'):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "20 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>length</th>\n",
+ " <th>diameter</th>\n",
+ " <th>height</th>\n",
+ " <th>rings</th>\n",
+ " <th>sex_F</th>\n",
+ " <th>sex_I</th>\n",
+ " <th>sex_M</th>\n",
+ " <th>sex_NULL</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0.455</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.095</td>\n",
+ " <td>15</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>0.35</td>\n",
+ " <td>0.265</td>\n",
+ " <td>0.09</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0.53</td>\n",
+ " <td>0.42</td>\n",
+ " <td>0.135</td>\n",
+ " <td>9</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.125</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>0.33</td>\n",
+ " <td>0.255</td>\n",
+ " <td>0.08</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>0.425</td>\n",
+ " <td>0.3</td>\n",
+ " <td>0.095</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>0.53</td>\n",
+ " <td>0.415</td>\n",
+ " <td>0.15</td>\n",
+ " <td>20</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>0.545</td>\n",
+ " <td>0.425</td>\n",
+ " <td>0.125</td>\n",
+ " <td>16</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0.475</td>\n",
+ " <td>0.37</td>\n",
+ " <td>0.125</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>0.55</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.15</td>\n",
+ " <td>19</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>0.525</td>\n",
+ " <td>0.38</td>\n",
+ " <td>0.14</td>\n",
+ " <td>14</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>0.43</td>\n",
+ " <td>0.35</td>\n",
+ " <td>0.11</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>0.49</td>\n",
+ " <td>0.38</td>\n",
+ " <td>0.135</td>\n",
+ " <td>11</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>0.535</td>\n",
+ " <td>0.405</td>\n",
+ " <td>0.145</td>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>0.47</td>\n",
+ " <td>0.355</td>\n",
+ " <td>0.1</td>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>0.5</td>\n",
+ " <td>0.4</td>\n",
+ " <td>0.13</td>\n",
+ " <td>12</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>0.355</td>\n",
+ " <td>0.28</td>\n",
+ " <td>0.085</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>0.44</td>\n",
+ " <td>0.34</td>\n",
+ " <td>0.1</td>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>0.365</td>\n",
+ " <td>0.295</td>\n",
+ " <td>0.08</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>0.45</td>\n",
+ " <td>0.32</td>\n",
+ " <td>0.1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 0.455, 0.365, 0.095, 15, 0, 0, 1, 0),\n",
+ " (2, 0.35, 0.265, 0.09, 7, 0, 0, 1, 0),\n",
+ " (3, 0.53, 0.42, 0.135, 9, 1, 0, 0, 0),\n",
+ " (4, 0.44, 0.365, 0.125, 10, 0, 0, 1, 0),\n",
+ " (5, 0.33, 0.255, 0.08, 7, 0, 1, 0, 0),\n",
+ " (6, 0.425, 0.3, 0.095, 8, 0, 1, 0, 0),\n",
+ " (7, 0.53, 0.415, 0.15, 20, 1, 0, 0, 0),\n",
+ " (8, 0.545, 0.425, 0.125, 16, 1, 0, 0, 0),\n",
+ " (9, 0.475, 0.37, 0.125, 9, 0, 0, 1, 0),\n",
+ " (10, 0.55, 0.44, 0.15, 19, 0, 0, 0, 1),\n",
+ " (11, 0.525, 0.38, 0.14, 14, 1, 0, 0, 0),\n",
+ " (12, 0.43, 0.35, 0.11, 10, 0, 0, 1, 0),\n",
+ " (13, 0.49, 0.38, 0.135, 11, 0, 0, 1, 0),\n",
+ " (14, 0.535, 0.405, 0.145, 10, 1, 0, 0, 0),\n",
+ " (15, 0.47, 0.355, 0.1, 10, 1, 0, 0, 0),\n",
+ " (16, 0.5, 0.4, 0.13, 12, 0, 0, 1, 0),\n",
+ " (17, 0.355, 0.28, 0.085, 7, 0, 1, 0, 0),\n",
+ " (18, 0.44, 0.34, 0.1, 10, 1, 0, 0, 0),\n",
+ " (19, 0.365, 0.295, 0.08, 7, 0, 0, 1, 0),\n",
+ " (20, 0.45, 0.32, 0.1, 9, 0, 0, 0, 1)]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " 'sex', -- Categorical columns\n",
+ " NULL, -- Categorical columns to exclude\n",
+ " NULL, -- Index columns\n",
+ " NULL, -- Top values\n",
+ " NULL, -- Value to drop for dummy encoding\n",
+ " TRUE -- Encode nulls\n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Encode all categorical variables and specify an index\n",
+ "Encode all categorical variables in the source table. Also, specify the column 'id' as the index (primary key) - this changes the output table to only include the index and the encoded variables:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " '*', -- Categorical columns\n",
+ " NULL, -- Categorical columns to exclude\n",
+ " 'id' -- Index columns\n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Encode top values\n",
+ "Now let's encode only the top values and group others into a miscellaneous bucket column. Top values can be global across all columns or specified by column. As an example of the latter, here are the top 2 'sex' values and the top 50% of 'rings' values:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " '*', -- Categorical columns\n",
+ " NULL, -- Categorical columns to exclude\n",
+ " 'id', -- Index columns\n",
+ " 'sex=2, rings=0.5' -- Top values\n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Show raw variables and encoded variables together\n",
+ "If you want to see both the raw categorical variable and its encoded form in the output_table, then include the categorical variables 'sex' and 'rings' in the index parameter. (Remember that this will not work if you specify '*' for the parameter 'categorical_cols', because in this case 'row_id' columns will not be encoded at all.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " 'sex, rings', -- Categorical columns\n",
+ " NULL, -- Categorical columns to exclude\n",
+ " 'id, sex, rings' -- Index columns\n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 7. Dummy encoding\n",
+ "For dummy encoding, let's make the 'I' value from the 'sex' variable as the reference.\n",
+ "Here we use the 'value_to_drop' parameter:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " '*', -- Categorical columns\n",
+ " 'rings', -- Categorical columns to exclude\n",
+ " 'id', -- Index columns\n",
+ " NULL, -- Top value\n",
+ " 'sex=I' -- Value to drop for dummy encoding \n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 8. Array output\n",
+ "Create an array output for the two categorical variables in the source table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " '*', -- Categorical columns\n",
+ " NULL, -- Categorical columns to exclude\n",
+ " 'id', -- Index columns\n",
+ " NULL, -- Top values\n",
+ " NULL, -- Value to drop for dummy encoding\n",
+ " NULL, -- Encode nulls\n",
+ " TRUE -- Array output\n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And here is the dictionary table that specifies the index into the array:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%sql SELECT * FROM abalone_out_dictionary;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 9. Dictionary output\n",
+ "Create a dictionary:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " '*', -- Categorical columns\n",
+ " NULL, -- Categorical columns to exclude\n",
+ " 'id', -- Index columns\n",
+ " NULL, -- Top values\n",
+ " NULL, -- Value to drop for dummy encoding\n",
+ " NULL, -- Encode nulls\n",
+ " FALSE, -- Array output\n",
+ " TRUE -- Dictionary output \n",
+ " );\n",
+ "SELECT * FROM abalone_out ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And here is the dictionary table that defines the columns in the output table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%sql SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 10. Distribution policy\n",
+ "We can chose for various distribution policies, for examply RANDOMLY:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " '*', -- Categorical columns\n",
+ " NULL, -- Categorical columns to exclude\n",
+ " 'id', -- Index columns\n",
+ " NULL, -- Top values\n",
+ " NULL, -- Value to drop for dummy encoding\n",
+ " NULL, -- Encode nulls\n",
+ " NULL, -- Array output\n",
+ " NULL, -- Dictionary output\n",
+ " 'RANDOMLY' -- Distribution policy\n",
+ " );"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 11. Encoding floats\n",
+ "If you have a float that you want to encode, you can cast it in the following way within the function call:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n",
+ "SELECT madlib.encode_categorical_variables (\n",
+ " 'abalone', -- Source table\n",
+ " 'abalone_out', -- Output table\n",
+ " 'height::TEXT' -- Categorical columns\n",
+ " );"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}