You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by fm...@apache.org on 2017/12/08 20:55:14 UTC
[4/7] madlib-site git commit: add new workbooks for 1dot13
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/Logistic-regression-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Logistic-regression-v1.ipynb b/community-artifacts/Logistic-regression-v1.ipynb
new file mode 100644
index 0000000..226049d
--- /dev/null
+++ b/community-artifacts/Logistic-regression-v1.ipynb
@@ -0,0 +1,892 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Logistic regression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
+ " \"You should import from traitlets.config instead.\", ShimWarning)\n",
+ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
+ " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "u'Connected: fmcquillan@madlib'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Greenplum 4.3.10.0\n",
+ "# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+ " \n",
+ "# PostgreSQL local\n",
+ "%sql postgresql://fmcquillan@localhost:5432/madlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>version</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql select madlib.version();\n",
+ "#%sql select version();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Load data\n",
+ "This data set is related to predicting a second heart attack given treatment and health factors."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "Done.\n",
+ "20 rows affected.\n",
+ "20 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>second_attack</th>\n",
+ " <th>treatment</th>\n",
+ " <th>trait_anxiety</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>70</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>80</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>50</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>60</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>40</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>65</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>75</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>80</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>70</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>60</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>65</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>50</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>45</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>35</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>40</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>50</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>55</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>45</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>50</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>60</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 1, 1, 70),\n",
+ " (2, 1, 1, 80),\n",
+ " (3, 1, 1, 50),\n",
+ " (4, 1, 0, 60),\n",
+ " (5, 1, 0, 40),\n",
+ " (6, 1, 0, 65),\n",
+ " (7, 1, 0, 75),\n",
+ " (8, 1, 0, 80),\n",
+ " (9, 1, 0, 70),\n",
+ " (10, 1, 0, 60),\n",
+ " (11, 0, 1, 65),\n",
+ " (12, 0, 1, 50),\n",
+ " (13, 0, 1, 45),\n",
+ " (14, 0, 1, 35),\n",
+ " (15, 0, 1, 40),\n",
+ " (16, 0, 1, 50),\n",
+ " (17, 0, 0, 55),\n",
+ " (18, 0, 0, 45),\n",
+ " (19, 0, 0, 50),\n",
+ " (20, 0, 0, 60)]"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS patients;\n",
+ "\n",
+ "CREATE TABLE patients( id INTEGER NOT NULL,\n",
+ " second_attack INTEGER,\n",
+ " treatment INTEGER,\n",
+ " trait_anxiety INTEGER);\n",
+ " \n",
+ "INSERT INTO patients VALUES \n",
+ "(1, 1, 1, 70),\n",
+ "(2, 1, 1, 80),\n",
+ "(3, 1, 1, 50),\n",
+ "(4, 1, 0, 60),\n",
+ "(5, 1, 0, 40),\n",
+ "(6, 1, 0, 65),\n",
+ "(7, 1, 0, 75),\n",
+ "(8, 1, 0, 80),\n",
+ "(9, 1, 0, 70),\n",
+ "(10, 1, 0, 60),\n",
+ "(11, 0, 1, 65),\n",
+ "(12, 0, 1, 50),\n",
+ "(13, 0, 1, 45),\n",
+ "(14, 0, 1, 35),\n",
+ "(15, 0, 1, 40),\n",
+ "(16, 0, 1, 50),\n",
+ "(17, 0, 0, 55),\n",
+ "(18, 0, 0, 45),\n",
+ "(19, 0, 0, 50),\n",
+ "(20, 0, 0, 60);\n",
+ "\n",
+ "SELECT * FROM patients ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Train regression model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>method</th>\n",
+ " <th>source_table</th>\n",
+ " <th>out_table</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " <th>optimizer_params</th>\n",
+ " <th>num_all_groups</th>\n",
+ " <th>num_failed_groups</th>\n",
+ " <th>num_rows_processed</th>\n",
+ " <th>num_missing_rows_skipped</th>\n",
+ " <th>grouping_col</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>logregr</td>\n",
+ " <td>patients</td>\n",
+ " <td>patients_logregr</td>\n",
+ " <td>second_attack</td>\n",
+ " <td>ARRAY[1, treatment, trait_anxiety]</td>\n",
+ " <td>optimizer=irls, max_iter=20, tolerance=0.0001</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>20</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'logregr', u'patients', u'patients_logregr', u'second_attack', u'ARRAY[1, treatment, trait_anxiety]', u'optimizer=irls, max_iter=20, tolerance=0.0001', 1, 0, 20, 0, None)]"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS patients_logregr, patients_logregr_summary;\n",
+ "\n",
+ "SELECT madlib.logregr_train( 'patients', -- Source table\n",
+ " 'patients_logregr', -- Output table\n",
+ " 'second_attack', -- Dependent variable\n",
+ " 'ARRAY[1, treatment, trait_anxiety]', -- Feature vector\n",
+ " NULL, -- Grouping\n",
+ " 20, -- Max iterations\n",
+ " 'irls' -- Optimizer to use\n",
+ " );\n",
+ "\n",
+ "SELECT * FROM patients_logregr_summary;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. View regression results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>coef</th>\n",
+ " <th>log_likelihood</th>\n",
+ " <th>std_err</th>\n",
+ " <th>z_stats</th>\n",
+ " <th>p_values</th>\n",
+ " <th>odds_ratios</th>\n",
+ " <th>condition_no</th>\n",
+ " <th>num_rows_processed</th>\n",
+ " <th>num_missing_rows_skipped</th>\n",
+ " <th>num_iterations</th>\n",
+ " <th>variance_covariance</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>[-6.36346994178192, -1.02410605239327, 0.119044916668607]</td>\n",
+ " <td>-9.41018298389</td>\n",
+ " <td>[3.21389766375099, 1.17107844860319, 0.0549790458269317]</td>\n",
+ " <td>[-1.97998524145757, -0.874498248699539, 2.16527796868916]</td>\n",
+ " <td>[0.0477051870698145, 0.381846973530455, 0.0303664045046183]</td>\n",
+ " <td>[0.00172337630923221, 0.359117354054956, 1.12642051220895]</td>\n",
+ " <td>326.081922792</td>\n",
+ " <td>20</td>\n",
+ " <td>0</td>\n",
+ " <td>5</td>\n",
+ " <td>[[10.329138193064, -0.474304665195738, -0.171995901260057], [-0.474304665195738, 1.37142473278286, -0.00119520703381591], [-0.171995901260057, -0.00119520703381591, 0.00302269548003986]]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[([-6.36346994178192, -1.02410605239327, 0.119044916668607], -9.41018298388876, [3.21389766375099, 1.17107844860319, 0.0549790458269317], [-1.97998524145757, -0.874498248699539, 2.16527796868916], [0.0477051870698145, 0.381846973530455, 0.0303664045046183], [0.00172337630923221, 0.359117354054956, 1.12642051220895], 326.081922791575, 20L, 0L, 5, [[10.329138193064, -0.474304665195738, -0.171995901260057], [-0.474304665195738, 1.37142473278286, -0.00119520703381591], [-0.171995901260057, -0.00119520703381591, 0.00302269548003986]])]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * from patients_logregr;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 4. Output formatting\n",
+ "Alternatively, unnest the arrays in the results for easier reading of output:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>attribute</th>\n",
+ " <th>coefficient</th>\n",
+ " <th>standard_error</th>\n",
+ " <th>z_stat</th>\n",
+ " <th>pvalue</th>\n",
+ " <th>odds_ratio</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>intercept</td>\n",
+ " <td>-6.36346994178</td>\n",
+ " <td>3.21389766375</td>\n",
+ " <td>-1.97998524146</td>\n",
+ " <td>0.0477051870698</td>\n",
+ " <td>0.00172337630923</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>treatment</td>\n",
+ " <td>-1.02410605239</td>\n",
+ " <td>1.1710784486</td>\n",
+ " <td>-0.8744982487</td>\n",
+ " <td>0.38184697353</td>\n",
+ " <td>0.359117354055</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>trait_anxiety</td>\n",
+ " <td>0.119044916669</td>\n",
+ " <td>0.0549790458269</td>\n",
+ " <td>2.16527796869</td>\n",
+ " <td>0.0303664045046</td>\n",
+ " <td>1.12642051221</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'intercept', -6.36346994178192, 3.21389766375099, -1.97998524145757, 0.0477051870698145, 0.00172337630923221),\n",
+ " (u'treatment', -1.02410605239327, 1.17107844860319, -0.874498248699539, 0.381846973530455, 0.359117354054956),\n",
+ " (u'trait_anxiety', 0.119044916668607, 0.0549790458269317, 2.16527796868916, 0.0303664045046183, 1.12642051220895)]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT unnest(array['intercept', 'treatment', 'trait_anxiety']) as attribute,\n",
+ " unnest(coef) as coefficient,\n",
+ " unnest(std_err) as standard_error,\n",
+ " unnest(z_stats) as z_stat,\n",
+ " unnest(p_values) as pvalue,\n",
+ " unnest(odds_ratios) as odds_ratio\n",
+ " FROM patients_logregr;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "scrolled": true
+ },
+ "source": [
+ "# 5. Predict outcome\n",
+ "This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>logregr_predict</th>\n",
+ " <th>second_attack</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>False</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>False</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>True</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>True</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>False</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>False</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>False</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>False</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>False</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>True</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>False</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>False</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>True</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, True, True),\n",
+ " (2, True, True),\n",
+ " (3, False, True),\n",
+ " (4, True, True),\n",
+ " (5, False, True),\n",
+ " (6, True, True),\n",
+ " (7, True, True),\n",
+ " (8, True, True),\n",
+ " (9, True, True),\n",
+ " (10, True, True),\n",
+ " (11, True, False),\n",
+ " (12, False, False),\n",
+ " (13, False, False),\n",
+ " (14, False, False),\n",
+ " (15, False, False),\n",
+ " (16, False, False),\n",
+ " (17, True, False),\n",
+ " (18, False, False),\n",
+ " (19, False, False),\n",
+ " (20, True, False)]"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "-- Display prediction value along with the original value\n",
+ "SELECT p.id, madlib.logregr_predict(coef, ARRAY[1, treatment, trait_anxiety]),\n",
+ " p.second_attack::BOOLEAN\n",
+ "FROM patients p, patients_logregr m\n",
+ "ORDER BY p.id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 6. Predict probability\n",
+ "Predicting the probability of the dependent variable being TRUE."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>logregr_predict_prob</th>\n",
+ " <th>second_attack</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0.720223028942</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>0.894354902502</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0.192269541755</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>0.685513072239</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>0.167747881509</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>0.798098108915</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>0.928568075753</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>0.959305763694</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0.877576117431</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>0.685513072239</td>\n",
+ " <td>True</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>0.586700895943</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>0.192269541755</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>0.116032010633</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>0.0383829143135</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>0.0674976224148</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>0.192269541755</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>0.545870774303</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>0.267675422387</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>0.398618639285</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>0.685513072239</td>\n",
+ " <td>False</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 0.720223028941527, True),\n",
+ " (2, 0.894354902502048, True),\n",
+ " (3, 0.192269541755171, True),\n",
+ " (4, 0.685513072239347, True),\n",
+ " (5, 0.167747881508857, True),\n",
+ " (6, 0.79809810891514, True),\n",
+ " (7, 0.928568075752503, True),\n",
+ " (8, 0.959305763693571, True),\n",
+ " (9, 0.877576117431452, True),\n",
+ " (10, 0.685513072239347, True),\n",
+ " (11, 0.586700895943317, False),\n",
+ " (12, 0.192269541755171, False),\n",
+ " (13, 0.116032010632994, False),\n",
+ " (14, 0.0383829143134982, False),\n",
+ " (15, 0.0674976224147597, False),\n",
+ " (16, 0.192269541755171, False),\n",
+ " (17, 0.545870774302621, False),\n",
+ " (18, 0.267675422387132, False),\n",
+ " (19, 0.398618639285111, False),\n",
+ " (20, 0.685513072239347, False)]"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "-- Display prediction value along with the original value\n",
+ "SELECT p.id, madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety]),\n",
+ " p.second_attack::BOOLEAN\n",
+ "FROM patients p, patients_logregr m\n",
+ "ORDER BY p.id;"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/Summary-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Summary-v1.ipynb b/community-artifacts/Summary-v1.ipynb
new file mode 100644
index 0000000..57c3611
--- /dev/null
+++ b/community-artifacts/Summary-v1.ipynb
@@ -0,0 +1,1026 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The sql extension is already loaded. To reload it, use:\n",
+ " %reload_ext sql\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "u'Connected: fmcquillan@madlib'"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Greenplum 4.3.10.0\n",
+ "# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+ " \n",
+ "# PostgreSQL local\n",
+ "%sql postgresql://fmcquillan@localhost:5432/madlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>version</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql select madlib.version();\n",
+ "#%sql select version();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. On-line help"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>summary</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td><br> 'summary' is a generic function used to produce summary statistics<br> of any data table. The function invokes particular 'methods' from<br> the MADlib library to provide an overview of the data.<br> -------<br> For an overview on usage, run:<br> SELECT madlib.summary('usage');<br> -------<br> For an example, run:<br> SELECT madlib.summary('example')<br> </td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u\"\\n 'summary' is a generic function used to produce summary statistics\\n of any data table. The function invokes particular 'methods' from\\n the MADlib library to provide an overview of the data.\\n -------\\n For an overview on usage, run:\\n SELECT madlib.summary('usage');\\n -------\\n For an example, run:\\n SELECT madlib.summary('example')\\n \",)]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql SELECT * FROM madlib.summary();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Load data\n",
+ "Using part of the popular iris data set from https://archive.ics.uci.edu/ml/datasets/iris"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "Done.\n",
+ "30 rows affected.\n",
+ "30 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>sepal_length</th>\n",
+ " <th>sepal_width</th>\n",
+ " <th>petal_length</th>\n",
+ " <th>petal_width</th>\n",
+ " <th>class_name</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>5.1</td>\n",
+ " <td>3.5</td>\n",
+ " <td>1.4</td>\n",
+ " <td>0.2</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>4.9</td>\n",
+ " <td>3.0</td>\n",
+ " <td>1.4</td>\n",
+ " <td>0.2</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>4.7</td>\n",
+ " <td>3.2</td>\n",
+ " <td>1.3</td>\n",
+ " <td>0.2</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>4.6</td>\n",
+ " <td>3.1</td>\n",
+ " <td>1.5</td>\n",
+ " <td>0.2</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>5.0</td>\n",
+ " <td>3.6</td>\n",
+ " <td>1.4</td>\n",
+ " <td>0.2</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>5.4</td>\n",
+ " <td>3.9</td>\n",
+ " <td>1.7</td>\n",
+ " <td>0.4</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>4.6</td>\n",
+ " <td>3.4</td>\n",
+ " <td>1.4</td>\n",
+ " <td>0.3</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>5.0</td>\n",
+ " <td>3.4</td>\n",
+ " <td>1.5</td>\n",
+ " <td>0.2</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>4.4</td>\n",
+ " <td>2.9</td>\n",
+ " <td>1.4</td>\n",
+ " <td>0.2</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>4.9</td>\n",
+ " <td>3.1</td>\n",
+ " <td>1.5</td>\n",
+ " <td>0.1</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>7.0</td>\n",
+ " <td>3.2</td>\n",
+ " <td>4.7</td>\n",
+ " <td>1.4</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>6.4</td>\n",
+ " <td>3.2</td>\n",
+ " <td>4.5</td>\n",
+ " <td>1.5</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>6.9</td>\n",
+ " <td>3.1</td>\n",
+ " <td>4.9</td>\n",
+ " <td>1.5</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>5.5</td>\n",
+ " <td>2.3</td>\n",
+ " <td>4.0</td>\n",
+ " <td>1.3</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>6.5</td>\n",
+ " <td>2.8</td>\n",
+ " <td>4.6</td>\n",
+ " <td>1.5</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>5.7</td>\n",
+ " <td>2.8</td>\n",
+ " <td>4.5</td>\n",
+ " <td>1.3</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>6.3</td>\n",
+ " <td>3.3</td>\n",
+ " <td>4.7</td>\n",
+ " <td>1.6</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>4.9</td>\n",
+ " <td>2.4</td>\n",
+ " <td>3.3</td>\n",
+ " <td>1.0</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>6.6</td>\n",
+ " <td>2.9</td>\n",
+ " <td>4.6</td>\n",
+ " <td>1.3</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>5.2</td>\n",
+ " <td>2.7</td>\n",
+ " <td>3.9</td>\n",
+ " <td>1.4</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>21</td>\n",
+ " <td>6.3</td>\n",
+ " <td>3.3</td>\n",
+ " <td>6.0</td>\n",
+ " <td>2.5</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>22</td>\n",
+ " <td>5.8</td>\n",
+ " <td>2.7</td>\n",
+ " <td>5.1</td>\n",
+ " <td>1.9</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>23</td>\n",
+ " <td>7.1</td>\n",
+ " <td>3.0</td>\n",
+ " <td>5.9</td>\n",
+ " <td>2.1</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>24</td>\n",
+ " <td>6.3</td>\n",
+ " <td>2.9</td>\n",
+ " <td>5.6</td>\n",
+ " <td>1.8</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>25</td>\n",
+ " <td>6.5</td>\n",
+ " <td>3.0</td>\n",
+ " <td>5.8</td>\n",
+ " <td>2.2</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>26</td>\n",
+ " <td>7.6</td>\n",
+ " <td>3.0</td>\n",
+ " <td>6.6</td>\n",
+ " <td>2.1</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>27</td>\n",
+ " <td>4.9</td>\n",
+ " <td>2.5</td>\n",
+ " <td>4.5</td>\n",
+ " <td>1.7</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>28</td>\n",
+ " <td>7.3</td>\n",
+ " <td>2.9</td>\n",
+ " <td>6.3</td>\n",
+ " <td>1.8</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>29</td>\n",
+ " <td>6.7</td>\n",
+ " <td>2.5</td>\n",
+ " <td>5.8</td>\n",
+ " <td>1.8</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>30</td>\n",
+ " <td>7.2</td>\n",
+ " <td>3.6</td>\n",
+ " <td>6.1</td>\n",
+ " <td>2.5</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 5.1, 3.5, 1.4, 0.2, u'Iris-setosa'),\n",
+ " (2, 4.9, 3.0, 1.4, 0.2, u'Iris-setosa'),\n",
+ " (3, 4.7, 3.2, 1.3, 0.2, u'Iris-setosa'),\n",
+ " (4, 4.6, 3.1, 1.5, 0.2, u'Iris-setosa'),\n",
+ " (5, 5.0, 3.6, 1.4, 0.2, u'Iris-setosa'),\n",
+ " (6, 5.4, 3.9, 1.7, 0.4, u'Iris-setosa'),\n",
+ " (7, 4.6, 3.4, 1.4, 0.3, u'Iris-setosa'),\n",
+ " (8, 5.0, 3.4, 1.5, 0.2, u'Iris-setosa'),\n",
+ " (9, 4.4, 2.9, 1.4, 0.2, u'Iris-setosa'),\n",
+ " (10, 4.9, 3.1, 1.5, 0.1, u'Iris-setosa'),\n",
+ " (11, 7.0, 3.2, 4.7, 1.4, u'Iris-versicolor'),\n",
+ " (12, 6.4, 3.2, 4.5, 1.5, u'Iris-versicolor'),\n",
+ " (13, 6.9, 3.1, 4.9, 1.5, u'Iris-versicolor'),\n",
+ " (14, 5.5, 2.3, 4.0, 1.3, u'Iris-versicolor'),\n",
+ " (15, 6.5, 2.8, 4.6, 1.5, u'Iris-versicolor'),\n",
+ " (16, 5.7, 2.8, 4.5, 1.3, u'Iris-versicolor'),\n",
+ " (17, 6.3, 3.3, 4.7, 1.6, u'Iris-versicolor'),\n",
+ " (18, 4.9, 2.4, 3.3, 1.0, u'Iris-versicolor'),\n",
+ " (19, 6.6, 2.9, 4.6, 1.3, u'Iris-versicolor'),\n",
+ " (20, 5.2, 2.7, 3.9, 1.4, u'Iris-versicolor'),\n",
+ " (21, 6.3, 3.3, 6.0, 2.5, u'Iris-virginica'),\n",
+ " (22, 5.8, 2.7, 5.1, 1.9, u'Iris-virginica'),\n",
+ " (23, 7.1, 3.0, 5.9, 2.1, u'Iris-virginica'),\n",
+ " (24, 6.3, 2.9, 5.6, 1.8, u'Iris-virginica'),\n",
+ " (25, 6.5, 3.0, 5.8, 2.2, u'Iris-virginica'),\n",
+ " (26, 7.6, 3.0, 6.6, 2.1, u'Iris-virginica'),\n",
+ " (27, 4.9, 2.5, 4.5, 1.7, u'Iris-virginica'),\n",
+ " (28, 7.3, 2.9, 6.3, 1.8, u'Iris-virginica'),\n",
+ " (29, 6.7, 2.5, 5.8, 1.8, u'Iris-virginica'),\n",
+ " (30, 7.2, 3.6, 6.1, 2.5, u'Iris-virginica')]"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris;\n",
+ "\n",
+ "CREATE TABLE iris (id INT, sepal_length FLOAT, sepal_width FLOAT,\n",
+ " petal_length FLOAT, petal_width FLOAT, \n",
+ " class_name text);\n",
+ " \n",
+ "INSERT INTO iris VALUES \n",
+ "(1,5.1,3.5,1.4,0.2,'Iris-setosa'),\n",
+ "(2,4.9,3.0,1.4,0.2,'Iris-setosa'),\n",
+ "(3,4.7,3.2,1.3,0.2,'Iris-setosa'),\n",
+ "(4,4.6,3.1,1.5,0.2,'Iris-setosa'),\n",
+ "(5,5.0,3.6,1.4,0.2,'Iris-setosa'),\n",
+ "(6,5.4,3.9,1.7,0.4,'Iris-setosa'),\n",
+ "(7,4.6,3.4,1.4,0.3,'Iris-setosa'),\n",
+ "(8,5.0,3.4,1.5,0.2,'Iris-setosa'),\n",
+ "(9,4.4,2.9,1.4,0.2,'Iris-setosa'),\n",
+ "(10,4.9,3.1,1.5,0.1,'Iris-setosa'),\n",
+ "(11,7.0,3.2,4.7,1.4,'Iris-versicolor'),\n",
+ "(12,6.4,3.2,4.5,1.5,'Iris-versicolor'),\n",
+ "(13,6.9,3.1,4.9,1.5,'Iris-versicolor'),\n",
+ "(14,5.5,2.3,4.0,1.3,'Iris-versicolor'),\n",
+ "(15,6.5,2.8,4.6,1.5,'Iris-versicolor'),\n",
+ "(16,5.7,2.8,4.5,1.3,'Iris-versicolor'),\n",
+ "(17,6.3,3.3,4.7,1.6,'Iris-versicolor'),\n",
+ "(18,4.9,2.4,3.3,1.0,'Iris-versicolor'),\n",
+ "(19,6.6,2.9,4.6,1.3,'Iris-versicolor'),\n",
+ "(20,5.2,2.7,3.9,1.4,'Iris-versicolor'),\n",
+ "(21,6.3,3.3,6.0,2.5,'Iris-virginica'),\n",
+ "(22,5.8,2.7,5.1,1.9,'Iris-virginica'),\n",
+ "(23,7.1,3.0,5.9,2.1,'Iris-virginica'),\n",
+ "(24,6.3,2.9,5.6,1.8,'Iris-virginica'),\n",
+ "(25,6.5,3.0,5.8,2.2,'Iris-virginica'),\n",
+ "(26,7.6,3.0,6.6,2.1,'Iris-virginica'),\n",
+ "(27,4.9,2.5,4.5,1.7,'Iris-virginica'),\n",
+ "(28,7.3,2.9,6.3,1.8,'Iris-virginica'),\n",
+ "(29,6.7,2.5,5.8,1.8,'Iris-virginica'),\n",
+ "(30,7.2,3.6,6.1,2.5,'Iris-virginica');\n",
+ "\n",
+ "SELECT * FROM iris ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. Get summary data \n",
+ "Basic summary data using all defaults."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>output_table</th>\n",
+ " <th>row_count</th>\n",
+ " <th>duration</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>iris_summary</td>\n",
+ " <td>6</td>\n",
+ " <td>0.0161118507385</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'iris_summary', 6, 0.0161118507385254)]"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_summary;\n",
+ "\n",
+ "SELECT * FROM madlib.summary( 'iris', -- Source table\n",
+ " 'iris_summary' -- Output table\n",
+ " );"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>group_by</th>\n",
+ " <th>group_by_value</th>\n",
+ " <th>target_column</th>\n",
+ " <th>column_number</th>\n",
+ " <th>data_type</th>\n",
+ " <th>row_count</th>\n",
+ " <th>distinct_values</th>\n",
+ " <th>missing_values</th>\n",
+ " <th>blank_values</th>\n",
+ " <th>fraction_missing</th>\n",
+ " <th>fraction_blank</th>\n",
+ " <th>mean</th>\n",
+ " <th>variance</th>\n",
+ " <th>min</th>\n",
+ " <th>max</th>\n",
+ " <th>quantile_array</th>\n",
+ " <th>most_frequent_values</th>\n",
+ " <th>mfv_frequencies</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>sepal_length</td>\n",
+ " <td>2</td>\n",
+ " <td>float8</td>\n",
+ " <td>30</td>\n",
+ " <td>22</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>5.84333333333</td>\n",
+ " <td>0.929436781609</td>\n",
+ " <td>4.4</td>\n",
+ " <td>7.6</td>\n",
+ " <td>[5.057, 6.414]</td>\n",
+ " <td>[u'4.9', u'6.3', u'5']</td>\n",
+ " <td>[4L, 3L, 2L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>sepal_width</td>\n",
+ " <td>3</td>\n",
+ " <td>float8</td>\n",
+ " <td>30</td>\n",
+ " <td>14</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>3.04</td>\n",
+ " <td>0.139034482759</td>\n",
+ " <td>2.3</td>\n",
+ " <td>3.9</td>\n",
+ " <td>[2.9, 3.2]</td>\n",
+ " <td>[u'3', u'2.9', u'3.2']</td>\n",
+ " <td>[4L, 4L, 3L]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(None, None, u'sepal_length', 2, u'float8', 30L, 22L, 0L, None, 0.0, None, 5.84333333333333, 0.9294367816092, 4.4, 7.6, [5.057, 6.414], [u'4.9', u'6.3', u'5'], [4L, 3L, 2L]),\n",
+ " (None, None, u'sepal_width', 3, u'float8', 30L, 14L, 0L, None, 0.0, None, 3.04, 0.13903448275862, 2.3, 3.9, [2.9, 3.2], [u'3', u'2.9', u'3.2'], [4L, 4L, 3L])]"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM iris_summary;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 4. Grouping"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "8 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>group_by</th>\n",
+ " <th>group_by_value</th>\n",
+ " <th>target_column</th>\n",
+ " <th>column_number</th>\n",
+ " <th>data_type</th>\n",
+ " <th>row_count</th>\n",
+ " <th>distinct_values</th>\n",
+ " <th>missing_values</th>\n",
+ " <th>blank_values</th>\n",
+ " <th>fraction_missing</th>\n",
+ " <th>fraction_blank</th>\n",
+ " <th>mean</th>\n",
+ " <th>variance</th>\n",
+ " <th>min</th>\n",
+ " <th>max</th>\n",
+ " <th>first_quartile</th>\n",
+ " <th>median</th>\n",
+ " <th>third_quartile</th>\n",
+ " <th>most_frequent_values</th>\n",
+ " <th>mfv_frequencies</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>class_name</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " <td>sepal_length</td>\n",
+ " <td>2</td>\n",
+ " <td>float8</td>\n",
+ " <td>10</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>4.86</td>\n",
+ " <td>0.0848888888889</td>\n",
+ " <td>4.4</td>\n",
+ " <td>5.4</td>\n",
+ " <td>4.625</td>\n",
+ " <td>4.9</td>\n",
+ " <td>5.0</td>\n",
+ " <td>[u'4.6', u'4.9', u'5', u'5.1', u'4.4', u'5.4', u'4.7']</td>\n",
+ " <td>[2L, 2L, 2L, 1L, 1L, 1L, 1L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>class_name</td>\n",
+ " <td>Iris-setosa</td>\n",
+ " <td>sepal_width</td>\n",
+ " <td>3</td>\n",
+ " <td>float8</td>\n",
+ " <td>10</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>3.31</td>\n",
+ " <td>0.0943333333333</td>\n",
+ " <td>2.9</td>\n",
+ " <td>3.9</td>\n",
+ " <td>3.1</td>\n",
+ " <td>3.3</td>\n",
+ " <td>3.475</td>\n",
+ " <td>[u'3.4', u'3.1', u'3.2', u'3.6', u'3.9', u'3.5', u'2.9', u'3']</td>\n",
+ " <td>[2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>class_name</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " <td>sepal_length</td>\n",
+ " <td>2</td>\n",
+ " <td>float8</td>\n",
+ " <td>10</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>6.1</td>\n",
+ " <td>0.528888888889</td>\n",
+ " <td>4.9</td>\n",
+ " <td>7.0</td>\n",
+ " <td>5.55</td>\n",
+ " <td>6.35</td>\n",
+ " <td>6.575</td>\n",
+ " <td>[u'7', u'6.4', u'6.9', u'5.5', u'6.5', u'5.7', u'6.3', u'4.9', u'6.6', u'5.2']</td>\n",
+ " <td>[1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>class_name</td>\n",
+ " <td>Iris-versicolor</td>\n",
+ " <td>sepal_width</td>\n",
+ " <td>3</td>\n",
+ " <td>float8</td>\n",
+ " <td>10</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>2.87</td>\n",
+ " <td>0.115666666667</td>\n",
+ " <td>2.3</td>\n",
+ " <td>3.3</td>\n",
+ " <td>2.725</td>\n",
+ " <td>2.85</td>\n",
+ " <td>3.175</td>\n",
+ " <td>[u'2.8', u'3.2', u'2.3', u'3.3', u'2.4', u'2.9', u'2.7', u'3.1']</td>\n",
+ " <td>[2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>class_name</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " <td>sepal_length</td>\n",
+ " <td>2</td>\n",
+ " <td>float8</td>\n",
+ " <td>10</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>6.57</td>\n",
+ " <td>0.646777777778</td>\n",
+ " <td>4.9</td>\n",
+ " <td>7.6</td>\n",
+ " <td>6.3</td>\n",
+ " <td>6.6</td>\n",
+ " <td>7.175</td>\n",
+ " <td>[u'6.3', u'5.8', u'7.1', u'6.5', u'7.6', u'4.9', u'7.3', u'6.7', u'7.2']</td>\n",
+ " <td>[2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>class_name</td>\n",
+ " <td>Iris-virginica</td>\n",
+ " <td>sepal_width</td>\n",
+ " <td>3</td>\n",
+ " <td>float8</td>\n",
+ " <td>10</td>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>2.94</td>\n",
+ " <td>0.113777777778</td>\n",
+ " <td>2.5</td>\n",
+ " <td>3.6</td>\n",
+ " <td>2.75</td>\n",
+ " <td>2.95</td>\n",
+ " <td>3.0</td>\n",
+ " <td>[u'3', u'2.9', u'2.5', u'3.3', u'2.7', u'3.6']</td>\n",
+ " <td>[3L, 2L, 2L, 1L, 1L, 1L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>sepal_length</td>\n",
+ " <td>2</td>\n",
+ " <td>float8</td>\n",
+ " <td>30</td>\n",
+ " <td>22</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>5.84333333333</td>\n",
+ " <td>0.929436781609</td>\n",
+ " <td>4.4</td>\n",
+ " <td>7.6</td>\n",
+ " <td>4.925</td>\n",
+ " <td>5.75</td>\n",
+ " <td>6.575</td>\n",
+ " <td>[u'4.9', u'6.3', u'6.5', u'4.6', u'5', u'6.9', u'5.4', u'4.4', u'7', u'6.4']</td>\n",
+ " <td>[4L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>sepal_width</td>\n",
+ " <td>3</td>\n",
+ " <td>float8</td>\n",
+ " <td>30</td>\n",
+ " <td>14</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>3.04</td>\n",
+ " <td>0.139034482759</td>\n",
+ " <td>2.3</td>\n",
+ " <td>3.9</td>\n",
+ " <td>2.825</td>\n",
+ " <td>3.0</td>\n",
+ " <td>3.275</td>\n",
+ " <td>[u'2.9', u'3', u'3.1', u'3.2', u'3.6', u'3.3', u'2.8', u'2.7', u'2.5', u'3.4']</td>\n",
+ " <td>[4L, 4L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'class_name', u'Iris-setosa', u'sepal_length', 2, u'float8', 10L, 7L, 0L, None, 0.0, None, 4.86, 0.0848888888888976, 4.4, 5.4, 4.625, 4.9, 5.0, [u'4.6', u'4.9', u'5', u'5.1', u'4.4', u'5.4', u'4.7'], [2L, 2L, 2L, 1L, 1L, 1L, 1L]),\n",
+ " (u'class_name', u'Iris-setosa', u'sepal_width', 3, u'float8', 10L, 8L, 0L, None, 0.0, None, 3.31, 0.094333333333336, 2.9, 3.9, 3.1, 3.3, 3.475, [u'3.4', u'3.1', u'3.2', u'3.6', u'3.9', u'3.5', u'2.9', u'3'], [2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L]),\n",
+ " (u'class_name', u'Iris-versicolor', u'sepal_length', 2, u'float8', 10L, 10L, 0L, None, 0.0, None, 6.1, 0.528888888888893, 4.9, 7.0, 5.55, 6.35, 6.575, [u'7', u'6.4', u'6.9', u'5.5', u'6.5', u'5.7', u'6.3', u'4.9', u'6.6', u'5.2'], [1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L]),\n",
+ " (u'class_name', u'Iris-versicolor', u'sepal_width', 3, u'float8', 10L, 8L, 0L, None, 0.0, None, 2.87, 0.115666666666666, 2.3, 3.3, 2.725, 2.85, 3.175, [u'2.8', u'3.2', u'2.3', u'3.3', u'2.4', u'2.9', u'2.7', u'3.1'], [2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L]),\n",
+ " (u'class_name', u'Iris-virginica', u'sepal_length', 2, u'float8', 10L, 9L, 0L, None, 0.0, None, 6.57, 0.646777777777778, 4.9, 7.6, 6.3, 6.6, 7.175, [u'6.3', u'5.8', u'7.1', u'6.5', u'7.6', u'4.9', u'7.3', u'6.7', u'7.2'], [2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L]),\n",
+ " (u'class_name', u'Iris-virginica', u'sepal_width', 3, u'float8', 10L, 6L, 0L, None, 0.0, None, 2.94, 0.11377777777778, 2.5, 3.6, 2.75, 2.95, 3.0, [u'3', u'2.9', u'2.5', u'3.3', u'2.7', u'3.6'], [3L, 2L, 2L, 1L, 1L, 1L]),\n",
+ " (None, None, u'sepal_length', 2, u'float8', 30L, 22L, 0L, None, 0.0, None, 5.84333333333333, 0.9294367816092, 4.4, 7.6, 4.925, 5.75, 6.575, [u'4.9', u'6.3', u'6.5', u'4.6', u'5', u'6.9', u'5.4', u'4.4', u'7', u'6.4'], [4L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L]),\n",
+ " (None, None, u'sepal_width', 3, u'float8', 30L, 14L, 0L, None, 0.0, None, 3.04, 0.13903448275862, 2.3, 3.9, 2.825, 3.0, 3.275, [u'2.9', u'3', u'3.1', u'3.2', u'3.6', u'3.3', u'2.8', u'2.7', u'2.5', u'3.4'], [4L, 4L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L])]"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_summary;\n",
+ "\n",
+ "SELECT * FROM madlib.summary( 'iris', -- Source table\n",
+ " 'iris_summary', -- Output table\n",
+ " 'sepal_length, sepal_width', -- Columns to summarize\n",
+ " 'class_name' -- Grouping column\n",
+ " );\n",
+ "\n",
+ "SELECT * FROM iris_summary;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 5. Other parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>group_by</th>\n",
+ " <th>group_by_value</th>\n",
+ " <th>target_column</th>\n",
+ " <th>column_number</th>\n",
+ " <th>data_type</th>\n",
+ " <th>row_count</th>\n",
+ " <th>distinct_values</th>\n",
+ " <th>missing_values</th>\n",
+ " <th>blank_values</th>\n",
+ " <th>fraction_missing</th>\n",
+ " <th>fraction_blank</th>\n",
+ " <th>mean</th>\n",
+ " <th>variance</th>\n",
+ " <th>min</th>\n",
+ " <th>max</th>\n",
+ " <th>quantile_array</th>\n",
+ " <th>most_frequent_values</th>\n",
+ " <th>mfv_frequencies</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>sepal_length</td>\n",
+ " <td>2</td>\n",
+ " <td>float8</td>\n",
+ " <td>30</td>\n",
+ " <td>22</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>5.84333333333</td>\n",
+ " <td>0.929436781609</td>\n",
+ " <td>4.4</td>\n",
+ " <td>7.6</td>\n",
+ " <td>[5.057, 6.414]</td>\n",
+ " <td>[u'4.9', u'6.3', u'5']</td>\n",
+ " <td>[4L, 3L, 2L]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>None</td>\n",
+ " <td>None</td>\n",
+ " <td>sepal_width</td>\n",
+ " <td>3</td>\n",
+ " <td>float8</td>\n",
+ " <td>30</td>\n",
+ " <td>14</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " <td>0.0</td>\n",
+ " <td>None</td>\n",
+ " <td>3.04</td>\n",
+ " <td>0.139034482759</td>\n",
+ " <td>2.3</td>\n",
+ " <td>3.9</td>\n",
+ " <td>[2.9, 3.2]</td>\n",
+ " <td>[u'3', u'2.9', u'3.2']</td>\n",
+ " <td>[4L, 4L, 3L]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(None, None, u'sepal_length', 2, u'float8', 30L, 22L, 0L, None, 0.0, None, 5.84333333333333, 0.9294367816092, 4.4, 7.6, [5.057, 6.414], [u'4.9', u'6.3', u'5'], [4L, 3L, 2L]),\n",
+ " (None, None, u'sepal_width', 3, u'float8', 30L, 14L, 0L, None, 0.0, None, 3.04, 0.13903448275862, 2.3, 3.9, [2.9, 3.2], [u'3', u'2.9', u'3.2'], [4L, 4L, 3L])]"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_summary;\n",
+ "\n",
+ "SELECT * FROM madlib.summary( 'iris', -- Source table\n",
+ " 'iris_summary', -- Output table\n",
+ " 'sepal_length, sepal_width', -- Columns to summarize\n",
+ " NULL, -- No grouping\n",
+ " TRUE, -- Get distinct values\n",
+ " FALSE, -- Dont get quartiles\n",
+ " ARRAY[0.33, 0.66], -- Get ntiles\n",
+ " 3, -- Number of MFV to compute\n",
+ " FALSE -- Get exact values\n",
+ " );\n",
+ "\n",
+ "SELECT * FROM iris_summary;"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/kNN-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/kNN-v1.ipynb b/community-artifacts/kNN-v1.ipynb
deleted file mode 100644
index cca3251..0000000
--- a/community-artifacts/kNN-v1.ipynb
+++ /dev/null
@@ -1,497 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# k-Nearest Neighbors (MADlib v1.10+)\n",
- "Finds k nearest data points to a given data point and outputs majority vote value of output classes in case of classification, and average value of target values in case of regression."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The sql extension is already loaded. To reload it, use:\n",
- " %reload_ext sql\n"
- ]
- }
- ],
- "source": [
- "%load_ext sql"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "u'Connected: gpdbchina@madlib'"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n",
- "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
- "#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>version</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb 7 19:45:19 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(u'MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb 7 19:45:19 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%sql select madlib.version();\n",
- "#%sql select version();"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 1. Misc testing"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n",
- "Done.\n",
- "9 rows affected.\n",
- "9 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>id</th>\n",
- " <th>data</th>\n",
- " <th>label</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>[1, 1]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>[2, 2]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>3</td>\n",
- " <td>[3, 3]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>4</td>\n",
- " <td>[4, 4]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>5</td>\n",
- " <td>[4, 5]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>6</td>\n",
- " <td>[20, 50]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>7</td>\n",
- " <td>[10, 31]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>8</td>\n",
- " <td>[81, 13]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>9</td>\n",
- " <td>[1, 111]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(1, [1, 1], 1.0),\n",
- " (2, [2, 2], 1.0),\n",
- " (3, [3, 3], 1.0),\n",
- " (4, [4, 4], 1.0),\n",
- " (5, [4, 5], 1.0),\n",
- " (6, [20, 50], 0.0),\n",
- " (7, [10, 31], 0.0),\n",
- " (8, [81, 13], 0.0),\n",
- " (9, [1, 111], 0.0)]"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%sql \n",
- "DROP TABLE IF EXISTS knn_train_data;\n",
- "CREATE TABLE knn_train_data (\n",
- " id integer, \n",
- " data integer[], \n",
- " label float\n",
- " );\n",
- "\n",
- "INSERT INTO knn_train_data VALUES\n",
- "(1, '{1,1}', 1.0),\n",
- "(2, '{2,2}', 1.0),\n",
- "(3, '{3,3}', 1.0),\n",
- "(4, '{4,4}', 1.0),\n",
- "(5, '{4,5}', 1.0),\n",
- "(6, '{20,50}', 0.0),\n",
- "(7, '{10,31}', 0.0),\n",
- "(8, '{81,13}', 0.0),\n",
- "(9, '{1,111}', 0.0);\n",
- "\n",
- "SELECT * from knn_train_data ORDER BY id;"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n",
- "Done.\n",
- "6 rows affected.\n",
- "6 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>id</th>\n",
- " <th>data</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>[2, 1]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>[2, 6]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>3</td>\n",
- " <td>[15, 40]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>4</td>\n",
- " <td>[12, 1]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>5</td>\n",
- " <td>[2, 90]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>6</td>\n",
- " <td>[50, 45]</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(1, [2, 1]),\n",
- " (2, [2, 6]),\n",
- " (3, [15, 40]),\n",
- " (4, [12, 1]),\n",
- " (5, [2, 90]),\n",
- " (6, [50, 45])]"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%sql \n",
- "DROP TABLE IF EXISTS knn_test_data;\n",
- "CREATE TABLE knn_test_data (\n",
- " id integer, \n",
- " data integer[]\n",
- " );\n",
- "\n",
- "INSERT INTO knn_test_data VALUES\n",
- "(1, '{2,1}'),\n",
- "(2, '{2,6}'),\n",
- "(3, '{15,40}'),\n",
- "(4, '{12,1}'),\n",
- "(5, '{2,90}'),\n",
- "(6, '{50,45}');\n",
- "\n",
- "SELECT * from knn_test_data ORDER BY id;"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n",
- "1 rows affected.\n",
- "6 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>id</th>\n",
- " <th>data</th>\n",
- " <th>prediction</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>[2, 1]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>[2, 6]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>3</td>\n",
- " <td>[15, 40]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>4</td>\n",
- " <td>[12, 1]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>5</td>\n",
- " <td>[2, 90]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>6</td>\n",
- " <td>[50, 45]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(1, [2, 1], 1.0),\n",
- " (2, [2, 6], 1.0),\n",
- " (3, [15, 40], 0.0),\n",
- " (4, [12, 1], 1.0),\n",
- " (5, [2, 90], 0.0),\n",
- " (6, [50, 45], 0.0)]"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%sql\n",
- "DROP TABLE IF EXISTS madlib_knn_result_classification;\n",
- "SELECT * FROM madlib.knn( \n",
- " 'knn_train_data', -- Table of training data\n",
- " 'data', -- Col name of training data\n",
- " 'label', -- Training labels\n",
- " 'knn_test_data', -- Table of test data\n",
- " 'data', -- Col name of test data\n",
- " 'id', -- Col name of id in test data \n",
- " 'madlib_knn_result_classification', -- Output table\n",
- " 'c', -- Classification\n",
- " 3 -- Number of nearest neighbours\n",
- " );\n",
- "SELECT * from madlib_knn_result_classification ORDER BY id;"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n",
- "1 rows affected.\n",
- "6 rows affected.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<table>\n",
- " <tr>\n",
- " <th>id</th>\n",
- " <th>data</th>\n",
- " <th>prediction</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>1</td>\n",
- " <td>[2, 1]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>2</td>\n",
- " <td>[2, 6]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>3</td>\n",
- " <td>[15, 40]</td>\n",
- " <td>0.333333333333</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>4</td>\n",
- " <td>[12, 1]</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>5</td>\n",
- " <td>[2, 90]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>6</td>\n",
- " <td>[50, 45]</td>\n",
- " <td>0.0</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "text/plain": [
- "[(1, [2, 1], 1.0),\n",
- " (2, [2, 6], 1.0),\n",
- " (3, [15, 40], 0.333333333333333),\n",
- " (4, [12, 1], 1.0),\n",
- " (5, [2, 90], 0.0),\n",
- " (6, [50, 45], 0.0)]"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%sql\n",
- "DROP TABLE IF EXISTS madlib_knn_result_regression;\n",
- "SELECT * FROM madlib.knn( \n",
- " 'knn_train_data', -- Table of training data\n",
- " 'data', -- Col name of training data\n",
- " 'label', -- Training labels\n",
- " 'knn_test_data', -- Table of test data\n",
- " 'data', -- Col name of test data\n",
- " 'id', -- Col name of id in test data \n",
- " 'madlib_knn_result_regression', -- Output table\n",
- " 'r', -- Regressions\n",
- " 3 -- Number of nearest neighbours\n",
- " );\n",
- "SELECT * from madlib_knn_result_regression ORDER BY id;"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}