You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/02/08 23:11:18 UTC
incubator-madlib git commit: Adds k-means doc example for array input Updates KNN and cat vars docs

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 2d5a5edb9 -> 8e7c6ebfe


Adds k-means doc example for array input
Updates KNN and cat vars docs

Closes #99


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8e7c6ebf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8e7c6ebf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8e7c6ebf

Branch: refs/heads/master
Commit: 8e7c6ebfeefede6f64c7f449f8f2f1a75a8dbe73
Parents: 2d5a5ed
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Wed Feb 8 15:08:05 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Feb 8 15:10:29 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/kmeans/kmeans.sql_in | 154 +++++++++----
 src/ports/postgres/modules/knn/knn.sql_in       | 190 ++++++++--------
 .../modules/utilities/encode_categorical.sql_in | 228 +++++++++----------
 3 files changed, 324 insertions(+), 248 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/kmeans/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 16014ef..f689dd6 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -218,43 +218,45 @@ closest_column( m, x )
 @anchor examples
 @examp
 
--#  Prepare some input data.
+Note: Your results may not be exactly the same as below due to the nature of the 
+k-means algorithm.
+
+-#  Prepare some input data:
 <pre class="example">
-CREATE TABLE public.km_sample(pid int, points double precision[]);
-COPY km_sample (pid, points) FROM stdin DELIMITER '|';
-1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}
-2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}
-3 | {13.16, 2.36,  2.67, 18.6, 101, 2.8,  3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}
-4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}
-5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}
-6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}
-7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}
-8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}
-9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}
-10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}
-\\.
+DROP TABLE IF EXISTS km_sample;
+CREATE TABLE km_sample(pid int, points double precision[]);
+INSERT INTO km_sample VALUES
+(1,  '{14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}'),
+(2,  '{13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}'),
+(3,  '{13.16, 2.36,  2.67, 18.6, 101, 2.8,  3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}'),
+(4,  '{14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}'),
+(5,  '{13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}'),
+(6,  '{14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}'),
+(7,  '{14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}'),
+(8,  '{14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}'),
+(9,  '{14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}'),
+(10, '{13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}');
 </pre>
-
 -#  Run k-means clustering using kmeans++ for centroid seeding:
 <pre class="example">
 \\x on;
-SELECT * FROM madlib.kmeanspp( 'km_sample',
-                               'points',
-                               2,
-                               'madlib.squared_dist_norm2',
-                               'madlib.avg',
-                               20,
-                               0.001
+SELECT * FROM madlib.kmeanspp( 'km_sample',   -- Table of source data
+                               'points',      -- Column containing point co-ordinates 
+                               2,             -- Number of centroids to calculate
+                               'madlib.squared_dist_norm2',   -- Distance function
+                               'madlib.avg',  -- Aggregate function
+                               20,            -- Number of iterations
+                               0.001          -- Fraction of centroids reassigned to keep iterating 
                              );
 </pre>
 Result:
 <pre class="result">
-centroids        | {{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988},
-                   {14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340}}
-cluster_variance | {90512.324426408,60672.638245208}
-objective_fn     | 151184.962672
+centroids        | {{13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667, 3.31833333333333,1020.83333333333},
+                   {14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75}}
+cluster_variance | {122999.110416013,30561.74805}
+objective_fn     | 153560.858466013
 frac_reassigned  | 0
-num_iterations   | 2
+num_iterations   | 3
 </pre>
 -# Calculate the simplified silhouette coefficient:
 <pre class="example">
@@ -273,33 +275,103 @@ SELECT * FROM madlib.simple_silhouette( 'km_sample',
 </pre>
 Result:
 <pre class="result">
-simple_silhouette | 0.68978804882941
+simple_silhouette | 0.686314347664694
 </pre>
 
--#  Find the cluster assignment for each point
+-#  Find the cluster assignment for each point:
 <pre class="example">
 \\x off;
-SELECT data.*, (madlib.closest_column(centroids, points)).column_id as cluster_id
-FROM public.km_sample as data,
-     (SELECT centroids
-      FROM madlib.kmeanspp('km_sample', 'points', 2,
+DROP TABLE IF EXISTS km_result;
+-- Run kmeans algorithm
+CREATE TABLE km_result AS
+SELECT * FROM madlib.kmeanspp('km_sample', 'points', 2,
                            'madlib.squared_dist_norm2',
-                           'madlib.avg', 20, 0.001)) as centroids
+                           'madlib.avg', 20, 0.001); 
+-- Get point assignment
+SELECT data.*,  (madlib.closest_column(centroids, points)).column_id as cluster_id
+FROM km_sample as data, km_result
 ORDER BY data.pid;
 </pre>
 <pre class="result">
- pid |                               points                               | cluster_id
+ pid |                               points                               | cluster_id 
 -----+--------------------------------------------------------------------+------------
    1 | {14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065}  |          0
    2 | {13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}    |          0
-   3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} |          1
-   4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480}   |          1
-   5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735}     |          0
-   6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450}  |          1
-   7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290}    |          1
-   8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295}  |          1
+   3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} |          0
+   4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480}   |          0
+   5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735}     |          1
+   6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450}  |          0
+   7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290}    |          0
+   8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295}  |          0
    9 | {14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045}      |          0
   10 | {13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.2199,1.01,3.55,1045}  |          0
+(10 rows)
+</pre>
+
+-#  Run the same example as above, but using array input.  Create the input table:
+<pre class="example">
+DROP TABLE IF EXISTS km_arrayin CASCADE;
+CREATE TABLE km_arrayin(pid int, 
+                        p1 float, 
+                        p2 float, 
+                        p3 float,
+                        p4 float, 
+                        p5 float, 
+                        p6 float,
+                        p7 float, 
+                        p8 float, 
+                        p9 float,
+                        p10 float, 
+                        p11 float, 
+                        p12 float,
+                        p13 float);
+INSERT INTO km_arrayin VALUES
+(1,  14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065),
+(2,  13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050),
+(3,  13.16, 2.36,  2.67, 18.6, 101, 2.8,  3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185),
+(4,  14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480),
+(5,  13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735),
+(6,  14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450),
+(7,  14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290),
+(8,  14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295),
+(9,  14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045),
+(10, 13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045);
+</pre>
+Now find the cluster assignment for each point:
+<pre class="example">
+DROP TABLE IF EXISTS km_result;
+-- Run kmeans algorithm
+CREATE TABLE km_result AS
+SELECT * FROM madlib.kmeans_random('km_arrayin', 
+                                'ARRAY[p1, p2, p3, p4, p5, p6, 
+                                      p7, p8, p9, p10, p11, p12, p13]', 
+                                2,
+                                'madlib.squared_dist_norm2',
+                                'madlib.avg', 
+                                20, 
+                                0.001);
+-- Get point assignment
+SELECT data.*,  (madlib.closest_column(centroids, 
+                                       ARRAY[p1, p2, p3, p4, p5, p6, 
+                                      p7, p8, p9, p10, p11, p12, p13])).column_id as cluster_id                                    
+FROM km_arrayin as data, km_result
+ORDER BY data.pid;
+</pre> 
+This produces the result in column format:
+<pre class="result">
+ pid |  p1   |  p2  |  p3  |  p4  | p5  |  p6  |  p7  |  p8  |  p9  |  p10   | p11  | p12  | p13  | cluster_id 
+-----+-------+------+------+------+-----+------+------+------+------+--------+------+------+------+------------
+   1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 |  2.8 | 3.06 | 0.28 | 2.29 |   5.64 | 1.04 | 3.92 | 1065 |          0
+   2 |  13.2 | 1.78 | 2.14 | 11.2 |   1 | 2.65 | 2.76 | 0.26 | 1.28 |   4.38 | 1.05 | 3.49 | 1050 |          0
+   3 | 13.16 | 2.36 | 2.67 | 18.6 | 101 |  2.8 | 3.24 |  0.3 | 2.81 | 5.6799 | 1.03 | 3.17 | 1185 |          0
+   4 | 14.37 | 1.95 |  2.5 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 |    7.8 | 0.86 | 3.45 | 1480 |          1
+   5 | 13.24 | 2.59 | 2.87 |   21 | 118 |  2.8 | 2.69 | 0.39 | 1.82 |   4.32 | 1.04 | 2.93 |  735 |          0
+   6 |  14.2 | 1.76 | 2.45 | 15.2 | 112 | 3.27 | 3.39 | 0.34 | 1.97 |   6.75 | 1.05 | 2.85 | 1450 |          1
+   7 | 14.39 | 1.87 | 2.45 | 14.6 |  96 |  2.5 | 2.52 |  0.3 | 1.98 |   5.25 | 1.02 | 3.58 | 1290 |          1
+   8 | 14.06 | 2.15 | 2.61 | 17.6 | 121 |  2.6 | 2.51 | 0.31 | 1.25 |   5.05 | 1.06 | 3.58 | 1295 |          1
+   9 | 14.83 | 1.64 | 2.17 |   14 |  97 |  2.8 | 2.98 | 0.29 | 1.98 |    5.2 | 1.08 | 2.85 | 1045 |          0
+  10 | 13.86 | 1.35 | 2.27 |   16 |  98 | 2.98 | 3.15 | 0.22 | 1.85 | 7.2199 | 1.01 | 3.55 | 1045 |          0
+(10 rows)
 </pre>
 
 @anchor notes

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index 526c8dd..d3c1929 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -44,29 +44,27 @@ m4_include(`SQLCommon.m4')
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#background">Technical Background</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
-<li class="level1"><a href="#related">Related Topics</a></li>
 </ul>
 </div>
 
-@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes in case of classification and average value of target values for regression task.
+@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes for classification, and average value of target values for regression.
 
 \warning <em> This MADlib method is still in early stage development. There may be some
 issues that will be addressed in a future version. Interface and implementation
-is subject to change. </em>
+are subject to change. </em>
 
 @anchor knn
 
-k-Nearest Neighbors is a method for finding k closest points to a
-given data point in terms of a given metric. Its input consist of
-data points as features from testing examples. For a given k, it
-looks for k closest points in training set for each of the data
-points in test set. Algorithm generates one output per testing example.
-The output of KNN depends on the type of task:
-For Classification, the output is majority vote of the classes of
-the k nearest data points. The testing example gets assigned the
-most popular class among nearest neighbors.
-For Regression, the output is average of the values of k nearest
-neighbors of the given testing example.
+K-nearest neighbors is a method for finding the k closest points to a
+given data point in terms of a given metric. Its input consists of
+data points as features from testing examples, and it
+looks for k closest points in the training set for each of the data
+points in test set.  The output of KNN depends on the type of task.
+For classification, the output is the majority vote of the classes of
+the k nearest data points. That is, the testing example gets assigned the
+most popular class from the nearest neighbors.
+For regression, the output is the average of the values of k nearest
+neighbors of the given test point.
 
 @anchor usage
 @par Usage
@@ -86,27 +84,27 @@ knn( point_source,
 \b Arguments
 <dl class="arglist">
 <dt>point_source</dt>
-<dd>TEXT. The name of the table containing the training data points.
+<dd>TEXT. Name of the table containing the training data points.
 
-Training data points are expected to be stored row-wise,
+Training data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>
 
 <dt>point_column_name</dt>
-<dd>TEXT. The name of the column with training data points.</dd>
+<dd>TEXT. Name of the column with training data points.</dd>
 
 <dt>label_column_name</dt>
-<dd>TEXT. The name of the column with labels/values of training data points.</dd>
+<dd>TEXT. Name of the column with labels/values of training data points.</dd>
 
 <dt>test_source</dt>
-<dd>TEXT. The name of the table containing the test data points.
+<dd>TEXT. Name of the table containing the test data points.
 
-Testing data points are expected to be stored row-wise,
+Testing data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>
 
 <dt>test_column_name</dt>
-<dd>TEXT. The name of the column with testing data points.</dd>
+<dd>TEXT. Name of the column with testing data points.</dd>
 
 <dt>id_column_name</dt>
 <dd>TEXT. Name of the column having ids of data points in test data table.</dd>
@@ -115,10 +113,11 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 <dd>TEXT. Name of the table to store final results.</dd>
 
 <dt>operation</dt>
-<dd>TEXT. the type of task; r for regression and c for classification.</dd>
+<dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>
 
 <dt>k (optional)</dt>
-<dd>INTEGER. default: 1. The number of nearest neighbors to consider.</dd>
+<dd>INTEGER. default: 1. Number of nearest neighbors to consider.
+For classification, should be an odd number to break ties.</dd>
 
 </dl>
 
@@ -138,7 +137,7 @@ The output of the KNN module is a table with the following columns:
     </tr>
     <tr>
         <th>prediction</th>
-        <td>INTEGER. The output of KNN- label in case of classification, average value in case of regression.</td>
+        <td>INTEGER. Label in case of classification, average value in case of regression.</td>
     </tr>
 </table>
 
@@ -146,89 +145,100 @@ The output of the KNN module is a table with the following columns:
 @anchor examples
 @examp
 
--#  Prepare some training data.
+-#  Prepare some training data:
 <pre class="example">
-CREATE TABLE knn_train_data (id integer, data integer[], label float);
-COPY knn_train_data (id, data, label) from stdin delimiter '|';
-1|{1,1}|1.0
-2|{2,2}|1.0
-3|{3,3}|1.0
-4|{4,4}|1.0
-5|{4,5}|1.0
-6|{20,50}|0.0
-7|{10,31}|0.0
-8|{81,13}|0.0
-9|{1,111}|0.0
-\\.
+DROP TABLE IF EXISTS knn_train_data;
+CREATE TABLE knn_train_data (
+                    id integer, 
+                    data integer[], 
+                    label float
+                    );
+INSERT INTO knn_train_data VALUES
+(1, '{1,1}', 1.0),
+(2, '{2,2}', 1.0),
+(3, '{3,3}', 1.0),
+(4, '{4,4}', 1.0),
+(5, '{4,5}', 1.0),
+(6, '{20,50}', 0.0),
+(7, '{10,31}', 0.0),
+(8, '{81,13}', 0.0),
+(9, '{1,111}', 0.0);
 </pre>
 
--#  Prepare some testing data.
+-#  Prepare some testing data:
 <pre class="example">
-CREATE TABLE knn_test_data (id integer, data integer[]);
-COPY knn_test_data (id, data) from stdin delimiter '|';
-1|{2,1}
-2|{2,6}
-3|{15,40}
-4|{12,1}
-5|{2,90}
-6|{50,45}
-\\.
+DROP TABLE IF EXISTS knn_test_data;
+CREATE TABLE knn_test_data (
+                    id integer, 
+                    data integer[]
+                    );
+INSERT INTO knn_test_data VALUES
+(1, '{2,1}'),
+(2, '{2,6}'),
+(3, '{15,40}'),
+(4, '{12,1}'),
+(5, '{2,90}'),
+(6, '{50,45}');
 </pre>
 
 -#  Run KNN for classification:
 <pre class="example">
-SELECT * FROM madlib.knn( 'knn_train_data',
-                               'data',
-                               'label',
-                               'knn_test_data',
-                               'data',
-                               'id',
-                               'madlib_knn_result_classification',
-                               'c',
-                               3
-                             );
-SELECT * from madlib_knn_result_classification;
+DROP TABLE IF EXISTS madlib_knn_result_classification;
+SELECT * FROM madlib.knn( 
+                'knn_train_data',      -- Table of training data
+                'data',                -- Col name of training data
+                'label',               -- Training labels
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data 
+                'madlib_knn_result_classification',  -- Output table
+                'c',                   -- Classification
+                 3                     -- Number of nearest neighbours
+                );
+SELECT * from madlib_knn_result_classification ORDER BY id;
 </pre>
 Result:
 <pre class="result">
-  id |   data   | prediction
------+----------+-----------
-   1 | {2,1}    |       1
-   2 | {2,6}    |       1
-   3 | {15,40}  |       0
-   4 | {12,1}   |       1
-   5 | {2,90}   |       0
-   6 | {50,45}  |       0
+ id |  data   | prediction 
+----+---------+------------
+  1 | {2,1}   |          1
+  2 | {2,6}   |          1
+  3 | {15,40} |          0
+  4 | {12,1}  |          1
+  5 | {2,90}  |          0
+  6 | {50,45} |          0
+(6 rows)
 </pre>
 
 -#  Run KNN for regression:
 <pre class="example">
-SELECT * FROM madlib.knn( 'knn_train_data',
-                               'data',
-                               'label',
-                               'knn_test_data',
-                               'data',
-                               'id',
-                               'madlib_knn_result_regression',
-                               'r',
-                               3
-                             );
-SELECT * from madlib_knn_result_regression;
+DROP TABLE IF EXISTS madlib_knn_result_regression;
+SELECT * FROM madlib.knn( 
+                'knn_train_data',      -- Table of training data
+                'data',                -- Col name of training data
+                'label',               -- Training labels
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data 
+                'madlib_knn_result_regression',  -- Output table
+                'r',                   -- Regressions
+                 3                     -- Number of nearest neighbours
+                );
+SELECT * from madlib_knn_result_regression ORDER BY id;
 </pre>
 Result:
 <pre class="result">
-  id |   data   | prediction
------+----------+-----------
-   1 | {2,1}    |      1
-   2 | {2,6}    |      1
-   3 | {15,40}  |      0.5
-   4 | {12,1}   |      1
-   5 | {2,90}   |      0.25
-   6 | {50,45}  |      0.25
+ id |  data   |    prediction     
+----+---------+-------------------
+  1 | {2,1}   |                 1
+  2 | {2,6}   |                 1
+  3 | {15,40} | 0.333333333333333
+  4 | {12,1}  |                 1
+  5 | {2,90}  |                 0
+  6 | {50,45} |                 0
+(6 rows)
 </pre>
 
-
-
 @anchor background
 @par Technical Background
 
@@ -260,12 +270,6 @@ Other distance metrics will be added in a future release of this module.
 [3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
     https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
 
-
-@anchor related
-@par Related Topics
-
-File knn.sql_in documenting the knn SQL functions
-
 @internal
 @sa namespace knn (documenting the implementation in Python)
 @endinternal

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index a14337c..d36ef2b 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -337,28 +337,28 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
-----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
-  1 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |       0 |       0 |       0
-  2 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  3 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
-  4 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
-  5 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  6 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       1 |       0
-  7 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |       0 |       0 |       0
-  8 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |       0 |       0 |       0
-  9 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
- 10 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |       0 |       0 |       0
- 11 |     1 |     0 |     0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 12 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 13 |     0 |     0 |     1 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 14 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 15 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 16 |     0 |     0 |     1 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 17 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 18 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 19 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 20 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
+ id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 
+----+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+  1 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
+  2 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  3 |     1 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  4 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  5 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  6 |     0 |     1 |     0 |       0 |       1 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  7 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1
+  8 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0
+  9 |     0 |     0 |     1 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 10 |     0 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0
+ 11 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0
+ 12 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 13 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0
+ 14 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 15 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 16 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0
+ 17 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 18 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 19 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 20 |     0 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
 (20 rows)
 </pre>
 
@@ -379,7 +379,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__
+ id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__ 
 ----+-------+-------+-------------+----------+---------+---------+---------------
   1 |     1 |     0 |           0 |        0 |       0 |       0 |             1
   2 |     1 |     0 |           0 |        0 |       1 |       0 |             0
@@ -421,28 +421,28 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex | rings | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
-----+-----+-------+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
-  1 | M   |    15 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |       0 |       0 |       0
-  2 | M   |     7 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  3 | F   |     9 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
-  4 | M   |    10 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
-  5 | I   |     7 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  6 | I   |     8 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       1 |       0
-  7 | F   |    20 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |       0 |       0 |       0
-  8 | F   |    16 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |       0 |       0 |       0
-  9 | M   |     9 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
- 10 |     |    19 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |       0 |       0 |       0
- 11 | F   |    14 |     1 |     0 |     0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 12 | M   |    10 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 13 | M   |    11 |     0 |     0 |     1 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 14 | F   |    10 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 15 | F   |    10 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 16 | M   |    12 |     0 |     0 |     1 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 17 | I   |     7 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 18 | F   |    10 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 19 | M   |     7 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 20 |     |     9 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
+ id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 
+----+-----+-------+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+  1 | M   |    15 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
+  2 | M   |     7 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  3 | F   |     9 |     1 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  4 | M   |    10 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  5 | I   |     7 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  6 | I   |     8 |     0 |     1 |     0 |       0 |       1 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  7 | F   |    20 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1
+  8 | F   |    16 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0
+  9 | M   |     9 |     0 |     0 |     1 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 10 |     |    19 |     0 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0
+ 11 | F   |    14 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0
+ 12 | M   |    10 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 13 | M   |    11 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0
+ 14 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 15 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 16 | M   |    12 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0
+ 17 | I   |     7 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 18 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 19 | M   |     7 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 20 |     |     9 |     0 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
 (20 rows)
 </pre>
 
@@ -499,33 +499,33 @@ SELECT madlib.encode_categorical_variables (
         NULL,                        -- Top values
         NULL,                        -- Value to drop for dummy encoding
         NULL,                        -- Encode nulls
-        TRUE                         -- Array output
+        'array'                      -- Array output type
         );
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id |     __encoded_variables__
+ id |     __encoded_variables__     
 ----+-------------------------------
-  1 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
-  2 | {0,0,1,0,0,0,0,0,0,0,0,1,0,0}
-  3 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
-  4 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
-  5 | {0,1,0,0,0,0,0,0,0,0,0,1,0,0}
-  6 | {0,1,0,0,0,0,0,0,0,0,0,0,1,0}
-  7 | {1,0,0,0,0,0,0,0,0,0,1,0,0,0}
-  8 | {1,0,0,0,0,0,0,0,1,0,0,0,0,0}
-  9 | {0,0,1,0,0,0,0,0,0,0,0,0,0,1}
- 10 | {0,0,0,0,0,0,0,0,0,1,0,0,0,0}
- 11 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
- 12 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
- 13 | {0,0,1,0,1,0,0,0,0,0,0,0,0,0}
- 14 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 15 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 16 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
- 17 | {0,1,0,0,0,0,0,0,0,0,0,1,0,0}
- 18 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 19 | {0,0,1,0,0,0,0,0,0,0,0,1,0,0}
- 20 | {0,0,0,0,0,0,0,0,0,0,0,0,0,1}
+  1 | {0,0,1,0,0,0,0,0,0,0,1,0,0,0}
+  2 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+  3 | {1,0,0,0,0,1,0,0,0,0,0,0,0,0}
+  4 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+  5 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+  6 | {0,1,0,0,1,0,0,0,0,0,0,0,0,0}
+  7 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
+  8 | {1,0,0,0,0,0,0,0,0,0,0,1,0,0}
+  9 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
+ 10 | {0,0,0,0,0,0,0,0,0,0,0,0,1,0}
+ 11 | {1,0,0,0,0,0,0,0,0,1,0,0,0,0}
+ 12 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+ 13 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
+ 14 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 15 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 16 | {0,0,1,0,0,0,0,0,1,0,0,0,0,0}
+ 17 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+ 18 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 19 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+ 20 | {0,0,0,0,0,1,0,0,0,0,0,0,0,0}
 (20 rows)
 </pre>
 View the dictionary table that gives the index into the array:
@@ -533,22 +533,22 @@ View the dictionary table that gives the index into the array:
 SELECT * FROM abalone_out_dictionary;
 </pre>
 <pre class="result">
-  encoded_column_name  | index | variable | value
+  encoded_column_name  | index | variable | value 
 -----------------------+-------+----------+-------
  __encoded_variables__ |     1 | sex      | F
  __encoded_variables__ |     2 | sex      | I
  __encoded_variables__ |     3 | sex      | M
- __encoded_variables__ |     4 | rings    | 10
- __encoded_variables__ |     5 | rings    | 11
- __encoded_variables__ |     6 | rings    | 12
- __encoded_variables__ |     7 | rings    | 14
- __encoded_variables__ |     8 | rings    | 15
- __encoded_variables__ |     9 | rings    | 16
- __encoded_variables__ |    10 | rings    | 19
- __encoded_variables__ |    11 | rings    | 20
- __encoded_variables__ |    12 | rings    | 7
- __encoded_variables__ |    13 | rings    | 8
- __encoded_variables__ |    14 | rings    | 9
+ __encoded_variables__ |     4 | rings    | 7
+ __encoded_variables__ |     5 | rings    | 8
+ __encoded_variables__ |     6 | rings    | 9
+ __encoded_variables__ |     7 | rings    | 10
+ __encoded_variables__ |     8 | rings    | 11
+ __encoded_variables__ |     9 | rings    | 12
+ __encoded_variables__ |    10 | rings    | 14
+ __encoded_variables__ |    11 | rings    | 15
+ __encoded_variables__ |    12 | rings    | 16
+ __encoded_variables__ |    13 | rings    | 19
+ __encoded_variables__ |    14 | rings    | 20
 (14 rows)
 </pre>
 -# Create a dictionary output:
@@ -563,34 +563,34 @@ SELECT madlib.encode_categorical_variables (
         NULL,                        -- Top values
         NULL,                        -- Value to drop for dummy encoding
         NULL,                        -- Encode nulls
-        FALSE,                       -- Array output
+        NULL,                        -- Output type
         TRUE                         -- Dictionary output
         );
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
-  id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11
+ id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 
 ----+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------
-  1 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |        0 |        0
-  2 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
-  3 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
-  4 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
-  5 |     0 |     1 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
-  6 |     0 |     1 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        1 |        0
-  7 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |        0 |        0
-  8 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |        0 |        0
-  9 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
- 10 |     0 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |        0 |        0
- 11 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 12 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 13 |     0 |     0 |     1 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 14 |     1 |     0 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 15 |     1 |     0 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 16 |     0 |     0 |     1 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 17 |     0 |     1 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
- 18 |     1 |     0 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 19 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
- 20 |     0 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
+  1 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |        0 |        0
+  2 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  3 |     1 |     0 |     0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  4 |     0 |     0 |     1 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  5 |     0 |     1 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  6 |     0 |     1 |     0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  7 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
+  8 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
+  9 |     0 |     0 |     1 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 10 |     0 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        1 |        0
+ 11 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |        0 |        0
+ 12 |     0 |     0 |     1 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 13 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |        0 |        0
+ 14 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 15 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 16 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |        0 |        0
+ 17 |     0 |     1 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 18 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 19 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 20 |     0 |     0 |     0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
 (20 rows)
 </pre>
 View the dictionary table that defines the numerical columns in the output table:
@@ -598,19 +598,19 @@ View the dictionary table that defines the numerical columns in the output table
 SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;
 </pre>
 <pre class="result">
-  encoded_column_name | index | variable | value
+ encoded_column_name | index | variable | value 
 ---------------------+-------+----------+-------
- "rings_1"           |     1 | rings    | 10
- "rings_10"          |    10 | rings    | 8
- "rings_11"          |    11 | rings    | 9
- "rings_2"           |     2 | rings    | 11
- "rings_3"           |     3 | rings    | 12
- "rings_4"           |     4 | rings    | 14
- "rings_5"           |     5 | rings    | 15
- "rings_6"           |     6 | rings    | 16
- "rings_7"           |     7 | rings    | 19
- "rings_8"           |     8 | rings    | 20
- "rings_9"           |     9 | rings    | 7
+ "rings_1"           |     1 | rings    | 7
+ "rings_10"          |    10 | rings    | 19
+ "rings_11"          |    11 | rings    | 20
+ "rings_2"           |     2 | rings    | 8
+ "rings_3"           |     3 | rings    | 9
+ "rings_4"           |     4 | rings    | 10
+ "rings_5"           |     5 | rings    | 11
+ "rings_6"           |     6 | rings    | 12
+ "rings_7"           |     7 | rings    | 14
+ "rings_8"           |     8 | rings    | 15
+ "rings_9"           |     9 | rings    | 16
  "sex_1"             |     1 | sex      | F
  "sex_2"             |     2 | sex      | I
  "sex_3"             |     3 | sex      | M
@@ -629,7 +629,7 @@ SELECT madlib.encode_categorical_variables (
         NULL,                        -- Top values
         NULL,                        -- Value to drop for dummy encoding
         NULL,                        -- Encode nulls
-        NULL,                        -- Array output
+        NULL,                        -- Output type
         NULL,                        -- Dictionary output
         'RANDOMLY'                   -- Distribution policy
         );