You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/02/08 23:11:18 UTC
incubator-madlib git commit: Adds k-means doc example for array input
Updates KNN and cat vars docs
Repository: incubator-madlib
Updated Branches:
refs/heads/master 2d5a5edb9 -> 8e7c6ebfe
Adds k-means doc example for array input
Updates KNN and cat vars docs
Closes #99
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8e7c6ebf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8e7c6ebf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8e7c6ebf
Branch: refs/heads/master
Commit: 8e7c6ebfeefede6f64c7f449f8f2f1a75a8dbe73
Parents: 2d5a5ed
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Wed Feb 8 15:08:05 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Feb 8 15:10:29 2017 -0800
----------------------------------------------------------------------
src/ports/postgres/modules/kmeans/kmeans.sql_in | 154 +++++++++----
src/ports/postgres/modules/knn/knn.sql_in | 190 ++++++++--------
.../modules/utilities/encode_categorical.sql_in | 228 +++++++++----------
3 files changed, 324 insertions(+), 248 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/kmeans/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 16014ef..f689dd6 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -218,43 +218,45 @@ closest_column( m, x )
@anchor examples
@examp
--# Prepare some input data.
+Note: Your results may not be exactly the same as below due to the nature of the
+k-means algorithm.
+
+-# Prepare some input data:
<pre class="example">
-CREATE TABLE public.km_sample(pid int, points double precision[]);
-COPY km_sample (pid, points) FROM stdin DELIMITER '|';
-1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}
-2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}
-3 | {13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}
-4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}
-5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}
-6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}
-7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}
-8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}
-9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}
-10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}
-\\.
+DROP TABLE IF EXISTS km_sample;
+CREATE TABLE km_sample(pid int, points double precision[]);
+INSERT INTO km_sample VALUES
+(1, '{14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}'),
+(2, '{13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}'),
+(3, '{13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}'),
+(4, '{14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}'),
+(5, '{13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}'),
+(6, '{14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}'),
+(7, '{14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}'),
+(8, '{14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}'),
+(9, '{14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}'),
+(10, '{13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}');
</pre>
-
-# Run k-means clustering using kmeans++ for centroid seeding:
<pre class="example">
\\x on;
-SELECT * FROM madlib.kmeanspp( 'km_sample',
- 'points',
- 2,
- 'madlib.squared_dist_norm2',
- 'madlib.avg',
- 20,
- 0.001
+SELECT * FROM madlib.kmeanspp( 'km_sample', -- Table of source data
+ 'points', -- Column containing point co-ordinates
+ 2, -- Number of centroids to calculate
+ 'madlib.squared_dist_norm2', -- Distance function
+ 'madlib.avg', -- Aggregate function
+ 20, -- Number of iterations
+ 0.001 -- Fraction of centroids reassigned to keep iterating
);
</pre>
Result:
<pre class="result">
-centroids | {{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988},
- {14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340}}
-cluster_variance | {90512.324426408,60672.638245208}
-objective_fn | 151184.962672
+centroids | {{13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667, 3.31833333333333,1020.83333333333},
+ {14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75}}
+cluster_variance | {122999.110416013,30561.74805}
+objective_fn | 153560.858466013
frac_reassigned | 0
-num_iterations | 2
+num_iterations | 3
</pre>
-# Calculate the simplified silhouette coefficient:
<pre class="example">
@@ -273,33 +275,103 @@ SELECT * FROM madlib.simple_silhouette( 'km_sample',
</pre>
Result:
<pre class="result">
-simple_silhouette | 0.68978804882941
+simple_silhouette | 0.686314347664694
</pre>
--# Find the cluster assignment for each point
+-# Find the cluster assignment for each point:
<pre class="example">
\\x off;
-SELECT data.*, (madlib.closest_column(centroids, points)).column_id as cluster_id
-FROM public.km_sample as data,
- (SELECT centroids
- FROM madlib.kmeanspp('km_sample', 'points', 2,
+DROP TABLE IF EXISTS km_result;
+-- Run kmeans algorithm
+CREATE TABLE km_result AS
+SELECT * FROM madlib.kmeanspp('km_sample', 'points', 2,
'madlib.squared_dist_norm2',
- 'madlib.avg', 20, 0.001)) as centroids
+ 'madlib.avg', 20, 0.001);
+-- Get point assignment
+SELECT data.*, (madlib.closest_column(centroids, points)).column_id as cluster_id
+FROM km_sample as data, km_result
ORDER BY data.pid;
</pre>
<pre class="result">
- pid | points | cluster_id
+ pid | points | cluster_id
-----+--------------------------------------------------------------------+------------
1 | {14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065} | 0
2 | {13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050} | 0
- 3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} | 1
- 4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480} | 1
- 5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735} | 0
- 6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450} | 1
- 7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290} | 1
- 8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295} | 1
+ 3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} | 0
+ 4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480} | 0
+ 5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735} | 1
+ 6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450} | 0
+ 7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290} | 0
+ 8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295} | 0
9 | {14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045} | 0
10 | {13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.2199,1.01,3.55,1045} | 0
+(10 rows)
+</pre>
+
+-# Run the same example as above, but using array input. Create the input table:
+<pre class="example">
+DROP TABLE IF EXISTS km_arrayin CASCADE;
+CREATE TABLE km_arrayin(pid int,
+ p1 float,
+ p2 float,
+ p3 float,
+ p4 float,
+ p5 float,
+ p6 float,
+ p7 float,
+ p8 float,
+ p9 float,
+ p10 float,
+ p11 float,
+ p12 float,
+ p13 float);
+INSERT INTO km_arrayin VALUES
+(1, 14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065),
+(2, 13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050),
+(3, 13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185),
+(4, 14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480),
+(5, 13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735),
+(6, 14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450),
+(7, 14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290),
+(8, 14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295),
+(9, 14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045),
+(10, 13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045);
+</pre>
+Now find the cluster assignment for each point:
+<pre class="example">
+DROP TABLE IF EXISTS km_result;
+-- Run kmeans algorithm
+CREATE TABLE km_result AS
+SELECT * FROM madlib.kmeans_random('km_arrayin',
+ 'ARRAY[p1, p2, p3, p4, p5, p6,
+ p7, p8, p9, p10, p11, p12, p13]',
+ 2,
+ 'madlib.squared_dist_norm2',
+ 'madlib.avg',
+ 20,
+ 0.001);
+-- Get point assignment
+SELECT data.*, (madlib.closest_column(centroids,
+ ARRAY[p1, p2, p3, p4, p5, p6,
+ p7, p8, p9, p10, p11, p12, p13])).column_id as cluster_id
+FROM km_arrayin as data, km_result
+ORDER BY data.pid;
+</pre>
+This produces the result in column format:
+<pre class="result">
+ pid | p1 | p2 | p3 | p4 | p5 | p6 | p7 | p8 | p9 | p10 | p11 | p12 | p13 | cluster_id
+-----+-------+------+------+------+-----+------+------+------+------+--------+------+------+------+------------
+ 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.8 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 0
+ 2 | 13.2 | 1.78 | 2.14 | 11.2 | 1 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.49 | 1050 | 0
+ 3 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.8 | 3.24 | 0.3 | 2.81 | 5.6799 | 1.03 | 3.17 | 1185 | 0
+ 4 | 14.37 | 1.95 | 2.5 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.8 | 0.86 | 3.45 | 1480 | 1
+ 5 | 13.24 | 2.59 | 2.87 | 21 | 118 | 2.8 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 0
+ 6 | 14.2 | 1.76 | 2.45 | 15.2 | 112 | 3.27 | 3.39 | 0.34 | 1.97 | 6.75 | 1.05 | 2.85 | 1450 | 1
+ 7 | 14.39 | 1.87 | 2.45 | 14.6 | 96 | 2.5 | 2.52 | 0.3 | 1.98 | 5.25 | 1.02 | 3.58 | 1290 | 1
+ 8 | 14.06 | 2.15 | 2.61 | 17.6 | 121 | 2.6 | 2.51 | 0.31 | 1.25 | 5.05 | 1.06 | 3.58 | 1295 | 1
+ 9 | 14.83 | 1.64 | 2.17 | 14 | 97 | 2.8 | 2.98 | 0.29 | 1.98 | 5.2 | 1.08 | 2.85 | 1045 | 0
+ 10 | 13.86 | 1.35 | 2.27 | 16 | 98 | 2.98 | 3.15 | 0.22 | 1.85 | 7.2199 | 1.01 | 3.55 | 1045 | 0
+(10 rows)
</pre>
@anchor notes
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index 526c8dd..d3c1929 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -44,29 +44,27 @@ m4_include(`SQLCommon.m4')
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#background">Technical Background</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
-<li class="level1"><a href="#related">Related Topics</a></li>
</ul>
</div>
-@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes in case of classification and average value of target values for regression task.
+@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes for classification, and average value of target values for regression.
\warning <em> This MADlib method is still in early stage development. There may be some
issues that will be addressed in a future version. Interface and implementation
-is subject to change. </em>
+are subject to change. </em>
@anchor knn
-k-Nearest Neighbors is a method for finding k closest points to a
-given data point in terms of a given metric. Its input consist of
-data points as features from testing examples. For a given k, it
-looks for k closest points in training set for each of the data
-points in test set. Algorithm generates one output per testing example.
-The output of KNN depends on the type of task:
-For Classification, the output is majority vote of the classes of
-the k nearest data points. The testing example gets assigned the
-most popular class among nearest neighbors.
-For Regression, the output is average of the values of k nearest
-neighbors of the given testing example.
+K-nearest neighbors is a method for finding the k closest points to a
+given data point in terms of a given metric. Its input consists of
+data points as features from testing examples, and it
+looks for k closest points in the training set for each of the data
+points in test set. The output of KNN depends on the type of task.
+For classification, the output is the majority vote of the classes of
+the k nearest data points. That is, the testing example gets assigned the
+most popular class from the nearest neighbors.
+For regression, the output is the average of the values of k nearest
+neighbors of the given test point.
@anchor usage
@par Usage
@@ -86,27 +84,27 @@ knn( point_source,
\b Arguments
<dl class="arglist">
<dt>point_source</dt>
-<dd>TEXT. The name of the table containing the training data points.
+<dd>TEXT. Name of the table containing the training data points.
-Training data points are expected to be stored row-wise,
+Training data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
<dt>point_column_name</dt>
-<dd>TEXT. The name of the column with training data points.</dd>
+<dd>TEXT. Name of the column with training data points.</dd>
<dt>label_column_name</dt>
-<dd>TEXT. The name of the column with labels/values of training data points.</dd>
+<dd>TEXT. Name of the column with labels/values of training data points.</dd>
<dt>test_source</dt>
-<dd>TEXT. The name of the table containing the test data points.
+<dd>TEXT. Name of the table containing the test data points.
-Testing data points are expected to be stored row-wise,
+Testing data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
<dt>test_column_name</dt>
-<dd>TEXT. The name of the column with testing data points.</dd>
+<dd>TEXT. Name of the column with testing data points.</dd>
<dt>id_column_name</dt>
<dd>TEXT. Name of the column having ids of data points in test data table.</dd>
@@ -115,10 +113,11 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
<dd>TEXT. Name of the table to store final results.</dd>
<dt>operation</dt>
-<dd>TEXT. the type of task; r for regression and c for classification.</dd>
+<dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>
<dt>k (optional)</dt>
-<dd>INTEGER. default: 1. The number of nearest neighbors to consider.</dd>
+<dd>INTEGER. default: 1. Number of nearest neighbors to consider.
+For classification, should be an odd number to break ties.</dd>
</dl>
@@ -138,7 +137,7 @@ The output of the KNN module is a table with the following columns:
</tr>
<tr>
<th>prediction</th>
- <td>INTEGER. The output of KNN- label in case of classification, average value in case of regression.</td>
+ <td>INTEGER. Label in case of classification, average value in case of regression.</td>
</tr>
</table>
@@ -146,89 +145,100 @@ The output of the KNN module is a table with the following columns:
@anchor examples
@examp
--# Prepare some training data.
+-# Prepare some training data:
<pre class="example">
-CREATE TABLE knn_train_data (id integer, data integer[], label float);
-COPY knn_train_data (id, data, label) from stdin delimiter '|';
-1|{1,1}|1.0
-2|{2,2}|1.0
-3|{3,3}|1.0
-4|{4,4}|1.0
-5|{4,5}|1.0
-6|{20,50}|0.0
-7|{10,31}|0.0
-8|{81,13}|0.0
-9|{1,111}|0.0
-\\.
+DROP TABLE IF EXISTS knn_train_data;
+CREATE TABLE knn_train_data (
+ id integer,
+ data integer[],
+ label float
+ );
+INSERT INTO knn_train_data VALUES
+(1, '{1,1}', 1.0),
+(2, '{2,2}', 1.0),
+(3, '{3,3}', 1.0),
+(4, '{4,4}', 1.0),
+(5, '{4,5}', 1.0),
+(6, '{20,50}', 0.0),
+(7, '{10,31}', 0.0),
+(8, '{81,13}', 0.0),
+(9, '{1,111}', 0.0);
</pre>
--# Prepare some testing data.
+-# Prepare some testing data:
<pre class="example">
-CREATE TABLE knn_test_data (id integer, data integer[]);
-COPY knn_test_data (id, data) from stdin delimiter '|';
-1|{2,1}
-2|{2,6}
-3|{15,40}
-4|{12,1}
-5|{2,90}
-6|{50,45}
-\\.
+DROP TABLE IF EXISTS knn_test_data;
+CREATE TABLE knn_test_data (
+ id integer,
+ data integer[]
+ );
+INSERT INTO knn_test_data VALUES
+(1, '{2,1}'),
+(2, '{2,6}'),
+(3, '{15,40}'),
+(4, '{12,1}'),
+(5, '{2,90}'),
+(6, '{50,45}');
</pre>
-# Run KNN for classification:
<pre class="example">
-SELECT * FROM madlib.knn( 'knn_train_data',
- 'data',
- 'label',
- 'knn_test_data',
- 'data',
- 'id',
- 'madlib_knn_result_classification',
- 'c',
- 3
- );
-SELECT * from madlib_knn_result_classification;
+DROP TABLE IF EXISTS madlib_knn_result_classification;
+SELECT * FROM madlib.knn(
+ 'knn_train_data', -- Table of training data
+ 'data', -- Col name of training data
+ 'label', -- Training labels
+ 'knn_test_data', -- Table of test data
+ 'data', -- Col name of test data
+ 'id', -- Col name of id in test data
+ 'madlib_knn_result_classification', -- Output table
+ 'c', -- Classification
+ 3 -- Number of nearest neighbours
+ );
+SELECT * from madlib_knn_result_classification ORDER BY id;
</pre>
Result:
<pre class="result">
- id | data | prediction
------+----------+-----------
- 1 | {2,1} | 1
- 2 | {2,6} | 1
- 3 | {15,40} | 0
- 4 | {12,1} | 1
- 5 | {2,90} | 0
- 6 | {50,45} | 0
+ id | data | prediction
+----+---------+------------
+ 1 | {2,1} | 1
+ 2 | {2,6} | 1
+ 3 | {15,40} | 0
+ 4 | {12,1} | 1
+ 5 | {2,90} | 0
+ 6 | {50,45} | 0
+(6 rows)
</pre>
-# Run KNN for regression:
<pre class="example">
-SELECT * FROM madlib.knn( 'knn_train_data',
- 'data',
- 'label',
- 'knn_test_data',
- 'data',
- 'id',
- 'madlib_knn_result_regression',
- 'r',
- 3
- );
-SELECT * from madlib_knn_result_regression;
+DROP TABLE IF EXISTS madlib_knn_result_regression;
+SELECT * FROM madlib.knn(
+ 'knn_train_data', -- Table of training data
+ 'data', -- Col name of training data
+ 'label', -- Training labels
+ 'knn_test_data', -- Table of test data
+ 'data', -- Col name of test data
+ 'id', -- Col name of id in test data
+ 'madlib_knn_result_regression', -- Output table
+ 'r', -- Regressions
+ 3 -- Number of nearest neighbours
+ );
+SELECT * from madlib_knn_result_regression ORDER BY id;
</pre>
Result:
<pre class="result">
- id | data | prediction
------+----------+-----------
- 1 | {2,1} | 1
- 2 | {2,6} | 1
- 3 | {15,40} | 0.5
- 4 | {12,1} | 1
- 5 | {2,90} | 0.25
- 6 | {50,45} | 0.25
+ id | data | prediction
+----+---------+-------------------
+ 1 | {2,1} | 1
+ 2 | {2,6} | 1
+ 3 | {15,40} | 0.333333333333333
+ 4 | {12,1} | 1
+ 5 | {2,90} | 0
+ 6 | {50,45} | 0
+(6 rows)
</pre>
-
-
@anchor background
@par Technical Background
@@ -260,12 +270,6 @@ Other distance metrics will be added in a future release of this module.
[3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
-
-@anchor related
-@par Related Topics
-
-File knn.sql_in documenting the knn SQL functions
-
@internal
@sa namespace knn (documenting the implementation in Python)
@endinternal
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index a14337c..d36ef2b 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -337,28 +337,28 @@ SELECT madlib.encode_categorical_variables (
SELECT * FROM abalone_out ORDER BY id;
</pre>
<pre class="result">
- id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
-----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
- 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0
- 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
- 4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0
- 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0
- 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0
- 9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
- 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0
- 11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
+ id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20
+----+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+ 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0
+ 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 4 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 5 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 6 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
+ 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
+ 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0
+ 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0
+ 12 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 13 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0
+ 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 16 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0
+ 17 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 19 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
(20 rows)
</pre>
@@ -379,7 +379,7 @@ SELECT madlib.encode_categorical_variables (
SELECT * FROM abalone_out ORDER BY id;
</pre>
<pre class="result">
- id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__
+ id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__
----+-------+-------+-------------+----------+---------+---------+---------------
1 | 1 | 0 | 0 | 0 | 0 | 0 | 1
2 | 1 | 0 | 0 | 0 | 1 | 0 | 0
@@ -421,28 +421,28 @@ SELECT madlib.encode_categorical_variables (
SELECT * FROM abalone_out ORDER BY id;
</pre>
<pre class="result">
- id | sex | rings | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
-----+-----+-------+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
- 1 | M | 15 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0
- 2 | M | 7 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 3 | F | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
- 4 | M | 10 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 5 | I | 7 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 6 | I | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0
- 7 | F | 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0
- 8 | F | 16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0
- 9 | M | 9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
- 10 | | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0
- 11 | F | 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 12 | M | 10 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 13 | M | 11 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 14 | F | 10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 15 | F | 10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 16 | M | 12 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 17 | I | 7 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 18 | F | 10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 19 | M | 7 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 20 | | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
+ id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20
+----+-----+-------+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+ 1 | M | 15 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0
+ 2 | M | 7 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 3 | F | 9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 4 | M | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 5 | I | 7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 6 | I | 8 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 7 | F | 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
+ 8 | F | 16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
+ 9 | M | 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 10 | | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0
+ 11 | F | 14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0
+ 12 | M | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 13 | M | 11 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0
+ 14 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 15 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 16 | M | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0
+ 17 | I | 7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 18 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 19 | M | 7 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 20 | | 9 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
(20 rows)
</pre>
@@ -499,33 +499,33 @@ SELECT madlib.encode_categorical_variables (
NULL, -- Top values
NULL, -- Value to drop for dummy encoding
NULL, -- Encode nulls
- TRUE -- Array output
+ 'array' -- Array output type
);
SELECT * FROM abalone_out ORDER BY id;
</pre>
<pre class="result">
- id | __encoded_variables__
+ id | __encoded_variables__
----+-------------------------------
- 1 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
- 2 | {0,0,1,0,0,0,0,0,0,0,0,1,0,0}
- 3 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
- 4 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
- 5 | {0,1,0,0,0,0,0,0,0,0,0,1,0,0}
- 6 | {0,1,0,0,0,0,0,0,0,0,0,0,1,0}
- 7 | {1,0,0,0,0,0,0,0,0,0,1,0,0,0}
- 8 | {1,0,0,0,0,0,0,0,1,0,0,0,0,0}
- 9 | {0,0,1,0,0,0,0,0,0,0,0,0,0,1}
- 10 | {0,0,0,0,0,0,0,0,0,1,0,0,0,0}
- 11 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
- 12 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
- 13 | {0,0,1,0,1,0,0,0,0,0,0,0,0,0}
- 14 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 15 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 16 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
- 17 | {0,1,0,0,0,0,0,0,0,0,0,1,0,0}
- 18 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 19 | {0,0,1,0,0,0,0,0,0,0,0,1,0,0}
- 20 | {0,0,0,0,0,0,0,0,0,0,0,0,0,1}
+ 1 | {0,0,1,0,0,0,0,0,0,0,1,0,0,0}
+ 2 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+ 3 | {1,0,0,0,0,1,0,0,0,0,0,0,0,0}
+ 4 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+ 5 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+ 6 | {0,1,0,0,1,0,0,0,0,0,0,0,0,0}
+ 7 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
+ 8 | {1,0,0,0,0,0,0,0,0,0,0,1,0,0}
+ 9 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
+ 10 | {0,0,0,0,0,0,0,0,0,0,0,0,1,0}
+ 11 | {1,0,0,0,0,0,0,0,0,1,0,0,0,0}
+ 12 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+ 13 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
+ 14 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 15 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 16 | {0,0,1,0,0,0,0,0,1,0,0,0,0,0}
+ 17 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+ 18 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 19 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+ 20 | {0,0,0,0,0,1,0,0,0,0,0,0,0,0}
(20 rows)
</pre>
View the dictionary table that gives the index into the array:
@@ -533,22 +533,22 @@ View the dictionary table that gives the index into the array:
SELECT * FROM abalone_out_dictionary;
</pre>
<pre class="result">
- encoded_column_name | index | variable | value
+ encoded_column_name | index | variable | value
-----------------------+-------+----------+-------
__encoded_variables__ | 1 | sex | F
__encoded_variables__ | 2 | sex | I
__encoded_variables__ | 3 | sex | M
- __encoded_variables__ | 4 | rings | 10
- __encoded_variables__ | 5 | rings | 11
- __encoded_variables__ | 6 | rings | 12
- __encoded_variables__ | 7 | rings | 14
- __encoded_variables__ | 8 | rings | 15
- __encoded_variables__ | 9 | rings | 16
- __encoded_variables__ | 10 | rings | 19
- __encoded_variables__ | 11 | rings | 20
- __encoded_variables__ | 12 | rings | 7
- __encoded_variables__ | 13 | rings | 8
- __encoded_variables__ | 14 | rings | 9
+ __encoded_variables__ | 4 | rings | 7
+ __encoded_variables__ | 5 | rings | 8
+ __encoded_variables__ | 6 | rings | 9
+ __encoded_variables__ | 7 | rings | 10
+ __encoded_variables__ | 8 | rings | 11
+ __encoded_variables__ | 9 | rings | 12
+ __encoded_variables__ | 10 | rings | 14
+ __encoded_variables__ | 11 | rings | 15
+ __encoded_variables__ | 12 | rings | 16
+ __encoded_variables__ | 13 | rings | 19
+ __encoded_variables__ | 14 | rings | 20
(14 rows)
</pre>
-# Create a dictionary output:
@@ -563,34 +563,34 @@ SELECT madlib.encode_categorical_variables (
NULL, -- Top values
NULL, -- Value to drop for dummy encoding
NULL, -- Encode nulls
- FALSE, -- Array output
+ NULL, -- Output type
TRUE -- Dictionary output
);
SELECT * FROM abalone_out ORDER BY id;
</pre>
<pre class="result">
- id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11
+ id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11
----+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------
- 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0
- 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
- 4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0
- 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0
- 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0
- 9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
- 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0
- 11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
- 19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
- 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
+ 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0
+ 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 4 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 5 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 6 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1
+ 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0
+ 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0
+ 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0
+ 12 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 13 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0
+ 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 16 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0
+ 17 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 19 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+ 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
(20 rows)
</pre>
View the dictionary table that defines the numerical columns in the output table:
@@ -598,19 +598,19 @@ View the dictionary table that defines the numerical columns in the output table
SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;
</pre>
<pre class="result">
- encoded_column_name | index | variable | value
+ encoded_column_name | index | variable | value
---------------------+-------+----------+-------
- "rings_1" | 1 | rings | 10
- "rings_10" | 10 | rings | 8
- "rings_11" | 11 | rings | 9
- "rings_2" | 2 | rings | 11
- "rings_3" | 3 | rings | 12
- "rings_4" | 4 | rings | 14
- "rings_5" | 5 | rings | 15
- "rings_6" | 6 | rings | 16
- "rings_7" | 7 | rings | 19
- "rings_8" | 8 | rings | 20
- "rings_9" | 9 | rings | 7
+ "rings_1" | 1 | rings | 7
+ "rings_10" | 10 | rings | 19
+ "rings_11" | 11 | rings | 20
+ "rings_2" | 2 | rings | 8
+ "rings_3" | 3 | rings | 9
+ "rings_4" | 4 | rings | 10
+ "rings_5" | 5 | rings | 11
+ "rings_6" | 6 | rings | 12
+ "rings_7" | 7 | rings | 14
+ "rings_8" | 8 | rings | 15
+ "rings_9" | 9 | rings | 16
"sex_1" | 1 | sex | F
"sex_2" | 2 | sex | I
"sex_3" | 3 | sex | M
@@ -629,7 +629,7 @@ SELECT madlib.encode_categorical_variables (
NULL, -- Top values
NULL, -- Value to drop for dummy encoding
NULL, -- Encode nulls
- NULL, -- Array output
+ NULL, -- Output type
NULL, -- Dictionary output
'RANDOMLY' -- Distribution policy
);