You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/10/27 04:15:31 UTC
madlib git commit: KNN: Fix optional parameters and ordering
Repository: madlib
Updated Branches:
refs/heads/master 2c01ec971 -> e6e7c3903
KNN: Fix optional parameters and ordering
Additional Author : Frank McQuillan <fm...@pivotal.io>
JIRA: MADLIB-1129
Add the necessary interfaces for optional parameters.
Consolidate two code paths for easy maintenance.
Add ordering for the nearest neighbors.
Change the default for output_neighbors to True.
Update the documentation to reflect the changes.
Closes #191
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/e6e7c390
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/e6e7c390
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/e6e7c390
Branch: refs/heads/master
Commit: e6e7c3903a3deb9bdf26a497aef6b710b6684192
Parents: 2c01ec9
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Thu Oct 26 21:12:49 2017 -0700
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Thu Oct 26 21:12:49 2017 -0700
----------------------------------------------------------------------
src/ports/postgres/modules/knn/knn.py_in | 141 +++++++------------
src/ports/postgres/modules/knn/knn.sql_in | 146 ++++++++++++++------
src/ports/postgres/modules/knn/test/knn.sql_in | 6 +-
3 files changed, 162 insertions(+), 131 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/e6e7c390/src/ports/postgres/modules/knn/knn.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index dec3f1f..0e21cdd 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -113,15 +113,10 @@ def knn(schema_madlib, point_source, point_column_name, point_id, label_column_n
points in test data table.
@param output_table Name of the table to store final
results.
- @param operation Flag for the operation:
- 'c' for classification and
- 'r' for regression
@param k default: 1. Number of nearest
neighbors to consider
@output_neighbours Outputs the list of k-nearest neighbors
that were used in the voting/averaging.
- Returns:
- VARCHAR Name of the output table.
"""
with MinWarning('warning'):
k_val = knn_validate_src(schema_madlib, point_source,
@@ -134,106 +129,74 @@ def knn(schema_madlib, point_source, point_column_name, point_id, label_column_n
label_col_temp = unique_string(desp='label_col_temp')
test_id_temp = unique_string(desp='test_id_temp')
- if output_neighbors is None or '':
- output_neighbors = False
+ if output_neighbors is None:
+ output_neighbors = True
interim_table = unique_string(desp='interim_table')
- if label_column_name is None or label_column_name == '':
- plpy.execute(
- """
- CREATE TEMP TABLE {interim_table} AS
- SELECT * FROM
- (
+ pred_out = ""
+ knn_neighbors = ""
+ label_out = ""
+ cast_to_int = ""
+
+ if output_neighbors:
+ knn_neighbors = (", array_agg(knn_temp.train_id ORDER BY "
+ "knn_temp.dist ASC) AS k_nearest_neighbours ")
+ if label_column_name:
+ is_classification = False
+ label_column_type = get_expr_type(
+ label_column_name, point_source).lower()
+ if label_column_type in ['boolean', 'integer', 'text']:
+ is_classification = True
+ cast_to_int = '::INTEGER'
+
+ pred_out = ", avg({label_col_temp})".format(**locals())
+ if is_classification:
+ pred_out = (", {schema_madlib}.mode({label_col_temp})"
+ ).format(**locals())
+ pred_out += " AS prediction"
+ label_out = (", train.{label_column_name}{cast_to_int}"
+ " AS {label_col_temp}").format(**locals())
+
+ if not label_column_name and not output_neighbors:
+
+ plpy.error("kNN error: Either label_column_name or "
+ "output_neighbors has to be non-NULL.")
+
+ plpy.execute("""
+ CREATE TEMP TABLE {interim_table} AS
+ SELECT * FROM (
SELECT row_number() over
(partition by {test_id_temp} order by dist) AS r,
{x_temp_table}.*
- FROM
- (
+ FROM (
SELECT test.{test_id} AS {test_id_temp} ,
train.{point_id} as train_id ,
{schema_madlib}.squared_dist_norm2(
train.{point_column_name},
test.{test_column_name})
- AS dist
- FROM {point_source} AS train, {test_source} AS test
+ AS dist {label_out}
+ FROM {point_source} AS train,
+ {test_source} AS test
) {x_temp_table}
) {y_temp_table}
- WHERE {y_temp_table}.r <= {k_val}
- """.format(**locals()))
- plpy.execute(
- """
- CREATE TABLE {output_table} AS
- SELECT {test_id_temp} AS id, {test_column_name} ,
- CASE WHEN {output_neighbors}
- THEN array_agg(knn_temp.train_id)
- ELSE NULL END AS k_nearest_neighbours
- FROM pg_temp.{interim_table} AS knn_temp
- join
- {test_source} AS knn_test ON
- knn_temp.{test_id_temp} = knn_test.{test_id}
- GROUP BY {test_id_temp} , {test_column_name}
- """.format(**locals()))
- return
-
- is_classification = False
- label_column_type = get_expr_type(
- label_column_name, point_source).lower()
- if label_column_type in ['boolean', 'integer', 'text']:
- is_classification = True
- convert_boolean_to_int = '::INTEGER'
- else:
- is_classification = False
+ WHERE {y_temp_table}.r <= {k_val}
+ """.format(**locals()))
plpy.execute(
"""
- CREATE TEMP TABLE {interim_table} AS
- SELECT * FROM
- (
- SELECT row_number() over
- (partition by {test_id_temp} order by dist) AS r,
- {x_temp_table}.*
- FROM
- (
- SELECT test.{test_id} AS {test_id_temp} ,
- train.{point_id} as train_id ,
- {schema_madlib}.squared_dist_norm2(
- train.{point_column_name},
- test.{test_column_name})
- AS dist,
- train.{label_column_name}{cast_to_int}
- AS {label_col_temp}
- FROM {point_source} AS train, {test_source} AS test
- ) {x_temp_table}
- ) {y_temp_table}
- WHERE {y_temp_table}.r <= {k_val}
- """.format(cast_to_int='::INTEGER' if is_classification else '',
- **locals()))
-
- knn_create_table = 'CREATE TABLE ' + output_table + ' AS ' \
- 'SELECT ' + test_id_temp + ' AS id,' + test_column_name + ','
- knn_pred_class = schema_madlib + \
- '.mode(' + label_col_temp + ') AS prediction'
- knn_pred_reg = 'avg(' + label_col_temp + ') AS prediction'
- knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours '
- knn_group_by = 'FROM pg_temp.' + interim_table + ' AS knn_temp join ' \
- + test_source + ' AS knn_test ON knn_temp.' + test_id_temp + '= knn_test.' \
- + test_id + ' GROUP BY ' + test_id_temp + ', ' + test_column_name
-
- if is_classification:
- if output_neighbors:
- plpy.execute("""{knn_create_table}{knn_pred_class}
- {knn_neighbours}{knn_group_by}""".format(**locals()))
- else:
- plpy.execute(""" {knn_create_table}{knn_pred_class}
- {knn_group_by}""".format(**locals()))
- else:
- if output_neighbors:
- plpy.execute(""" {knn_create_table}{knn_pred_reg}
- {knn_neighbours}{knn_group_by}""".format(**locals()))
- else:
- plpy.execute("""{knn_create_table}{knn_pred_reg}
- {knn_group_by}""".format(**locals()))
+ CREATE TABLE {output_table} AS
+ SELECT {test_id_temp} AS id, {test_column_name}
+ {pred_out}
+ {knn_neighbors}
+ FROM pg_temp.{interim_table} AS knn_temp
+ JOIN
+ {test_source} AS knn_test ON
+ knn_temp.{test_id_temp} = knn_test.{test_id}
+ GROUP BY {test_id_temp} , {test_column_name}
+ """.format(**locals()))
+
plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table))
+ return
# ------------------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/e6e7c390/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index dfd2374..befb768 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -86,7 +86,6 @@ knn( point_source,
<dl class="arglist">
<dt>point_source</dt>
<dd>TEXT. Name of the table containing the training data points.
-
Training data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
@@ -96,19 +95,19 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
<dt>point_id</dt>
<dd>TEXT. Name of the column in 'point_source’ containing source data ids.
-The ids are of type INTEGER with no duplicates. They do not need to be contiguous.
-This parameter must be used if the list of nearest neighbors are to be output, i.e.,
+The ids are of type INTEGER with no duplicates. They do not need to be contiguous.
+This parameter must be used if the list of nearest neighbors are to be output, i.e.,
if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL.
<dt>label_column_name</dt>
<dd>TEXT. Name of the column with labels/values of training data points.
-If Boolean, integer or text types will run knn classification, else if
-double precision values will run knn regression.
-If you set this to NULL will return neighbors only without doing classification or regression.</dd>
+If this column is a Boolean, integer or text, it will run KNN classification,
+else if it is double precision values will run KNN regression.
+If you set this to NULL, it will only return the set of neighbors without
+actually doing classification or regression.</dd>
<dt>test_source</dt>
<dd>TEXT. Name of the table containing the test data points.
-
Testing data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
@@ -122,17 +121,15 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
<dt>output_table</dt>
<dd>TEXT. Name of the table to store final results.</dd>
-<dt>operation</dt>
-<dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>
-
<dt>k (optional)</dt>
<dd>INTEGER. default: 1. Number of nearest neighbors to consider.
-For classification, should be an odd number to break ties.
-otherwise result may depend on ordering of the input data.</dd>
+For classification, should be an odd number to break ties,
+otherwise the result may depend on ordering of the input data.</dd>
<dt>output_neighbors (optional) </dt>
-<dd>BOOLEAN default: FALSE. Outputs the list of k-nearest
-neighbors that were used in the voting/averaging.</dd>
+<dd>BOOLEAN default: TRUE. Outputs the list of k-nearest
+neighbors that were used in the voting/averaging, sorted
+from closest to furthest.</dd>
</dl>
@@ -164,9 +161,9 @@ The output of the KNN module is a table with the following columns:
<pre class="example">
DROP TABLE IF EXISTS knn_train_data;
CREATE TABLE knn_train_data (
- id integer,
- data integer[],
- label integer
+ id integer,
+ data integer[],
+ label integer -- Integer label means for classification
);
INSERT INTO knn_train_data VALUES
(1, '{1,1}', 1),
@@ -184,9 +181,9 @@ INSERT INTO knn_train_data VALUES
<pre class="example">
DROP TABLE IF EXISTS knn_train_data_reg;
CREATE TABLE knn_train_data_reg (
- id integer,
- data integer[],
- label float
+ id integer,
+ data integer[],
+ label float -- Float label means for regression
);
INSERT INTO knn_train_data_reg VALUES
(1, '{1,1}', 1.0),
@@ -204,7 +201,7 @@ INSERT INTO knn_train_data_reg VALUES
<pre class="example">
DROP TABLE IF EXISTS knn_test_data;
CREATE TABLE knn_test_data (
- id integer,
+ id integer,
data integer[]
);
INSERT INTO knn_test_data VALUES
@@ -218,26 +215,26 @@ INSERT INTO knn_test_data VALUES
-# Run KNN for classification:
<pre class="example">
-DROP TABLE IF EXISTS madlib_knn_result_classification;
-SELECT * FROM madlib.knn(
+DROP TABLE IF EXISTS knn_result_classification;
+SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
- 'id', -- Col Name of id in train data
+ 'id', -- Col Name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
- 'id', -- Col name of id in test data
- 'madlib_knn_result_classification', -- Output table
- 3 -- Number of nearest neighbours
- True -- True if you want to show Nearest-Neighbors, False otherwise
+ 'id', -- Col name of id in test data
+ 'knn_result_classification', -- Output table
+ 3, -- Number of nearest neighbors
+ True -- True if you want to show Nearest-Neighbors by id, False otherwise
);
-SELECT * from madlib_knn_result_classification ORDER BY id;
+SELECT * from knn_result_classification ORDER BY id;
</pre>
Result:
<pre class="result">
- id | data | prediction | k_nearest_neighbours
+ id | data | prediction | k_nearest_neighbours
----+---------+------------+----------------------
- 1 | {2,1} | 1 | {1,2,3}
+ 1 | {2,1} | 1 | {2,1,3}
2 | {2,6} | 1 | {5,4,3}
3 | {15,40} | 0 | {7,6,5}
4 | {12,1} | 1 | {4,5,3}
@@ -245,29 +242,30 @@ Result:
6 | {50,45} | 0 | {6,7,8}
(6 rows)
</pre>
+Note that the nearest neighbors are sorted from closest to furthest from the corresponding test point.
-# Run KNN for regression:
<pre class="example">
-DROP TABLE IF EXISTS madlib_knn_result_regression;
-SELECT * FROM madlib.knn(
+DROP TABLE IF EXISTS knn_result_regression;
+SELECT * FROM madlib.knn(
'knn_train_data_reg', -- Table of training data
'data', -- Col name of training data
- 'id', -- Col Name of id in train data
+ 'id', -- Col Name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
- 'id', -- Col name of id in test data
- 'madlib_knn_result_regression', -- Output table
- 3, -- Number of nearest neighbours
+ 'id', -- Col name of id in test data
+ 'knn_result_regression', -- Output table
+ 3, -- Number of nearest neighbors
True -- True if you want to show Nearest-Neighbors, False otherwise
);
-SELECT * from madlib_knn_result_regression ORDER BY id;
+SELECT * FROM knn_result_regression ORDER BY id;
</pre>
Result:
<pre class="result">
- id | data | prediction | k_nearest_neighbours
+ id | data | prediction | k_nearest_neighbours
----+---------+-------------------+----------------------
- 1 | {2,1} | 1 | {1,2,3}
+ 1 | {2,1} | 1 | {2,1,3}
2 | {2,6} | 1 | {5,4,3}
3 | {15,40} | 0.333333333333333 | {7,6,5}
4 | {12,1} | 1 | {4,5,3}
@@ -276,6 +274,35 @@ Result:
(6 rows)
</pre>
+-# List nearest neighbors only, without doing classification or regression:
+<pre class="example">
+DROP TABLE IF EXISTS knn_result_list_neighbors;
+SELECT * FROM madlib.knn(
+ 'knn_train_data_reg', -- Table of training data
+ 'data', -- Col name of training data
+ 'id', -- Col Name of id in train data
+ NULL, -- NULL training labels means just list neighbors
+ 'knn_test_data', -- Table of test data
+ 'data', -- Col name of test data
+ 'id', -- Col name of id in test data
+ 'knn_result_list_neighbors', -- Output table
+ 3 -- Number of nearest neighbors
+ );
+SELECT * FROM knn_result_list_neighbors ORDER BY id;
+</pre>
+Result, with neighbors sorted from closest to furthest:
+<pre class="result">
+ id | data | k_nearest_neighbours
+----+---------+----------------------
+ 1 | {2,1} | {2,1,3}
+ 2 | {2,6} | {5,4,3}
+ 3 | {15,40} | {7,6,5}
+ 4 | {12,1} | {4,5,3}
+ 5 | {2,90} | {9,6,7}
+ 6 | {50,45} | {6,7,8}
+(6 rows)
+</pre>
+
@anchor background
@par Technical Background
@@ -447,3 +474,42 @@ BEGIN
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+ point_source VARCHAR,
+ point_column_name VARCHAR,
+ point_id VARCHAR,
+ label_column_name VARCHAR,
+ test_source VARCHAR,
+ test_column_name VARCHAR,
+ test_id VARCHAR,
+ output_table VARCHAR,
+ k INTEGER
+) RETURNS VARCHAR AS $$
+DECLARE
+ returnstring VARCHAR;
+BEGIN
+ returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,TRUE);
+ RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+ point_source VARCHAR,
+ point_column_name VARCHAR,
+ point_id VARCHAR,
+ label_column_name VARCHAR,
+ test_source VARCHAR,
+ test_column_name VARCHAR,
+ test_id VARCHAR,
+ output_table VARCHAR
+) RETURNS VARCHAR AS $$
+DECLARE
+ returnstring VARCHAR;
+BEGIN
+ returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,TRUE);
+ RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
http://git-wip-us.apache.org/repos/asf/madlib/blob/e6e7c390/src/ports/postgres/modules/knn/test/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in
index fa38751..1e71a0e 100644
--- a/src/ports/postgres/modules/knn/test/knn.sql_in
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -44,8 +44,8 @@ copy knn_train_data (id, data, label) from stdin delimiter '|';
\.
DROP TABLE IF EXISTS knn_train_data_reg;
CREATE TABLE knn_train_data_reg (
- id integer,
- data integer[],
+ id integer,
+ data integer[],
label float
);
COPY knn_train_data_reg (id, data, label) from stdin delimiter '|';
@@ -59,6 +59,7 @@ COPY knn_train_data_reg (id, data, label) from stdin delimiter '|';
8|{81,13}|0.0
9|{1,111}|0.0
\.
+DROP TABLE IF EXISTS knn_test_data;
create table knn_test_data (
id integer,
data integer[]);
@@ -70,6 +71,7 @@ copy knn_test_data (id, data) from stdin delimiter '|';
5|{2,90}
6|{50,45}
\.
+
drop table if exists madlib_knn_result_classification;
select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,False);
select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;