You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/03/13 20:57:47 UTC
[25/50] [abbrv] incubator-madlib git commit: New Module: k-Nearest
Neighbors (k-NN)
New Module: k-Nearest Neighbors (k-NN)
JIRA: MADLIB-927
- Add a new module for k-NN, as early stage development.
This version of k-NN considers only one distance metric, which is
MADlib's squared_dist_norm2. There are multiple JIRAs opened to
address and improve this and other such limitations (JIRA IDs:
MADLIB-1059, MADLIB-1060, MADLIB-1061).
Additional authors: Orhan Kislal, Nandish Jayaram
Closes #81
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/61f3c5f0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/61f3c5f0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/61f3c5f0
Branch: refs/heads/latest_release
Commit: 61f3c5f038d96cff851a72f4e97b926fc5a02726
Parents: 9162271
Author: Auon Haidar Kazmi <ka...@gmail.com>
Authored: Wed Feb 1 16:54:57 2017 -0800
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Wed Feb 1 16:54:57 2017 -0800
----------------------------------------------------------------------
doc/mainpage.dox.in | 7 +
src/config/Modules.yml | 2 +
src/ports/postgres/modules/knn/__init__.py_in | 0
src/ports/postgres/modules/knn/knn.py_in | 129 ++++++
src/ports/postgres/modules/knn/knn.sql_in | 449 ++++++++++++++++++++
src/ports/postgres/modules/knn/test/knn.sql_in | 70 +++
6 files changed, 657 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 0e846e1..85a5d8d 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -253,6 +253,13 @@ Interface and implementation are subject to change.
@defgroup grp_cg Conjugate Gradient
@defgroup grp_bayes Naive Bayes Classification
@defgroup grp_sample Random Sampling
+
+ @defgroup grp_nene Nearest Neighbors
+ @ingroup grp_super
+ @{A collection of methods to create nearest neigbor based models.@}
+
+ @defgroup grp_knn k-Nearest Neighbors
+ @ingroup grp_nene
@}
@defgroup grp_deprecated Deprecated Modules
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/config/Modules.yml
----------------------------------------------------------------------
diff --git a/src/config/Modules.yml b/src/config/Modules.yml
index c3315b6..d5c336e 100644
--- a/src/config/Modules.yml
+++ b/src/config/Modules.yml
@@ -17,6 +17,8 @@ modules:
- name: graph
- name: kmeans
depends: ['array_ops', 'svec_util', 'sample']
+ - name: knn
+ depends: ['array_ops']
- name: lda
depends: ['array_ops']
- name: linalg
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/__init__.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/__init__.py_in b/src/ports/postgres/modules/knn/__init__.py_in
new file mode 100644
index 0000000..e69de29
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/knn.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
new file mode 100644
index 0000000..da7f9d6
--- /dev/null
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -0,0 +1,129 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`<!', `!>')
+
+"""
+@file knn.py_in
+
+@brief knn: Driver functions
+
+@namespace knn
+
+@brief knn: Driver functions
+"""
+
+import plpy
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import is_col_array
+from utilities.validate_args import array_col_has_no_null
+from utilities.validate_args import get_cols_and_types
+
+STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
+HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>, <!False!>)
+UDF_ON_SEGMENT_NOT_ALLOWED = m4_ifdef(<!__UDF_ON_SEGMENT_NOT_ALLOWED__!>, <!True!>, <!False!>)
+# ----------------------------------------------------------------------
+
+
+def knn_validate_src(schema_madlib, **kwargs):
+ trainingSource = kwargs['trainingSource']
+ if not trainingSource:
+ plpy.error("knn error: Invalid training table name!")
+ if not table_exists(trainingSource):
+ plpy.error("knn error: Training table {0} does not exist!".format(trainingSource))
+ if table_is_empty(trainingSource):
+ plpy.error("knn error: Training table {0} is empty!".format(trainingSource))
+
+ testSource = kwargs['testSource']
+ if not testSource:
+ plpy.error("knn error: Invalid test table name!")
+ if not table_exists(testSource):
+ plpy.error("knn error: Test table {0} does not exist!".format(testSource))
+ if table_is_empty(testSource):
+ plpy.error("knn error: Test table {0} is empty!".format(testSource))
+
+ trainingClassColumn = kwargs['trainingClassColumn']
+ trainingFeatureColumn = kwargs['trainingFeatureColumn']
+ for c in (trainingClassColumn, trainingFeatureColumn):
+ if not c:
+ plpy.error("knn error: Invalid column name in training table!")
+ if not columns_exist_in_table(trainingSource, [c]):
+ plpy.error("knn error: " + \
+ "Column '{0}' does not exist in {1}!".format(c, trainingSource))
+
+ testingFeatureColumn = kwargs['testingFeatureColumn']
+ testingIdColumn = kwargs['testingIdColumn']
+ for c in (testingFeatureColumn, testingIdColumn):
+ if not c:
+ plpy.error("knn error: Invalid column name in test table!")
+ if not columns_exist_in_table(testSource, [c]):
+ plpy.error("knn error: " + \
+ "Column '{0}' does not exist in {1}!".format(c, testSource))
+
+ if not is_col_array(trainingSource, trainingFeatureColumn):
+ plpy.error("knn error:" + \
+ "'Feature column {0} in train table is not an array!".format(str(trainingFeatureColumn)))
+ if not is_col_array(testSource, testingFeatureColumn):
+ plpy.error("knn error:" + \
+ "'Feature column {0} in test table is not an array!".format(str(testingFeatureColumn)))
+
+ if not array_col_has_no_null(trainingSource, trainingFeatureColumn):
+ plpy.error("knn error:" + \
+ "'Feature column {0} in train table has some NULL values!".format(str(trainingFeatureColumn)))
+ if not array_col_has_no_null(testSource, testingFeatureColumn):
+ plpy.error("knn error:" + \
+ "'Feature column {0} in test table has some NULL values!".format(str(testingFeatureColumn)))
+
+ k = int(kwargs['K'])
+ if k<=0:
+ plpy.error("knn error:" + \
+ "'k' {0} is not valid for knn!".format(str(k)))
+ bound = plpy.execute("""SELECT {k} <= count(*)
+ AS bound FROM {tbl}""".format(k=str(k),
+ trainingFeatureColumn=trainingFeatureColumn, tbl=trainingSource))[0]['bound']
+ if not bound:
+ plpy.error("knn error:" + \
+ "'k' {0} is greater than number of rows in training table!".format(str(k)))
+
+ colTypesList = get_cols_and_types(trainingSource)
+ colType = ''
+ for type in colTypesList:
+ if type[0] == trainingClassColumn:
+ colType = type[1]
+ break
+ if colType not in ['INTEGER','integer','double precision','DOUBLE PRECISION','float','FLOAT','boolean','BOOLEAN'] :
+ plpy.error("knn error:" + \
+ "Data type {0} is not valid as label for scope of knn!".format(str(colType)))
+
+ colTypesTestList = get_cols_and_types(testSource)
+ colType = ''
+ for type in colTypesTestList:
+ if type[0] == testingIdColumn:
+ colType = type[1]
+ break
+ if colType not in ['INTEGER','integer'] :
+ plpy.error("knn error:" + \
+ "Data type {0} is not valid as Id in test table!".format(str(colType)))
+
+
+# ----------------------------------------------------------------------
+m4_changequote(<!`!>, <!'!>)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
new file mode 100644
index 0000000..7ee736b
--- /dev/null
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -0,0 +1,449 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- *//**
+ *
+ * @file knn.sql_in
+ * @brief Set of functions for k-nearest neighbors.
+ * @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification,
+ * see the module description \ref grp_knn.
+ *
+ *
+ *//* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_knn
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li class="level1"><a href="#knn">K-Nearest Neighbors</a></li>
+<li class="level1"><a href="#usage">Usage</a></li>
+<li class="level1"><a href="#output">Output Format</a></li>
+<li class="level1"><a href="#examples">Examples</a></li>
+<li class="level1"><a href="#background">Technical Background</a></li>
+<li class="level1"><a href="#literature">Literature</a></li>
+<li class="level1"><a href="#related">Related Topics</a></li>
+</ul>
+</div>
+
+@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes in case of classification and average value of target values for regression task.
+
+\warning <em> This MADlib method is still in early stage development. There may be some
+issues that will be addressed in a future version. Interface and implementation
+is subject to change. </em>
+
+@anchor knn
+
+k-Nearest Neighbors is a method for finding k closest points to a
+given data point in terms of a given metric. Its input consist of
+data points as features from testing examples. For a given k, it
+looks for k closest points in training set for each of the data
+points in test set. Algorithm generates one output per testing example.
+The output of KNN depends on the type of task:
+For Classification, the output is majority vote of the classes of
+the k nearest data points. The testing example gets assigned the
+most popular class among nearest neighbors.
+For Regression, the output is average of the values of k nearest
+neighbors of the given testing example.
+
+@anchor usage
+@par Usage
+<pre class="syntax">
+knn( point_source,
+ point_column_name,
+ label_column_name,
+ test_source,
+ test_column_name,
+ id_column_name,
+ output_table,
+ operation,
+ k
+ )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>point_source</dt>
+<dd>TEXT. The name of the table containing the training data points.
+
+Training data points are expected to be stored row-wise,
+in a column of type <tt>DOUBLE PRECISION[]</tt>.
+</dd>
+
+<dt>point_column_name</dt>
+<dd>TEXT. The name of the column with training data points.</dd>
+
+<dt>label_column_name</dt>
+<dd>TEXT. The name of the column with labels/values of training data points.</dd>
+
+<dt>test_source</dt>
+<dd>TEXT. The name of the table containing the test data points.
+
+Testing data points are expected to be stored row-wise,
+in a column of type <tt>DOUBLE PRECISION[]</tt>.
+</dd>
+
+<dt>test_column_name</dt>
+<dd>TEXT. The name of the column with testing data points.</dd>
+
+<dt>id_column_name</dt>
+<dd>TEXT. Name of the column having ids of data points in test data table.</dd>
+
+<dt>output_table</dt>
+<dd>TEXT. Name of the table to store final results.</dd>
+
+<dt>operation</dt>
+<dd>TEXT. the type of task; r for regression and c for classification.</dd>
+
+<dt>k (optional)</dt>
+<dd>INTEGER. default: 1. The number of nearest neighbors to consider.</dd>
+
+</dl>
+
+
+@anchor output
+@par Output Format
+
+The output of the KNN module is a table with the following columns:
+<table class="output">
+ <tr>
+ <th>id</th>
+ <td>INTEGER. The ids of test data points.</td>
+ </tr>
+ <tr>
+ <th>test_column_name</th>
+ <td>DOUBLE PRECISION[]. The test data points.</td>
+ </tr>
+ <tr>
+ <th>prediction</th>
+ <td>INTEGER. The output of KNN- label in case of classification, average value in case of regression.</td>
+ </tr>
+</table>
+
+
+@anchor examples
+@examp
+
+-# Prepare some training data.
+<pre class="example">
+CREATE TABLE knn_train_data (id integer, data integer[], label float);
+COPY knn_train_data (id, data, label) from stdin delimiter '|';
+1|{1,1}|1.0
+2|{2,2}|1.0
+3|{3,3}|1.0
+4|{4,4}|1.0
+5|{4,5}|1.0
+6|{20,50}|0.0
+7|{10,31}|0.0
+8|{81,13}|0.0
+9|{1,111}|0.0
+\\.
+</pre>
+
+-# Prepare some testing data.
+<pre class="example">
+CREATE TABLE knn_test_data (id integer, data integer[]);
+COPY knn_test_data (id, data) from stdin delimiter '|';
+1|{2,1}
+2|{2,6}
+3|{15,40}
+4|{12,1}
+5|{2,90}
+6|{50,45}
+\\.
+</pre>
+
+-# Run KNN for classification:
+<pre class="example">
+SELECT * FROM madlib.knn( 'knn_train_data',
+ 'data',
+ 'label',
+ 'knn_test_data',
+ 'data',
+ 'id',
+ 'madlib_knn_result_classification',
+ 'c',
+ 3
+ );
+SELECT * from madlib_knn_result_classification;
+</pre>
+Result:
+<pre class="result">
+ id | data | prediction
+-----+----------+-----------
+ 1 | {2,1} | 1
+ 2 | {2,6} | 1
+ 3 | {15,40} | 0
+ 4 | {12,1} | 1
+ 5 | {2,90} | 0
+ 6 | {50,45} | 0
+</pre>
+
+-# Run KNN for regression:
+<pre class="example">
+SELECT * FROM madlib.knn( 'knn_train_data',
+ 'data',
+ 'label',
+ 'knn_test_data',
+ 'data',
+ 'id',
+ 'madlib_knn_result_regression',
+ 'r',
+ 3
+ );
+SELECT * from madlib_knn_result_regression;
+</pre>
+Result:
+<pre class="result">
+ id | data | prediction
+-----+----------+-----------
+ 1 | {2,1} | 1
+ 2 | {2,6} | 1
+ 3 | {15,40} | 0.5
+ 4 | {12,1} | 1
+ 5 | {2,90} | 0.25
+ 6 | {50,45} | 0.25
+</pre>
+
+
+
+@anchor background
+@par Technical Background
+
+The training data points are vectors in a multidimensional feature space,
+each with a class label. The training phase of the algorithm consists
+only of storing the feature vectors and class labels of the training points.
+
+In the classification phase, k is a user-defined constant, and an unlabeled
+vector (a test point) is classified by assigning the label which is most
+frequent among the k training samples nearest to that test point.
+In case of regression, average of the values of these k training samples
+is assigned to the test point.
+The only distance metric supported in this version is MADlib's squared_dist_norm2.
+Other distance metrics will be added in a future release of this module.
+
+
+@anchor literature
+@literature
+
+@anchor knn-lit-1
+[1] Wikipedia, k-nearest neighbors algorithm,
+ https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+
+@anchor knn-lit-2
+[2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression
+ http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf
+
+@anchor knn-lit-3
+[3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
+ https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
+
+
+@anchor related
+@par Related Topics
+
+File knn.sql_in documenting the knn SQL functions
+
+@internal
+@sa namespace knn (documenting the implementation in Python)
+@endinternal
+*/
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
+"trainingSource" VARCHAR,
+"trainingClassColumn" VARCHAR,
+"trainingFeatureColumn" VARCHAR,
+"testSource" VARCHAR,
+"testingIdColumn" VARCHAR,
+"testingFeatureColumn" VARCHAR,
+"K" INTEGER
+) RETURNS VOID AS $$
+ PythonFunction(knn, knn, knn_validate_src)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+ arg1 VARCHAR
+) RETURNS VOID AS $$
+BEGIN
+ IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
+ RAISE NOTICE
+'
+-----------------------------------------------------------------------
+ USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.knn(
+ point_source, -- Training data table having training features as vector column and labels
+ point_column_name, -- Name of column having feature vectors in training data table
+ label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table
+ test_source, -- Test data table having features as vector column. Id of features is mandatory
+ test_column_name, -- Name of column having feature vectors in test data table
+ id_column_name, -- Name of column having feature vector Ids in test data table
+ output_table, -- Name of output table
+ operation, -- c for classification task, r for regression task
+ k -- value of k. Default will go as 1
+ );
+
+-----------------------------------------------------------------------
+ OUTPUT
+-----------------------------------------------------------------------
+The output of the KNN module is a table with the following columns:
+
+id The ids of test data points.
+test_column_name The test data points.
+prediction The output of KNN- label in case of classification, average value in case of regression.
+';
+ END IF;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+) RETURNS VOID AS $$
+BEGIN
+ RAISE NOTICE '
+k-Nearest Neighbors is a method for finding k closest points to a given data
+point in terms of a given metric. Its input consist of data points as features
+from testing examples. For a given k, it looks for k closest points in
+training set for each of the data points in test set. Algorithm generates one
+output per testing example. The output of KNN depends on the type of task:
+For Classification, the output is majority vote of the classes of the k
+nearest data points. The testing example gets assigned the most popular class
+among nearest neighbors. For Regression, the output is average of the values
+of k nearest neighbors of the given testing example.
+ ';
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+ point_source VARCHAR,
+ point_column_name VARCHAR,
+ label_column_name VARCHAR,
+ test_source VARCHAR,
+ test_column_name VARCHAR,
+ id_column_name VARCHAR,
+ output_table VARCHAR,
+ operation VARCHAR,
+ k INTEGER
+) RETURNS VARCHAR AS $$
+DECLARE
+ class_test_source REGCLASS;
+ class_point_source REGCLASS;
+ l FLOAT;
+ outputTableFlag INTEGER;
+ id INTEGER;
+ vector DOUBLE PRECISION[];
+ cur_pid integer;
+ oldClientMinMessages VARCHAR;
+ returnstring VARCHAR;
+ x_temp_table VARCHAR;
+ y_temp_table VARCHAR;
+BEGIN
+ oldClientMinMessages :=
+ (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
+ EXECUTE 'SET client_min_messages TO warning';
+ PERFORM MADLIB_SCHEMA.__knn_validate_src(point_source, label_column_name, point_column_name, test_source, id_column_name, test_column_name,k);
+ class_test_source := test_source;
+ class_point_source := point_source;
+ --checks
+ IF (k <= 0) THEN
+ RAISE EXCEPTION 'KNN error: Number of neighbors k must be a positive integer.';
+ END IF;
+ IF (operation != 'c' AND operation != 'r') THEN
+ RAISE EXCEPTION 'KNN error: The operation has to be r for regression OR c for classification.';
+ END IF;
+ PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
+ x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
+ y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
+
+ EXECUTE
+ $sql$
+ SELECT count(*) FROM information_schema.tables WHERE table_name = '$sql$ || output_table || $sql$'$sql$ into outputTableFlag;
+ IF (outputTableFlag != 0) THEN
+ RAISE Exception 'KNN error: Output table % already exists.', output_table;
+ END IF;
+
+ EXECUTE
+ $sql$
+ DROP TABLE IF EXISTS pg_temp.madlib_knn_interm;
+ CREATE TABLE pg_temp.madlib_knn_interm AS
+ SELECT *
+ FROM
+ (
+ SELECT row_number() over (partition by test_id order by dist) as r, $sql$ || x_temp_table || $sql$.*
+ FROM
+ (
+ SELECT test. $sql$ || id_column_name || $sql$ as test_id, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) as dist, $sql$ || label_column_name || $sql$ from $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test
+ )$sql$ || x_temp_table || $sql$
+ )$sql$ || y_temp_table || $sql$
+ WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k;
+ IF (operation = 'c') THEN
+ EXECUTE
+ $sql$
+ CREATE TABLE $sql$ || output_table || $sql$ AS
+ SELECT test_id as id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name || $sql$) as prediction from pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on test_id=$sql$ || id_column_name || $sql$ group by test_id, $sql$ || test_column_name;
+ ELSE
+ EXECUTE
+ $sql$
+ CREATE TABLE $sql$ || output_table || $sql$ AS
+ SELECT test_id as id, $sql$ || test_column_name || $sql$ ,avg($sql$ || label_column_name || $sql$) as prediction from pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on test_id=$sql$ || id_column_name || $sql$ group by test_id, $sql$ || test_column_name || $sql$ order by test_id $sql$;
+ END IF;
+
+ EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
+ IF (operation = 'c') THEN
+ returnstring := 'The classification results have been written to table';
+ ELSE
+ returnstring := 'The regression results have been written to table';
+ END IF;
+ DROP TABLE pg_temp.madlib_knn_interm;
+ RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+ point_source VARCHAR,
+ point_column_name VARCHAR,
+ label_column_name VARCHAR,
+ test_source VARCHAR,
+ test_column_name VARCHAR,
+ id_column_name VARCHAR,
+ output_table VARCHAR,
+ operation VARCHAR
+) RETURNS VARCHAR AS $$
+DECLARE
+ returnstring VARCHAR;
+BEGIN
+ returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
+ RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/test/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in
new file mode 100644
index 0000000..3c730ee
--- /dev/null
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -0,0 +1,70 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+/* -----------------------------------------------------------------------------
+ * Test knn.
+ *
+ * FIXME: Verify results
+ * -------------------------------------------------------------------------- */
+
+drop table if exists knn_train_data;
+create table knn_train_data (
+id integer,
+data integer[],
+label float);
+copy knn_train_data (id, data, label) from stdin delimiter '|';
+1|{1,1}|1.0
+2|{2,2}|1.0
+3|{3,3}|1.0
+4|{4,4}|1.0
+5|{4,5}|1.0
+6|{20,50}|0.0
+7|{10,31}|0.0
+8|{81,13}|0.0
+9|{1,111}|0.0
+\.
+drop table if exists knn_test_data;
+create table knn_test_data (
+id integer,
+data integer[]);
+copy knn_test_data (id, data) from stdin delimiter '|';
+1|{2,1}
+2|{2,6}
+3|{15,40}
+4|{12,1}
+5|{2,90}
+6|{50,45}
+\.
+drop table if exists madlib_knn_result_classification;
+select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c',3);
+select madlib.assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
+
+drop table if exists madlib_knn_result_regression;
+select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_regression','r',4);
+select madlib.assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression;
+
+drop table if exists madlib_knn_result_classification;
+select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c');
+select madlib.assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification;
+
+select madlib.knn();
+select madlib.knn('help');