You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/10/27 04:15:31 UTC

madlib git commit: KNN: Fix optional parameters and ordering

Repository: madlib
Updated Branches:
  refs/heads/master 2c01ec971 -> e6e7c3903


KNN: Fix optional parameters and ordering

Additional Author : Frank McQuillan <fm...@pivotal.io>

JIRA: MADLIB-1129

Add the necessary interfaces for optional parameters.
Consolidate two code paths for easy maintenance.
Add ordering for the nearest neighbors.
Change the default for output_neighbors to True.
Update the documentation to reflect the changes.

Closes #191


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/e6e7c390
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/e6e7c390
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/e6e7c390

Branch: refs/heads/master
Commit: e6e7c3903a3deb9bdf26a497aef6b710b6684192
Parents: 2c01ec9
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Thu Oct 26 21:12:49 2017 -0700
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Thu Oct 26 21:12:49 2017 -0700

----------------------------------------------------------------------
 src/ports/postgres/modules/knn/knn.py_in       | 141 +++++++------------
 src/ports/postgres/modules/knn/knn.sql_in      | 146 ++++++++++++++------
 src/ports/postgres/modules/knn/test/knn.sql_in |   6 +-
 3 files changed, 162 insertions(+), 131 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/e6e7c390/src/ports/postgres/modules/knn/knn.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index dec3f1f..0e21cdd 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -113,15 +113,10 @@ def knn(schema_madlib, point_source, point_column_name, point_id, label_column_n
                                         points in test data table.
             @param output_table         Name of the table to store final
                                         results.
-            @param operation            Flag for the operation:
-                                        'c' for classification and
-                                        'r' for regression
             @param k                    default: 1. Number of nearest
                                         neighbors to consider
             @output_neighbours          Outputs the list of k-nearest neighbors
                                         that were used in the voting/averaging.
-        Returns:
-            VARCHAR                     Name of the output table.
     """
     with MinWarning('warning'):
         k_val = knn_validate_src(schema_madlib, point_source,
@@ -134,106 +129,74 @@ def knn(schema_madlib, point_source, point_column_name, point_id, label_column_n
         label_col_temp = unique_string(desp='label_col_temp')
         test_id_temp = unique_string(desp='test_id_temp')
 
-        if output_neighbors is None or '':
-            output_neighbors = False
+        if output_neighbors is None:
+            output_neighbors = True
 
         interim_table = unique_string(desp='interim_table')
 
-        if label_column_name is None or label_column_name == '':
-            plpy.execute(
-                """
-                CREATE TEMP TABLE {interim_table} AS
-                SELECT * FROM
-                    (
+        pred_out = ""
+        knn_neighbors = ""
+        label_out = ""
+        cast_to_int = ""
+
+        if output_neighbors:
+            knn_neighbors = (", array_agg(knn_temp.train_id ORDER BY "
+                                "knn_temp.dist ASC) AS k_nearest_neighbours ")
+        if label_column_name:
+            is_classification = False
+            label_column_type = get_expr_type(
+                label_column_name, point_source).lower()
+            if label_column_type in ['boolean', 'integer', 'text']:
+                is_classification = True
+                cast_to_int = '::INTEGER'
+
+            pred_out = ", avg({label_col_temp})".format(**locals())
+            if is_classification:
+                pred_out = (", {schema_madlib}.mode({label_col_temp})"
+                            ).format(**locals())
+            pred_out += " AS prediction"
+            label_out = (", train.{label_column_name}{cast_to_int}"
+                            " AS {label_col_temp}").format(**locals())
+
+        if not label_column_name and not output_neighbors:
+
+            plpy.error("kNN error: Either label_column_name or "
+                   "output_neighbors has to be non-NULL.")
+
+        plpy.execute("""
+            CREATE TEMP TABLE {interim_table} AS
+                SELECT * FROM (
                     SELECT row_number() over
                             (partition by {test_id_temp} order by dist) AS r,
                             {x_temp_table}.*
-                    FROM
-                        (
+                    FROM (
                         SELECT test.{test_id} AS {test_id_temp} ,
                             train.{point_id} as train_id ,
                             {schema_madlib}.squared_dist_norm2(
                                 train.{point_column_name},
                                 test.{test_column_name})
-                            AS dist
-                            FROM {point_source} AS train, {test_source} AS test
+                            AS dist {label_out}
+                            FROM {point_source} AS train,
+                                {test_source} AS test
                         ) {x_temp_table}
                     ) {y_temp_table}
-                WHERE {y_temp_table}.r <= {k_val}
-                """.format(**locals()))
-            plpy.execute(
-                """
-                CREATE TABLE {output_table} AS
-                    SELECT {test_id_temp} AS id, {test_column_name} ,
-                        CASE WHEN {output_neighbors}
-                        THEN array_agg(knn_temp.train_id)
-                        ELSE NULL END  AS k_nearest_neighbours
-                    FROM pg_temp.{interim_table} AS knn_temp
-                    join
-                        {test_source} AS knn_test ON
-                        knn_temp.{test_id_temp} = knn_test.{test_id}
-                    GROUP BY {test_id_temp}  ,  {test_column_name}
-                """.format(**locals()))
-            return
-
-        is_classification = False
-        label_column_type = get_expr_type(
-            label_column_name, point_source).lower()
-        if label_column_type in ['boolean', 'integer', 'text']:
-            is_classification = True
-            convert_boolean_to_int = '::INTEGER'
-        else:
-            is_classification = False
+            WHERE {y_temp_table}.r <= {k_val}
+            """.format(**locals()))
 
         plpy.execute(
             """
-            CREATE TEMP TABLE {interim_table} AS
-            SELECT * FROM
-                (
-                SELECT row_number() over
-                        (partition by {test_id_temp} order by dist) AS r,
-                        {x_temp_table}.*
-                FROM
-                    (
-                    SELECT test.{test_id} AS {test_id_temp} ,
-                        train.{point_id} as train_id ,
-                        {schema_madlib}.squared_dist_norm2(
-                            train.{point_column_name},
-                            test.{test_column_name})
-                        AS dist,
-                        train.{label_column_name}{cast_to_int}
-                            AS {label_col_temp}
-                        FROM {point_source} AS train, {test_source} AS test
-                    ) {x_temp_table}
-                ) {y_temp_table}
-            WHERE {y_temp_table}.r <= {k_val}
-            """.format(cast_to_int='::INTEGER' if is_classification else '',
-                       **locals()))
-
-        knn_create_table = 'CREATE TABLE ' + output_table + ' AS '  \
-            'SELECT ' + test_id_temp + ' AS id,' + test_column_name + ','
-        knn_pred_class = schema_madlib + \
-            '.mode(' + label_col_temp + ') AS prediction'
-        knn_pred_reg = 'avg(' + label_col_temp + ') AS prediction'
-        knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours '
-        knn_group_by = 'FROM pg_temp.' + interim_table + ' AS knn_temp join ' \
-            + test_source + ' AS knn_test ON  knn_temp.' + test_id_temp + '= knn_test.' \
-            + test_id + ' GROUP BY ' + test_id_temp + ', ' + test_column_name
-
-        if is_classification:
-            if output_neighbors:
-                plpy.execute("""{knn_create_table}{knn_pred_class}
-                    {knn_neighbours}{knn_group_by}""".format(**locals()))
-            else:
-                plpy.execute(""" {knn_create_table}{knn_pred_class}
-                    {knn_group_by}""".format(**locals()))
-        else:
-            if output_neighbors:
-                plpy.execute(""" {knn_create_table}{knn_pred_reg}
-                    {knn_neighbours}{knn_group_by}""".format(**locals()))
-            else:
-                plpy.execute("""{knn_create_table}{knn_pred_reg}
-                    {knn_group_by}""".format(**locals()))
+            CREATE TABLE {output_table} AS
+                SELECT {test_id_temp} AS id, {test_column_name}
+                    {pred_out}
+                    {knn_neighbors}
+                FROM pg_temp.{interim_table} AS knn_temp
+                    JOIN
+                    {test_source} AS knn_test ON
+                    knn_temp.{test_id_temp} = knn_test.{test_id}
+                GROUP BY {test_id_temp} , {test_column_name}
+            """.format(**locals()))
+
 
         plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table))
+        return
 # ------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/madlib/blob/e6e7c390/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index dfd2374..befb768 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -86,7 +86,6 @@ knn( point_source,
 <dl class="arglist">
 <dt>point_source</dt>
 <dd>TEXT. Name of the table containing the training data points.
-
 Training data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>
@@ -96,19 +95,19 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 
 <dt>point_id</dt>
 <dd>TEXT. Name of the column in 'point_source’ containing source data ids.
-The ids are of type INTEGER with no duplicates. They do not need to be contiguous. 
-This parameter must be used if the list of nearest neighbors are to be output, i.e., 
+The ids are of type INTEGER with no duplicates. They do not need to be contiguous.
+This parameter must be used if the list of nearest neighbors are to be output, i.e.,
 if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL.
 
 <dt>label_column_name</dt>
 <dd>TEXT. Name of the column with labels/values of training data points.
-If Boolean, integer or text types will run knn classification, else if 
-double precision values will run knn regression.  
-If you set this to NULL will return neighbors only without doing classification or regression.</dd>
+If this column is a Boolean, integer or text, it will run KNN classification,
+else if it is double precision values will run KNN regression.
+If you set this to NULL, it will only return the set of neighbors without
+actually doing classification or regression.</dd>
 
 <dt>test_source</dt>
 <dd>TEXT. Name of the table containing the test data points.
-
 Testing data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>
@@ -122,17 +121,15 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 <dt>output_table</dt>
 <dd>TEXT. Name of the table to store final results.</dd>
 
-<dt>operation</dt>
-<dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>
-
 <dt>k (optional)</dt>
 <dd>INTEGER. default: 1. Number of nearest neighbors to consider.
-For classification, should be an odd number to break ties.
-otherwise result may depend on ordering of the input data.</dd>
+For classification, should be an odd number to break ties,
+otherwise the result may depend on ordering of the input data.</dd>
 
 <dt>output_neighbors (optional) </dt>
-<dd>BOOLEAN default: FALSE. Outputs the list of k-nearest 
-neighbors that were used in the voting/averaging.</dd>
+<dd>BOOLEAN default: TRUE. Outputs the list of k-nearest
+neighbors that were used in the voting/averaging, sorted
+from closest to furthest.</dd>
 
 </dl>
 
@@ -164,9 +161,9 @@ The output of the KNN module is a table with the following columns:
 <pre class="example">
 DROP TABLE IF EXISTS knn_train_data;
 CREATE TABLE knn_train_data (
-                    id integer, 
-                    data integer[], 
-                    label integer
+                    id integer,
+                    data integer[],
+                    label integer  -- Integer label means for classification
                     );
 INSERT INTO knn_train_data VALUES
 (1, '{1,1}', 1),
@@ -184,9 +181,9 @@ INSERT INTO knn_train_data VALUES
 <pre class="example">
 DROP TABLE IF EXISTS knn_train_data_reg;
 CREATE TABLE knn_train_data_reg (
-                    id integer, 
-                    data integer[], 
-                    label float
+                    id integer,
+                    data integer[],
+                    label float  -- Float label means for regression
                     );
 INSERT INTO knn_train_data_reg VALUES
 (1, '{1,1}', 1.0),
@@ -204,7 +201,7 @@ INSERT INTO knn_train_data_reg VALUES
 <pre class="example">
 DROP TABLE IF EXISTS knn_test_data;
 CREATE TABLE knn_test_data (
-                    id integer, 
+                    id integer,
                     data integer[]
                     );
 INSERT INTO knn_test_data VALUES
@@ -218,26 +215,26 @@ INSERT INTO knn_test_data VALUES
 
 -#  Run KNN for classification:
 <pre class="example">
-DROP TABLE IF EXISTS madlib_knn_result_classification;
-SELECT * FROM madlib.knn( 
+DROP TABLE IF EXISTS knn_result_classification;
+SELECT * FROM madlib.knn(
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
-                'id',                  -- Col Name of id in train data 
+                'id',                  -- Col Name of id in train data
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
-                'id',                  -- Col name of id in test data 
-                'madlib_knn_result_classification',  -- Output table
-                 3                     -- Number of nearest neighbours
-                 True                  -- True if you want to show Nearest-Neighbors, False otherwise
+                'id',                  -- Col name of id in test data
+                'knn_result_classification',  -- Output table
+                 3,                    -- Number of nearest neighbors
+                 True                  -- True if you want to show Nearest-Neighbors by id, False otherwise
                 );
-SELECT * from madlib_knn_result_classification ORDER BY id;
+SELECT * from knn_result_classification ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id |  data   | prediction | k_nearest_neighbours 
+  id |  data   | prediction | k_nearest_neighbours
 ----+---------+------------+----------------------
-  1 | {2,1}   |          1 | {1,2,3}
+  1 | {2,1}   |          1 | {2,1,3}
   2 | {2,6}   |          1 | {5,4,3}
   3 | {15,40} |          0 | {7,6,5}
   4 | {12,1}  |          1 | {4,5,3}
@@ -245,29 +242,30 @@ Result:
   6 | {50,45} |          0 | {6,7,8}
 (6 rows)
 </pre>
+Note that the nearest neighbors are sorted from closest to furthest from the corresponding test point.
 
 -#  Run KNN for regression:
 <pre class="example">
-DROP TABLE IF EXISTS madlib_knn_result_regression;
-SELECT * FROM madlib.knn( 
+DROP TABLE IF EXISTS knn_result_regression;
+SELECT * FROM madlib.knn(
                 'knn_train_data_reg',  -- Table of training data
                 'data',                -- Col name of training data
-                'id',                  -- Col Name of id in train data 
+                'id',                  -- Col Name of id in train data
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
-                'id',                  -- Col name of id in test data 
-                'madlib_knn_result_regression',  -- Output table
-                 3,                    -- Number of nearest neighbours
+                'id',                  -- Col name of id in test data
+                'knn_result_regression',  -- Output table
+                 3,                    -- Number of nearest neighbors
                 True                   -- True if you want to show Nearest-Neighbors, False otherwise
                 );
-SELECT * from madlib_knn_result_regression ORDER BY id;
+SELECT * FROM knn_result_regression ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id |  data   |    prediction     | k_nearest_neighbours 
+ id |  data   |    prediction     | k_nearest_neighbours
 ----+---------+-------------------+----------------------
-  1 | {2,1}   |                 1 | {1,2,3}
+  1 | {2,1}   |                 1 | {2,1,3}
   2 | {2,6}   |                 1 | {5,4,3}
   3 | {15,40} | 0.333333333333333 | {7,6,5}
   4 | {12,1}  |                 1 | {4,5,3}
@@ -276,6 +274,35 @@ Result:
 (6 rows)
 </pre>
 
+-#  List nearest neighbors only, without doing classification or regression:
+<pre class="example">
+DROP TABLE IF EXISTS knn_result_list_neighbors;
+SELECT * FROM madlib.knn(
+                'knn_train_data_reg',  -- Table of training data
+                'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data
+                NULL,                  -- NULL training labels means just list neighbors
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data
+                'knn_result_list_neighbors', -- Output table
+                3                      -- Number of nearest neighbors
+                );
+SELECT * FROM knn_result_list_neighbors ORDER BY id;
+</pre>
+Result, with neighbors sorted from closest to furthest:
+<pre class="result">
+ id |  data   | k_nearest_neighbours
+----+---------+----------------------
+  1 | {2,1}   | {2,1,3}
+  2 | {2,6}   | {5,4,3}
+  3 | {15,40} | {7,6,5}
+  4 | {12,1}  | {4,5,3}
+  5 | {2,90}  | {9,6,7}
+  6 | {50,45} | {6,7,8}
+(6 rows)
+</pre>
+
 @anchor background
 @par Technical Background
 
@@ -447,3 +474,42 @@ BEGIN
 END;
 $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    point_source VARCHAR,
+    point_column_name VARCHAR,
+    point_id VARCHAR,
+    label_column_name VARCHAR,
+    test_source VARCHAR,
+    test_column_name VARCHAR,
+    test_id VARCHAR,
+    output_table VARCHAR,
+    k INTEGER
+) RETURNS VARCHAR AS $$
+DECLARE
+    returnstring VARCHAR;
+BEGIN
+    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,TRUE);
+    RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    point_source VARCHAR,
+    point_column_name VARCHAR,
+    point_id VARCHAR,
+    label_column_name VARCHAR,
+    test_source VARCHAR,
+    test_column_name VARCHAR,
+    test_id VARCHAR,
+    output_table VARCHAR
+) RETURNS VARCHAR AS $$
+DECLARE
+    returnstring VARCHAR;
+BEGIN
+    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,TRUE);
+    RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

http://git-wip-us.apache.org/repos/asf/madlib/blob/e6e7c390/src/ports/postgres/modules/knn/test/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in
index fa38751..1e71a0e 100644
--- a/src/ports/postgres/modules/knn/test/knn.sql_in
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -44,8 +44,8 @@ copy knn_train_data (id, data, label) from stdin delimiter '|';
 \.
 DROP TABLE IF EXISTS knn_train_data_reg;
 CREATE TABLE knn_train_data_reg (
-                    id integer, 
-                    data integer[], 
+                    id integer,
+                    data integer[],
                     label float
                     );
 COPY knn_train_data_reg (id, data, label) from stdin delimiter '|';
@@ -59,6 +59,7 @@ COPY knn_train_data_reg (id, data, label) from stdin delimiter '|';
 8|{81,13}|0.0
 9|{1,111}|0.0
 \.
+DROP TABLE IF EXISTS knn_test_data;
 create table knn_test_data (
 id  integer,
 data integer[]);
@@ -70,6 +71,7 @@ copy knn_test_data (id, data) from stdin delimiter '|';
 5|{2,90}
 6|{50,45}
 \.
+
 drop table if exists madlib_knn_result_classification;
 select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,False);
 select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;