You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/02/01 22:06:26 UTC

incubator-madlib git commit: K-means: support for array input

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 071128d7c -> 08294791f


K-means: support for array input

JIRA: MADLIB-1018

Adds support for array input as data points. The function collates
the columns into a column in a temp table.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/08294791
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/08294791
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/08294791

Branch: refs/heads/master
Commit: 08294791fbfcfef053c7a752bc87a42aeba117e1
Parents: 071128d
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Feb 1 14:04:38 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Feb 1 14:04:38 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/kmeans/kmeans.py_in  | 58 ++++++++++++++++++++
 src/ports/postgres/modules/kmeans/kmeans.sql_in | 32 +++++++----
 .../postgres/modules/kmeans/test/kmeans.sql_in  | 10 +++-
 3 files changed, 86 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/kmeans.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.py_in b/src/ports/postgres/modules/kmeans/kmeans.py_in
index d99ddd8..da75d78 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.py_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.py_in
@@ -12,11 +12,15 @@ m4_changequote(`<!', `!>')
 """
 
 import plpy
+import re
 
 from utilities.control import IterationController2D
 from utilities.control_composite import IterationControllerComposite
 from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_expr_type
+from utilities.utilities import unique_string
 
 STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
 HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>, <!False!>)
@@ -34,6 +38,31 @@ def kmeans_validate_src(schema_madlib, rel_source, **kwargs):
 
 # ----------------------------------------------------------------------
 
+def kmeans_validate_expr(schema_madlib, rel_source, expr_point, **kwargs):
+    """
+    Validation function for the expr_point parameter
+    expr_point accepts 2 formats:
+        - A single column name of a numeric array
+        - A numeric array expression
+    """
+
+    expr_type = get_expr_type(expr_point,rel_source).lower()
+
+    # Both formats should return a numeric array type
+    if expr_type in ['smallint[]', 'integer[]', 'bigint[]', 'decimal[]',
+                        'numeric[]', 'real[]', 'double precision[]',
+                        'serial[]', 'bigserial[]', 'float8[]']:
+
+        # An array expression should fail this check
+        if columns_exist_in_table(rel_source, [expr_point]):
+            return False
+        return True
+    else:
+        plpy.error(
+            """Kmeans error: {expr_point} is not a valid column or array!
+            """.format(**locals()))
+
+# ----------------------------------------------------------------------
 
 def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
                              expr_point, **kwargs):
@@ -54,6 +83,16 @@ def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
     @return The iteration number (i.e., the key) with which to look up the
         result in \c rel_state
     """
+
+    if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+        view_name = unique_string('km_view')
+
+        plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+            SELECT {expr_point} AS expr FROM {rel_source}
+            """.format(**locals()))
+        rel_source = view_name
+        expr_point = 'expr'
+
     fn_dist_name = plpy.execute("SELECT fn_dist_name FROM " + rel_args)[0]['fn_dist_name']
     iterationCtrl = IterationController2D(
         rel_args=rel_args,
@@ -139,6 +178,15 @@ def compute_kmeans_random_seeding(schema_madlib, rel_args, rel_state,
     @return The iteration number (i.e., the key) with which to look up the
         result in \c rel_state
     """
+    if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+        view_name = unique_string('km_view')
+
+        plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+            SELECT {expr_point} AS expr FROM {rel_source}
+            """.format(**locals()))
+        rel_source = view_name
+        expr_point = 'expr'
+
     iterationCtrl = IterationController2D(
         rel_args=rel_args,
         rel_state=rel_state,
@@ -211,6 +259,16 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source,
     @return The iteration number (i.e., the key) with which to look up the
         result in \c rel_state
     """
+
+    if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+        view_name = unique_string('km_view')
+
+        plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+            SELECT {expr_point} AS expr FROM {rel_source}
+            """.format(**locals()))
+        rel_source = view_name
+        expr_point = 'expr'
+
     fn_dist_name = plpy.execute("SELECT fn_dist_name FROM " +
                                 rel_args)[0]['fn_dist_name']
     iterationCtrl = IterationControllerComposite(

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 2352bc1..16014ef 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -105,7 +105,7 @@ are skipped during analysis.
 </dd>
 
 <dt>expr_point</dt>
-<dd>TEXT. The name of the column with point coordinates.</dd>
+<dd>TEXT. The name of the column with point coordinates or an array expression.</dd>
 
 <dt>k</dt>
 <dd>INTEGER. The number of centroids to calculate.</dd>
@@ -148,20 +148,11 @@ Note: the final K-means algorithm is run on the complete dataset. This parameter
 only builds a subsample for the seeding and is only available for kmeans++.
 
 <dt>rel_initial_centroids</dt>
-<dd>TEXT. The set of initial centroids. The centroid relation is
-expected to be of the following form:
-<pre>
-{TABLE|VIEW} rel_initial_centroids (
-    ...
-    expr_centroid DOUBLE PRECISION[],
-    ...
-)
-</pre>
-where <em>expr_centroid</em> is the name of a column with coordinates.
+<dd>TEXT. The set of initial centroids.
 </dd>
 
 <dt>expr_centroid</dt>
-<dd>TEXT. The name of the column in the <em>rel_initial_centroids</em> relation that contains the centroid coordinates.</dd>
+<dd>TEXT. The name of the column (or the array expression) in the <em>rel_initial_centroids</em> relation that contains the centroid coordinates.</dd>
 
 <dt>initial_centroids</dt>
 <dd>TEXT. A string containing a DOUBLE PRECISION array expression with the initial centroid coordinates.</dd>
@@ -501,6 +492,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_src(
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_expr(
+    rel_source      VARCHAR,
+    expr_point      VARCHAR
+) RETURNS BOOLEAN AS $$
+    PythonFunction(kmeans, kmeans, kmeans_validate_expr)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__seeding_validate_args(
     rel_source VARCHAR,
@@ -516,6 +514,11 @@ DECLARE
   rel_source_regclass REGCLASS;
   rel_filtered VARCHAR;
 BEGIN
+
+    -- Validate the expr_point input. Since we don't need a view at this
+    -- point, the output is safe to ignore.
+    PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
+
     rel_source_regclass := rel_source;
 
     IF (initial_centroids IS NOT NULL) THEN
@@ -532,6 +535,7 @@ BEGIN
         Number of clusters k must be <= 32767 (for results to be returned in a
         reasonable amount of time).';
     END IF;
+
     EXECUTE $sql$ SELECT count(*)
                   FROM $sql$ || textin(regclassout(rel_source_regclass)) || $sql$
                   WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 $sql$
@@ -1571,6 +1575,10 @@ BEGIN
 
     PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source);
 
+    -- Validate the expr_point input. Since we don't need a view at this
+    -- point, the output is safe to ignore.
+    PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
+
     class_rel_source := rel_source;
 
     proc_fn_dist := fn_dist

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
index b95693b..072ecb3 100644
--- a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
@@ -28,7 +28,7 @@ FROM (
 ) AS centroids, generate_series(1,100) i;
 
 CREATE TABLE centroids AS
-SELECT position
+SELECT x,y,position
 FROM kmeans_2d
 ORDER BY random()
 LIMIT 10;
@@ -82,8 +82,14 @@ COPY km_sample (pid, points) FROM stdin DELIMITER '|';
 10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, NULL, 1045}
 \.
 
-DROP TABLE IF EXISTS centroids;
 
 SELECT * FROM kmeanspp('km_sample', 'points', 2,
                        'MADLIB_SCHEMA.squared_dist_norm2',
                        'MADLIB_SCHEMA.avg', 20, 0.001);
+
+
+SELECT * FROM kmeans('kmeans_2d', 'array[x,y]', 'centroids', 'array[x,y]');
+SELECT * FROM kmeanspp('kmeans_2d', 'array[x,y]', 10);
+SELECT * FROM kmeans_random('kmeans_2d', 'arRAy [ x,y]', 10);
+
+DROP TABLE IF EXISTS centroids;