You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/02/01 22:06:26 UTC
incubator-madlib git commit: K-means: support for array input
Repository: incubator-madlib
Updated Branches:
refs/heads/master 071128d7c -> 08294791f
K-means: support for array input
JIRA: MADLIB-1018
Adds support for array input as data points. The function collates
the columns into a column in a temp table.
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/08294791
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/08294791
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/08294791
Branch: refs/heads/master
Commit: 08294791fbfcfef053c7a752bc87a42aeba117e1
Parents: 071128d
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Feb 1 14:04:38 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Feb 1 14:04:38 2017 -0800
----------------------------------------------------------------------
src/ports/postgres/modules/kmeans/kmeans.py_in | 58 ++++++++++++++++++++
src/ports/postgres/modules/kmeans/kmeans.sql_in | 32 +++++++----
.../postgres/modules/kmeans/test/kmeans.sql_in | 10 +++-
3 files changed, 86 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/kmeans.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.py_in b/src/ports/postgres/modules/kmeans/kmeans.py_in
index d99ddd8..da75d78 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.py_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.py_in
@@ -12,11 +12,15 @@ m4_changequote(`<!', `!>')
"""
import plpy
+import re
from utilities.control import IterationController2D
from utilities.control_composite import IterationControllerComposite
from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_expr_type
+from utilities.utilities import unique_string
STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>, <!False!>)
@@ -34,6 +38,31 @@ def kmeans_validate_src(schema_madlib, rel_source, **kwargs):
# ----------------------------------------------------------------------
+def kmeans_validate_expr(schema_madlib, rel_source, expr_point, **kwargs):
+ """
+ Validation function for the expr_point parameter
+ expr_point accepts 2 formats:
+ - A single column name of a numeric array
+ - A numeric array expression
+ """
+
+ expr_type = get_expr_type(expr_point,rel_source).lower()
+
+ # Both formats should return a numeric array type
+ if expr_type in ['smallint[]', 'integer[]', 'bigint[]', 'decimal[]',
+ 'numeric[]', 'real[]', 'double precision[]',
+ 'serial[]', 'bigserial[]', 'float8[]']:
+
+ # An array expression should fail this check
+ if columns_exist_in_table(rel_source, [expr_point]):
+ return False
+ return True
+ else:
+ plpy.error(
+ """Kmeans error: {expr_point} is not a valid column or array!
+ """.format(**locals()))
+
+# ----------------------------------------------------------------------
def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
expr_point, **kwargs):
@@ -54,6 +83,16 @@ def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
@return The iteration number (i.e., the key) with which to look up the
result in \c rel_state
"""
+
+ if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+ view_name = unique_string('km_view')
+
+ plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+ SELECT {expr_point} AS expr FROM {rel_source}
+ """.format(**locals()))
+ rel_source = view_name
+ expr_point = 'expr'
+
fn_dist_name = plpy.execute("SELECT fn_dist_name FROM " + rel_args)[0]['fn_dist_name']
iterationCtrl = IterationController2D(
rel_args=rel_args,
@@ -139,6 +178,15 @@ def compute_kmeans_random_seeding(schema_madlib, rel_args, rel_state,
@return The iteration number (i.e., the key) with which to look up the
result in \c rel_state
"""
+ if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+ view_name = unique_string('km_view')
+
+ plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+ SELECT {expr_point} AS expr FROM {rel_source}
+ """.format(**locals()))
+ rel_source = view_name
+ expr_point = 'expr'
+
iterationCtrl = IterationController2D(
rel_args=rel_args,
rel_state=rel_state,
@@ -211,6 +259,16 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source,
@return The iteration number (i.e., the key) with which to look up the
result in \c rel_state
"""
+
+ if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+ view_name = unique_string('km_view')
+
+ plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+ SELECT {expr_point} AS expr FROM {rel_source}
+ """.format(**locals()))
+ rel_source = view_name
+ expr_point = 'expr'
+
fn_dist_name = plpy.execute("SELECT fn_dist_name FROM " +
rel_args)[0]['fn_dist_name']
iterationCtrl = IterationControllerComposite(
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 2352bc1..16014ef 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -105,7 +105,7 @@ are skipped during analysis.
</dd>
<dt>expr_point</dt>
-<dd>TEXT. The name of the column with point coordinates.</dd>
+<dd>TEXT. The name of the column with point coordinates or an array expression.</dd>
<dt>k</dt>
<dd>INTEGER. The number of centroids to calculate.</dd>
@@ -148,20 +148,11 @@ Note: the final K-means algorithm is run on the complete dataset. This parameter
only builds a subsample for the seeding and is only available for kmeans++.
<dt>rel_initial_centroids</dt>
-<dd>TEXT. The set of initial centroids. The centroid relation is
-expected to be of the following form:
-<pre>
-{TABLE|VIEW} rel_initial_centroids (
- ...
- expr_centroid DOUBLE PRECISION[],
- ...
-)
-</pre>
-where <em>expr_centroid</em> is the name of a column with coordinates.
+<dd>TEXT. The set of initial centroids.
</dd>
<dt>expr_centroid</dt>
-<dd>TEXT. The name of the column in the <em>rel_initial_centroids</em> relation that contains the centroid coordinates.</dd>
+<dd>TEXT. The name of the column (or the array expression) in the <em>rel_initial_centroids</em> relation that contains the centroid coordinates.</dd>
<dt>initial_centroids</dt>
<dd>TEXT. A string containing a DOUBLE PRECISION array expression with the initial centroid coordinates.</dd>
@@ -501,6 +492,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_src(
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_expr(
+ rel_source VARCHAR,
+ expr_point VARCHAR
+) RETURNS BOOLEAN AS $$
+ PythonFunction(kmeans, kmeans, kmeans_validate_expr)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__seeding_validate_args(
rel_source VARCHAR,
@@ -516,6 +514,11 @@ DECLARE
rel_source_regclass REGCLASS;
rel_filtered VARCHAR;
BEGIN
+
+ -- Validate the expr_point input. Since we don't need a view at this
+ -- point, the output is safe to ignore.
+ PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
+
rel_source_regclass := rel_source;
IF (initial_centroids IS NOT NULL) THEN
@@ -532,6 +535,7 @@ BEGIN
Number of clusters k must be <= 32767 (for results to be returned in a
reasonable amount of time).';
END IF;
+
EXECUTE $sql$ SELECT count(*)
FROM $sql$ || textin(regclassout(rel_source_regclass)) || $sql$
WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 $sql$
@@ -1571,6 +1575,10 @@ BEGIN
PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source);
+ -- Validate the expr_point input. Since we don't need a view at this
+ -- point, the output is safe to ignore.
+ PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
+
class_rel_source := rel_source;
proc_fn_dist := fn_dist
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
index b95693b..072ecb3 100644
--- a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
@@ -28,7 +28,7 @@ FROM (
) AS centroids, generate_series(1,100) i;
CREATE TABLE centroids AS
-SELECT position
+SELECT x,y,position
FROM kmeans_2d
ORDER BY random()
LIMIT 10;
@@ -82,8 +82,14 @@ COPY km_sample (pid, points) FROM stdin DELIMITER '|';
10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, NULL, 1045}
\.
-DROP TABLE IF EXISTS centroids;
SELECT * FROM kmeanspp('km_sample', 'points', 2,
'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001);
+
+
+SELECT * FROM kmeans('kmeans_2d', 'array[x,y]', 'centroids', 'array[x,y]');
+SELECT * FROM kmeanspp('kmeans_2d', 'array[x,y]', 10);
+SELECT * FROM kmeans_random('kmeans_2d', 'arRAy [ x,y]', 10);
+
+DROP TABLE IF EXISTS centroids;