You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/27 02:38:14 UTC

incubator-madlib git commit: SVM: Add polynomial kernel transformation

Repository: incubator-madlib
Updated Branches:
  refs/heads/master dfeffb654 -> 3e576c3ba


SVM: Add polynomial kernel transformation

JIRA: MADLIB-938

The features are transformed using Kar and Karnick's appoach, which
applies to any positive definite kernel where the kernel function
admits a Maclaurin expansion. This commit implements the special
case when the kernel function is polynomial.

As part of this work, we also add support for Bernoulli distribution
to matrix_random.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/3e576c3b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/3e576c3b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/3e576c3b

Branch: refs/heads/master
Commit: 3e576c3baf3f38e6ba69ecb6152a7bfb784fd493
Parents: dfeffb6
Author: Xiaocheng Tang <xi...@gmail.com>
Authored: Fri Feb 26 17:19:42 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Fri Feb 26 17:21:57 2016 -0800

----------------------------------------------------------------------
 src/modules/linalg/matrix_ops.cpp               |  60 +++-
 src/modules/linalg/matrix_ops.hpp               |   2 +
 .../postgres/modules/linalg/matrix_ops.py_in    |  65 ++--
 .../postgres/modules/linalg/matrix_ops.sql_in   |  57 +++-
 .../modules/svm/kernel_approximation.py_in      | 313 ++++++++++++++++++-
 src/ports/postgres/modules/svm/svm.py_in        |   3 +-
 src/ports/postgres/modules/svm/test/svm.sql_in  |  61 +++-
 7 files changed, 514 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/modules/linalg/matrix_ops.cpp
----------------------------------------------------------------------
diff --git a/src/modules/linalg/matrix_ops.cpp b/src/modules/linalg/matrix_ops.cpp
index 307fa52..c671416 100644
--- a/src/modules/linalg/matrix_ops.cpp
+++ b/src/modules/linalg/matrix_ops.cpp
@@ -14,6 +14,7 @@
 #include <numeric>
 #include <boost/random/uniform_real.hpp>
 #include <boost/random/normal_distribution.hpp>
+#include <boost/random/bernoulli_distribution.hpp>
 #include <boost/random/variate_generator.hpp>
 #include <boost/generator_iterator.hpp>
 #include <boost/random/linear_congruential.hpp>
@@ -238,15 +239,41 @@ AnyType normal_vector::run(AnyType & args)
     if (dim < 1) {
         throw std::invalid_argument("invalid argument - dim should be positive");
     }
-    ColumnVector r(dim);
+    ColumnVector res(dim);
     boost::minstd_rand generator(seed);
     boost::normal_distribution<> nd_dist(mu, sigma);
     boost::variate_generator<boost::minstd_rand&, boost::normal_distribution<> > nd(generator, nd_dist);
 
     for (int i = 0; i < dim; i++){
-        r(i) = (double)nd();
+        res(i) = (double)nd();
     }
-    return r;
+    return res;
+}
+
+AnyType bernoulli_vector::run(AnyType & args)
+{
+    int dim = args[0].getAs<int>();
+    double upper_val = args[1].getAs<double>();
+    double lower_val = args[2].getAs<double>();
+    double prob = args[3].getAs<double>();
+    int seed = args[4].getAs<int>();
+
+    if (dim < 1) {
+        throw std::invalid_argument("invalid argument - dim should be positive");
+    }
+    if (prob > 1 || prob < 0) {
+        throw std::invalid_argument("invalid argument - probability should be in [0,1]");
+    }
+
+    ColumnVector res(dim);
+    boost::minstd_rand generator(seed);
+    boost::bernoulli_distribution<> bn_dist(prob);
+    boost::variate_generator<boost::minstd_rand&, boost::bernoulli_distribution<> > bn(generator, bn_dist);
+
+    for (int i = 0; i < dim; i++) {
+        res(i) = bn() ? upper_val : lower_val;
+    }
+    return res;
 }
 
 AnyType uniform_vector::run(AnyType & args)
@@ -259,14 +286,14 @@ AnyType uniform_vector::run(AnyType & args)
     if (dim < 1) {
         throw std::invalid_argument("invalid argument - dim should be positive");
     }
-    ColumnVector r(dim);
+    ColumnVector res(dim);
     boost::minstd_rand generator(seed);
-    boost::uniform_real<> uni_dist(min_,max_);
+    boost::uniform_real<> uni_dist(min_, max_);
     boost::variate_generator<boost::minstd_rand&, boost::uniform_real<> > uni(generator, uni_dist);
     for (int i = 0; i < dim; i++){
-        r(i) = (double)uni();
+        res(i) = (double)uni();
     }
-    return r;
+    return res;
 }
 
 AnyType matrix_vec_mult_in_mem_2d::run(AnyType & args){
@@ -304,7 +331,24 @@ AnyType matrix_vec_mult_in_mem_1d::run(AnyType & args){
     return v;
 }
 
-AnyType rand_block::run(AnyType & args) {
+AnyType row_fold::run(AnyType & args){
+    MappedColumnVector vec = args[0].getAs<MappedColumnVector>();
+    MappedIntegerVector pat = args[1].getAs<MappedIntegerVector>();
+
+    if (vec.size() != pat.sum()) {
+        throw std::invalid_argument(
+            "dimensions mismatch: row_in.size() != pattern.sum()");
+    }
+
+    ColumnVector r(pat.size());
+    for (int i = 0, j = 0; i < pat.size(); j += pat[i++])
+        r[i] = vec.segment(j, pat[i]).prod();
+
+    return r;
+}
+
+AnyType rand_block::run(AnyType & args)
+{
     int row_dim = args[0].getAs<int>();
     int col_dim = args[1].getAs<int>();
     if (row_dim < 1 || col_dim < 1) {

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/modules/linalg/matrix_ops.hpp
----------------------------------------------------------------------
diff --git a/src/modules/linalg/matrix_ops.hpp b/src/modules/linalg/matrix_ops.hpp
index 2fe8452..085e51f 100644
--- a/src/modules/linalg/matrix_ops.hpp
+++ b/src/modules/linalg/matrix_ops.hpp
@@ -16,8 +16,10 @@ DECLARE_UDF(linalg, matrix_mem_sum_sfunc)
 DECLARE_UDF(linalg, rand_block)
 DECLARE_UDF(linalg, rand_vector)
 DECLARE_UDF(linalg, uniform_vector)
+DECLARE_UDF(linalg, bernoulli_vector)
 DECLARE_UDF(linalg, normal_vector)
 DECLARE_UDF(linalg, matrix_vec_mult_in_mem_2d)
 DECLARE_UDF(linalg, matrix_vec_mult_in_mem_1d)
+DECLARE_UDF(linalg, row_fold)
 DECLARE_SR_UDF(linalg, row_split)
 DECLARE_SR_UDF(linalg, unnest_block)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/linalg/matrix_ops.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.py_in b/src/ports/postgres/modules/linalg/matrix_ops.py_in
index 53b5e59..d11dcdb 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.py_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.py_in
@@ -3090,7 +3090,19 @@ def matrix_ones(schema_madlib, row_dim, col_dim, matrix_out, out_args):
 
 def matrix_random(schema_madlib, distribution, row_dim, col_dim,
                   in_args, matrix_out, out_args):
-    """ Create a row_dim by col_dim random matrix from distribution.
+    """ Generate a row_dim x col_dim random matrix sampled from given distribution
+
+        Args:
+            schema_madlib: MADlib schema name
+            distribution: Str, Name of the sampling distribution.
+                            Supported names: uniform, normal, bernoulli
+            row_dim: int, Target matrix row dimensionality
+            col_dim: int, Target matrix col dimensionality
+            in_args: str, Distribution parameters in key=value pairs.
+                        Supported parameters:
+                            Normal: mu, sigma
+                            Uniform: min, max
+                            Bernoulli: lower, upper, prob
     """
     _validate_output_table(matrix_out)
     _assert(row_dim > 0 and col_dim > 0,
@@ -3100,8 +3112,8 @@ def matrix_random(schema_madlib, distribution, row_dim, col_dim,
     out_args = parse_matrix_args(
         out_args, in_default_args={'row': 'row', 'col': 'col', 'val': 'val'})
 
+    supported_dist = ['uniform', 'normal', 'bernoulli']
     if distribution:
-        supported_dist = ['uniform', 'normal']
         try:
             distribution = next(x for x in supported_dist
                                 if x.startswith(distribution))
@@ -3113,44 +3125,51 @@ def matrix_random(schema_madlib, distribution, row_dim, col_dim,
     else:
         distribution = 'uniform'
 
-    in_args_default = {
-        'seed': randint(1, 1000),
-        'mu': 0,
-        'sigma': 1,
-        'min': 0,
-        'max': 1}
-
-    in_args_types = {
-        'seed': int,
-        'mu': float,
-        'sigma': float,
-        'min': float,
-        'max': float}
-
-    in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
-
+    in_args_default = {'seed': randint(0, 1000), 'table_type': ''}
+    in_args_types = {'seed': int, 'table_type': str}
     if distribution == 'normal':
-        random_row = ("""{schema_madlib}.__normal_vector(
+        in_args_default.update({'mu': 0, 'sigma': 1})
+        in_args_types.update({'mu': float, 'sigma': float})
+        in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
+        random_vector = ("""{schema_madlib}.__normal_vector(
                                {col_dim},
                                {in_args_vals[mu]},
                                {in_args_vals[sigma]},
                                {in_args_vals[seed]} + row)
                       """.format(**locals()))
-    else:
-        random_row = ("""{schema_madlib}.__uniform_vector(
+    elif distribution == 'bernoulli':
+        in_args_default.update({'upper': 1., 'lower': 0., 'prob': 0.5})
+        in_args_types.update({'upper': float, 'lower': float, 'prob': float})
+        in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
+        random_vector = ("""{schema_madlib}.__bernoulli_vector(
+                               {col_dim},
+                               {in_args_vals[upper]},
+                               {in_args_vals[lower]},
+                               {in_args_vals[prob]},
+                               {in_args_vals[seed]} + row)
+                      """.format(**locals()))
+    elif distribution == 'uniform':
+        in_args_default.update({'min': 0, 'max': 1})
+        in_args_types.update({'min': float, 'max': float})
+        in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
+        random_vector = ("""{schema_madlib}.__uniform_vector(
                                {col_dim},
                                {in_args_vals[min]},
                                {in_args_vals[max]},
                                {in_args_vals[seed]} + row)
                       """.format(**locals()))
+    else:
+        plpy.error("Matrix Error: Invalid distribution: "
+                   "{0}. Supported distributions: ({1})"
+                   .format(distribution, ', '.join(sorted(supported_dist))))
 
     plpy.execute("""
-        CREATE TABLE {matrix_out}
+        CREATE {in_args_vals[table_type]} TABLE {matrix_out}
         m4_ifdef(`__POSTGRESQL__', `',
            `WITH (APPENDONLY=TRUE,COMPRESSTYPE=QUICKLZ)') AS
             SELECT
                 row as {out_args[row]},
-                {random_row} as {out_args[val]}
+                {random_vector} as {out_args[val]}
             FROM
                 generate_series(1, {row_dim}) as row
             m4_ifdef(`__POSTGRESQL__', `',

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.sql_in b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
index 65f4fdb..2b15169 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.sql_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
@@ -174,7 +174,7 @@ in the glossary.
 -- Create a diag matrix initialized with given diagonal elements
 &nbsp; <b>matrix_diag</b>( diag_elements, matrix_out, out_args)
 -- Create a matrix initialized with values sampled from a distribution
--- Supported distributions: normal and uniform
+-- Supported distributions: normal, uniform, bernoulli
 &nbsp; <b>matrix_random</b>( distribution, row_dim, col_dim, in_args, matrix_out, out_args )
 </pre>
 
@@ -2393,12 +2393,25 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.matrix_random
 (
+    row_id        INTEGER,
+    col_id        INTEGER,
+    distribution  TEXT,
+    matrix_out    TEXT
+)
+RETURNS MADLIB_SCHEMA.matrix_result AS $$
+    SELECT MADLIB_SCHEMA.matrix_random($1, $2, ''::TEXT, $3, $4, ''::TEXT)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.matrix_random
+(
     row_id     INTEGER,
     col_id     INTEGER,
     matrix_out TEXT
 )
 RETURNS MADLIB_SCHEMA.matrix_result AS $$
-    SELECT MADLIB_SCHEMA.matrix_random($1, $2, ''::TEXT, 'uniform', $3, ''::TEXT)
+    SELECT MADLIB_SCHEMA.matrix_random($1, $2, 'uniform', $3)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -2969,6 +2982,46 @@ AS 'MODULE_PATHNAME', 'matrix_vec_mult_in_mem_1d'
 LANGUAGE C STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
+/**
+ * @brief The function folds (through multiplication) array x according to
+ *        the pattern in array y, producing an array of the same length as array y.
+ * @param row Array x
+ * @param pattern Array y
+ * @param folded array
+ */
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__row_fold(
+    row_in  FLOAT8[],
+    pattern  INTEGER[]
+)
+RETURNS FLOAT8[]
+AS 'MODULE_PATHNAME', 'row_fold'
+LANGUAGE C STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.__bernoulli_vector
+(
+    dim INTEGER,
+    pos_val FLOAT8,
+    neg_val FLOAT8,
+    prob FLOAT8,
+    seed INTEGER
+)
+RETURNS FLOAT8[]
+AS 'MODULE_PATHNAME', 'bernoulli_vector'
+LANGUAGE C STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.__bernoulli_vector
+(
+    dim INTEGER
+)
+RETURNS FLOAT8[] AS $$
+    SELECT MADLIB_SCHEMA.__bernoulli_vector($1, 1::FLOAT8, 0::FLOAT8, 0.5::FLOAT8, 42::INTEGER)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.__uniform_vector
 (

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/svm/kernel_approximation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/kernel_approximation.py_in b/src/ports/postgres/modules/svm/kernel_approximation.py_in
index 04aa018..4e19bcb 100644
--- a/src/ports/postgres/modules/svm/kernel_approximation.py_in
+++ b/src/ports/postgres/modules/svm/kernel_approximation.py_in
@@ -8,6 +8,295 @@ from utilities.utilities import num_features
 
 from math import sqrt
 from math import pi
+from math import log
+from math import factorial
+
+from random import random
+from random import seed
+
+from operator import mul
+from collections import namedtuple
+
+
+PolyRandOperator = namedtuple('PolyRandOperator',
+                              'weights, coefs, reps, '
+                              'other_features, rd_id, rd_val')
+
+
+class PolyKernel(object):
+    """docstring for PolyKernel"""
+    def __init__(self, schema_madlib, degree=2, coef0=1, n_components=100,
+                 random_state=1, poly_operator=None, orig_data=None):
+        self.schema_madlib = schema_madlib
+        self.kernel_func = 'polynomial'
+        self.degree = degree
+        self.coef0 = coef0
+        self.n_components = n_components
+        self.random_state = random_state
+        # polynomial random mapping operator
+        self.pro = poly_operator
+        self.orig_data = orig_data
+        if self.pro is not None:
+            pro = self.pro
+            self.n_components = num_features(pro.coefs, pro.rd_val)
+            self.n_components += num_features(pro.other_features, pro.rd_val)
+
+    def clear(self):
+        data_type = 'view' if self.orig_data else 'table'
+        if self.pro:
+            run_sql = """
+                drop {data_type} if exists {pro.weights};
+                drop {data_type} if exists {pro.coefs};
+                drop {data_type} if exists {pro.reps};
+                drop {data_type} if exists {pro.other_features};
+            """.format(pro=self.pro, data_type=data_type)
+            plpy.execute(run_sql)
+
+    def __del__(self):
+        self.clear()
+
+    def save_as(self, name):
+        if self.orig_data:
+            plpy.warning("Polynomial Kernel Warning: no need to save."
+                         "Original data table exists: {0}"
+                         .format(self.orig_data))
+            return
+
+        run_sql = """
+            create table {name} as
+                select {pro.rd_id} as id, {pro.rd_val} as val,
+                       'coefs' as desp
+                from {pro.coefs}
+                union
+                select {pro.rd_id} as id, {pro.rd_val} as val,
+                       'weights' as desp
+                from {pro.weights}
+                union
+                select {pro.rd_id} as id, {pro.rd_val} as val,
+                       'reps' as desp
+                from {pro.reps}
+                union
+                select {pro.rd_id} as id, {pro.rd_val} as val,
+                       'other_features' as desp
+                from {pro.other_features}
+        """.format(name=name, pro=self.pro)
+        plpy.execute(run_sql)
+
+    @classmethod
+    def create(cls, schema_madlib, n_features, kernel_params):
+        params = cls.parse_params(kernel_params, n_features)
+        return cls(schema_madlib, **params)
+
+    @classmethod
+    def load_from(cls, schema_madlib, data, kernel_params=''):
+        other_features = unique_string(desp='other_features')
+        rd_weights = unique_string(desp='random_weights')
+        rd_coefs = unique_string(desp='random_coefs')
+        rd_reps = unique_string(desp='random_reps')
+        rd_val = unique_string(desp='val')
+        rd_id = unique_string(desp='id')
+        plpy.execute("""
+                drop view if exists {rd_weights};
+                create temp view {rd_weights} as
+                    select id as {rd_id}, val as {rd_val} from {data}
+                    where desp = 'weights';
+
+                drop view if exists {rd_coefs};
+                create temp view {rd_coefs} as
+                    select id as {rd_id}, val as {rd_val} from {data}
+                    where desp = 'coefs';
+
+                drop view if exists {rd_reps};
+                create temp view {rd_reps} as
+                    select id as {rd_id}, val as {rd_val} from {data}
+                    where desp = 'reps';
+
+                drop view if exists {other_features};
+                create temp view {other_features} as
+                    select id as {rd_id}, val as {rd_val} from {data}
+                    where desp = 'other_features';
+                     """.format(**locals()))
+        params = cls.parse_params(kernel_params)
+        pro = PolyRandOperator(weights=rd_weights, coefs=rd_coefs,
+                               reps=rd_reps, other_features=other_features,
+                               rd_id=rd_id, rd_val=rd_val)
+
+        return cls(schema_madlib, poly_operator=pro, orig_data=data, **params)
+
+    @property
+    def kernel_params(self):
+        return ('degree={degree}, coef0={coef0}, '
+                'n_components={n_components}, '
+                'random_state={random_state}'
+                .format(degree=self.degree, coef0=self.coef0,
+                        n_components=self.n_components,
+                        random_state=self.random_state))
+
+    @classmethod
+    def parse_params(cls, kernel_params='', n_features=10):
+        params_default = {
+            'degree': 3,
+            'n_components': 2*n_features,
+            'coef0': 1,
+            'random_state': 1}
+        params_types = {
+            'degree': int,
+            'n_components': int,
+            'coef0': float,
+            'random_state': int}
+        return extract_keyvalue_params(kernel_params, params_types, params_default)
+
+    def fit(self, n_features):
+        # fast way to compute nCr
+        # combinations and permutations
+        def ncr(n, r):
+            r = min(r, n-r)
+            if r == 0:
+                return 1
+            numer = reduce(mul, range(n, n-r, -1))
+            denom = factorial(r + 1)
+            return numer // denom
+
+        # Maclaurin expansion of f = (q + x)**r
+        def maclaurin_coefs(r, q, k):
+            if q == 0:
+                return 0.
+            return ncr(r, k)*(q**(r-k))
+
+        self.clear()
+        self.orig_data = None
+        coefs_ = [sqrt(maclaurin_coefs(self.degree, self.coef0, k)*(2**(k+1)))
+                  for k in range(self.degree+1)]
+        seed(self.random_state)
+        reps_ = [int(log((1./random()), 2)) for _ in range(self.n_components)]
+        reps_nz_ = [x for x in reps_ if (x > 0) and (x <= self.degree)]
+        rd_val_ = unique_string(desp='val')
+        rd_id_ = unique_string(desp='id')
+        rd_weights_ = unique_string(desp='random_weights')
+        run_sql = """
+            drop table if exists {rd_weights};
+            select {schema_madlib}.matrix_random(
+                        1, {dim},
+                        'pos_val=1, neg_val=-1, seed={seed}, table_type=temp',
+                        'bernoulli', '{rd_weights}',
+                        'row={id}, val={val}')
+        """.format(rd_weights=rd_weights_,
+                   dim=sum(reps_nz_)*n_features,
+                   seed=self.random_state,
+                   schema_madlib=self.schema_madlib,
+                   val=rd_val_, id=rd_id_)
+        plpy.execute(run_sql)
+
+        vals_ = [coefs_[k] for k in reps_nz_]
+        rd_coefs_ = unique_string(desp='rd_coefs')
+        run_sql = """
+            drop table if exists {data};
+            create temp table {data} as
+                select
+                    $1 as {val}, id as {id}
+                from generate_series(1, 1) as id
+        """.format(data=rd_coefs_,
+                   val=rd_val_, id=rd_id_)
+        plpy.execute(plpy.prepare(run_sql, ["float[]"]), [vals_])
+
+        rd_reps_ = unique_string(desp='reps_nz')
+        run_sql = """
+            drop table if exists {data};
+            create temp table {data} as
+                select
+                    $1 as {val}, id as {id}
+                from generate_series(1, 1) as id
+        """.format(data=rd_reps_,
+                   val=rd_val_, id=rd_id_)
+        plpy.execute(plpy.prepare(run_sql, ["float[]"]), [reps_nz_])
+
+        vals_ = ([coefs_[0]]*len([_ for _ in reps_ if _ == 0]) +
+                 [0]*len([_ for _ in reps_ if _ > self.degree]))
+        other_features_ = unique_string(desp='other_features')
+        run_sql = """
+            drop table if exists {data};
+            create temp table {data} as
+                select
+                    $1 as {val}, id as {id}
+                from generate_series(1, 1) as id
+        """.format(data=other_features_,
+                   val=rd_val_, id=rd_id_)
+        plpy.execute(plpy.prepare(run_sql, ["float[]"]), [vals_])
+
+        self.pro = PolyRandOperator(weights=rd_weights_,
+                                    coefs=rd_coefs_, reps=rd_reps_,
+                                    other_features=other_features_,
+                                    rd_id=rd_id_, rd_val=rd_val_)
+        return self
+
+    def transform(self, source_table, independent_varname,
+                  dependent_varname=None, grouping_col=None, id_col=None,
+                  transformed_name='poly_transformed'):
+        if not self.pro:
+            return self
+        self.original_table = dict(source_table=source_table,
+                                   independent_varname=independent_varname,
+                                   dependent_varname=dependent_varname)
+        schema_madlib = self.schema_madlib
+
+        def _cast_if_null(input, alias):
+            null_str = "NULL::integer"
+            if input:
+                return str(input)
+            else:
+                return null_str + " as " + alias if alias else null_str
+
+        grouping_col = _cast_if_null(grouping_col, unique_string('grp_col'))
+        dependent_varname = _cast_if_null(dependent_varname, '')
+        id_col = _cast_if_null(id_col, unique_string('id_col'))
+
+        features_col = unique_string(desp='features_col')
+        target_col = unique_string(desp='target_col')
+        transformed = unique_string(desp=transformed_name)
+
+        # X = a * cos (X*C + b)
+        pro, multiplier = self.pro, sqrt(1. / self.n_components)
+        run_sql = """
+        drop table if exists {transformed};
+        create temp table {transformed} as
+            select
+                {schema_madlib}.array_scalar_mult(
+                    array_cat(
+                        {schema_madlib}.array_mult(
+                            {schema_madlib}.__row_fold(
+                                {schema_madlib}.__matrix_vec_mult_in_mem(
+                                    q.{features_col}::float[],
+                                    weights.{pro.rd_val}::float[]
+                                )::float[],
+                                reps.{pro.rd_val}::integer[]
+                            )::float[],
+                            coefs.{pro.rd_val}::float[]
+                        )::float[],
+                        of.{pro.rd_val}::float[]
+                    )::float[],
+                    {multiplier}::float
+                ) as {features_col},
+                q.{target_col} as {target_col},
+                {id_col},
+                {grouping_col}
+            from (
+                select
+                    {dependent_varname} as {target_col},
+                    {independent_varname} as {features_col},
+                    {id_col},
+                    {grouping_col}
+                from {source_table}
+            ) q cross join (select {pro.rd_val} from {pro.weights}) as weights
+                cross join (select {pro.rd_val} from {pro.coefs}) as coefs
+                cross join (select {pro.rd_val} from {pro.reps}) as reps
+                cross join (select {pro.rd_val} from {pro.other_features}) as of
+        """.format(**locals())
+        plpy.execute(run_sql)
+        # assert(self.n_components == num_features(transformed, features_col))
+        self.transformed_table = dict(source_table=transformed,
+                                      dependent_varname=target_col,
+                                      independent_varname=features_col)
+        return self
 
 
 class GaussianKernelBase(object):
@@ -196,23 +485,21 @@ class GaussianKernel(GaussianKernelBase):
                   transformed_name='gaussian_transformed'):
         if not self.rd_offset or not self.rd_weights:
             return self
-
         self.original_table = dict(source_table=source_table,
                                    independent_varname=independent_varname,
                                    dependent_varname=dependent_varname)
-
         schema_madlib = self.schema_madlib
 
-        def _verify(x, s):
+        def _cast_if_null(input, alias):
             null_str = "NULL::integer"
-            if x:
-                return str(x)
+            if input:
+                return str(input)
             else:
-                return null_str + " as " + s if s else null_str
+                return null_str + " as " + alias if alias else null_str
 
-        grouping_col = _verify(grouping_col, unique_string('grp_col'))
-        dependent_varname = _verify(dependent_varname, '')
-        id_col = _verify(id_col, unique_string('id_col'))
+        grouping_col = _cast_if_null(grouping_col, unique_string('grp_col'))
+        dependent_varname = _cast_if_null(dependent_varname, '')
+        id_col = _cast_if_null(id_col, unique_string('id_col'))
 
         # copy data to the temporary table with id column
         # id_col is different from index_col
@@ -399,9 +686,9 @@ def create_kernel(schema_madlib, n_features, kernel_func, kernel_params):
     if kernel_func == 'linear':
         return None
     elif kernel_func == 'gaussian':
-        return GaussianKernelBase.create(schema_madlib,
-                                         n_features,
-                                         kernel_params)
+        return GaussianKernelBase.create(schema_madlib, n_features, kernel_params)
+    elif kernel_func == 'polynomial':
+        return PolyKernel.create(schema_madlib, n_features, kernel_params)
 
 
 def load_kernel(schema_madlib, data, kernel_func, kernel_params):
@@ -409,3 +696,5 @@ def load_kernel(schema_madlib, data, kernel_func, kernel_params):
         return None
     elif kernel_func == 'gaussian':
         return GaussianKernelBase.load_from(schema_madlib, data, kernel_params)
+    elif kernel_func == 'polynomial':
+        return PolyKernel.load_from(schema_madlib, data, kernel_params)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/svm/svm.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in
index 9b16f83..7b7208b 100644
--- a/src/ports/postgres/modules/svm/svm.py_in
+++ b/src/ports/postgres/modules/svm/svm.py_in
@@ -744,6 +744,7 @@ def _cross_validate_svm(args):
     val_res.output_tbl(params_dict['validation_result'])
     params_dict.update(val_res.first('sub_args')['params_dict'])
     args.update(dict(transformer=transformer))
+# ------------------------------------------------------------------------------
 
 
 def _get_kernel_name(kernel_func):
@@ -751,7 +752,7 @@ def _get_kernel_name(kernel_func):
         kernel_func = 'linear'
     else:
         # Add non-linear kernels below after implementing them.
-        supported_kernels = ['linear', 'gaussian']
+        supported_kernels = ['linear', 'gaussian', 'polynomial']
         try:
             # allow user to specify a prefix substring of
             # supported kernels. This works because the supported

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/svm/test/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in
index 48b8bab..31b996d 100644
--- a/src/ports/postgres/modules/svm/test/svm.sql_in
+++ b/src/ports/postgres/modules/svm/test/svm.sql_in
@@ -798,6 +798,65 @@ INSERT INTO kernel_data (index, x1, x2, y) VALUES (13, 0.200000000000000011, -2.
 INSERT INTO kernel_data (index, x1, x2, y) VALUES (14, 0, -2.70000000000000018, 1);
 INSERT INTO kernel_data (index, x1, x2, y) VALUES (15, 1.30000000000000004, 2.10000000000000009, 1);
 
+-- verify poly kernel mapping dimensions
+DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
+SELECT svm_classification(
+     'svm_train_data',
+     'm1',
+     'label',
+     'ind',
+     'poly',
+     'n_components=3',
+     NULL,
+     'max_iter=2');
+DROP TABLE IF EXISTS svm_test_predict CASCADE;
+SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT
+    assert(
+        array_upper(coef, 1) = 3,
+        'The dimension of the coefficients must be equal to n_components (3)!')
+FROM m1;
+
+-- verify poly kernel with grouping
+-- verify partial string support in kernel specification
+DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary, svr_mdl_m_random CASCADE;
+SELECT svm_regression(
+        'abalone_train_small',
+        'svr_mdl_m',
+        'rings',
+        'ARRAY[1,diameter,shell,shucked,length]',
+        'po',
+        'degree=2, n_components=10',
+        'sex',
+        'max_iter=2, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.05',
+        false);
+DROP TABLE IF EXISTS svm_test_predict CASCADE;
+SELECT svm_predict('svr_mdl_m','abalone_train_small', 'id', 'svm_test_predict');
+SELECT
+    assert(
+        array_upper(coef, 1) = 10,
+        'The dimension of the coefficients must be equal to n_components (10)!')
+FROM svr_mdl_m;
+
+-- verify poly kernel with cross validation
+DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
+SELECT svm_classification(
+     'svm_train_data',
+     'm1',
+     'label',
+     'ind',
+     'poly',
+     'n_components=3',
+     NULL,
+     'max_iter=2, n_folds=3, lambda=[0.01, 0.1, 0.5]');
+DROP TABLE IF EXISTS svm_test_predict CASCADE;
+SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT
+    assert(
+        array_upper(coef, 1) = 3,
+        'The dimension of the coefficients must be equal to n_components (3)!')
+FROM m1;
+
 
 DROP TABLE IF EXISTS m1, m1_summary, m1_random;
 SELECT svm_classification(
@@ -817,7 +876,7 @@ SELECT
 FROM kernel_predict NATURAL JOIN kernel_data
 WHERE prediction <> y;
 SELECT
-    assert(count(*) = 0, 'Using kernel should perfectly fit the data!')
+    assert(count(*) = 0, 'Using gaussian kernel should perfectly fit the data!')
 FROM kernel_predict NATURAL JOIN kernel_data
 WHERE prediction <> y;