You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/27 02:38:14 UTC
incubator-madlib git commit: SVM: Add polynomial kernel transformation
Repository: incubator-madlib
Updated Branches:
refs/heads/master dfeffb654 -> 3e576c3ba
SVM: Add polynomial kernel transformation
JIRA: MADLIB-938
The features are transformed using Kar and Karnick's appoach, which
applies to any positive definite kernel where the kernel function
admits a Maclaurin expansion. This commit implements the special
case when the kernel function is polynomial.
As part of this work, we also add support for Bernoulli distribution
to matrix_random.
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/3e576c3b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/3e576c3b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/3e576c3b
Branch: refs/heads/master
Commit: 3e576c3baf3f38e6ba69ecb6152a7bfb784fd493
Parents: dfeffb6
Author: Xiaocheng Tang <xi...@gmail.com>
Authored: Fri Feb 26 17:19:42 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Fri Feb 26 17:21:57 2016 -0800
----------------------------------------------------------------------
src/modules/linalg/matrix_ops.cpp | 60 +++-
src/modules/linalg/matrix_ops.hpp | 2 +
.../postgres/modules/linalg/matrix_ops.py_in | 65 ++--
.../postgres/modules/linalg/matrix_ops.sql_in | 57 +++-
.../modules/svm/kernel_approximation.py_in | 313 ++++++++++++++++++-
src/ports/postgres/modules/svm/svm.py_in | 3 +-
src/ports/postgres/modules/svm/test/svm.sql_in | 61 +++-
7 files changed, 514 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/modules/linalg/matrix_ops.cpp
----------------------------------------------------------------------
diff --git a/src/modules/linalg/matrix_ops.cpp b/src/modules/linalg/matrix_ops.cpp
index 307fa52..c671416 100644
--- a/src/modules/linalg/matrix_ops.cpp
+++ b/src/modules/linalg/matrix_ops.cpp
@@ -14,6 +14,7 @@
#include <numeric>
#include <boost/random/uniform_real.hpp>
#include <boost/random/normal_distribution.hpp>
+#include <boost/random/bernoulli_distribution.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/generator_iterator.hpp>
#include <boost/random/linear_congruential.hpp>
@@ -238,15 +239,41 @@ AnyType normal_vector::run(AnyType & args)
if (dim < 1) {
throw std::invalid_argument("invalid argument - dim should be positive");
}
- ColumnVector r(dim);
+ ColumnVector res(dim);
boost::minstd_rand generator(seed);
boost::normal_distribution<> nd_dist(mu, sigma);
boost::variate_generator<boost::minstd_rand&, boost::normal_distribution<> > nd(generator, nd_dist);
for (int i = 0; i < dim; i++){
- r(i) = (double)nd();
+ res(i) = (double)nd();
}
- return r;
+ return res;
+}
+
+AnyType bernoulli_vector::run(AnyType & args)
+{
+ int dim = args[0].getAs<int>();
+ double upper_val = args[1].getAs<double>();
+ double lower_val = args[2].getAs<double>();
+ double prob = args[3].getAs<double>();
+ int seed = args[4].getAs<int>();
+
+ if (dim < 1) {
+ throw std::invalid_argument("invalid argument - dim should be positive");
+ }
+ if (prob > 1 || prob < 0) {
+ throw std::invalid_argument("invalid argument - probability should be in [0,1]");
+ }
+
+ ColumnVector res(dim);
+ boost::minstd_rand generator(seed);
+ boost::bernoulli_distribution<> bn_dist(prob);
+ boost::variate_generator<boost::minstd_rand&, boost::bernoulli_distribution<> > bn(generator, bn_dist);
+
+ for (int i = 0; i < dim; i++) {
+ res(i) = bn() ? upper_val : lower_val;
+ }
+ return res;
}
AnyType uniform_vector::run(AnyType & args)
@@ -259,14 +286,14 @@ AnyType uniform_vector::run(AnyType & args)
if (dim < 1) {
throw std::invalid_argument("invalid argument - dim should be positive");
}
- ColumnVector r(dim);
+ ColumnVector res(dim);
boost::minstd_rand generator(seed);
- boost::uniform_real<> uni_dist(min_,max_);
+ boost::uniform_real<> uni_dist(min_, max_);
boost::variate_generator<boost::minstd_rand&, boost::uniform_real<> > uni(generator, uni_dist);
for (int i = 0; i < dim; i++){
- r(i) = (double)uni();
+ res(i) = (double)uni();
}
- return r;
+ return res;
}
AnyType matrix_vec_mult_in_mem_2d::run(AnyType & args){
@@ -304,7 +331,24 @@ AnyType matrix_vec_mult_in_mem_1d::run(AnyType & args){
return v;
}
-AnyType rand_block::run(AnyType & args) {
+AnyType row_fold::run(AnyType & args){
+ MappedColumnVector vec = args[0].getAs<MappedColumnVector>();
+ MappedIntegerVector pat = args[1].getAs<MappedIntegerVector>();
+
+ if (vec.size() != pat.sum()) {
+ throw std::invalid_argument(
+ "dimensions mismatch: row_in.size() != pattern.sum()");
+ }
+
+ ColumnVector r(pat.size());
+ for (int i = 0, j = 0; i < pat.size(); j += pat[i++])
+ r[i] = vec.segment(j, pat[i]).prod();
+
+ return r;
+}
+
+AnyType rand_block::run(AnyType & args)
+{
int row_dim = args[0].getAs<int>();
int col_dim = args[1].getAs<int>();
if (row_dim < 1 || col_dim < 1) {
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/modules/linalg/matrix_ops.hpp
----------------------------------------------------------------------
diff --git a/src/modules/linalg/matrix_ops.hpp b/src/modules/linalg/matrix_ops.hpp
index 2fe8452..085e51f 100644
--- a/src/modules/linalg/matrix_ops.hpp
+++ b/src/modules/linalg/matrix_ops.hpp
@@ -16,8 +16,10 @@ DECLARE_UDF(linalg, matrix_mem_sum_sfunc)
DECLARE_UDF(linalg, rand_block)
DECLARE_UDF(linalg, rand_vector)
DECLARE_UDF(linalg, uniform_vector)
+DECLARE_UDF(linalg, bernoulli_vector)
DECLARE_UDF(linalg, normal_vector)
DECLARE_UDF(linalg, matrix_vec_mult_in_mem_2d)
DECLARE_UDF(linalg, matrix_vec_mult_in_mem_1d)
+DECLARE_UDF(linalg, row_fold)
DECLARE_SR_UDF(linalg, row_split)
DECLARE_SR_UDF(linalg, unnest_block)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/linalg/matrix_ops.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.py_in b/src/ports/postgres/modules/linalg/matrix_ops.py_in
index 53b5e59..d11dcdb 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.py_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.py_in
@@ -3090,7 +3090,19 @@ def matrix_ones(schema_madlib, row_dim, col_dim, matrix_out, out_args):
def matrix_random(schema_madlib, distribution, row_dim, col_dim,
in_args, matrix_out, out_args):
- """ Create a row_dim by col_dim random matrix from distribution.
+ """ Generate a row_dim x col_dim random matrix sampled from given distribution
+
+ Args:
+ schema_madlib: MADlib schema name
+ distribution: Str, Name of the sampling distribution.
+ Supported names: uniform, normal, bernoulli
+ row_dim: int, Target matrix row dimensionality
+ col_dim: int, Target matrix col dimensionality
+ in_args: str, Distribution parameters in key=value pairs.
+ Supported parameters:
+ Normal: mu, sigma
+ Uniform: min, max
+ Bernoulli: lower, upper, prob
"""
_validate_output_table(matrix_out)
_assert(row_dim > 0 and col_dim > 0,
@@ -3100,8 +3112,8 @@ def matrix_random(schema_madlib, distribution, row_dim, col_dim,
out_args = parse_matrix_args(
out_args, in_default_args={'row': 'row', 'col': 'col', 'val': 'val'})
+ supported_dist = ['uniform', 'normal', 'bernoulli']
if distribution:
- supported_dist = ['uniform', 'normal']
try:
distribution = next(x for x in supported_dist
if x.startswith(distribution))
@@ -3113,44 +3125,51 @@ def matrix_random(schema_madlib, distribution, row_dim, col_dim,
else:
distribution = 'uniform'
- in_args_default = {
- 'seed': randint(1, 1000),
- 'mu': 0,
- 'sigma': 1,
- 'min': 0,
- 'max': 1}
-
- in_args_types = {
- 'seed': int,
- 'mu': float,
- 'sigma': float,
- 'min': float,
- 'max': float}
-
- in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
-
+ in_args_default = {'seed': randint(0, 1000), 'table_type': ''}
+ in_args_types = {'seed': int, 'table_type': str}
if distribution == 'normal':
- random_row = ("""{schema_madlib}.__normal_vector(
+ in_args_default.update({'mu': 0, 'sigma': 1})
+ in_args_types.update({'mu': float, 'sigma': float})
+ in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
+ random_vector = ("""{schema_madlib}.__normal_vector(
{col_dim},
{in_args_vals[mu]},
{in_args_vals[sigma]},
{in_args_vals[seed]} + row)
""".format(**locals()))
- else:
- random_row = ("""{schema_madlib}.__uniform_vector(
+ elif distribution == 'bernoulli':
+ in_args_default.update({'upper': 1., 'lower': 0., 'prob': 0.5})
+ in_args_types.update({'upper': float, 'lower': float, 'prob': float})
+ in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
+ random_vector = ("""{schema_madlib}.__bernoulli_vector(
+ {col_dim},
+ {in_args_vals[upper]},
+ {in_args_vals[lower]},
+ {in_args_vals[prob]},
+ {in_args_vals[seed]} + row)
+ """.format(**locals()))
+ elif distribution == 'uniform':
+ in_args_default.update({'min': 0, 'max': 1})
+ in_args_types.update({'min': float, 'max': float})
+ in_args_vals = extract_keyvalue_params(in_args, in_args_types, in_args_default)
+ random_vector = ("""{schema_madlib}.__uniform_vector(
{col_dim},
{in_args_vals[min]},
{in_args_vals[max]},
{in_args_vals[seed]} + row)
""".format(**locals()))
+ else:
+ plpy.error("Matrix Error: Invalid distribution: "
+ "{0}. Supported distributions: ({1})"
+ .format(distribution, ', '.join(sorted(supported_dist))))
plpy.execute("""
- CREATE TABLE {matrix_out}
+ CREATE {in_args_vals[table_type]} TABLE {matrix_out}
m4_ifdef(`__POSTGRESQL__', `',
`WITH (APPENDONLY=TRUE,COMPRESSTYPE=QUICKLZ)') AS
SELECT
row as {out_args[row]},
- {random_row} as {out_args[val]}
+ {random_vector} as {out_args[val]}
FROM
generate_series(1, {row_dim}) as row
m4_ifdef(`__POSTGRESQL__', `',
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.sql_in b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
index 65f4fdb..2b15169 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.sql_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
@@ -174,7 +174,7 @@ in the glossary.
-- Create a diag matrix initialized with given diagonal elements
<b>matrix_diag</b>( diag_elements, matrix_out, out_args)
-- Create a matrix initialized with values sampled from a distribution
--- Supported distributions: normal and uniform
+-- Supported distributions: normal, uniform, bernoulli
<b>matrix_random</b>( distribution, row_dim, col_dim, in_args, matrix_out, out_args )
</pre>
@@ -2393,12 +2393,25 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.matrix_random
(
+ row_id INTEGER,
+ col_id INTEGER,
+ distribution TEXT,
+ matrix_out TEXT
+)
+RETURNS MADLIB_SCHEMA.matrix_result AS $$
+ SELECT MADLIB_SCHEMA.matrix_random($1, $2, ''::TEXT, $3, $4, ''::TEXT)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.matrix_random
+(
row_id INTEGER,
col_id INTEGER,
matrix_out TEXT
)
RETURNS MADLIB_SCHEMA.matrix_result AS $$
- SELECT MADLIB_SCHEMA.matrix_random($1, $2, ''::TEXT, 'uniform', $3, ''::TEXT)
+ SELECT MADLIB_SCHEMA.matrix_random($1, $2, 'uniform', $3)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -2969,6 +2982,46 @@ AS 'MODULE_PATHNAME', 'matrix_vec_mult_in_mem_1d'
LANGUAGE C STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+/**
+ * @brief The function folds (through multiplication) array x according to
+ * the pattern in array y, producing an array of the same length as array y.
+ * @param row Array x
+ * @param pattern Array y
+ * @param folded array
+ */
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__row_fold(
+ row_in FLOAT8[],
+ pattern INTEGER[]
+)
+RETURNS FLOAT8[]
+AS 'MODULE_PATHNAME', 'row_fold'
+LANGUAGE C STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.__bernoulli_vector
+(
+ dim INTEGER,
+ pos_val FLOAT8,
+ neg_val FLOAT8,
+ prob FLOAT8,
+ seed INTEGER
+)
+RETURNS FLOAT8[]
+AS 'MODULE_PATHNAME', 'bernoulli_vector'
+LANGUAGE C STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.__bernoulli_vector
+(
+ dim INTEGER
+)
+RETURNS FLOAT8[] AS $$
+ SELECT MADLIB_SCHEMA.__bernoulli_vector($1, 1::FLOAT8, 0::FLOAT8, 0.5::FLOAT8, 42::INTEGER)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.__uniform_vector
(
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/svm/kernel_approximation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/kernel_approximation.py_in b/src/ports/postgres/modules/svm/kernel_approximation.py_in
index 04aa018..4e19bcb 100644
--- a/src/ports/postgres/modules/svm/kernel_approximation.py_in
+++ b/src/ports/postgres/modules/svm/kernel_approximation.py_in
@@ -8,6 +8,295 @@ from utilities.utilities import num_features
from math import sqrt
from math import pi
+from math import log
+from math import factorial
+
+from random import random
+from random import seed
+
+from operator import mul
+from collections import namedtuple
+
+
+PolyRandOperator = namedtuple('PolyRandOperator',
+ 'weights, coefs, reps, '
+ 'other_features, rd_id, rd_val')
+
+
+class PolyKernel(object):
+ """docstring for PolyKernel"""
+ def __init__(self, schema_madlib, degree=2, coef0=1, n_components=100,
+ random_state=1, poly_operator=None, orig_data=None):
+ self.schema_madlib = schema_madlib
+ self.kernel_func = 'polynomial'
+ self.degree = degree
+ self.coef0 = coef0
+ self.n_components = n_components
+ self.random_state = random_state
+ # polynomial random mapping operator
+ self.pro = poly_operator
+ self.orig_data = orig_data
+ if self.pro is not None:
+ pro = self.pro
+ self.n_components = num_features(pro.coefs, pro.rd_val)
+ self.n_components += num_features(pro.other_features, pro.rd_val)
+
+ def clear(self):
+ data_type = 'view' if self.orig_data else 'table'
+ if self.pro:
+ run_sql = """
+ drop {data_type} if exists {pro.weights};
+ drop {data_type} if exists {pro.coefs};
+ drop {data_type} if exists {pro.reps};
+ drop {data_type} if exists {pro.other_features};
+ """.format(pro=self.pro, data_type=data_type)
+ plpy.execute(run_sql)
+
+ def __del__(self):
+ self.clear()
+
+ def save_as(self, name):
+ if self.orig_data:
+ plpy.warning("Polynomial Kernel Warning: no need to save."
+ "Original data table exists: {0}"
+ .format(self.orig_data))
+ return
+
+ run_sql = """
+ create table {name} as
+ select {pro.rd_id} as id, {pro.rd_val} as val,
+ 'coefs' as desp
+ from {pro.coefs}
+ union
+ select {pro.rd_id} as id, {pro.rd_val} as val,
+ 'weights' as desp
+ from {pro.weights}
+ union
+ select {pro.rd_id} as id, {pro.rd_val} as val,
+ 'reps' as desp
+ from {pro.reps}
+ union
+ select {pro.rd_id} as id, {pro.rd_val} as val,
+ 'other_features' as desp
+ from {pro.other_features}
+ """.format(name=name, pro=self.pro)
+ plpy.execute(run_sql)
+
+ @classmethod
+ def create(cls, schema_madlib, n_features, kernel_params):
+ params = cls.parse_params(kernel_params, n_features)
+ return cls(schema_madlib, **params)
+
+ @classmethod
+ def load_from(cls, schema_madlib, data, kernel_params=''):
+ other_features = unique_string(desp='other_features')
+ rd_weights = unique_string(desp='random_weights')
+ rd_coefs = unique_string(desp='random_coefs')
+ rd_reps = unique_string(desp='random_reps')
+ rd_val = unique_string(desp='val')
+ rd_id = unique_string(desp='id')
+ plpy.execute("""
+ drop view if exists {rd_weights};
+ create temp view {rd_weights} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'weights';
+
+ drop view if exists {rd_coefs};
+ create temp view {rd_coefs} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'coefs';
+
+ drop view if exists {rd_reps};
+ create temp view {rd_reps} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'reps';
+
+ drop view if exists {other_features};
+ create temp view {other_features} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'other_features';
+ """.format(**locals()))
+ params = cls.parse_params(kernel_params)
+ pro = PolyRandOperator(weights=rd_weights, coefs=rd_coefs,
+ reps=rd_reps, other_features=other_features,
+ rd_id=rd_id, rd_val=rd_val)
+
+ return cls(schema_madlib, poly_operator=pro, orig_data=data, **params)
+
+ @property
+ def kernel_params(self):
+ return ('degree={degree}, coef0={coef0}, '
+ 'n_components={n_components}, '
+ 'random_state={random_state}'
+ .format(degree=self.degree, coef0=self.coef0,
+ n_components=self.n_components,
+ random_state=self.random_state))
+
+ @classmethod
+ def parse_params(cls, kernel_params='', n_features=10):
+ params_default = {
+ 'degree': 3,
+ 'n_components': 2*n_features,
+ 'coef0': 1,
+ 'random_state': 1}
+ params_types = {
+ 'degree': int,
+ 'n_components': int,
+ 'coef0': float,
+ 'random_state': int}
+ return extract_keyvalue_params(kernel_params, params_types, params_default)
+
+ def fit(self, n_features):
+ # fast way to compute nCr
+ # combinations and permutations
+ def ncr(n, r):
+ r = min(r, n-r)
+ if r == 0:
+ return 1
+ numer = reduce(mul, range(n, n-r, -1))
+ denom = factorial(r + 1)
+ return numer // denom
+
+ # Maclaurin expansion of f = (q + x)**r
+ def maclaurin_coefs(r, q, k):
+ if q == 0:
+ return 0.
+ return ncr(r, k)*(q**(r-k))
+
+ self.clear()
+ self.orig_data = None
+ coefs_ = [sqrt(maclaurin_coefs(self.degree, self.coef0, k)*(2**(k+1)))
+ for k in range(self.degree+1)]
+ seed(self.random_state)
+ reps_ = [int(log((1./random()), 2)) for _ in range(self.n_components)]
+ reps_nz_ = [x for x in reps_ if (x > 0) and (x <= self.degree)]
+ rd_val_ = unique_string(desp='val')
+ rd_id_ = unique_string(desp='id')
+ rd_weights_ = unique_string(desp='random_weights')
+ run_sql = """
+ drop table if exists {rd_weights};
+ select {schema_madlib}.matrix_random(
+ 1, {dim},
+ 'pos_val=1, neg_val=-1, seed={seed}, table_type=temp',
+ 'bernoulli', '{rd_weights}',
+ 'row={id}, val={val}')
+ """.format(rd_weights=rd_weights_,
+ dim=sum(reps_nz_)*n_features,
+ seed=self.random_state,
+ schema_madlib=self.schema_madlib,
+ val=rd_val_, id=rd_id_)
+ plpy.execute(run_sql)
+
+ vals_ = [coefs_[k] for k in reps_nz_]
+ rd_coefs_ = unique_string(desp='rd_coefs')
+ run_sql = """
+ drop table if exists {data};
+ create temp table {data} as
+ select
+ $1 as {val}, id as {id}
+ from generate_series(1, 1) as id
+ """.format(data=rd_coefs_,
+ val=rd_val_, id=rd_id_)
+ plpy.execute(plpy.prepare(run_sql, ["float[]"]), [vals_])
+
+ rd_reps_ = unique_string(desp='reps_nz')
+ run_sql = """
+ drop table if exists {data};
+ create temp table {data} as
+ select
+ $1 as {val}, id as {id}
+ from generate_series(1, 1) as id
+ """.format(data=rd_reps_,
+ val=rd_val_, id=rd_id_)
+ plpy.execute(plpy.prepare(run_sql, ["float[]"]), [reps_nz_])
+
+ vals_ = ([coefs_[0]]*len([_ for _ in reps_ if _ == 0]) +
+ [0]*len([_ for _ in reps_ if _ > self.degree]))
+ other_features_ = unique_string(desp='other_features')
+ run_sql = """
+ drop table if exists {data};
+ create temp table {data} as
+ select
+ $1 as {val}, id as {id}
+ from generate_series(1, 1) as id
+ """.format(data=other_features_,
+ val=rd_val_, id=rd_id_)
+ plpy.execute(plpy.prepare(run_sql, ["float[]"]), [vals_])
+
+ self.pro = PolyRandOperator(weights=rd_weights_,
+ coefs=rd_coefs_, reps=rd_reps_,
+ other_features=other_features_,
+ rd_id=rd_id_, rd_val=rd_val_)
+ return self
+
+ def transform(self, source_table, independent_varname,
+ dependent_varname=None, grouping_col=None, id_col=None,
+ transformed_name='poly_transformed'):
+ if not self.pro:
+ return self
+ self.original_table = dict(source_table=source_table,
+ independent_varname=independent_varname,
+ dependent_varname=dependent_varname)
+ schema_madlib = self.schema_madlib
+
+ def _cast_if_null(input, alias):
+ null_str = "NULL::integer"
+ if input:
+ return str(input)
+ else:
+ return null_str + " as " + alias if alias else null_str
+
+ grouping_col = _cast_if_null(grouping_col, unique_string('grp_col'))
+ dependent_varname = _cast_if_null(dependent_varname, '')
+ id_col = _cast_if_null(id_col, unique_string('id_col'))
+
+ features_col = unique_string(desp='features_col')
+ target_col = unique_string(desp='target_col')
+ transformed = unique_string(desp=transformed_name)
+
+ # X = a * cos (X*C + b)
+ pro, multiplier = self.pro, sqrt(1. / self.n_components)
+ run_sql = """
+ drop table if exists {transformed};
+ create temp table {transformed} as
+ select
+ {schema_madlib}.array_scalar_mult(
+ array_cat(
+ {schema_madlib}.array_mult(
+ {schema_madlib}.__row_fold(
+ {schema_madlib}.__matrix_vec_mult_in_mem(
+ q.{features_col}::float[],
+ weights.{pro.rd_val}::float[]
+ )::float[],
+ reps.{pro.rd_val}::integer[]
+ )::float[],
+ coefs.{pro.rd_val}::float[]
+ )::float[],
+ of.{pro.rd_val}::float[]
+ )::float[],
+ {multiplier}::float
+ ) as {features_col},
+ q.{target_col} as {target_col},
+ {id_col},
+ {grouping_col}
+ from (
+ select
+ {dependent_varname} as {target_col},
+ {independent_varname} as {features_col},
+ {id_col},
+ {grouping_col}
+ from {source_table}
+ ) q cross join (select {pro.rd_val} from {pro.weights}) as weights
+ cross join (select {pro.rd_val} from {pro.coefs}) as coefs
+ cross join (select {pro.rd_val} from {pro.reps}) as reps
+ cross join (select {pro.rd_val} from {pro.other_features}) as of
+ """.format(**locals())
+ plpy.execute(run_sql)
+ # assert(self.n_components == num_features(transformed, features_col))
+ self.transformed_table = dict(source_table=transformed,
+ dependent_varname=target_col,
+ independent_varname=features_col)
+ return self
class GaussianKernelBase(object):
@@ -196,23 +485,21 @@ class GaussianKernel(GaussianKernelBase):
transformed_name='gaussian_transformed'):
if not self.rd_offset or not self.rd_weights:
return self
-
self.original_table = dict(source_table=source_table,
independent_varname=independent_varname,
dependent_varname=dependent_varname)
-
schema_madlib = self.schema_madlib
- def _verify(x, s):
+ def _cast_if_null(input, alias):
null_str = "NULL::integer"
- if x:
- return str(x)
+ if input:
+ return str(input)
else:
- return null_str + " as " + s if s else null_str
+ return null_str + " as " + alias if alias else null_str
- grouping_col = _verify(grouping_col, unique_string('grp_col'))
- dependent_varname = _verify(dependent_varname, '')
- id_col = _verify(id_col, unique_string('id_col'))
+ grouping_col = _cast_if_null(grouping_col, unique_string('grp_col'))
+ dependent_varname = _cast_if_null(dependent_varname, '')
+ id_col = _cast_if_null(id_col, unique_string('id_col'))
# copy data to the temporary table with id column
# id_col is different from index_col
@@ -399,9 +686,9 @@ def create_kernel(schema_madlib, n_features, kernel_func, kernel_params):
if kernel_func == 'linear':
return None
elif kernel_func == 'gaussian':
- return GaussianKernelBase.create(schema_madlib,
- n_features,
- kernel_params)
+ return GaussianKernelBase.create(schema_madlib, n_features, kernel_params)
+ elif kernel_func == 'polynomial':
+ return PolyKernel.create(schema_madlib, n_features, kernel_params)
def load_kernel(schema_madlib, data, kernel_func, kernel_params):
@@ -409,3 +696,5 @@ def load_kernel(schema_madlib, data, kernel_func, kernel_params):
return None
elif kernel_func == 'gaussian':
return GaussianKernelBase.load_from(schema_madlib, data, kernel_params)
+ elif kernel_func == 'polynomial':
+ return PolyKernel.load_from(schema_madlib, data, kernel_params)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/svm/svm.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in
index 9b16f83..7b7208b 100644
--- a/src/ports/postgres/modules/svm/svm.py_in
+++ b/src/ports/postgres/modules/svm/svm.py_in
@@ -744,6 +744,7 @@ def _cross_validate_svm(args):
val_res.output_tbl(params_dict['validation_result'])
params_dict.update(val_res.first('sub_args')['params_dict'])
args.update(dict(transformer=transformer))
+# ------------------------------------------------------------------------------
def _get_kernel_name(kernel_func):
@@ -751,7 +752,7 @@ def _get_kernel_name(kernel_func):
kernel_func = 'linear'
else:
# Add non-linear kernels below after implementing them.
- supported_kernels = ['linear', 'gaussian']
+ supported_kernels = ['linear', 'gaussian', 'polynomial']
try:
# allow user to specify a prefix substring of
# supported kernels. This works because the supported
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3e576c3b/src/ports/postgres/modules/svm/test/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in
index 48b8bab..31b996d 100644
--- a/src/ports/postgres/modules/svm/test/svm.sql_in
+++ b/src/ports/postgres/modules/svm/test/svm.sql_in
@@ -798,6 +798,65 @@ INSERT INTO kernel_data (index, x1, x2, y) VALUES (13, 0.200000000000000011, -2.
INSERT INTO kernel_data (index, x1, x2, y) VALUES (14, 0, -2.70000000000000018, 1);
INSERT INTO kernel_data (index, x1, x2, y) VALUES (15, 1.30000000000000004, 2.10000000000000009, 1);
+-- verify poly kernel mapping dimensions
+DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
+SELECT svm_classification(
+ 'svm_train_data',
+ 'm1',
+ 'label',
+ 'ind',
+ 'poly',
+ 'n_components=3',
+ NULL,
+ 'max_iter=2');
+DROP TABLE IF EXISTS svm_test_predict CASCADE;
+SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT
+ assert(
+ array_upper(coef, 1) = 3,
+ 'The dimension of the coefficients must be equal to n_components (3)!')
+FROM m1;
+
+-- verify poly kernel with grouping
+-- verify partial string support in kernel specification
+DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary, svr_mdl_m_random CASCADE;
+SELECT svm_regression(
+ 'abalone_train_small',
+ 'svr_mdl_m',
+ 'rings',
+ 'ARRAY[1,diameter,shell,shucked,length]',
+ 'po',
+ 'degree=2, n_components=10',
+ 'sex',
+ 'max_iter=2, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.05',
+ false);
+DROP TABLE IF EXISTS svm_test_predict CASCADE;
+SELECT svm_predict('svr_mdl_m','abalone_train_small', 'id', 'svm_test_predict');
+SELECT
+ assert(
+ array_upper(coef, 1) = 10,
+ 'The dimension of the coefficients must be equal to n_components (10)!')
+FROM svr_mdl_m;
+
+-- verify poly kernel with cross validation
+DROP TABLE IF EXISTS m1, m1_summary, m1_random CASCADE;
+SELECT svm_classification(
+ 'svm_train_data',
+ 'm1',
+ 'label',
+ 'ind',
+ 'poly',
+ 'n_components=3',
+ NULL,
+ 'max_iter=2, n_folds=3, lambda=[0.01, 0.1, 0.5]');
+DROP TABLE IF EXISTS svm_test_predict CASCADE;
+SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict');
+SELECT
+ assert(
+ array_upper(coef, 1) = 3,
+ 'The dimension of the coefficients must be equal to n_components (3)!')
+FROM m1;
+
DROP TABLE IF EXISTS m1, m1_summary, m1_random;
SELECT svm_classification(
@@ -817,7 +876,7 @@ SELECT
FROM kernel_predict NATURAL JOIN kernel_data
WHERE prediction <> y;
SELECT
- assert(count(*) = 0, 'Using kernel should perfectly fit the data!')
+ assert(count(*) = 0, 'Using gaussian kernel should perfectly fit the data!')
FROM kernel_predict NATURAL JOIN kernel_data
WHERE prediction <> y;