You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2018/02/03 00:47:20 UTC
madlib git commit: SVM: Add minibatch as a new solver [Forced Update!]
Repository: madlib
Updated Branches:
refs/heads/master 657cf4aa4 -> a8bbe082c (forced update)
SVM: Add minibatch as a new solver
This work is based on the original work by
Xiaocheng Tang <xi...@gmail.com> in #75.
This PR adds two main features:
- A Minibatch solver that takes as input a batch of data
- SVM code that takes advantage of the minibatch
Closes #229
Co-authored-by: Nikhil Kak <nk...@pivotal.io>
Co-authored-by: Xiaocheng Tang <xi...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/a8bbe082
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/a8bbe082
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/a8bbe082
Branch: refs/heads/master
Commit: a8bbe082ca60f87e006eba164ea69b159e1875fc
Parents: 53db736
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Feb 2 14:15:01 2018 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Feb 2 16:46:48 2018 -0800
----------------------------------------------------------------------
.gitignore | 1 +
src/dbal/EigenIntegration/HandleMap_proto.hpp | 3 +-
src/modules/convex/algo/igd.hpp | 81 ++++++++-
src/modules/convex/linear_svm_igd.cpp | 159 +++++++++++++++++
src/modules/convex/linear_svm_igd.hpp | 6 +
src/modules/convex/task/linear_svm.hpp | 67 +++++++-
src/modules/convex/type/model.hpp | 71 +++++---
src/modules/convex/type/state.hpp | 79 +++++++++
src/modules/convex/type/tuple.hpp | 3 +
src/ports/postgres/modules/svm/svm.py_in | 170 +++++++++++++------
src/ports/postgres/modules/svm/svm.sql_in | 117 +++++++++++--
src/ports/postgres/modules/svm/test/svm.sql_in | 104 ++++++++++++
.../modules/utilities/validate_args.py_in | 19 +--
13 files changed, 774 insertions(+), 106 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 00dc016..a073fbd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ auto
*.swp
*.fdb_latexmk
*.swo # vim swap file
+\#*\# # emacs backup file
# Biblatex temporary files
*-blx.bib
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/dbal/EigenIntegration/HandleMap_proto.hpp
----------------------------------------------------------------------
diff --git a/src/dbal/EigenIntegration/HandleMap_proto.hpp b/src/dbal/EigenIntegration/HandleMap_proto.hpp
index 4bfe7c5..c7b6cb2 100644
--- a/src/dbal/EigenIntegration/HandleMap_proto.hpp
+++ b/src/dbal/EigenIntegration/HandleMap_proto.hpp
@@ -19,6 +19,7 @@ namespace eigen_integration {
template <class EigenType, class Handle, int MapOptions = Eigen::Unaligned>
class HandleMap : public Eigen::Map<EigenType, MapOptions> {
public:
+ typedef EigenType PlainEigenType;
typedef Eigen::Map<EigenType, MapOptions> Base;
typedef typename Base::Scalar Scalar;
typedef typename Base::Index Index;
@@ -57,7 +58,7 @@ public:
*
* For example, this allows construction of MappedColumnVector from
* MappedMatrix::col(int) or NativeColumnVector, etc.
- */
+ */
template <class Derived>
HandleMap(const Eigen::MapBase<Derived>& inMappedData,
typename boost::enable_if_c<Derived::IsVectorAtCompileTime>::type* = 0)
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/modules/convex/algo/igd.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/algo/igd.hpp b/src/modules/convex/algo/igd.hpp
index cd17e64..3ae4c13 100644
--- a/src/modules/convex/algo/igd.hpp
+++ b/src/modules/convex/algo/igd.hpp
@@ -34,7 +34,9 @@ public:
typedef typename Task::model_type model_type;
static void transition(state_type &state, const tuple_type &tuple);
+ static void transitionInMiniBatch(state_type &state, const tuple_type &tuple);
static void merge(state_type &state, const_state_type &otherState);
+ static void mergeInPlace(state_type &state, const_state_type &otherState);
static void final(state_type &state);
};
@@ -56,6 +58,62 @@ IGD<State, ConstState, Task>::transition(state_type &state,
state.task.stepsize * tuple.weight);
}
+/**
+ * @brief Update the transition state in mini-batches
+ *
+ * Note: We assume that
+ * 1. Task defines a model_eigen_type
+ * 2. A batch of tuple.indVar is a Matrix
+ * 3. A batch of tuple.depVar is a ColumnVector
+ * 4. Task defines a getLossAndUpdateModel method
+ *
+ */
+ template <class State, class ConstState, class Task>
+ void
+ IGD<State, ConstState, Task>::transitionInMiniBatch(
+ state_type &state,
+ const tuple_type &tuple) {
+
+ madlib_assert(tuple.indVar.rows() == tuple.depVar.rows(),
+ std::runtime_error("Invalid data. Independent and dependent "
+ "batches don't have same number of rows."));
+
+ int batch_size = state.algo.batchSize;
+ int n_epochs = state.algo.nEpochs;
+
+ // n_rows/n_ind_cols are the rows/cols in a transition tuple.
+ int n_rows = tuple.indVar.rows();
+ int n_ind_cols = tuple.indVar.cols();
+ int n_batches = n_rows < batch_size ? 1 :
+ n_rows / batch_size +
+ int(n_rows%batch_size > 0);
+
+ for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
+ double loss = 0.0;
+ for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches;
+ curr_batch++, curr_batch_row_index += batch_size) {
+ Matrix X_batch;
+ ColumnVector y_batch;
+ if (curr_batch == n_batches-1) {
+ // last batch
+ X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
+ y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
+ } else {
+ X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols);
+ y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size);
+ }
+ loss += Task::getLossAndUpdateModel(
+ state.task.model, X_batch, y_batch, state.task.stepsize);
+ }
+
+ // The first epoch will most likely have the highest loss.
+ // Being pessimistic, use the total loss only from the first epoch.
+ if (curr_epoch==0) state.algo.loss += loss;
+ }
+ return;
+ }
+
+
template <class State, class ConstState, class Task>
void
IGD<State, ConstState, Task>::merge(state_type &state,
@@ -86,11 +144,32 @@ IGD<State, ConstState, Task>::merge(state_type &state,
template <class State, class ConstState, class Task>
void
+IGD<State, ConstState, Task>::mergeInPlace(state_type &state,
+ const_state_type &otherState) {
+ // avoid division by zero
+ if (state.algo.numRows == 0) {
+ state.task.model = otherState.task.model;
+ return;
+ } else if (otherState.algo.numRows == 0) {
+ return;
+ }
+
+ // model averaging, weighted by rows seen
+ double leftRows = static_cast<double>(state.algo.numRows + state.algo.numRows);
+ double rightRows = static_cast<double>(otherState.algo.numRows + otherState.algo.numRows);
+ double totalNumRows = leftRows + rightRows;
+ state.task.model *= leftRows / rightRows;
+ state.task.model += otherState.task.model;
+ state.task.model *= rightRows / totalNumRows;
+}
+
+template <class State, class ConstState, class Task>
+void
IGD<State, ConstState, Task>::final(state_type &state) {
// The reason that we have to keep the task.model untouched in transition
// funtion: loss computation needs the model from last iteration cleanly
-
state.task.model = state.algo.incrModel;
+
}
} // namespace convex
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/modules/convex/linear_svm_igd.cpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/linear_svm_igd.cpp b/src/modules/convex/linear_svm_igd.cpp
index f396250..90882a3 100644
--- a/src/modules/convex/linear_svm_igd.cpp
+++ b/src/modules/convex/linear_svm_igd.cpp
@@ -32,6 +32,10 @@ typedef IGD<GLMIGDState<MutableArrayHandle<double> >,
GLMIGDState<ArrayHandle<double> >,
LinearSVM<GLMModel, GLMTuple > > LinearSVMIGDAlgorithm;
+typedef IGD<SVMMinibatchState<MutableArrayHandle<double> >,
+ SVMMinibatchState<ArrayHandle<double> >,
+ LinearSVM<GLMModel, SVMMiniBatchTuple > > LinearSVMIGDAlgoMiniBatch;
+
typedef Loss<GLMIGDState<MutableArrayHandle<double> >,
GLMIGDState<ArrayHandle<double> >,
LinearSVM<GLMModel, GLMTuple > > LinearSVMLossAlgorithm;
@@ -121,6 +125,98 @@ linear_svm_igd_transition::run(AnyType &args) {
}
/**
+ * @brief Perform the linear support vector machine transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+linear_svm_igd_minibatch_transition::run(AnyType &args) {
+ // The real state.
+ // For the first tuple: args[0] is nothing more than a marker that
+ // indicates that we should do some initial operations.
+ // For other tuples: args[0] holds the computation state until last tuple
+ SVMMinibatchState<MutableArrayHandle<double> > state = args[0];
+
+ // initialize the state if first tuple
+ if (state.algo.numRows == 0) {
+
+ LinearSVM<GLMModel, GLMTuple >::epsilon = args[9].getAs<double>();;
+ LinearSVM<GLMModel, GLMTuple >::is_svc = args[10].getAs<bool>();;
+ if (!args[3].isNull()) {
+ SVMMinibatchState<ArrayHandle<double> > previousState = args[3];
+ state.allocate(*this, previousState.task.nFeatures);
+ state = previousState;
+ } else {
+ // configuration parameters
+ uint32_t dimension = args[4].getAs<uint32_t>();
+ state.allocate(*this, dimension); // with zeros
+ }
+ // resetting in either case
+ // state.reset();
+ state.task.stepsize = args[5].getAs<double>();
+ const double lambda = args[6].getAs<double>();
+ const bool isL2 = args[7].getAs<bool>();
+ const int nTuples = args[8].getAs<int>();
+
+ // The regularization operations called below (scaling and clipping)
+ // need these class variables to be set.
+ L1<GLMModel>::n_tuples = nTuples;
+ L2<GLMModel>::n_tuples = nTuples;
+ if (isL2)
+ L2<GLMModel>::lambda = lambda;
+ else
+ L1<GLMModel>::lambda = lambda;
+ }
+
+ state.algo.nEpochs = args[12].getAs<int>();
+ state.algo.batchSize = args[13].getAs<int>();
+
+ // Skip the current record if args[1] (features) contains NULL values,
+ // or args[2] is NULL
+ try {
+ args[1].getAs<MappedMatrix>();
+ } catch (const ArrayWithNullException &e) {
+ return args[0];
+ }
+ if (args[2].isNull())
+ return args[0];
+
+ // tuple
+ using madlib::dbal::eigen_integration::MappedColumnVector;
+
+ MappedMatrix x(NULL);
+ MappedColumnVector y(NULL);
+ try {
+ new (&x) MappedMatrix(args[1].getAs<MappedMatrix>());
+ new (&y) MappedColumnVector(args[2].getAs<MappedColumnVector>());
+ } catch (const ArrayWithNullException &e) {
+ return args[0];
+ }
+ SVMMiniBatchTuple tuple;
+ tuple.indVar = trans(x);
+ tuple.depVar = y;
+
+ // each tuple can be weighted - this can be combination of the sample weight
+ // and the class weight. Calling function is responsible for combining the two
+ // into a single tuple weight. The default value for this parameter is 1, set
+ // into the definition of "tuple".
+ // The weight is used to increase the value of a particular tuple for the online
+ // learning. The weight is not used for the loss computation.
+ tuple.weight = args[11].getAs<double>();
+
+
+ // Now do the transition step
+ // apply Minibatching with regularization
+ L2<GLMModel>::scaling(state.task.model, state.task.stepsize);
+ LinearSVMIGDAlgoMiniBatch::transitionInMiniBatch(state, tuple);
+ L1<GLMModel>::clipping(state.task.model, state.task.stepsize);
+
+ state.algo.numRows += x.cols();
+ return state;
+}
+
+
+/**
* @brief Perform the perliminary aggregation function: Merge transition states
*/
AnyType
@@ -146,6 +242,30 @@ linear_svm_igd_merge::run(AnyType &args) {
}
/**
+ * @brief Perform the perliminary aggregation function: Merge transition states
+ */
+AnyType
+linear_svm_igd_minibatch_merge::run(AnyType &args) {
+ SVMMinibatchState<MutableArrayHandle<double> > stateLeft = args[0];
+ SVMMinibatchState<ArrayHandle<double> > stateRight = args[1];
+
+ // We first handle the trivial case where this function is called with one
+ // of the states being the initial state
+ if (stateLeft.algo.numRows == 0) { return stateRight; }
+ else if (stateRight.algo.numRows == 0) { return stateLeft; }
+
+ // Merge states together
+ LinearSVMIGDAlgoMiniBatch::mergeInPlace(stateLeft, stateRight);
+
+ // The following numRows update, cannot be put above, because the model
+ // averaging depends on their original values
+ stateLeft.algo.numRows += stateRight.algo.numRows;
+ stateLeft.algo.loss += stateRight.algo.loss;
+
+ return stateLeft;
+}
+
+/**
* @brief Perform the linear support vector machine final step
*/
AnyType
@@ -172,6 +292,29 @@ linear_svm_igd_final::run(AnyType &args) {
}
/**
+ * @brief Perform the linear support vector machine final step
+ */
+AnyType
+linear_svm_igd_minibatch_final::run(AnyType &args) {
+ // We request a mutable object. Depending on the backend, this might perform
+ // a deep copy.
+ SVMMinibatchState<MutableArrayHandle<double> > state = args[0];
+ // Aggregates that haven't seen any data just return Null.
+ if (state.algo.numRows == 0) { return Null(); }
+ state.algo.loss = state.algo.loss / state.algo.numRows;
+ return state;
+}
+
+AnyType
+internal_linear_svm_igd_minibatch_distance::run(AnyType &args) {
+ SVMMinibatchState<ArrayHandle<double> > stateLeft = args[0];
+ SVMMinibatchState<ArrayHandle<double> > stateRight = args[1];
+
+ return std::abs((stateLeft.algo.loss - stateRight.algo.loss)
+ / stateLeft.algo.loss);
+}
+
+/**
* @brief Return the difference in RMSE between two states
*/
AnyType
@@ -199,6 +342,22 @@ internal_linear_svm_igd_result::run(AnyType &args) {
return tuple;
}
+/**
+ * @brief Return the coefficients and diagnostic statistics of the state
+ */
+AnyType
+internal_linear_svm_igd_minibatch_result::run(AnyType &args) {
+ SVMMinibatchState<ArrayHandle<double> > state = args[0];
+
+ AnyType tuple;
+ tuple << state.task.model
+ << static_cast<double>(state.algo.loss)
+ << 0.
+ << static_cast<int64_t>(state.algo.numRows);
+
+ return tuple;
+}
+
} // namespace convex
} // namespace modules
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/modules/convex/linear_svm_igd.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/linear_svm_igd.hpp b/src/modules/convex/linear_svm_igd.hpp
index afe169e..0958b0f 100644
--- a/src/modules/convex/linear_svm_igd.hpp
+++ b/src/modules/convex/linear_svm_igd.hpp
@@ -8,26 +8,32 @@
* @brief Linear support vector machine (incremental gradient): Transition function
*/
DECLARE_UDF(convex, linear_svm_igd_transition)
+DECLARE_UDF(convex, linear_svm_igd_minibatch_transition)
/**
* @brief Linear support vector machine (incremental gradient): State merge function
*/
DECLARE_UDF(convex, linear_svm_igd_merge)
+DECLARE_UDF(convex, linear_svm_igd_minibatch_merge)
/**
* @brief Linear support vector machine (incremental gradient): Final function
*/
DECLARE_UDF(convex, linear_svm_igd_final)
+DECLARE_UDF(convex, linear_svm_igd_minibatch_final)
/**
* @brief Linear support vector machine (incremental gradient): Difference in
* log-likelihood between two transition states
*/
DECLARE_UDF(convex, internal_linear_svm_igd_distance)
+DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_distance)
+
/**
* @brief Linear support vector machine (incremental gradient): Convert
* transition state to result tuple
*/
DECLARE_UDF(convex, internal_linear_svm_igd_result)
+DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_result)
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/modules/convex/task/linear_svm.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/linear_svm.hpp b/src/modules/convex/task/linear_svm.hpp
index 136d73b..7146432 100644
--- a/src/modules/convex/task/linear_svm.hpp
+++ b/src/modules/convex/task/linear_svm.hpp
@@ -22,10 +22,16 @@ class LinearSVM {
public:
typedef Model model_type;
typedef Tuple tuple_type;
- typedef typename Tuple::independent_variables_type
- independent_variables_type;
+
+ typedef typename Tuple::independent_variables_type independent_variables_type;
typedef typename Tuple::dependent_variable_type dependent_variable_type;
+ // Model is assumed to be base Eigen type or Eigen map and the 'EigenType'
+ // variable infers the actual type from the Model definition.
+ // For eg. SVMModel is defined as a ColumnVectorTransparentHandleMap which
+ // has a ColumnVector as its EigenType.
+ typedef typename model_type::PlainEigenType coefficient_type;
+
static double epsilon;
static bool is_svc;
@@ -41,6 +47,12 @@ public:
const dependent_variable_type &y,
const double &stepsize);
+ static double getLossAndUpdateModel(
+ model_type &model,
+ const independent_variables_type &x,
+ const dependent_variable_type &y,
+ const double &stepsize);
+
static double loss(
const model_type &model,
const independent_variables_type &x,
@@ -101,6 +113,57 @@ LinearSVM<Model, Tuple>::gradientInPlace(
}
}
+/**
+* @brief This function will update the model for a single batch and return the loss
+* @param model Model to update
+* @param x Batch of independent variables
+* @param y Batch of dependent variables
+* @param stepsize Learning rate for model update
+* @return Total loss in the batch
+*/
+template <class Model, class Tuple>
+double
+LinearSVM<Model, Tuple>::getLossAndUpdateModel(
+ model_type &model,
+ const independent_variables_type &x,
+ const dependent_variable_type &y,
+ const double &stepsize){
+
+ // This function is called by the minibatch transition function to update
+ // the model for each batch. x and y in the function signature are defined
+ // as generic variables to ensure a consistent interface across all modules.
+
+ // ASSUMPTION: 'gradient' will always be of the same type as the
+ // coefficients. In SVM, the model is just the coefficients, but can be
+ // more complex with other modules like MLP.
+ coefficient_type gradient = model;
+ gradient.setZero();
+ coefficient_type w_transpose_x = x * model;
+ double loss = 0.0;
+ int batch_size = x.rows();
+ double dist_from_hyperplane = 0.0;
+ double c = 0.0;
+ int n_points_with_positive_dist = 0;
+ for (int i = 0; i < batch_size; i++) {
+ if (is_svc) {
+ c = -y(i); // minus for "-loglik"
+ dist_from_hyperplane = 1.0 - w_transpose_x(i) * y(i);
+ } else {
+ double wx_y = w_transpose_x(i) - y(i);
+ c = wx_y > 0 ? 1.0 : -1.0;
+ dist_from_hyperplane = c * wx_y - epsilon;
+ }
+ if (dist_from_hyperplane > 0.) {
+ gradient += c * x.row(i);
+ loss += dist_from_hyperplane;
+ n_points_with_positive_dist++;
+ }
+ }
+ gradient.array() /= n_points_with_positive_dist;
+ model -= stepsize * gradient;
+ return loss;
+}
+
template <class Model, class Tuple>
double
LinearSVM<Model, Tuple>::loss(
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/modules/convex/type/model.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/model.hpp b/src/modules/convex/type/model.hpp
index 679dab4..4f534e4 100644
--- a/src/modules/convex/type/model.hpp
+++ b/src/modules/convex/type/model.hpp
@@ -93,17 +93,20 @@ struct LMFModel {
}
};
-// Generalized Linear Models (GLMs): Logistic regression, Linear SVM
typedef HandleTraits<MutableArrayHandle<double> >::ColumnVectorTransparentHandleMap
GLMModel;
+typedef HandleTraits<MutableArrayHandle<double> >::ColumnVectorTransparentHandleMap
+ SVMModel;
+
// The necessity of this wrapper is to allow classes in algo/ and task/ to
// have a type that they can template over
template <class Handle>
struct MLPModel {
- typename HandleTraits<Handle>::ReferenceToUInt16 is_classification;
- typename HandleTraits<Handle>::ReferenceToUInt16 activation;
- std::vector<Eigen::Map<Matrix > > u;
+ typename HandleTraits<Handle>::ReferenceToDouble is_classification;
+ typename HandleTraits<Handle>::ReferenceToDouble activation;
+ // std::vector<Eigen::Map<Matrix > > u;
+ std::vector<MutableMappedMatrix> u;
/**
* @brief Space needed.
@@ -120,8 +123,8 @@ struct MLPModel {
size_t N = inNumberOfStages;
const double *n = inNumbersOfUnits;
size_t k;
- for (k = 1; k <= N; k ++) {
- size += (n[k-1] + 1) * (n[k]);
+ for (k = 0; k < N; k ++) {
+ size += (n[k] + 1) * (n[k+1]);
}
return size; // weights (u)
}
@@ -140,71 +143,87 @@ struct MLPModel {
uint32_t sizeOfU = 0;
u.clear();
- for (k = 1; k <= N; k ++) {
- u.push_back(Eigen::Map<Matrix >(
- const_cast<double*>(data + sizeOfU),
- n[k-1] + 1, n[k]));
- sizeOfU += (n[k-1] + 1) * (n[k]);
+ for (k = 0; k < N; k ++) {
+ // u.push_back(Eigen::Map<Matrix >(
+ // const_cast<double*>(data + sizeOfU),
+ // n[k] + 1, n[k+1]));
+ u.push_back(MutableMappedMatrix());
+ u[k].rebind(const_cast<double *>(data + sizeOfU), n[k] + 1, n[k+1]);
+ sizeOfU += (n[k] + 1) * (n[k+1]);
}
return sizeOfU;
}
+ void initialize(const uint16_t &inNumberOfStages,
+ const double *inNumbersOfUnits){
+ size_t N = inNumberOfStages;
+ const double *n = inNumbersOfUnits;
+ size_t k;
+ double span;
+ for (k =0; k < N; ++k){
+ // Initalize according to Glorot and Bengio (2010)
+ // See design doc for more info
+ span = sqrt(6.0 / (n[k] + n[k+1]));
+ u[k] << span * Matrix::Random(u[k].rows(), u[k].cols());
+ }
+ }
+
double norm() const {
double norm = 0.;
size_t k;
for (k = 0; k < u.size(); k ++) {
- norm+=u[k].bottomRows(u[k].rows()-1).squaredNorm();
+ norm += u[k].bottomRows(u[k].rows()-1).squaredNorm();
}
return std::sqrt(norm);
}
void setZero(){
size_t k;
- for (k = 1; k <= u.size(); k ++) {
- u[k-1].setZero();
+ for (k = 0; k < u.size(); k ++) {
+ u[k].setZero();
}
}
/*
* Some operator wrappers for u.
*/
- MLPModel &operator*=(const double &c) {
+ MLPModel& operator*=(const double &c) {
// Note that when scaling the model, you should
// not update the bias.
size_t k;
- for (k = 1; k <= u.size(); k ++) {
- u[k-1] *= c;
+ for (k = 0; k < u.size(); k ++) {
+ u[k] *= c;
}
return *this;
}
template<class OtherHandle>
- MLPModel &operator-=(const MLPModel<OtherHandle> &inOtherModel) {
+ MLPModel& operator-=(const MLPModel<OtherHandle> &inOtherModel) {
size_t k;
- for (k = 1; k <= u.size() && k <= inOtherModel.u.size(); k ++) {
- u[k-1] -= inOtherModel.u[k-1];
+ for (k = 0; k < u.size() && k < inOtherModel.u.size(); k ++) {
+ u[k] -= inOtherModel.u[k];
}
return *this;
}
template<class OtherHandle>
- MLPModel &operator+=(const MLPModel<OtherHandle> &inOtherModel) {
+ MLPModel& operator+=(const MLPModel<OtherHandle> &inOtherModel) {
size_t k;
- for (k = 1; k <= u.size() && k <= inOtherModel.u.size(); k ++) {
- u[k-1] += inOtherModel.u[k-1];
+ for (k = 0; k < u.size() && k < inOtherModel.u.size(); k ++) {
+ u[k] += inOtherModel.u[k];
}
return *this;
}
template<class OtherHandle>
- MLPModel &operator=(const MLPModel<OtherHandle> &inOtherModel) {
+ MLPModel& operator=(const MLPModel<OtherHandle> &inOtherModel) {
size_t k;
- for (k = 1; k <= u.size() && k <= inOtherModel.u.size(); k ++) {
- u[k-1] = inOtherModel.u[k-1];
+ for (k = 0; k < u.size() && k < inOtherModel.u.size(); k ++) {
+ u[k] = inOtherModel.u[k];
}
is_classification = inOtherModel.is_classification;
activation = inOtherModel.activation;
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/modules/convex/type/state.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/state.hpp b/src/modules/convex/type/state.hpp
index 2cb2643..f846e8f 100644
--- a/src/modules/convex/type/state.hpp
+++ b/src/modules/convex/type/state.hpp
@@ -295,6 +295,85 @@ public:
} algo;
};
+template <class Handle>
+class SVMMinibatchState {
+ template <class OtherHandle>
+ friend class SVMMinibatchState;
+
+public:
+ SVMMinibatchState(const AnyType &inArray) : mStorage(inArray.getAs<Handle>()) {
+ rebind();
+ }
+
+ /**
+ * @brief Convert to backend representation
+ *
+ * We define this function so that we can use State in the
+ * argument list and as a return type.
+ */
+ inline operator AnyType() const {
+ return mStorage;
+ }
+
+ /**
+ * @brief Allocating the state.
+ */
+ inline void allocate(const Allocator &inAllocator, uint32_t nFeatures) {
+ mStorage = inAllocator.allocateArray<double, dbal::AggregateContext,
+ dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(nFeatures));
+
+ rebind();
+ task.nFeatures = nFeatures;
+ rebind();
+ }
+
+ /**
+ * @brief We need to support assigning the previous state
+ */
+ template <class OtherHandle>
+ SVMMinibatchState &operator=(const SVMMinibatchState<OtherHandle> &inOtherState) {
+ for (size_t i = 0; i < mStorage.size(); i++) {
+ mStorage[i] = inOtherState.mStorage[i];
+ }
+
+ return *this;
+ }
+
+ static inline uint32_t arraySize(const uint32_t nFeatures) {
+ return 8 + nFeatures;
+ }
+
+protected:
+ void rebind() {
+ task.nFeatures.rebind(&mStorage[0]);
+ task.stepsize.rebind(&mStorage[1]);
+ algo.numRows.rebind(&mStorage[2]);
+ algo.loss.rebind(&mStorage[3]);
+ task.reg.rebind(&mStorage[4]);
+ algo.batchSize.rebind(&mStorage[5]);
+ algo.nEpochs.rebind(&mStorage[6]);
+ task.model.rebind(&mStorage[8], task.nFeatures);
+ }
+
+ Handle mStorage;
+
+public:
+ struct TaskState {
+ typename HandleTraits<Handle>::ReferenceToUInt32 nFeatures;
+ typename HandleTraits<Handle>::ReferenceToDouble stepsize;
+ typename HandleTraits<Handle>::ReferenceToDouble reg;
+ typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap model;
+ } task;
+
+ struct AlgoState {
+ typename HandleTraits<Handle>::ReferenceToUInt64 numRows;
+ typename HandleTraits<Handle>::ReferenceToDouble loss;
+ typename HandleTraits<Handle>::ReferenceToUInt32 batchSize;
+ typename HandleTraits<Handle>::ReferenceToUInt32 nEpochs;
+ } algo;
+};
+
+
/**
* @brief Inter- (Task State) and intra-iteration (Algo State) state of
* Conjugate Gradient for generalized linear models
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/modules/convex/type/tuple.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/tuple.hpp b/src/modules/convex/type/tuple.hpp
index 824ed90..ac070b6 100644
--- a/src/modules/convex/type/tuple.hpp
+++ b/src/modules/convex/type/tuple.hpp
@@ -61,6 +61,9 @@ using madlib::dbal::eigen_integration::MappedColumnVector;
// Generalized Linear Models (GLMs): Logistic regression, Linear SVM
typedef ExampleTuple<MappedColumnVector, double> GLMTuple;
+typedef ExampleTuple<MappedColumnVector, double> SVMTuple;
+typedef ExampleTuple<Matrix, ColumnVector> SVMMiniBatchTuple;
+
// madlib::modules::convex::MatrixIndex
typedef ExampleTuple<MatrixIndex, double> LMFTuple;
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/ports/postgres/modules/svm/svm.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in
index 4760f36..a57a135 100644
--- a/src/ports/postgres/modules/svm/svm.py_in
+++ b/src/ports/postgres/modules/svm/svm.py_in
@@ -36,36 +36,60 @@ def _compute_svm(args):
"""
init_stepsize = args['init_stepsize']
args['stepsize'] = init_stepsize
+ batch_size = args['batch_size']
+ args['dist_func'] = ('internal_linear_svm_igd_distance' if batch_size == 1
+ else 'internal_linear_svm_igd_minibatch_distance')
iterationCtrl = GroupIterationController(args)
with iterationCtrl as it:
it.iteration = 0
has_converged = False
while not has_converged:
- it.update(
- """
- {schema_madlib}.linear_svm_igd_step(
- ({col_ind_var})::FLOAT8[],
- ({col_dep_var_trans})::FLOAT8,
- {rel_state}.{col_grp_state},
- {n_features}::INT4,
- {stepsize}::FLOAT8,
- {lambda}::FLOAT8,
- {is_l2}::BOOLEAN,
- {col_n_tuples},
- ({select_epsilon})::FLOAT8,
- {is_svc}::BOOLEAN,
- {class_weight_sql}::FLOAT8
- )
- """)
+ if batch_size == 1:
+ it.update(
+ """
+ {schema_madlib}.linear_svm_igd_step(
+ ({col_ind_var})::FLOAT8[],
+ ({col_dep_var_trans})::FLOAT8,
+ {rel_state}.{col_grp_state},
+ {n_features}::INT4,
+ {stepsize}::FLOAT8,
+ {lambda}::FLOAT8,
+ {is_l2}::BOOLEAN,
+ {col_n_tuples},
+ ({select_epsilon})::FLOAT8,
+ {is_svc}::BOOLEAN,
+ {class_weight_sql}::FLOAT8
+ )
+ """)
+ else:
+ it.update(
+ """
+ {schema_madlib}.linear_svm_igd_minibatch_step(
+ ({col_ind_var})::FLOAT8[][],
+ ({col_dep_var_trans})::FLOAT8[],
+ {rel_state}.{col_grp_state},
+ {n_features}::INT4,
+ {stepsize}::FLOAT8,
+ {lambda}::FLOAT8,
+ {is_l2}::BOOLEAN,
+ {col_n_tuples},
+ ({select_epsilon})::FLOAT8,
+ {is_svc}::BOOLEAN,
+ {class_weight_sql}::FLOAT8,
+ {n_epochs}::INTEGER,
+ {batch_size}::INTEGER
+ )
+ """)
it.info()
if it.kwargs['decay_factor'] > 0:
it.kwargs['stepsize'] *= it.kwargs['decay_factor']
else:
it.kwargs['stepsize'] = init_stepsize / (it.iteration + 1)
+
has_converged = it.test(
"""
{iteration} >= {max_iter}
- OR {schema_madlib}.internal_linear_svm_igd_distance(
+ OR {schema_madlib}.{dist_func}(
_state_previous, _state_current) < {tolerance}
""")
it.final()
@@ -89,9 +113,9 @@ def _verify_table(source_table, model_table, dependent_varname,
"('{dependent_varname}') for source_table "
"({source_table})!".format(dependent_varname=dependent_varname,
source_table=source_table))
- dep_type = get_expr_type(dependent_varname, source_table)
- if '[]' in dep_type:
- plpy.error("SVM error: dependent_varname cannot be of array type!")
+ # dep_type = get_expr_type(dependent_varname, source_table)
+ # if '[]' in dep_type:
+ # plpy.error("SVM error: dependent_varname cannot be of array type!")
# validate output tables
output_tbl_valid(model_table, 'SVM')
@@ -157,6 +181,8 @@ def _build_output_tables(n_iters_run, args, **kwargs):
else:
groupby_str, grouping_str1, using_str = "", "", "ON TRUE"
# organizing results
+ result_func = ("internal_linear_svm_igd_result" if args['batch_size'] == 1
+ else "internal_linear_svm_igd_minibatch_result")
args.update(locals())
model_table_query = """
CREATE TABLE {model_table} AS
@@ -173,7 +199,7 @@ def _build_output_tables(n_iters_run, args, **kwargs):
FROM
(
SELECT
- {schema_madlib}.internal_linear_svm_igd_result(
+ {schema_madlib}.{result_func}(
{col_grp_state}
) AS result,
{col_grp_key}
@@ -952,9 +978,9 @@ def svm(schema_madlib, source_table, model_table,
_verify_table(source_table, model_table,
dependent_varname, independent_varname)
reserved_cols =['coef', 'random_feature_data',
- 'random_feature_data', 'loss'
- 'num_rows_processed', 'num_rows_skipped',
- 'norm_of_gradient', 'num_iterations']
+ 'random_feature_data', 'loss'
+ 'num_rows_processed', 'num_rows_skipped',
+ 'norm_of_gradient', 'num_iterations']
grouping_str, grouping_col = \
get_grouping_col_str(schema_madlib, 'SVM', reserved_cols,
source_table, grouping_col)
@@ -1151,7 +1177,15 @@ def _svm_parsed_params(schema_madlib, source_table, model_table,
datasets.
"""
- n_features = num_features(source_table, independent_varname)
+ # n_features = num_features(source_table, independent_varname)
+
+ upper_dim = 1 if params_dict['batch_size'] == 1 else 2
+ n_features = plpy.execute("SELECT array_upper({0}, {2}) AS dim "
+ "FROM {1} LIMIT 1".
+ format(independent_varname,
+ source_table,
+ upper_dim))[0]['dim']
+
if update_source_for_one_class:
# This block is run only when the caller is svm_one_class
@@ -1204,7 +1238,9 @@ def _svm_parsed_params(schema_madlib, source_table, model_table,
args.update(_verify_get_params_dict(params_dict))
args.update(_process_epsilon(is_svc, args))
- args.update(_svc_or_svr(is_svc, source_table, dependent_varname))
+
+ is_sgd = params_dict['batch_size'] <= 1
+ args.update(_svc_or_svr(is_svc, source_table, dependent_varname, is_sgd))
# place holder for compatibility
plpy.execute("CREATE TABLE pg_temp.{0} AS SELECT 1".format(args['rel_args']))
@@ -1254,13 +1290,13 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name,
input_tbl_valid(new_data_table, 'SVM')
reserved_cols =['coef', 'random_feature_data',
- 'random_feature_data', 'loss'
- 'num_rows_processed', 'num_rows_skipped',
- 'norm_of_gradient', 'num_iterations']
+ 'random_feature_data', 'loss'
+ 'num_rows_processed', 'num_rows_skipped',
+ 'norm_of_gradient', 'num_iterations']
grouping_str, grouping_col = get_grouping_col_str(schema_madlib,
- 'SVM', reserved_cols,
- new_data_table,
- grouping_col)
+ 'SVM', reserved_cols,
+ new_data_table,
+ grouping_col)
_assert(is_var_valid(new_data_table, independent_varname),
"SVM Error: independent_varname ('" + independent_varname +
"') is invalid for new_data_table (" + new_data_table + ")!")
@@ -1341,43 +1377,67 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name,
# -----------------------------------------------------------------------------
-def _svc_or_svr(is_svc, source_table, dependent_varname):
+def _svc_or_svr(is_svc, source_table, dependent_varname, is_sgd):
# transform col_dep_var to binary (1`or -1) if classification
_args = {'col_dep_var_trans': dependent_varname,
'mapping': 'NULL',
'method': 'SVR'}
if is_svc:
+ if is_sgd:
+ src = source_table
+ else:
+ src = "(SELECT unnest({0}) as {0} FROM {1}) q".format(dependent_varname, source_table)
# dependent variable mapping
dep_labels = plpy.execute("""
SELECT {dependent_varname} AS y
- FROM {source_table}
+ FROM {src}
WHERE ({dependent_varname}) IS NOT NULL
GROUP BY ({dependent_varname})
ORDER BY ({dependent_varname})
- """.format(source_table=source_table,
+ """.format(src=src,
dependent_varname=dependent_varname))
- dep_var_mapping = ["'{0}'".format(d['y'])
- if isinstance(d['y'], basestring)
- else str(d['y']) for d in dep_labels]
-
- _assert(1 <= len(dep_var_mapping) <= 2,
+ _assert(1 <= len(dep_labels) <= 2,
"SVM Error: Classification currently "
"only supports unary or binary output!. Found values {0}".
- format(dep_var_mapping))
-
- col_dep_var_trans = ("""
- CASE WHEN ({col_dep_var}) IS NULL THEN NULL
- WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0
- ELSE 1.0
- END
- """.format(col_dep_var=dependent_varname,
- mapped_value_for_negative=dep_var_mapping[0]))
+ format(dep_labels))
+
+ dep_labels_str = ["'{0}'".format(d['y'])
+ if isinstance(d['y'], basestring)
+ else str(d['y']) for d in dep_labels]
+
+ # map the dependent variable labels to -1 and 1 to represent the two
+ # sides of the hyperplane (only supporting unary/binary for now)
+ if is_sgd:
+ col_dep_var_trans = """
+ CASE WHEN ({dependent_varname}) IS NULL THEN NULL
+ WHEN ({dependent_varname}) = {mapped_value_for_negative} THEN -1.0
+ ELSE 1.0
+ END
+ """
+ else:
+ # For minibatch, the dependent_varname is an array. So unnest the
+ # array before mapping to 1/-1.
+ col_dep_var_trans = """
+ ARRAY(SELECT
+ CASE WHEN ({dependent_varname}) IS NULL THEN NULL
+ WHEN ({dependent_varname}) = {mapped_value_for_negative} THEN -1.0
+ ELSE 1.0
+ END
+ FROM UNNEST({dependent_varname}) as {dependent_varname}
+ )
+ """
+
+ # col_dep_var_trans is used by the update query in _compute_svm to
+ # transform dependent variable labels to -1 and 1.
+ col_dep_var_trans = col_dep_var_trans.format(
+ dependent_varname=dependent_varname,
+ mapped_value_for_negative=dep_labels_str[0])
_args.update({
- 'mapped_value_for_negative': dep_var_mapping[0],
+ 'mapped_value_for_negative': dep_labels_str[0],
'col_dep_var_trans': col_dep_var_trans,
- 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1],
+ 'mapping': dep_labels_str[0] + "," + dep_labels_str[1],
'method': 'SVC'})
return _args
# -----------------------------------------------------------------------------
@@ -1489,7 +1549,9 @@ def _extract_params(schema_madlib, params, module='SVM'):
'validation_result': '',
'epsilon': [0.01],
'eps_table': '',
- 'class_weight': ''}
+ 'class_weight': '',
+ 'n_epochs': 1,
+ 'batch_size': 1}
params_types = {
'init_stepsize': list,
@@ -1502,7 +1564,9 @@ def _extract_params(schema_madlib, params, module='SVM'):
'validation_result': str,
'epsilon': list,
'eps_table': str,
- 'class_weight': str}
+ 'class_weight': str,
+ 'n_epochs': int,
+ 'batch_size': int}
params_vals = extract_keyvalue_params(params, params_types, params_default)
if params_vals['n_folds'] < 0:
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/ports/postgres/modules/svm/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.sql_in b/src/ports/postgres/modules/svm/svm.sql_in
index f3948a8..78d7b76 100644
--- a/src/ports/postgres/modules/svm/svm.sql_in
+++ b/src/ports/postgres/modules/svm/svm.sql_in
@@ -79,7 +79,7 @@ svm_classification(
<DD>TEXT. Expression list to evaluate for the
independent variables. An intercept variable should not be included as part
of this expression. See 'fit_intercept' in the kernel params for info on
- intercepts. Please note that expression should be able to be cast
+ intercepts. Please note that expression should be able to be cast
to DOUBLE PRECISION[].
<DT>kernel_func (optional)</DT>
@@ -339,7 +339,7 @@ the parameter is ignored.
Hyperparameter optimization can be carried out using the built-in cross
validation mechanism, which is activated by assigning a value greater than 1 to
-the parameter \e n_folds in \e params.
+the parameter \e n_folds in \e params.
Please note that cross validation is not
supported if grouping is used.
@@ -442,7 +442,7 @@ while the other k - 1 folds form the training set.
</DD>
<DT>class_weight</dt>
-<DD>Default: 1 for classification, 'balanced' for one-class novelty detection,
+<DD>Default: 1 for classification, 'balanced' for one-class novelty detection,
n/a for regression.
Set the weight for the positive and negative classes. If not given, all classes
@@ -495,8 +495,8 @@ table name is already in use, then an error is returned. Table contains:</DD>
</tr>
<tr>
<th>prediction</th>
- <td>Provides the prediction for each row in new_data_table.
- For regression this would be the same as decision_function. For classification,
+ <td>Provides the prediction for each row in new_data_table.
+ For regression this would be the same as decision_function. For classification,
this will be one of the dependent variable values.</td>
</tr>
<tr>
@@ -645,9 +645,9 @@ num_rows_skipped | -1
dep_var_mapping | {-1,1}
</pre>
-# Now let's look at the prediction functions. We want to predict if house price
-is less than $100,000. In the following examples we will
+is less than $100,000. In the following examples we will
use the training data set for prediction as well, which is not usual but serves to
-show the syntax. The predicted results are in the \e prediction column and the
+show the syntax. The predicted results are in the \e prediction column and the
actual data is in the \e target column.
For the linear model:
<pre class="example">
@@ -657,7 +657,7 @@ SELECT *, price < 100000 AS target FROM houses JOIN houses_pred USING (id) ORDER
</pre>
Result:
<pre class="result">
- id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target
+ id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target
----+------+---------+------+--------+------+-------+------------+--------------------+--------
1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 104.685894748292 | t
2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 200.592436923938 | t
@@ -683,7 +683,7 @@ SELECT *, price < 100000 AS target FROM houses JOIN houses_pred_gaussian USING (
</pre>
This produces a more accurate result than the linear case for this small data set:
<pre class="result">
- id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target
+ id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target
----+------+---------+------+--------+------+-------+------------+-------------------+--------
1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 1.00338548176312 | t
2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 1.00000000098154 | t
@@ -709,7 +709,7 @@ SELECT * FROM houses JOIN houses_regr USING (id) ORDER BY id;
</pre>
Result for the linear regression model:
<pre class="result">
- id | tax | bedroom | bath | price | size | lot | prediction | decision_function
+ id | tax | bedroom | bath | price | size | lot | prediction | decision_function
----+------+---------+------+--------+------+-------+------------------+-------------------
1 | 590 | 2 | 1 | 50000 | 770 | 22100 | 55288.6992755623 | 55288.6992755623
2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | 99978.8137019119 | 99978.8137019119
@@ -733,7 +733,7 @@ DROP TABLE IF EXISTS houses_gaussian_regr;
SELECT madlib.svm_predict('houses_svm_gaussian_regression', 'houses', 'id', 'houses_gaussian_regr');
SELECT * FROM houses JOIN houses_gaussian_regr USING (id) ORDER BY id;
</pre>
--# For the novelty detection using one-class, let's create a test data set using
+-# For the novelty detection using one-class, let's create a test data set using
the last 3 values from the training set plus an outlier at the end (10x price):
<pre class="example">
DROP TABLE IF EXISTS houses_one_class_test;
@@ -754,7 +754,7 @@ SELECT * FROM houses_one_class_test JOIN houses_one_class_pred USING (id) ORDER
</pre>
Result showing the last row predicted to be novel:
<pre class="result">
- id | tax | bedroom | bath | price | size | lot | prediction | decision_function
+ id | tax | bedroom | bath | price | size | lot | prediction | decision_function
----+------+---------+------+--------+------+-------+------------+---------------------
1 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | 1 | 0.111497008121437
2 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | 1 | 0.0996021345169148
@@ -938,6 +938,86 @@ CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_step(
INITCOND='{0,0,0,0,0,0,0}'
);
+--------------------------------------------------------------------------
+-- create SQL functions for IGD optimizer
+--------------------------------------------------------------------------
+-- cannot be labeled as STRICT because we set previous_state NULL initially
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_minibatch_transition(
+ state double precision[],
+ ind_var double precision[][],
+ dep_var double precision[],
+ previous_state double precision[],
+ dimension integer,
+ stepsize double precision,
+ reg double precision,
+ is_l2 boolean,
+ n_tuples integer,
+ epsilon double precision,
+ is_svc boolean,
+ tuple_weight double precision,
+ batch_size integer,
+ n_epochs integer
+)
+RETURNS double precision[] AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_minibatch_merge(
+ state1 double precision[],
+ state2 double precision[])
+RETURNS double precision[] AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_minibatch_final(
+ state double precision[])
+RETURNS double precision[] AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
+
+/**
+ * @internal
+ * @brief Perform one iteration of the incremental gradient
+ * method for computing linear support vector machine
+ */
+DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linear_svm_igd_minibatch_step(
+ /*+ ind_var */ double precision[][],
+ /*+ dep_var */ double precision[],
+ /*+ previous_state */ double precision[],
+ /*+ dimension */ integer,
+ /*+ stepsize */ double precision,
+ /*+ reg */ double precision,
+ /*+ is_l2 */ boolean,
+ /*+ n_tuples */ integer,
+ /*+ epsilon */ double precision,
+ /*+ is_svc */ boolean,
+ /*+ tuple_weight */ double precision,
+ /*+ batch_size */ integer,
+ /*+ n_epochs */ integer
+);
+CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_minibatch_step(
+ /*+ ind_var */ double precision[][],
+ /*+ dep_var */ double precision[],
+ /*+ previous_state */ double precision[],
+ /*+ dimension */ integer,
+ /*+ stepsize */ double precision,
+ /*+ reg */ double precision,
+ /*+ is_l2 */ boolean,
+ /*+ n_tuples */ integer,
+ /*+ epsilon */ double precision,
+ /*+ is_svc */ boolean,
+ /*+ tuple_weight */ double precision,
+ /*+ batch_size */ integer,
+ /*+ n_epochs */ integer
+ ) (
+ STYPE=double precision[],
+ SFUNC=MADLIB_SCHEMA.linear_svm_igd_minibatch_transition,
+ m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linear_svm_igd_minibatch_merge,')
+ FINALFUNC=MADLIB_SCHEMA.linear_svm_igd_minibatch_final,
+ INITCOND='{0,0,0,0,0,0,0,0,0}'
+);
+
+
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_distance(
/*+ state1 */ double precision[],
/*+ state2 */ double precision[])
@@ -945,12 +1025,25 @@ RETURNS double precision AS 'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_minibatch_distance(
+ /*+ state1 */ double precision[],
+ /*+ state2 */ double precision[])
+RETURNS double precision AS 'MODULE_PATHNAME'
+LANGUAGE c IMMUTABLE STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
+
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_result(
/*+ state */ double precision[])
RETURNS MADLIB_SCHEMA.linear_svm_result AS 'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_minibatch_result(
+ /*+ state */ double precision[])
+RETURNS MADLIB_SCHEMA.linear_svm_result AS 'MODULE_PATHNAME'
+LANGUAGE c IMMUTABLE STRICT
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
+
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression(
source_table text,
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/ports/postgres/modules/svm/test/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in
index 60d280e..d9e9383 100644
--- a/src/ports/postgres/modules/svm/test/svm.sql_in
+++ b/src/ports/postgres/modules/svm/test/svm.sql_in
@@ -903,3 +903,107 @@ SELECT
'The dimension of the coefficients must be equal to n_components (3)!')
FROM m9;
*/
+
+
+-- minibatch -----------------------------------------------------------
+drop table if exists svm_minibatch_train;
+CREATE TABLE svm_minibatch_train (
+ id integer,
+ x double precision[][],
+ rings integer[],
+ sex text[]
+);
+
+COPY svm_minibatch_train (id, x, rings, sex) FROM stdin DELIMITER '|';
+0|{{0.53,0.42,0.17,0.828,0.41,0.208,0.1505},{0.27,0.195,0.06,0.073,0.0285,0.0235,0.03},{0.31,0.23,0.07,0.1245,0.0505,0.0265,0.038},{0.36,0.27,0.085,0.2185,0.1065,0.038,0.062},{0.32,0.24,0.08,0.18,0.08,0.0385,0.055},{0.3,0.22,0.08,0.121,0.0475,0.042,0.035},{0.5,0.39,0.135,0.6595,0.3145,0.1535,0.1565},{0.295,0.215,0.07,0.121,0.047,0.0155,0.0405},{0.375,0.28,0.08,0.226,0.105,0.047,0.065}}|{6,5,6,6,6,5,6,6,6}|{F,M,F,F,F,M,F,F,F}
+1|{{0.415,0.31,0.105,0.3595,0.167,0.083,0.0915},{0.35,0.25,0.07,0.1605,0.0715,0.0335,0.046},{0.415,0.33,0.09,0.3595,0.17,0.081,0.09},{0.66,0.475,0.18,1.3695,0.641,0.294,0.335},{0.415,0.31,0.09,0.2815,0.1245,0.0615,0.085},{0.35,0.265,0.09,0.2265,0.0995,0.0575,0.065},{0.215,0.155,0.06,0.0525,0.021,0.0165,0.015},{0.35,0.27,0.075,0.215,0.1,0.036,0.065},{0.255,0.18,0.065,0.079,0.034,0.014,0.025},{0.28,0.22,0.08,0.1315,0.066,0.024,0.03}}|{6,6,6,6,6,6,5,6,5,5}|{F,F,F,F,F,F,M,F,M,M}
+2|{{0.27,0.19,0.08,0.081,0.0265,0.0195,0.03},{0.375,0.29,0.095,0.2875,0.123,0.0605,0.08},{0.27,0.2,0.08,0.1205,0.0465,0.028,0.04},{0.235,0.175,0.065,0.0615,0.0205,0.02,0.019},{0.24,0.17,0.05,0.0545,0.0205,0.016,0.0155},{0.34,0.255,0.085,0.204,0.097,0.021,0.05},{0.275,0.22,0.08,0.1365,0.0565,0.0285,0.042},{0.385,0.28,0.09,0.228,0.1025,0.042,0.0655},{0.355,0.27,0.075,0.1775,0.079,0.0315,0.054},{0.27,0.205,0.05,0.084,0.03,0.0185,0.029}}|{6,6,6,6,5,6,6,5,6,6}|{F,F,F,F,M,F,F,M,F,F}
+3|{{0.335,0.26,0.085,0.192,0.097,0.03,0.054},{0.26,0.215,0.08,0.099,0.037,0.0255,0.045},{0.315,0.21,0.06,0.125,0.06,0.0375,0.035},{0.585,0.45,0.125,0.874,0.3545,0.2075,0.225},{0.44,0.345,0.13,0.4495,0.209,0.0835,0.134},{0.315,0.23,0.08,0.1375,0.0545,0.031,0.0445},{0.35,0.25,0.07,0.18,0.0655,0.048,0.054},{0.31,0.24,0.105,0.2885,0.118,0.065,0.083},{0.325,0.23,0.09,0.147,0.06,0.034,0.045},{0.28,0.21,0.075,0.1195,0.053,0.0265,0.03}}|{6,5,5,6,6,5,6,6,4,6}|{F,M,M,F,F,M,F,F,F,F}
+4|{{0.41,0.31,0.09,0.3335,0.1635,0.061,0.091},{0.595,0.475,0.16,1.1405,0.547,0.231,0.271},{0.385,0.3,0.09,0.247,0.1225,0.044,0.0675},{0.455,0.335,0.105,0.422,0.229,0.0865,0.1},{0.245,0.18,0.065,0.0635,0.0245,0.0135,0.02},{0.155,0.115,0.025,0.024,0.009,0.005,0.0075},{0.28,0.215,0.08,0.132,0.072,0.022,0.033},{0.335,0.25,0.08,0.1695,0.0695,0.044,0.0495},{0.175,0.125,0.05,0.0235,0.008,0.0035,0.008},{0.275,0.205,0.075,0.1105,0.045,0.0285,0.035}}|{6,6,5,6,4,5,5,6,5,6}|{F,F,M,F,F,M,M,F,M,F}
+5|{{0.41,0.325,0.1,0.394,0.208,0.0655,0.106},{0.4,0.295,0.095,0.252,0.1105,0.0575,0.066},{0.26,0.2,0.07,0.092,0.037,0.02,0.03},{0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195},{0.255,0.185,0.07,0.075,0.028,0.018,0.025},{0.385,0.3,0.115,0.3435,0.1645,0.085,0.1025},{0.325,0.27,0.1,0.185,0.08,0.0435,0.065},{0.28,0.205,0.1,0.1165,0.0545,0.0285,0.03},{0.275,0.2,0.065,0.092,0.0385,0.0235,0.027},{0.38,0.275,0.095,0.2505,0.0945,0.0655,0.075}}|{6,6,6,6,6,6,6,5,5,6}|{F,F,F,F,F,F,F,M,M,F}
+6|{{0.365,0.255,0.08,0.1985,0.0785,0.0345,0.053},{0.175,0.135,0.04,0.0305,0.011,0.0075,0.01},{0.515,0.375,0.11,0.6065,0.3005,0.131,0.15},{0.23,0.18,0.05,0.064,0.0215,0.0135,0.02},{0.185,0.135,0.04,0.027,0.0105,0.0055,0.009},{0.33,0.24,0.075,0.163,0.0745,0.033,0.048},{0.37,0.265,0.075,0.214,0.09,0.051,0.07},{0.325,0.245,0.07,0.161,0.0755,0.0255,0.045},{0.19,0.13,0.045,0.0265,0.009,0.005,0.009},{0.325,0.245,0.075,0.1495,0.0605,0.033,0.045}}|{5,5,6,5,5,6,6,6,5,5}|{M,M,F,M,M,F,F,F,M,M}
+7|{{0.44,0.34,0.105,0.369,0.164,0.08,0.1015},{0.27,0.195,0.08,0.1,0.0385,0.0195,0.03},{0.32,0.235,0.08,0.1485,0.064,0.031,0.045},{0.53,0.41,0.14,0.681,0.3095,0.1415,0.1835},{0.405,0.285,0.09,0.2645,0.1265,0.0505,0.075},{0.45,0.33,0.11,0.3685,0.16,0.0885,0.102},{0.245,0.175,0.055,0.0785,0.04,0.018,0.02},{0.38,0.275,0.095,0.2425,0.106,0.0485,0.21},{0.47,0.36,0.11,0.4965,0.237,0.127,0.13},{0.37,0.27,0.095,0.2175,0.097,0.046,0.065}}|{5,6,6,6,6,6,5,6,6,6}|{M,F,F,F,F,F,M,F,F,F}
+8|{{0.35,0.265,0.08,0.192,0.081,0.0465,0.053},{0.28,0.2,0.075,0.1225,0.0545,0.0115,0.035},{0.385,0.3,0.1,0.2725,0.1115,0.057,0.08},{0.335,0.245,0.09,0.1665,0.0595,0.04,0.06},{0.285,0.21,0.07,0.109,0.044,0.0265,0.033},{0.415,0.305,0.1,0.325,0.156,0.0505,0.091},{0.43,0.335,0.105,0.378,0.188,0.0785,0.09},{0.175,0.125,0.04,0.024,0.0095,0.006,0.005},{0.4,0.315,0.085,0.2675,0.116,0.0585,0.0765},{0.375,0.285,0.09,0.2545,0.119,0.0595,0.0675}}|{6,5,6,6,5,6,6,4,6,6}|{F,M,F,F,M,F,F,F,F,F}
+9|{{0.365,0.27,0.105,0.2155,0.0915,0.0475,0.063},{0.36,0.27,0.09,0.2075,0.098,0.039,0.062},{0.33,0.23,0.085,0.1695,0.079,0.026,0.0505},{0.34,0.26,0.085,0.1885,0.0815,0.0335,0.06},{0.285,0.215,0.075,0.106,0.0415,0.023,0.035},{0.195,0.145,0.05,0.032,0.01,0.008,0.012},{0.32,0.24,0.07,0.133,0.0585,0.0255,0.041},{0.495,0.4,0.155,0.8085,0.2345,0.1155,0.35},{0.475,0.36,0.12,0.5915,0.3245,0.11,0.127},{0.395,0.27,0.1,0.2985,0.1445,0.061,0.082}}|{6,6,6,6,5,4,6,6,6,5}|{F,F,F,F,M,F,F,F,F,M}
+\.
+
+DROP TABLE IF EXISTS svm_minibatch_test;
+CREATE TABLE svm_minibatch_test AS
+SELECT id,
+ ARRAY[round(length::numeric, 4),
+ round(diameter::numeric, 4),
+ round(height::numeric, 4),
+ round(whole::numeric, 4),
+ round(shucked::numeric, 4),
+ round(viscera::numeric, 4),
+ round(shell::numeric, 4)] as x,
+ rings,
+ CASE WHEN sex = 'I' then 'F' else sex end as sex
+FROM abalone_train_small_tmp;
+
+------ Regression-------
+DROP TABLE IF EXISTS svm_minibatch_reg_out, svm_minibatch_reg_out_summary;
+SELECT svm_regression(
+ 'svm_minibatch_train',
+ 'svm_minibatch_reg_out',
+ 'rings',
+ 'x',
+ 'linear',
+ NULL,
+ NULL,
+ 'max_iter=10, init_stepsize=0.2, batch_size=3, n_epochs=3'
+);
+
+DROP TABLE IF EXISTS svm_predict_reg_minibatch_out;
+SELECT svm_predict('svm_minibatch_reg_out', 'svm_minibatch_test', 'id', 'svm_predict_reg_minibatch_out');
+
+SELECT assert(error < 1, 'Training error ' || error || ' with SVM regression minibatch is too high (>1)')
+from (SELECT avg((rings-prediction)^2) as error
+FROM svm_minibatch_test JOIN svm_predict_reg_minibatch_out
+using (id)) q;
+
+
+-- testing for batch_size bigger than datapoints ------------------------------
+-- setting batch_size = 30 with max of 11 rows in svm_minibatch_train
+-- this should not error and just treat the whole matrix as a single batch
+DROP TABLE IF EXISTS svm_minibatch_reg_out, svm_minibatch_reg_out_summary;
+SELECT svm_regression(
+ 'svm_minibatch_train',
+ 'svm_minibatch_reg_out',
+ 'rings',
+ 'x',
+ 'linear',
+ NULL,
+ NULL,
+ 'max_iter=10, init_stepsize=0.2, batch_size=30, n_epochs=1'
+);
+
+------ Classification -------
+DROP TABLE IF EXISTS svm_minibatch_classification_out, svm_minibatch_classification_out_summary;
+\timing on
+SELECT svm_classification(
+ 'svm_minibatch_train',
+ 'svm_minibatch_classification_out',
+ 'sex',
+ 'x',
+ 'linear',
+ NULL,
+ NULL,
+ 'max_iter=10, init_stepsize=0.2, batch_size=3, n_epochs=2'
+);
+\timing off
+
+DROP TABLE IF EXISTS svm_predict_classification_minibatch_out;
+SELECT svm_predict('svm_minibatch_classification_out',
+ 'svm_minibatch_test',
+ 'id',
+ 'svm_predict_classification_minibatch_out');
+
+SELECT assert(accuracy >= 0.70,
+ 'Training accuracy '|| accuracy ||
+ ' with SVM classification minibatch is too low (<0.7)')
+FROM (SELECT count(*)/99. as accuracy
+ FROM svm_minibatch_test JOIN svm_predict_classification_minibatch_out
+ USING (id)
+ WHERE sex = prediction
+ ) q;
http://git-wip-us.apache.org/repos/asf/madlib/blob/a8bbe082/src/ports/postgres/modules/utilities/validate_args.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 2b9c6d7..a2f43fd 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -475,9 +475,9 @@ def array_col_dimension(tbl, col):
if col is None:
plpy.error('Input error: Column name is invalid')
dim = plpy.execute("""
- SELECT max(array_upper({col}, 1)) AS dim
- FROM {tbl}
- """.format(col=col, tbl=tbl))[0]["dim"]
+ SELECT max(array_upper({col}, 1)) AS dim
+ FROM {tbl}
+ """.format(col=col, tbl=tbl))[0]["dim"]
return dim
# ------------------------------------------------------------------------
@@ -491,15 +491,12 @@ def array_col_has_same_dimension(tbl, col):
if col is None or col.lower() == 'null':
plpy.error('Input error: Column name is invalid')
- max_dim = plpy.execute("""
- SELECT max(array_upper({col}, 1)) AS max_dim
- FROM {tbl}
- """.format(col=col, tbl=tbl))[0]["max_dim"]
- min_dim = plpy.execute("""
- SELECT min(array_upper({col}, 1)) AS min_dim
+ results = plpy.execute("""
+ SELECT min(array_upper({col}, 1)) AS min_dim,
+ max(array_upper({col}, 1)) AS max_dim
FROM {tbl}
- """.format(col=col, tbl=tbl))[0]["min_dim"]
- return max_dim == min_dim
+ """.format(col=col, tbl=tbl))[0]
+ return results['max_dim'] == results['min_dim']
# ------------------------------------------------------------------------