You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/13 13:20:33 UTC
[40/50] [abbrv] incubator-singa git commit: SINGA-192 Implement
optimization algorithms for v1
SINGA-192 Implement optimization algorithms for v1
Merge branch PR#164 into dev
Fix the bugs in test adagrad and rmsprop.
Note, expect near (with diff 1e-5) is used to avoid numeric bugs. Need to do test on more
machines.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5784bff3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5784bff3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5784bff3
Branch: refs/heads/master
Commit: 5784bff3e5ebfb3a992624d10f03f30cd5e520a3
Parents: 6d69047 178db01
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 15:43:53 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 18:03:12 2016 +0800
----------------------------------------------------------------------
include/singa/model/optimizer.h | 43 ++++++++++++++
src/core/tensor/math_kernel.cu | 14 ++---
src/core/tensor/math_kernel.h | 2 +-
src/core/tensor/tensor.cc | 3 +-
src/model/optimizer/adagrad.cc | 36 ++++++++++++
src/model/optimizer/nesterov.cc | 43 ++++++++++++++
src/model/optimizer/rmsprop.cc | 41 ++++++++++++++
src/proto/model.proto | 3 +
test/singa/test_adagrad.cc | 96 +++++++++++++++++++++++++++++++
test/singa/test_nesterov.cc | 101 +++++++++++++++++++++++++++++++++
test/singa/test_rmsprop.cc | 106 +++++++++++++++++++++++++++++++++++
11 files changed, 478 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.cu
index b618f9b,aed6add..484868a
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@@ -236,192 -300,151 +236,192 @@@ __global__ void KernelThreshold(const s
}
}
- __global__ void KernelGE(const int num, const float *in, const float x,
-__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
- float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data_a[index] / src_data_b[index];
++__global__ void KernelGE(const size_t num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] >= x ? 1.0f : 0.0f;
}
}
- __global__ void KernelGT(const int num, const float *in, const float x,
-
-__global__ static void kernel_set_value(float *data, float value, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- data[index] = value;
++__global__ void KernelGT(const size_t num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] > x ? 1.0f : 0.0f;
}
}
- __global__ void KernelLE(const int num, const float *in, const float x,
-
-__global__ void kernel_threshold(const float *src_data, float *des_data,
- float alpha, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
++__global__ void KernelLE(const size_t num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] <= x ? 1.0f : 0.0f;
}
}
-void sum(int n, const float *in, float *out) {
- int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
- // here, we only need one block
- int num_blocks = 1;
- __global__ void KernelLT(const int num, const float *in, const float x,
- kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
++__global__ void KernelLT(const size_t num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] < x ? 1.0f : 0.0f;
+ }
}
-void sum_row(int rows, int cols, int stride, const float *in, float *out) {
- int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
- int num_blocks = cols;
+// ********************************
+// Functions call kernels
+// ********************************
- kernel_sum_row << <num_blocks, threads_per_block>>>
- (in, out, rows, cols, stride);
+void set(const size_t n, const float v, float *out, cudaStream_t s) {
+ KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
}
-void sum_col(int rows, int cols, int stride, const float *in, float *out) {
- int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
- int num_blocks = rows;
+void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
- kernel_sum_col << <num_blocks, threads_per_block>>>
- (in, out, rows, cols, stride);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void add_row(int rows, int cols, int stride, const float *in_row,
- const float *in_mat, float *out) {
- dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
- dim3 num_blocks(
- cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
- rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
- kernel_add_vec_row << <num_blocks, threads_per_block>>>
- (in_row, in_mat, out, rows, cols, stride);
+
+void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void add(int n, const float *a, const float *b, float *out) {
- kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void sub(int n, const float *a, const float *b, float *out) {
- kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void exp(int n, const float *in, float *out) {
- kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+
+void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void log(int n, const float *in, float *out) {
- kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void sigmoid(int n, const float *in, float *out) {
- kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
- void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
++void sigmoid(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void clamp(const size_t n, const float low, const float high, const float *in,
+ float *out, cudaStream_t s) {
+ KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
}
-void sigmoid_grad(int n, const float *in, float *out) {
- kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
}
-void relu(int n, const float *in, float *out) {
- kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
}
-void relu_grad(int n, const float *in, float *out) {
- kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
}
-void tanh(int n, const float *in, float *out) {
- kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void div(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s) {
+ KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
}
-void tanh_grad(int n, const float *in, float *out) {
- kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void threshold(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s) {
+ KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
}
-void softplus(int n, const float *in, float *out) {
- kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void gt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void ge(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void lt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void le(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
}
-void softplus_grad(int n, const float *in, float *out) {
- kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void square(int n, const float *in, float *out) {
- kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void square_grad(int n, const float *in, float *out) {
- kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void sqrt(int n, const float *in, float *out) {
- kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void pow(int n, const float *a, const float *b, float *out) {
- kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void mult(int n, const float *a, const float *b, float *out) {
- kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+ int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+ // here, we only need one block
+ int num_blocks = 1;
+ KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
+}
+/*
+void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void mult(int n, const float *a, const float x, float *out) {
- kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void div(int n, const float *a, const float *b, float *out) {
- kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void set_value(int n, float v, float *out) {
- kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+
+void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void threshold(int n, float alpha, const float *in, float *out) {
- kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-// follow the consistency guide for math API
-__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
- float *out) {
- for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
- idx += blockDim.x * gridDim.x) {
- out[idx] = alpha / in[idx];
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+ int rows, int cols, int stride) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < rows; index += num_threads) {
+ dst_vec_data[index] = 0.0f;
+ for (int k = 0; k < cols; k++) {
+ dst_vec_data[index] += src_mat_data[index * stride + k];
+ }
}
}
@@@ -485,62 -485,30 +485,62 @@@ __global__ void kernel_sigmoid_grad(con
}
}
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
- KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+ }
}
-void Div(const size_t num, float alpha, const float *in, float *out,
- cudaStream_t s) {
- KernelDiv << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, alpha, in, out);
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = (1.0f - src_data[index] * src_data[index]);
+ }
}
-void GT(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelGT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+ }
}
-void GE(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelGE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void KernelSquareGrad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = 2 * src_data[index];
+ }
}
- __global__ void kernel_softmax_loss(const float *prob, const int *label,
-void LT(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelLT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
++__global__ void kernel_softmax_loss(const float *prob, const size_t *label,
+ float *loss, int n, int dim) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ float prob_of_truth = prob[index * dim + label[index]];
+ loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
+ }
}
- __global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
-void LE(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
++__global__ void kernel_softmax_gradient(float *grad, const size_t *label, int n,
+ int dim, float scale) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ int pos = index * dim + label[index];
+ grad[pos] = (grad[pos] - 1.0f) * scale;
+ }
}
+*/
+
} // namespace cuda
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.h
index d8a58a5,5c906a9..444f6ca
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@@ -31,66 -31,65 +31,66 @@@ namespace singa
// TODO(wangwei) make all function templates.
namespace cuda {
-void sum(int n, const float *in, float *out);
-void sum_row(int rows, int cols, int stride, const float *in, float *out);
-
-void sum_col(int rows, int cols, int stride, const float *in, float *out);
-
-void add_row(int rows, int cols, int stride, const float *in_row,
- const float *in_mat, float *out);
-
-void add(int n, const float *a, const float *b, float *out);
-
-void sub(int n, const float *a, const float *b, float *out);
-
-void exp(int n, const float *in, float *out);
-
-void log(int n, const float *in, float *out);
-
-void sigmoid(int n, const float *in, float *out);
-
-void sigmoid_grad(int n, const float *in, float *out);
-
-void relu(int n, const float *in, float *out);
-
-void relu_grad(int n, const float *in, float *out);
-
-void tanh(int n, const float *in, float *out);
-
-void tanh_grad(int n, const float *in, float *out);
+// 0 input
+void set(const size_t n, const float v, float *out, cudaStream_t s);
+
+// 1 input
+void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+void log(const size_t n, const float *in, float *out, cudaStream_t s);
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+void square(const size_t n, const float *in, float *out, cudaStream_t s);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s);
- void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
++void sigmoid(const size_t n, const float *in, float *out, cudaStream_t s);
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+void clamp(const size_t n, const float low, const float high, const float *in,
+ float *out, cudaStream_t s);
+
+void pow(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s);
-void softplus(int n, const float *in, float *out);
+void add(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s);
-void softplus_grad(int n, const float *in, float *out);
+void mult(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s);
-void square(int n, const float *in, float *out);
+void div(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s);
-void square_grad(int n, const float *in, float *out);
+void threshold(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s);
-void sqrt(int n, const float *in, float *out);
+void gt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
+void ge(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
+void lt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
+void le(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
-void pow(int n, const float *a, const float *b, float *out);
+// 2 inputs
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void mult(int n, const float *a, const float *b, float *out);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void mult(int n, const float *a, const float x, float *out);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void div(int n, const float *a, const float *b, float *out);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void set_value(int n, float v, float *out);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void threshold(int n, float alpha, const float *in, float *out);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s);
-// follow the consistency guide for math API
-void Div(const size_t num, const float x, const float *in, float *out,
- cudaStream_t s);
-void Set(const size_t num, const float x, float *out, cudaStream_t s);
-void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
} // cuda
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index e62386a,5ae375c..e6917d8
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -639,92 -701,4 +639,91 @@@ void SumRows(const Tensor &M, Tensor *v
Mult(X, one, v);
}
}
+// ====================Random operations=====================================
+template <typename SType>
+void Bernoulli(const SType p, Tensor *out) {
+ TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+ auto prob = TypeCast<SType, DType>(p);
+ out->device()->Exec([prob, out](Context *ctx) {
+ Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+ }, {}, {out->blob()}, true);
+ });
+}
+template void Bernoulli<float>(const float p, Tensor *out);
+
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out) {
+ TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+ auto l = TypeCast<SType, DType>(low);
+ auto h = TypeCast<SType, DType>(high);
+ out->device()->Exec([l, h, out](Context *ctx) {
+ Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+ }, {}, {out->blob()}, true);
+ });
+}
+template void Uniform<float>(const float low, const float high, Tensor *out);
+
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out) {
+ TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+ auto m = TypeCast<SType, DType>(mean);
+ auto s = TypeCast<SType, DType>(std);
+ out->device()->Exec([m, s, out](Context *ctx) {
+ Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+ }, {}, {out->blob()}, true);
+ });
+}
+template void Gaussian<float>(const float mean, const float std, Tensor *out);
+
+// ================Blas operations============================================
+template <typename SType>
+void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+ TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+ auto a = TypeCast<SType, DType>(alpha);
+ out->device()->Exec([a, in, out](Context *ctx) {
+ Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+ }, {in.blob(), out->blob()}, {out->blob()});
+ });
+}
- template <>
- void Axpy(const float alpha, const Tensor &in, Tensor *out);
++template void Axpy(const float alpha, const Tensor &in, Tensor *out);
+
+Tensor Mult(const Tensor &A, const Tensor &B) {
+ Shape s;
+ s.push_back(A.shape(0));
+ if (B.nDim() == 2) s.push_back(B.shape(1));
+ Tensor out(s, A.device(), A.data_type());
+ Mult(A, B, &out);
+ return out;
+}
+
+void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+ Mult(1.0f, A, B, 0.0f, out);
+}
+
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+ Tensor *C) {
+ CHECK_EQ(A.shape().size(), 2u);
+ if (B.nDim() == 1u) {
+ TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+ auto a = TypeCast<SType, DType>(alpha);
+ auto b = TypeCast<SType, DType>(beta);
+ C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+ B.blob(), b, C->blob(), ctx);
+ }, {A.blob(), B.blob()}, {C->blob()});
+ });
+ } else {
+ CHECK(!C->transpose());
+ TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+ auto a = TypeCast<SType, DType>(alpha);
+ auto b = TypeCast<SType, DType>(beta);
+ C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+ A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+ }, {A.blob(), B.blob()}, {C->blob()});
+ });
+ }
+}
+
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/model/optimizer/adagrad.cc
----------------------------------------------------------------------
diff --cc src/model/optimizer/adagrad.cc
index 0000000,8bdb07c..0b8ec88
mode 000000,100644..100644
--- a/src/model/optimizer/adagrad.cc
+++ b/src/model/optimizer/adagrad.cc
@@@ -1,0 -1,35 +1,36 @@@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ #ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #include "singa/model/optimizer.h"
+ #include <functional>
+ namespace singa {
+
+ void Adagrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); }
+
+ void Adagrad::Apply(int step, float lr, const string& name, Tensor* grad,
+ Tensor* value) {
+ if (history_gradient_.find(name) == history_gradient_.end())
+ history_gradient_[name].ResetLike(*value);
+ Tensor& history = history_gradient_[name];
- history += (*grad) * (*grad);
- (*value) -= (*grad) * lr / Sqrt(history + delta_);
++ history += Square(*grad);
++ (*grad) /= Sqrt(history + delta_);
++ Axpy(-lr, *grad, value);
+ }
+ } // namespace singa
+ #endif // SRC_MODEL_OPTIMIZER_ADAGRAD_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/model/optimizer/rmsprop.cc
----------------------------------------------------------------------
diff --cc src/model/optimizer/rmsprop.cc
index 0000000,cad333c..7b9934c
mode 000000,100644..100644
--- a/src/model/optimizer/rmsprop.cc
+++ b/src/model/optimizer/rmsprop.cc
@@@ -1,0 -1,38 +1,41 @@@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ #ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #include "singa/model/optimizer.h"
+ #include <functional>
+ namespace singa {
+
+ void RMSProp::Setup(const OptimizerConf& conf) {
+ delta_ = conf.delta();
- rho_ = conf.delta();
++ rho_ = conf.rho();
+ }
+
+ void RMSProp::Apply(int step, float lr, const string& name, Tensor* grad,
+ Tensor* value) {
- if (history_gradient_.find(name) == history_gradient_.end())
++ if (history_gradient_.find(name) == history_gradient_.end()) {
+ history_gradient_[name].ResetLike(*value);
++ }
+ Tensor& history = history_gradient_[name];
- history = history * rho_ + (*grad) * (*grad) * (1 - rho_);
- (*value) -= (*grad) * lr / Sqrt(history + delta_);
++ history *= rho_;
++ Axpy(1 - rho_, Square(*grad), &history);
++ (*grad) /= Sqrt(history + delta_);
++ Axpy(-lr, *grad, value);
+ }
+ } // namespace singa
+ #endif // SRC_MODEL_OPTIMIZER_ADAGRAD_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/proto/model.proto
----------------------------------------------------------------------
diff --cc src/proto/model.proto
index d368296,c26aa35..ca6f0cd
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@@ -86,6 -86,9 +86,9 @@@ message OptimizerConf
// used by vanilla sgd and nesterov
optional float momentum = 5 [default = 0.9];
+
+ // delta is used to avoid dividing zero
- optional float delta = 6 [default = 0.0000001];
++ optional float delta = 6 [default = 1e-8];
}
message ConstraintConf {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/test/singa/test_adagrad.cc
----------------------------------------------------------------------
diff --cc test/singa/test_adagrad.cc
index 0000000,1382467..80240b1
mode 000000,100644..100644
--- a/test/singa/test_adagrad.cc
+++ b/test/singa/test_adagrad.cc
@@@ -1,0 -1,92 +1,96 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
+ #include "singa_config.h"
+ #include <cmath>
+
+ TEST(Adagrad, ApplyCPU) {
+ singa::Adagrad adagrad;
+ float lr = 0.1f;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
++ singa::OptimizerConf conf;
++ adagrad.Setup(conf);
+ adagrad.Apply(0, lr, "xx", &grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ const float* newv1 = v1.data<const float*>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+ for (int i = 0; i < 4; ++i)
- EXPECT_FLOAT_EQ(newv1[i],
- v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
++ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ adagrad.Apply(1, lr, "xx", &grad, &value);
+ singa::Tensor v2 = value.Clone();
+ const float* newv2 = v2.data<const float*>();
+ for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+ for (int i = 0; i < 4; ++i)
- EXPECT_FLOAT_EQ(newv2[i],
- newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ EXPECT_NEAR(newv2[i],
++ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+
+ #ifdef USE_CUDA
+ TEST(Adagrad, ApplyCUDA) {
+ singa::Adagrad adagrad;
+ float lr = 0.1f;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::CudaGPU dev;
+ singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
++ singa::OptimizerConf conf;
++ adagrad.Setup(conf);
+ adagrad.Apply(0, lr, "xx", &grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ v1.ToHost();
+ const float* newv1 = v1.data<const float*>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+ for (int i = 0; i < 4; ++i)
- EXPECT_FLOAT_EQ(newv1[i],
- v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
++ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ adagrad.Apply(1, lr, "xx", &grad, &value);
+ singa::Tensor v2 = value.Clone();
+ v2.ToHost();
+ const float* newv2 = v2.data<const float*>();
+ for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+ for (int i = 0; i < 4; ++i)
+ EXPECT_FLOAT_EQ(newv2[i],
- newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()));
+ }
+ #endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/test/singa/test_rmsprop.cc
----------------------------------------------------------------------
diff --cc test/singa/test_rmsprop.cc
index 0000000,62101f7..8104f50
mode 000000,100644..100644
--- a/test/singa/test_rmsprop.cc
+++ b/test/singa/test_rmsprop.cc
@@@ -1,0 -1,103 +1,106 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
+ #include "singa_config.h"
+ #include <cmath>
+
+ TEST(RMSProp, ApplyCPU) {
+ singa::RMSProp rmsprop;
+ float lr = 0.1f;
- float rho = 0.002f;
++ float rho = 0.9;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::OptimizerConf conf;
+ conf.set_rho(rho);
++ conf.set_delta(1E-8);
+
+ singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ rmsprop.Setup(conf);
+ rmsprop.Apply(0, lr, "xx", &grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ const float* newv1 = v1.data<const float*>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+ for (int i = 0; i < 4; ++i)
- EXPECT_FLOAT_EQ(newv1[i],
- v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ EXPECT_NEAR(newv1[i], v[i] - g[i] * lr / sqrt(history[i] + (float)1E-8),
++ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ rmsprop.Apply(1, lr, "xx", &grad, &value);
+ singa::Tensor v2 = value.Clone();
+ const float* newv2 = v2.data<const float*>();
+ for (int i = 0; i < 4; ++i)
- history[i] += history[i] * rho + g[i] * g[i] * (1 - rho);
++ history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+
+ for (int i = 0; i < 4; ++i)
- EXPECT_FLOAT_EQ(newv2[i],
- newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ EXPECT_NEAR(newv2[i], newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8),
++ 1e-5);
+ }
+
+ #ifdef USE_CUDA
+ TEST(RMSProp, ApplyCUDA) {
+ singa::RMSProp rmsprop;
+ float lr = 0.1f;
- float rho = 0.002f;
++ float rho = 0.02;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::OptimizerConf conf;
+ conf.set_rho(rho);
++ conf.set_delta(1e-8);
+
+ singa::CudaGPU dev;
+ singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
++ rmsprop.Setup(conf);
+ rmsprop.Apply(0, lr, "xx", &grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ v1.ToHost();
+ const float* newv1 = v1.data<const float*>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+ for (int i = 0; i < 4; ++i)
- EXPECT_FLOAT_EQ(newv1[i],
- v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
++ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ rmsprop.Apply(1, lr, "xx", &grad, &value);
+ singa::Tensor v2 = value.Clone();
+ v2.ToHost();
+ const float* newv2 = v2.data<const float*>();
+ for (int i = 0; i < 4; ++i)
- history[i] += history[i] * rho + g[i] * g[i] * (1 - rho);
++ history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+
+ for (int i = 0; i < 4; ++i)
- EXPECT_FLOAT_EQ(newv2[i],
- newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++ EXPECT_NEAR(newv2[i],
++ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+ #endif