You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/13 13:19:54 UTC

[01/50] [abbrv] incubator-singa git commit: SINGA-167 - Add Tensor Math function APIs

Repository: incubator-singa
Updated Branches:
  refs/heads/master d547a8610 -> 9c2869b9a


SINGA-167 - Add Tensor Math function APIs

Add basic linalg functions for Tensor

Add blas functions for Tensor.

Unify gemm and gemv in Tensor::Mult

this commit also contains code for Param class, which woud be removed in the next commit.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/02851fac
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/02851fac
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/02851fac

Branch: refs/heads/master
Commit: 02851fac11ae6455b60d1cd5be4c2b6f142696cf
Parents: e36bc92
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri May 13 21:00:48 2016 +0800
Committer: wangwei <wa...@gmail.com>
Committed: Tue May 17 00:40:23 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                       |   2 +-
 include/singa/core/math.h            | 273 ---------------------
 include/singa/core/tensor.h          | 285 +++++++++++-----------
 include/singa/model/layer.h          |  23 +-
 include/singa/model/param.h          |  97 ++++++++
 src/core/device/device.cc            |   1 +
 src/core/math/cpp_math.cc            |  54 -----
 src/core/math/cuda_math.cc           |  48 ----
 src/core/math/opencl_math.cc         |  24 --
 src/core/tensor/tensor.cc            | 379 ++++++++++++++++++++++++++----
 src/core/tensor/tensor_math.h        | 302 ++++++++++++++++++++++++
 src/core/tensor/tensor_math_cpp.h    |  57 +++++
 src/core/tensor/tensor_math_cuda.h   |  53 +++++
 src/core/tensor/tensor_math_opencl.h |  28 +++
 src/model/layer/layer.cc             |   8 +
 src/proto/layer.proto                |  22 +-
 test/singa/test_cpp_math.cc          |   4 +-
 test/singa/test_tensor.cc            |  35 +--
 test/singa/test_tensor_math.cc       |  84 +++++++
 19 files changed, 1135 insertions(+), 644 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21b3804..67a82e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
 PROJECT(singa)
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11")
 
 # Flags
 IF(UNIX OR APPLE)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/core/math.h
----------------------------------------------------------------------
diff --git a/include/singa/core/math.h b/include/singa/core/math.h
deleted file mode 100644
index 511d9ee..0000000
--- a/include/singa/core/math.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef SINGA_CORE_MATH_H_
-#define SINGA_CORE_MATH_H_
-#include <type_traits>
-#include "singa/core/common.h"
-#include "singa/utils/logging.h"
-
-namespace singa {
-
-/// \file math.h Math functions for linear algebra, neural net and random
-/// operations.
-/// All functions have a template argument, DType for DataType, Lib for the
-/// backend library, e.g., lib::Cublas, lib::Cudnn, etc.
-
-/// Some operations would have many config/hyper-parameters, e.g., Conv, and
-/// these config vary among diff implementations, e.g., cuda/cudnn/opencl.
-/// To separate the modules, we pass a OpConf pointer to the Tensor Op function.
-/// The specific fields are implemented by inheriting OpConf, and casting the
-/// pointer between the base and the sub-class.
-class OpConf {
- public:
-  template <typename T>
-  T* CastTo() {
-    static_assert(std::is_base_of<OpConf, T>::value,
-                  "The cast type must be a sub-class of OpConf");
-    return static_cast<T*>(this);
-  }
-};
-
-// ================Linear algebra functions====================================
-template <typename DType, typename Lib>
-void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Sign(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Base is e, Neper number
-template <typename DType, typename Lib>
-void Exp(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Natual logarithm, the base is e, Neper number.
-template <typename DType, typename Lib>
-void Log(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Do v^x for every v from the input tensor
-template <typename DType, typename Lib>
-void Pow(int count, DType x, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Do v^x for every v from the lhs and every x from rhs
-template <typename DType, typename Lib>
-void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Clamp every element into [low, high]
-template <typename DType, typename Lib>
-void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret,
-           Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = x + input
-template <typename DType, typename Lib>
-void Add(int count, DType x, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = x * input
-/// div could be enabled by calling Mult with 1/x
-template <typename DType, typename Lib>
-void Mult(int count, DType x, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs + rhs
-template <typename DType, typename Lib>
-void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs - rhs
-template <typename DType, typename Lib>
-void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs * rhs
-template <typename DType, typename Lib>
-void Mult(int count, const Blob* lhs, const Blob* rhs, Blob* ret,
-          Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs / rhs
-template <typename DType, typename Lib>
-void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// outer-product.
-/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
-template <typename DType, typename Lib>
-void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
-           Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lib>
-void SumRow(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lib>
-void SumCol(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
-template <typename DType, typename Lib>
-void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Add the vector v to every column of A as the column of ret
-template <typename DType, typename Lib>
-void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
-// ===== Level 1
-/// return the index of the element with the max value.
-template <typename DType, typename Lib>
-void Amax(int count, const Blob* input, int* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// return the index of the element with the min value.
-template <typename DType, typename Lib>
-void Amin(int count, const Blob* input, int* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// ret = sum |x| for all x in input
-template <typename DType, typename Lib>
-void Asum(int count, const Blob* input, DType* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = alpha * input + ret
-template <typename DType, typename Lib>
-void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret *= x
-template <typename DType, typename Lib>
-void Scale(int count, DType x, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
-         Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 2
-/// ret = alpha * op(A) * v + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(A) = m, cols(A) = n.
-template <typename DType, typename Lib>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
-          DType beta, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 3
-/// ret = alpha * op(A) * op(B) + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(A) = m, cols(A) = n.
-template <typename DType, typename Lib>
-void GEMV(bool transA, bool transB, int m, int n, int k, DType alpha,
-          const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ================Random functions===========================================
-// The random generator should be extracted from ctx.
-template <typename DType, typename Lib>
-void Uniform(int count, DType low, DType high, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Gaussian(int count, DType mean, DType std, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-template <typename DType, typename Lib>
-void Bernoulli(int count, DType p, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret[i] would be 1 with prob p[i] and 0 with 1-p[i]. 0<= p[i] <= 1
-template <typename DType, typename Lib>
-void Bernoulli(int count, const Blob* p, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ================Neural net functions=======================================
-/// Do 2D conv.
-/// c is input image channel, w is input width, h is input height
-/// nb_kernel is output channel, kw, and kh are kenerl width and height
-/*
-template <typename DType, typename Lib>
-void Conv2D(int c, int w, int h, int nb_kernel, int kw, int kh,
-           const Blob* input, const Blob* kernel, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-*/
-}  // namespace singa
-
-#endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 725f657..4278078 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -20,23 +20,29 @@
 #define SINGA_CORE_TENSOR_H_
 
 #include <vector>
+#include <tuple>
 
 #include "singa/core/common.h"
 #include "singa/core/device.h"
-#include "singa/core/math.h"
 #include "singa/proto/core.pb.h"
 #include "singa/utils/logging.h"
 
 using std::vector;
+using std::tuple;
 namespace singa {
 
 typedef vector<int> Shape;
 inline int Product(Shape shape) {
   if (shape.size() == 0)
     return 0;
+  return Product(shape.begin(), shape.end());
+}
+
+inline int Product(vector<int>::iterator begin, vector<int>::iterator end) {
+  CHECK(begin != end);
   int v = 1;
-  for (auto s : shape)
-    v *= s;
+  for (auto it = being; it < end; it++)
+    v* = *it;
   return v;
 }
 
@@ -60,19 +66,20 @@ inline int SizeOf(DataType t) {
 class Tensor {
  public:
   ~Tensor();
-  Tensor() = default;
-  explicit Tensor(const Shape& shape, DataType dtype = kFloat32);
+  Tensor();
+  Tensor(Shape&& shape, DataType dtype = kFloat32);
+  Tensor(const Shape& shape, DataType dtype = kFloat32);
+  Tensor(Shape&& shape, Device* dev, DataType dtype = kFloat32);
   Tensor(const Shape& shape, Device* dev, DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor& from);
-
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(Tensor&& from);
 
   /// For functions in xx_math.cc to access the blob.
   /// Users should not operate against Blob directly.
-  /// It will malloc memory for the tensor if not allocated before.
+  /// blob_ is allocated in constructors.
   Blob* blob() const {
     return blob_;
   }
@@ -82,9 +89,9 @@ class Tensor {
   }
 
   /// Return immutable Tensor values with given type.
-  template <typename T>
-  const T* data() {
-    return static_cast<const T*> (blob()->data());
+  template <typename DType>
+  const DType* data() const {
+    return static_cast<const DType*> (blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -96,20 +103,28 @@ class Tensor {
     return shape_;
   }
 
+  int nDim() const {
+    return shape_.size();
+  }
+
   bool transpose() const {
     return transpose_;
   }
 
+  /// Return number of total elements
   int Size() const {
     return blob_->size() / SizeOf(data_type_);
   }
 
+  /// Return memory size (i.e., Bytes)
   int MemSize() const {
     return blob_->size();
   }
 
+  /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
   void ReShape(const Shape& shape);
 
+  /// Reset the data type, it would reallocate blob if type changes.
   void AsType(DataType type);
 
   /// Reset the device.
@@ -119,8 +134,9 @@ class Tensor {
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
 
-  /// For init the tensor values, copy 'size' bytes data.
-  void CopyDataFromHostPtr(const void* src, size_t size);
+  /// For init the tensor values, copy 'num' elements.
+  template<typename DType>
+  void CopyDataFromHostPtr(const DType* src, int num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
@@ -141,49 +157,39 @@ class Tensor {
   /// Copy the meta info with data blob shared.
   void operator=(Tensor&& t);
 
+
   void operator+=(const Tensor& t);
-  /*
-  void operator+=(Tensor&& t);
+  // void operator+=(Tensor&& t);
   void operator-=(const Tensor& t);
-  void operator-=(Tensor&& t);
+  // void operator-=(Tensor&& t);
   void operator*=(const Tensor& t);
-  void operator*=(Tensor&& t);
+  // void operator*=(Tensor&& t);
   void operator/=(const Tensor& t);
-  void operator/=(Tensor&& t);
+  // void operator/=(Tensor&& t);
 
   // Scalar operations.
 
   /// T is a scalar type
-  template <typename T>
-  void operator+=(const T x);
+  template<typename DType>
+  void operator+=(DType x);
 
   /// T is a scalar type
-  template <typename T>
-  void operator-=(const T x);
+  template <typename DType>
+  void operator-=(const DType x);
 
   /// T is a scalar type
-  template <typename T>
-  void operator*=(const T x);
+  template <typename DType>
+  void operator*=(const DType x);
 
   /// T is a scalar type
-  template <typename T>
-  void operator/=(const T x);
-
-  void Log(int base = 2);
-  void Tanh();
-  void Sigmoid();
-  void ReLU();
-
-  // random functions.
-  void Uniform(float low, float high);
-  template <typename T>
-  void Gaussian(float mean, float std);
+  template <typename DType>
+  void operator/=(const DType x);
 
   /// save Tensor into a proto msg
   // void ToProto(TensorProto* t);
   /// load Tensor from proto msg
   // void FromProto(const TensorProto& t);
-  */
+
  protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
@@ -194,142 +200,131 @@ class Tensor {
   Shape shape_;
 };
 
-/// For tensors with sparse content, e.g., missing columns or rows.
+// For tensors with sparse content, e.g., missing columns or rows.
 // class SparseTensor : public Tensor {};
 
-// ==================Simple Linear Algebra Operations=========================
-/*
-Tensor Tanh(const Tensor& t);
-Tensor Log(const Tensor& t);
-Tensor Sigmoid(const Tensor& t);
-Tensor ReLU(const Tensor& t);
-Tensor Softmax(const Tensor& t);
-*/
+/// Copy 'num' elements of src to dst.
+/// The first 'src_offset' ('dst_offset') elements will be skipped.
 void CopyData(Tensor* dst,
               const Tensor& src,
-              int msize,
+              int num,
               int src_offset = 0,
               int dst_offset = 0);
 
-// element-wise ops
+/// Copy 'nBytes' bytes of src data to dst.
+/// The first 'src_offset' ('dst_offset') bytes will be skipped.
+void CopyRawData(Tensor* dst,
+              const Tensor& src,
+              int nBytes,
+              int src_offset = 0,
+              int dst_offset = 0);
+
+// ==================Simple Linear Algebra Operations=========================
+Tensor Abs(const Tensor& t);
+Tensor Exp(const Tensor& t);
+Tensor Log(const Tensor& t);
+Tensor ReLU(const Tensor& t);
+Tensor Sigmoid(const Tensor& t);
+Tensor Sign(const Tensor& t);
+Tensor Sqrt(const Tensor& t);
+Tensor Tanh(const Tensor& t);
+
+/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
+/// and shape_[axis+1]*...*shape_[nDim()] columns.
+/// and do softmax along each row.
+Tensor Softmax(const Tensor& t, int axis = -1);
+void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
+
+/// Element-wise opeartion, ret[i]=t[i]^x
+template<typename DType>
+Tensor Pow(const Tensor& t, DType x);
+/// Element-wise opeartion, ret[i]=t[i]^x
+template<typename DType>
+void Pow(const Tensor& t, DType x, Tensor* ret);
+/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
+Tensor Pow(const Tensor& base, Tensor exp);
+/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
+void Pow(const Tensor& base, const Tensor& exp, Tensor* ret);
 
 Tensor operator+(const Tensor& lhs, const Tensor& rhs);
 void Add(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-/*
 Tensor operator-(const Tensor& lhs, const Tensor& rhs);
 void Sub(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 Tensor operator*(const Tensor& lhs, const Tensor& rhs);
-void operator*(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+void EltwiseMult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 Tensor operator/(const Tensor& lhs, const Tensor& rhs);
-void operator/(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+void Div(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 
-template <typename T>
-Tensor operator+(const T x, const Tensor& t);
-template <typename T>
-void operator+(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator+(const Tensor& t, DType x);
+template <typename DType>
+void Add(const Tensor& t, DType x, Tensor* ret);
 
-template <typename T>
-Tensor operator-(const T x, const Tensor& t);
-template <typename T>
-void operator-(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator-(const Tensor& t, DType x);
+template <typename DType>
+void Sub(const Tensor& t, DType x, Tensor* ret);
 
-template <typename T>
-Tensor operator*(const T x, const Tensor& t);
-template <typename T>
-void operator*(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator*(const Tensor& t, DType x);
+template <typename DType>
+void EltwiseMult(const Tensor& t, DType x, Tensor* ret);
 
-template <typename T>
-Tensor operator/(const T x, const Tensor& t);
-template <typename T>
-void operator/(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator/(const Tensor& t, DType x);
+template <typename DType>
+void Div(const Tensor& t, DType x, Tensor* ret);
 
 //================Blas operations============================================
+// ===== Level 1
+// TODO(wangwei) make amax/amin/asum a member function of tensor
+// void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
+// void Asum(Tensor Context* ctx);
+
+// template <typename DType>
+// void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  ret = lhs * rhs
+template <typename DType>
 Tensor Mult(const Tensor& lhs, const Tensor& rhs);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  ret = lhs * rhs
+template <typename DType>
 void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 
-tempalte<typename T> T Dot(const Tensor& lhs, const Tensor& rhs);
-
-//================Neural Net operations======================================
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  ret = alpha lhs * rhs + beta * ret
+template <typename DType>
+Tensor Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
+template <typename DType>
+void Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs,
+    Tensor* C);
 
-/// Convolution Op. 'Conf' is ConvConf;
-void Conv(const OpConf* conf,
-          const Tensor& input,
-          const Tensor& W,
-          const Tensor &b,
-          Tensor* ret);
+// tempalte<typename DType> T Dot(const Tensor& lhs, const Tensor& rhs);
 
 //================Random operations==========================================
-Tensor Uniform(float low, float high, const Shape& shape, Device* dev);
-
-Tensor Gaussian(float mean, float std, const Shape& shape, Device* dev);
-*/
-//============================================================================
-/// typedef DType accroding to type value.
-/// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...)                               \
-  do {                                                              \
-    switch (type) {                                                 \
-      case kFloat32: {                                              \
-        typedef float DType;                                        \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kInt: {                                                  \
-        typedef int DType;                                          \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kChar: {                                                 \
-        typedef char DType;                                         \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      default:                                                      \
-        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
-    }                                                               \
-  } while (0)
-
-/// typedef DType and Lib according to values of type and lib respectively.
-/// type is from DataType, and lib is from LibType.
-/// DType and Lib would be used in __VA_ARGS__.
-#define TYPE_LIB_SWITCH(dtype, DType, ltype, Lib, ...)                 \
-  do {                                                               \
-    const int _SwitchShift = 3;                                      \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);                 \
-    switch (_SwitchHash) {                                           \
-      case ((kFloat32 << _SwitchShift) + kCuda): {                   \
-        typedef float DType;                                          \
-        typedef lib::Cuda Lib;                                            \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      case ((kFloat32 << _SwitchShift) + kCudnn): {                  \
-        typedef float DType;                                          \
-        typedef lib::Cudnn Lib;                                           \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      case ((kFloat32 << _SwitchShift) + kCpp): {                    \
-        typedef float DType;                                          \
-        typedef lib::Cpp Lib;                                             \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      case ((kFloat32 << _SwitchShift) + kOpencl): {                \
-        typedef float DType;                                          \
-        typedef lib::Opencl Lib;                                          \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      default:                                                       \
-        LOG(FATAL) << "Unknown combination of data type "            \
-                   << DataType_Name(dtype) << " and library "        \
-                   << LibType_Name(ltype);                             \
-    }                                                                \
-  } while (0)
-
-
+/// For each element x set x = 0 if random() < p; otherwise x = 1.
+Tensor Bernoulli(float p, Blob* t);
+/// Fill in Tensor 't' following uniform distribution.
+Tensor Uniform(float low, DType high, Blob* t);
+/// Fill in Tensor 't' following Gaussian distribution.
+Tensor Gaussian(float mean, DType std, Blob* t);
 
+//================Neural Net operations======================================
+// following API of cudnn, e.g., conv, pool, lrn, batchnorm, softmax
+void ConvFwd(const ConvConf& conf, const Tensor& x, const Tensor& w, Tensor* y);
+void ConvBwdBias(const ConvConf& conf, const Tensor& dy, Tensor* db);
+void ConvBwdFilter(const ConvConf& conf, const Tensor& dy, const Tensor& x,
+                   Tensor* dw);
+void ConvBwdData(const ConvConf& conf, const Tensor& dy, const Tensor& w,
+                 Tensor* db);
+void PoolFwd(const PoolConf& conf, const Tensor& x, Tensor* y,
+             Tensor* mask = nullptr);
+void PoolBwd(const PoolConf& conf, const Tensor& y, const Tensor& dy,
+             const Tensor& x, Tensor* dx);
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 37f3fa8..7b9b6d4 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -45,7 +45,9 @@ class Layer {
   }
 
   /// Set meta data fields configured in 'conf' (a proto message).
-  virtual void Setup(const LayerConf& conf) {}
+  virtual void Setup(const LayerConf& conf) {
+    name_ = conf.name();
+  }
 
   /// Do feature transformation for given 'input' Tensor.
   /// It is the forward pass for feed-forward nets and rnn nets.
@@ -67,6 +69,7 @@ class Layer {
                                                const vector<Tensor>& input) {
     return vector<Tensor>{};
   }
+  // return <dx>  <dw (ParamGrad)>
 
   /// Move the layer (including its parameters and other Tensor) onto the given
   /// device
@@ -82,28 +85,26 @@ class Layer {
   }
 
   /// Serialize the layer info, including params)_, into a LayerConf message.
-  virtual std::string ToProto(LayerConf* param) const = 0;
+  virtual std::string ToProto(LayerConf* conf) const {
+    conf->set_name(name_);
+  }
 
   /// Serialize the layer info, including params_, into a string representing
   /// a LayerParameter message.
-  /*
-  std::string ToProtoStr() const {
-    std:: string str;
-    SerializeToString(&str);
-  }
-  */
+  std::string ToProtoStr() const;
 
   /// Return all Param instances of this layer.
-  const vector<void*> params() const { return params_; }
+  /// Each layer could cache the Param objects.
+  /// To save memory of , it can also create it when this function
+  /// is called
+  const vector<Param*> GetParam();
 
   /// Each layer instance would optionally have a name.
   /// Used for debugging and logging.
   const std::string name() const { return name_; }
 
-
  protected:
   std::string name_;
-  std::vector<void*> params_;
 };
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/model/param.h
----------------------------------------------------------------------
diff --git a/include/singa/model/param.h b/include/singa/model/param.h
new file mode 100644
index 0000000..b859b1c
--- /dev/null
+++ b/include/singa/model/param.h
@@ -0,0 +1,97 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_MODEL_PARAM_H_
+#define SINGA_MODEL_PARAM_H_
+#include "singa/core/tensor.h"
+#include <vector>
+#include <string>
+using std::vector;
+using std::string;
+namespace singa {
+/// Base Param class for storing set of parameters, e.g., a weight matrix or a
+/// bias vector.
+/// It includes multiple Tensor s for parameter values, gradients, etc.
+class Param {
+ public:
+  ~Param();
+  Param(const ParamSpec& conf);
+  Param(Param&& p);
+  Param(const Param& p);
+  void operator=(Param&& p);
+  void operator=(const Param& p);
+
+  Tensor& value() {
+    return value_;
+  }
+
+  Tensor& grad() {
+    return grad_;
+  }
+
+  void set_value(const Tensor& t) {
+    value_ = t;
+  }
+
+  void set_value(Tensor&& t) {
+    value_ = std::move(t);
+  }
+
+  void set_grad(const Tensor& t) {
+    isGradValid_ = true;
+    grad_ = t;
+  }
+
+  void set_grad(Tensor&& t) {
+    grad_ = std::move(t);
+  }
+
+  // void Compress();
+  // string ToString();
+
+ protected:
+  string name_;
+  Tensor value_;
+  float lr_mult_ = 1.0f, decay_mult_ = 1.0f;
+};
+
+class ParamGrad {
+// return grad tensor or data to recover the grad tensor, e.g., if W = U * V
+// then, ParamGrad could just store U and V. provide func for serailize and
+// deserialize.
+};
+
+// updater just copy the ParamGrad to a device and submit ops to that device, e.g.,
+// add grad; check update_condidtion; apply sgd; copy back.
+// consider rpc (no rmda).
+
+Param* CreateParam(string type) {
+  Param* p = nullptr;
+  if (type == "default")
+    p = new Param();
+  else
+    LOG(FATAL) << "Currently param type " << type << " is not implemented."
+               << "Pls use the 'default' type";
+  return p;
+}
+#endif  // SINGA_MODEL_PARAM_H_
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 5bdab6f..4976a32 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -49,6 +49,7 @@ void Device::FreeBlob(Blob* blob) {
 
 void Device::CopyData(Blob* dst, const Blob& src, int len, int dst_offset,
                       int src_offset) {
+
   memcpy(reinterpret_cast<Byte*>(dst->mutable_data()) + dst_offset,
          (const Byte*)src.data() + src_offset, len);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/math/cpp_math.cc
----------------------------------------------------------------------
diff --git a/src/core/math/cpp_math.cc b/src/core/math/cpp_math.cc
deleted file mode 100644
index 638d693..0000000
--- a/src/core/math/cpp_math.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "singa/core/math.h"
-#include "singa/core/common.h"
-
-#ifdef USE_CBLAS
-#include <cblas.h>
-#endif
-
-namespace singa {
-template<>
-void Add<float, lib::Cpp>(int count,
-                     const Blob* lhs,
-                     const Blob* rhs,
-                     Blob* ret,
-                     Context* ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float*>(ret->mutable_data());
-  const float *lptr = static_cast<const float*>(lhs->data());
-  const float *rptr = static_cast<const float*>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + rptr[i];
-  }
-}
-
-#ifdef USE_CBLAS
-template<>
-void Dot<float, lib::Cpp>(int count,
-                     const Blob* lhs,
-                     const Blob* rhs,
-                     float* ret,
-                     Context* ctx) {
-  float dptr = ret->mutable_data(), lptr = lhs->data(), rptr = rhs->data();
-  *ret = cblas_sdot(count, lptr, 1, rptr, 1);
-}
-
-#endif
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/math/cuda_math.cc
----------------------------------------------------------------------
diff --git a/src/core/math/cuda_math.cc b/src/core/math/cuda_math.cc
deleted file mode 100644
index 1cff1c2..0000000
--- a/src/core/math/cuda_math.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "singa/core/math.h"
-#include "singa/core/common.h"
-
-
-namespace singa {
-
-#ifdef USE_CUDA
-template<>
-void Add<float, lib::Cuda>(int count, const Blob* lhs, const Blob* rhs,
-                        Blob* ret, Context* ctx) {
-  cublasSetStream(ctx->handle, ctx->stream);
-  cublasScopy(ctx->handle, count, lhs->data(), 1, ret->mutable_data(), 1);
-  cublasSaxpy(ctx->handle, 1.0f, rhs->data(), 1, ret->mutable_data(), 1);
-}
-
-#ifdef USE_CUDNN
-template<>
-void Conv<float, lib::Cudnn>(const OpConf *conf,
-          const Blob* input,
-          const Blob* W,
-          const Blob* b,
-          Blob* ret,
-          Context* ctx) {
-  // auto conv_conf = conf->CastTo<ConvConf>();
-  // conv op
-}
-
-#endif
-#endif
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/math/opencl_math.cc
----------------------------------------------------------------------
diff --git a/src/core/math/opencl_math.cc b/src/core/math/opencl_math.cc
deleted file mode 100644
index 7012610..0000000
--- a/src/core/math/opencl_math.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "singa/core/math.h"
-
-namespace singa {
-
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 8fdc2ed..51b785e 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -15,28 +15,42 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "singa/core/tensor.h"
-#include "singa/core/math.h"
+#include "./tensor_math.h"
+#include "./tensor_math_cpp.h"
+#include "./tensor_math_cuda.h"
+#include "./tensor_math_opencl.h"
 
 namespace singa {
+
 Tensor::~Tensor() {
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
+Tensor::Tensor() {
+  device_ = &hostDeviceSingleton;
+}
+
 Tensor::Tensor(const Shape& shape, DataType dtype)
     : data_type_(dtype), device_(&hostDeviceSingleton), shape_(shape) {
   device_ = &hostDeviceSingleton;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-
+Tensor::Tensor(Shape&& shape, DataType dtype)
+    : data_type_(dtype), device_(&hostDeviceSingleton), shape_(shape) {
+  device_ = &hostDeviceSingleton;
+  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+}
 Tensor::Tensor(const Shape& shape, Device* device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-
+Tensor::Tensor(Shape&& shape, Device* device, DataType dtype)
+    : data_type_(dtype), device_(device), shape_(shape) {
+  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+}
 Tensor::Tensor(const Tensor& t)
     : transpose_(t.transpose_),
       data_type_(t.data_type_),
@@ -50,7 +64,7 @@ Tensor::Tensor(Tensor&& t)
     : transpose_(t.transpose_),
       data_type_(t.data_type_),
       device_(t.device_),
-      shape_(t.shape_) {
+      shape_(std::move(t.shape_)) {
   blob_ = t.blob_;
   t.blob_ = nullptr;
 }
@@ -90,18 +104,26 @@ void Tensor::ToHost() {
   ToDevice(device_->host());
 }
 
-void Tensor::CopyDataFromHostPtr(const void* src, size_t size) {
+template<typename DType>
+void Tensor::CopyDataFromHostPtr(const DType* src, int num) {
+  CHECK_EQ(sizeof(DType), SizeOf(data_type_)) << "data_type is "
+                                              << DataType_Name(data_type_)
+                                              << " user given type is of size "
+                                              << sizeof(DType);
   if (src != nullptr)
-    device_->CopyDataFromHostPtr(blob(), src, size);
+    device_->CopyDataFromHostPtr(blob(), src, sizeof(DType) * num);
   else
     LOG(WARNING) << "Copy data from null host ptr";
 }
+template void Tensor::CopyDataFromHostPtr(const float* src, int num);
 
 void Tensor::CopyData(const Tensor& src) {
   CHECK_EQ(Size(), src.Size());
+  CHECK(blob_ != nullptr);
   // Do copy only if the src's blob is already initialized.
-  if (src.blob_ != nullptr)
-    singa::CopyData(this, src, Size() * SizeOf(data_type_), 0, 0);
+  if (src.blob_ != nullptr) {
+    singa::CopyData(this, src, Size(), 0, 0);
+  }
 }
 
 Tensor Tensor::Clone() {
@@ -112,8 +134,10 @@ Tensor Tensor::Clone() {
 }
 
 Tensor Tensor::T() const {
+  CHECK_EQ(shape_.size(), 2);
   Tensor t(*this);
   t.transpose_ = ~transpose_;
+  std::swap(shape_[0], shape_[1]);
   return t;
 }
 
@@ -132,80 +156,315 @@ void Tensor::operator=(Tensor&& t) {
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
-  shape_ = t.shape_;
+  shape_ = std::move(t.shape_);
   device_ = t.device_;
   blob_ = t.blob_;
   t.blob_ = nullptr;
 }
 
-void Tensor::operator+=(const Tensor& t) {
-  Add(*this, t, this);
-}
-// ====================Tensor Operations=======================================
+#define GenUnaryTensorArgMemberFunction(op, fn) \
+  void Tensor::op(const Tensor& t) { fn(*this, t, this); }
+
+GenUnaryTensorArgMemberFunction(operator+=, Add);
+GenUnaryTensorArgMemberFunction(operator-=, Sub);
+GenUnaryTensorArgMemberFunction(operator*=, EltwiseMult);
+GenUnaryTensorArgMemberFunction(operator/=, Div);
+
+#define GenUnaryScalarArgMemberFunction(op, fn) \
+  template <typename DType>                     \
+  void Tensor::op(DType x) {                    \
+    fn(*this, x, this);                         \
+  }                                             \
+  template void Tensor::op<float>(float x)
+
+GenUnaryScalarArgMemberFunction(operator-=, Sub);
+GenUnaryScalarArgMemberFunction(operator+=, Add);
+GenUnaryScalarArgMemberFunction(operator*=, EltwiseMult);
+GenUnaryScalarArgMemberFunction(operator/=, Div);
 
+// ====================Tensor Operations=======================================
 void CopyData(Tensor* dst,
               const Tensor& src,
-              int len,
+              int num,
               int dst_offset,
               int src_offset) {
-  CHECK_GE(src.MemSize(), src_offset + len);
-  CHECK_GE(dst->MemSize(), dst_offset + len);
+  CHECK_GE(src.Size(), src_offset + num);
+  CHECK_GE(dst->Size(), dst_offset + num);
+  int width = SizeOf(src.data_type());
+  CHECK_EQ(width, SizeOf(dst->data_type()));
+  CopyRawData(dst, src, num * width, dst_offset * width, src_offset * width);
+}
+
+void CopyRawData(Tensor* dst,
+              const Tensor& src,
+              int nBytes,
+              int dst_offset,
+              int src_offset) {
+  CHECK_GE(src.MemSize(), src_offset + nBytes);
+  CHECK_GE(dst->MemSize(), dst_offset + nBytes);
   Device* src_dev = src.device(), *dst_dev = dst->device();
   Blob* src_blob = src.blob(), *dst_blob = dst->blob();
   if (dst_dev->device_lib() != src_dev->device_lib()) {
     // let the none cpp device conduct copy op
     if (dst_dev->device_lib() == kCpp) {
-      src_dev->CopyData(dst_blob, *src_blob, len, dst_offset, src_offset);
+      src_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
     } else if (src_dev->device_lib() == kCpp) {
-      dst_dev->CopyData(dst_blob, *src_blob, len, dst_offset, src_offset);
+      dst_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
-    src_dev->CopyData(dst_blob, *src_blob, len, dst_offset, src_offset);
+    src_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
   }
 }
+//============================================================================
+/// typedef DType accroding to type value.
+/// DType would be used in the code block __VA_ARGS__.
+#define TYPE_SWITCH(type, DType, ...)                               \
+  do {                                                              \
+    switch (type) {                                                 \
+      case kFloat32: {                                              \
+        typedef float DType;                                        \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kInt: {                                                  \
+        typedef int DType;                                          \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kChar: {                                                 \
+        typedef char DType;                                         \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      default:                                                      \
+        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
+    }                                                               \
+  } while (0)
+
+/// typedef DType and Lib according to values of type and lib respectively.
+/// type is from DataType, and lib is from LibType.
+/// DType and Lib would be used in __VA_ARGS__.
+#define TYPE_LIB_SWITCH(dtype, DType, ltype, Lib, ...)        \
+  do {                                                        \
+    const int _SwitchShift = 3;                               \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);    \
+    switch (_SwitchHash) {                                    \
+      case ((kFloat32 << _SwitchShift) + kCuda): {            \
+        typedef float DType;                                  \
+        typedef lib::Cuda Lib;                                \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      case ((kFloat32 << _SwitchShift) + kCudnn): {           \
+        typedef float DType;                                  \
+        typedef lib::Cudnn Lib;                               \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      case ((kFloat32 << _SwitchShift) + kCpp): {             \
+        typedef float DType;                                  \
+        typedef lib::Cpp Lib;                                 \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      case ((kFloat32 << _SwitchShift) + kOpencl): {          \
+        typedef float DType;                                  \
+        typedef lib::Opencl Lib;                              \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      default:                                                \
+        LOG(FATAL) << "Unknown combination of data type "     \
+                   << DataType_Name(dtype) << " and library " \
+                   << LibType_Name(ltype);                    \
+    }                                                         \
+  } while (0)
+
+
+#define EltwiseUnaryTensorFn(fn, t, ret)                                   \
+  do {                                                                     \
+    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, { \
+      ret->device()->Submit(                                               \
+          [t, ret](Context* ctx) {                                         \
+            fn<DType, Lib>(t.Size(), t.blob(), ret->blob(), ctx);          \
+          },                                                               \
+          {t.blob()}, {ret->blob()});                                      \
+    });                                                                    \
+  } while (0)
+
+#define GenUnaryTensorFunction(fn)                    \
+  Tensor fn(const Tensor& t) {                        \
+    Tensor ret(t.shape(), t.device(), t.data_type()); \
+    auto* retptr = &ret;                              \
+    EltwiseUnaryTensorFn(fn, t, retptr);              \
+    return ret;                                       \
+  }
+
+GenUnaryTensorFunction(Abs);
+GenUnaryTensorFunction(Exp);
+GenUnaryTensorFunction(Log);
+GenUnaryTensorFunction(ReLU);
+GenUnaryTensorFunction(Sigmoid);
+GenUnaryTensorFunction(Sign);
+GenUnaryTensorFunction(Sqrt);
+GenUnaryTensorFunction(Tanh);
 
-Tensor operator+(const Tensor& lhs, const Tensor& rhs) {
-  Tensor ret(lhs.shape(), lhs.device());
-  Add(lhs, rhs, &ret);
+Tensor Softmax(const Tensor& t, int axis) {
+  Tensor ret(t.shape(), t.device(), t.data_type());
+  Softmax(t, &ret, axis);
   return ret;
 }
 
-void Add(const Tensor& lhs, const Tensor& rhs, Tensor* ret) {
-  TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, {
+void Softmax(const Tensor& t, Tensor* ret, int axis) {
+  int nrow = 1, ncol = t.Size(), size = ncol;
+  CHECK_GE(axis, -1);
+  CHECK_GT(t.shape().size(), 0);
+  if (axis > -1) {
+    nrow = Product(t.shape().begin(), t.shape().begin() + axis + 1);
+    CHECK_EQ(size % nrow, 0) << "Size = " << size << " nrow = " << nrow;
+    ncol = size / nrow;
+  }
+  TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, {
     ret->device()->Submit(
-        [lhs, rhs, ret](Context* ctx) {
-          Add<DType, Lib>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), ctx);
+        [nrow, ncol, t, ret](Context* ctx) {
+          Softmax<DType, Lib>(nrow, ncol, t.blob(), ret->blob(), ctx);
         },
-        {lhs.blob(), rhs.blob()}, {ret->blob()});
-  });
+        {t.blob()}, {ret->blob()});
+    });
 }
-/*
-Tensor operator-(const Tensor& lhs, const Tensor& rhs) {
-  Tensor ret(lhs.shape(), lhs.device());
-  Sub(lhs, rhs, &ret);
+
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
+  do {                                                                         \
+    TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, { \
+      ret->device()->Submit(                                                   \
+          CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                    \
+          [lhs, rhs, ret](Context* ctx) {                                      \
+            fn<DType, Lib>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),    \
+                           ctx);                                               \
+          },                                                                   \
+          {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
+    });                                                                        \
+  } while (0)
+
+#define GenBinaryTensorFunction(op, fn)                        \
+  Tensor op(const Tensor& lhs, const Tensor& rhs) {            \
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
+    fn(lhs, rhs, &ret);                                        \
+    return ret;                                                \
+  }                                                            \
+  void fn(const Tensor& lhs, const Tensor& rhs, Tensor* ret) { \
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
+  }
+
+GenBinaryTensorFunction(operator+, Add);
+GenBinaryTensorFunction(operator-, Sub);
+GenBinaryTensorFunction(operator*, EltwiseMult);
+GenBinaryTensorFunction(operator/, Div);
+GenBinaryTensorFunction(Pow, Pow);
+
+#define EltwiseTensorScalarFn(fn, t, x, ret)                                \
+  do {                                                                      \
+    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, {  \
+      ret->device()->Submit(                                                \
+          static_assert(typeid(x) == typeid(DType),                         \
+                        "The Scalar type must match the Tensor data type"); \
+          [t, x, ret](Context* ctx) {                                       \
+            fn<DType, Lib>(t.Size(), t.blob(), x, ret->blob(), ctx);        \
+          },                                                                \
+          {t.blob()}, {ret->blob()});                                       \
+    });                                                                     \
+  } while (0)
+
+#define GenTensorScalarFunction(op, fn)                \
+  template <typename DType>                                \
+  Tensor op(const Tensor& t, DType x) {                    \
+    Tensor ret(t.shape(), t.device(), t.data_type());  \
+    fn(t, x, &ret);                                    \
+    return ret;                                        \
+  }                                                    \
+  template <typename DType>                                \
+  void fn(const Tensor& t, DType x, Tensor* ret) {   \
+    EltwiseTensorScalarFn(fn, t, x, ret);              \
+  }                                                    \
+  template Tensor op<float>(const Tensor& t, float x); \
+  template void fn<float>(const Tensor& t, const float x, Tensor* ret)
+
+GenTensorScalarFunction(operator+, Add);
+GenTensorScalarFunction(operator-, Sub);
+GenTensorScalarFunction(operator*, EltwiseMult);
+GenTensorScalarFunction(operator/, Div);
+GenTensorScalarFunction(Pow, Pow);
+
+// ================Blas operations============================================
+template <typename DType>
+Tensor Mult(const Tensor& lhs, const Tensor& rhs) {
+  Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());
+  Mult<DType>(lhs, rhs, &ret);
+  return ret;
+}
+template Tensor Mult<float>(const Tensor& lhs, const Tensor& rhs);
+
+template <typename DType>
+void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret) {
+  Mult(DType(1), lhs, DType(1), rhs, ret);
+}
+template void Mult<float>(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+
+template <typename DType>
+Tensor Mult(DType alpha, const Tensor& A, DType beta, const Tensor& B) {
+  Tensor ret(A.shape(), A.device(), A.data_type());
+  Mult<DType>(alpha, A, beta, B, &ret);
   return ret;
 }
+template Tensor Mult<float>(float alpha, const Tensor& lhs, float beta,
+    const Tensor& rhs);
 
-void Sub(const Tensor& lhs, const Tensor& rhs, Tensor *ret) {
-  TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, {
-      ret->device()->Submit(
-        [lhs, rhs, ret](Context* ctx) {
-          Sub<DType, Lib>(
-            lhs.Size(),
-            lhs.blob(),
-            rhs.blob(),
-            ret->blob(),
-            ctx);}
-        , {lhs.blob(), rhs.blob()}, {ret->blob()});
+template <typename SType>
+void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
+{
+  CHECK_EQ(A.shape().size(), 2);
+  bool transA = A.transpose();
+  int m = transA ? A.shape()[1] : A.shape()[0], n = 0;
+  if (B.shape().size() == 1) {
+    n = C->Size();
+    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
+      static_assert(std::is_same<SType, DType>::value,
+        "The scalar type must be the same as the tensor data type");
+      C->device()->Submit(
+        [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
+        GEMV<DType, Lib>(transA, m, n, alpha, A.blob(),
+          B.blob(), beta, C->blob(), ctx);
+        },
+        {A.blob(), B.blob()}, {C->blob()});
       });
+  } else {
+    CHECK(!C->transpose());
+    bool transB = B.transpose();
+    int k = transB ? B.shape()[1] : B.shape()[0];
+    n = C->shape()[1];
+    CHECK_EQ(C->shape()[0], m);
+    CHECK_EQ(A.Size(), m * k);
+    CHECK_EQ(B.Size(), n * k);
+    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
+        static_assert(std::is_same<SType, DType>::value,
+          "The scalar type must be the same as the tensor data type");
+        C->device()->Submit(
+          [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
+          GEMM<DType, Lib>(transA, transB, m, n, k, alpha, A.blob(),
+            B.blob(), beta, C->blob(), ctx);
+          },
+          {A.blob(), B.blob()}, {C->blob()});
+        });
+  }
 }
+template void Mult<float>(float alpha, const Tensor& lhs, float beta,
+    const Tensor& rhs, Tensor* ret);
 
-// ================Blas operations============================================
 
 // ================Neural Net operations======================================
-
+/*
 void Conv(const OpConf* conf, const Tensor& input, const Tensor& W,
           const Tensor& b, Tensor* ret) {
   TYPE_LIB_SWITCH(input.data_type(), DType, input.device()->nn_lib(), Lib, {
@@ -218,5 +477,33 @@ void Conv(const OpConf* conf, const Tensor& input, const Tensor& W,
   });
 }
 */
+void Bernoulli(float threshold, Tensor* t) {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+    t->device()->Submit(
+        [threshold, t](Context* ctx) {
+          Bernoulli<DType, Lib>(t->Size(), threshold, t->blob(), ctx);
+        },
+        {}, {t->blob()});
+  });
+}
+
+void Uniform(float low, float high, Tensor* t) {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+    t->device()->Submit(
+        [low, high, t](Context* ctx) {
+          Uniform<DType, Lib>(t->Size(), low, high, t->blob(), ctx);
+        },
+        {}, {t->blob()});
+  });
+}
 
+void Gaussian(float mean, float std, Tensor* t) {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+    t->device()->Submit(
+        [mean, std, t](Context* ctx) {
+          Gaussian<DType, Lib>(t->Size(), mean, std, t->blob(), ctx);
+        },
+        {}, {t->blob()});
+  });
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
new file mode 100644
index 0000000..a4f68e3
--- /dev/null
+++ b/src/core/tensor/tensor_math.h
@@ -0,0 +1,302 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_CORE_MATH_H_
+#define SINGA_CORE_MATH_H_
+#include <type_traits>
+#include "singa/core/common.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+
+/// \file math.h Math functions for linear algebra, neural net and random
+/// operations.
+/// All functions have a template argument, DType for DataType, Lib for the
+/// backend library, e.g., lib::Cublas, lib::Cudnn, etc.
+
+/// Some operations would have many config/hyper-parameters, e.g., Conv, and
+/// these config vary among diff implementations, e.g., cuda/cudnn/opencl.
+/// To separate the modules, we pass a OpConf pointer to the Tensor Op function.
+/// The specific fields are implemented by inheriting OpConf, and casting the
+/// pointer between the base and the sub-class.
+class OpConf {
+ public:
+  template <typename T>
+  T* CastTo() {
+    static_assert(std::is_base_of<OpConf, T>::value,
+                  "The cast type must be a sub-class of OpConf");
+    return static_cast<T*>(this);
+  }
+};
+
+// ================Linear algebra functions====================================
+/// ret[i] = |input[i]|
+template <typename DType, typename Lib>
+void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// sum all elements of input into ret
+template <typename DType, typename Lib>
+void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret[i] = sign(input[i])
+template <typename DType, typename Lib>
+void Sign(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Base is e, Neper number. ret[i]=exp(input[i])
+template <typename DType, typename Lib>
+void Exp(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
+template <typename DType, typename Lib>
+void Log(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, ret[i]=sqrt([input[i])
+template <typename DType, typename Lib>
+void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, ret[i]=tanh([input[i])
+template <typename DType, typename Lib>
+void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Element-wise operation, ret[i]=max(0, input[i])
+template <typename DType, typename Lib>
+void ReLU(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Element-wise operation, ret[i]=sigmoid([input[i])
+template <typename DType, typename Lib>
+void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, do v^x for every v from the input tensor
+template <typename DType, typename Lib>
+void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
+template <typename DType, typename Lib>
+void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
+template <typename DType, typename Lib>
+void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret,
+           Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = input + x
+template <typename DType, typename Lib>
+void Add(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret =  input - x
+template <typename DType, typename Lib>
+void Sub(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  Add<DType, Lib>(count, input, -x, ret, ctx);
+}
+/// ret = input * x
+template <typename DType, typename Lib>
+void EltwiseMult(int count, const Blob* input, DType x, Blob* ret, Context* ctx)
+{
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret = input / x
+template <typename DType, typename Lib>
+void Div(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  EltwiseMult<DType, Lib>(count, input, DType(1) / x, ret, ctx);
+}
+
+/// ret = lhs + rhs
+template <typename DType, typename Lib>
+void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = lhs - rhs
+template <typename DType, typename Lib>
+void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = lhs * rhs
+template <typename DType, typename Lib>
+void EltwiseMult(int count, const Blob* lhs, const Blob* rhs, Blob* ret,
+          Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = lhs / rhs
+template <typename DType, typename Lib>
+void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// outer-product.
+/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
+template <typename DType, typename Lib>
+void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
+           Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the input matrix into a vector
+template <typename DType, typename Lib>
+void SumRow(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Sum the rows of the input matrix into a vector
+template <typename DType, typename Lib>
+void SumCol(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of ret
+template <typename DType, typename Lib>
+void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Add the vector v to every column of A as the column of ret
+template <typename DType, typename Lib>
+void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// ===== Level 1
+/// return the index of the element with the max value.
+template <typename DType, typename Lib>
+void Amax(int count, const Blob* input, int* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// return the index of the element with the min value.
+template <typename DType, typename Lib>
+void Amin(int count, const Blob* input, int* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret = sum |x| for all x in input
+template <typename DType, typename Lib>
+void Asum(int count, const Blob* input, DType* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = alpha * input + ret
+template <typename DType, typename Lib>
+void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret *= x
+template <typename DType, typename Lib>
+void Scale(int count, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
+         Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ===== Level 2
+/// ret = alpha * op(A) * v + beta * ret.
+/// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+template <typename DType, typename Lib>
+void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
+          DType beta, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ===== Level 3
+/// ret = alpha * op(A) * op(B) + beta * ret.
+/// op(A) = A if trans = false; A^T otherwise; rows(ret) = m, cols(ret) = n.
+template <typename DType, typename Lib>
+void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha,
+          const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ================Random functions===========================================
+/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <typename DType, typename Lib>
+void Bernoulli(int count, float threshold, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <typename DType, typename Lib>
+void Uniform(int count, float low, float high, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
+template <typename DType, typename Lib>
+void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ================Neural net functions=======================================
+template <typename DType, typename Lib>
+void ConvFwd(ConvConf* conf, const Blob* x, const Blob* w, Blob* y,
+             Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void ConvBwdBias(const ConvConf* conf, const Blob* dy, Blob* db, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void PoolFwd(const PoolConf* conf, const Blob* x, Blob* y, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void PoolBwd(const PoolConf* conf, const Blob* y, const Blob* dy, const Blob* x,
+             Blob* dx, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+}  // namespace singa
+
+#endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
new file mode 100644
index 0000000..a953085
--- /dev/null
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+#define SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+#include "./tensor_math.h"
+#include "singa/core/common.h"
+
+#ifdef USE_CBLAS
+#include <cblas.h>
+#endif
+
+namespace singa {
+template<>
+void Add<float, lib::Cpp>(int count,
+                     const Blob* lhs,
+                     const Blob* rhs,
+                     Blob* ret,
+                     Context* ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *dptr = static_cast<float*>(ret->mutable_data());
+  const float *lptr = static_cast<const float*>(lhs->data());
+  const float *rptr = static_cast<const float*>(rhs->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] + rptr[i];
+  }
+}
+
+#ifdef USE_CBLAS
+template<>
+void Dot<float, lib::Cpp>(int count,
+                     const Blob* lhs,
+                     const Blob* rhs,
+                     float* ret,
+                     Context* ctx) {
+  float dptr = ret->mutable_data(), lptr = lhs->data(), rptr = rhs->data();
+  *ret = cblas_sdot(count, lptr, 1, rptr, 1);
+}
+
+#endif
+}
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
new file mode 100644
index 0000000..e1c72d8
--- /dev/null
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
+#define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
+#include "./tensor_math.h"
+#include "singa/core/common.h"
+
+
+namespace singa {
+
+#ifdef USE_CUDA
+template<>
+void Add<float, lib::Cuda>(int count, const Blob* lhs, const Blob* rhs,
+                        Blob* ret, Context* ctx) {
+  cublasSetStream(ctx->handle, ctx->stream);
+  cublasScopy(ctx->handle, count, lhs->data(), 1, ret->mutable_data(), 1);
+  cublasSaxpy(ctx->handle, 1.0f, rhs->data(), 1, ret->mutable_data(), 1);
+}
+
+#ifdef USE_CUDNN
+template<>
+void Conv<float, lib::Cudnn>(const OpConf *conf,
+          const Blob* input,
+          const Blob* W,
+          const Blob* b,
+          Blob* ret,
+          Context* ctx) {
+  // auto conv_conf = conf->CastTo<ConvConf>();
+  // conv op
+}
+
+#endif
+#endif
+}
+
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h
new file mode 100644
index 0000000..c4b1347
--- /dev/null
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
+#include "./tensor_math.h"
+
+namespace singa {
+
+
+}
+
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/model/layer/layer.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/layer.cc b/src/model/layer/layer.cc
index 1f0e34d..0e83cde 100644
--- a/src/model/layer/layer.cc
+++ b/src/model/layer/layer.cc
@@ -18,5 +18,13 @@
 #include "singa/model/layer.h"
 
 namespace singa {
+const vector<Tensor> ComputeFeature(int flag, const vector<Tensor>& input) {
+  const vector<Blob*> input_blobs;
 
+}
+
+void ComputeFeature(int flag, const vector<Tensor>& input) {
+  const vector<Blob*> input_blobs;
+
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/proto/layer.proto
----------------------------------------------------------------------
diff --git a/src/proto/layer.proto b/src/proto/layer.proto
index bb87af9..0fbbb5d 100644
--- a/src/proto/layer.proto
+++ b/src/proto/layer.proto
@@ -97,6 +97,10 @@ message ParamSpec {
 
   // The multiplier on the global weight decay for this parameter.
   optional float decay_mult = 4 [default = 1.0];
+
+  // SINGA field for creating diff Param, e.g. SparseParam or CompressableParam
+  // Curently only have a default param implementation.
+  optional string type = 20 [default = "default"];
 }
 
 // NOTE
@@ -154,27 +158,27 @@ message LayerConf {
   optional ConcatConf concat_conf = 104;
   optional ContrastiveLossConf contrastive_loss_conf = 105;
   optional ConvolutionConf convolution_conf = 106;
-  optional DataConf data_conf = 107;
+  // optional DataConf data_conf = 107;
   optional DropoutConf dropout_conf = 108;
-  optional DummyDataConf dummy_data_conf = 109;
+  // optional DummyDataConf dummy_data_conf = 109;
   optional EltwiseConf eltwise_conf = 110;
   optional EmbedConf embed_conf = 137;
   optional ExpConf exp_conf = 111;
   optional FlattenConf flatten_conf = 135;
-  optional HDF5DataConf hdf5_data_conf = 112;
-  optional HDF5OutputConf hdf5_output_conf = 113;
+  // optional HDF5DataConf hdf5_data_conf = 112;
+  // optional HDF5OutputConf hdf5_output_conf = 113;
   optional HingeLossConf hinge_loss_conf = 114;
-  optional ImageDataConf image_data_conf = 115;
+  // optional ImageDataConf image_data_conf = 115;
   optional InfogainLossConf infogain_loss_conf = 116;
   optional InnerProductConf inner_product_conf = 117;
   optional LogConf log_conf = 134;
   optional LRNConf lrn_conf = 118;
-  optional MemoryDataConf memory_data_conf = 119;
+  // optional MemoryDataConf memory_data_conf = 119;
   optional MVNConf mvn_conf = 120;
   optional PoolingConf pooling_conf = 121;
   optional PowerConf power_conf = 122;
   optional PReLUConf prelu_conf = 131;
-  optional PythonConf python_conf = 130;
+  // optional PythonConf python_conf = 130;
   optional ReductionConf reduction_conf = 136;
   optional ReLUConf relu_conf = 123;
   optional ReshapeConf reshape_conf = 133;
@@ -185,7 +189,7 @@ message LayerConf {
   optional TanHConf tanh_conf = 127;
   optional ThresholdConf threshold_conf = 128;
   optional TileConf tile_conf = 138;
-  optional WindowDataConf window_data_conf = 129;
+  //optional WindowDataConf window_data_conf = 129;
 }
 
 // Message that stores hyper-parameters used to apply transformation
@@ -835,7 +839,7 @@ message PReLUConf {
   // Surpassing Human-Level Performance on ImageNet Classification, 2015.
 
   // Initial value of a_i. Default is a_i=0.25 for all i.
-  optional FillerParameter filler = 1;
+  optional FillerConf filler = 1;
   // Whether or not slope paramters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/test/singa/test_cpp_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_math.cc b/test/singa/test_cpp_math.cc
index 268785d..78c713f 100644
--- a/test/singa/test_cpp_math.cc
+++ b/test/singa/test_cpp_math.cc
@@ -20,8 +20,6 @@
 *************************************************************/
 
 #include "gtest/gtest.h"
-#include "singa/core/math.h"
+#include "../src/core/tensor/tensor_math_cpp.h"
 
-TEST(CppMath, Add) {
 
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index 04068ae..86200a8 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -15,7 +15,7 @@ TEST(TensorTest, TestConstructor) {
 
   EXPECT_NE(float_t.device(), nullptr);
 
-  singa::Tensor float16_t(singa::Shape{2,3}, singa::kFloat16);
+  singa::Tensor float16_t(Shape{2,3}, singa::kFloat16);
   EXPECT_EQ(singa::kFloat16, float16_t.data_type());
   EXPECT_EQ(6, float16_t.Size());
   EXPECT_EQ(12, float16_t.blob()->size());
@@ -68,7 +68,7 @@ TEST(TensorClass, ToDevice) {
 TEST(TensorClass, CopyDataFromHostPtr) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
+  t.CopyDataFromHostPtr(data, 3);
   const float* dptr = static_cast<const float*>(t.blob()->data());
   EXPECT_FLOAT_EQ(1.0f, dptr[0]);
   EXPECT_FLOAT_EQ(2.0f, dptr[1]);
@@ -78,7 +78,7 @@ TEST(TensorClass, CopyDataFromHostPtr) {
 TEST(TensorClass, CopyData) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
+  t.CopyDataFromHostPtr(data, 3);
 
   Tensor o(Shape{3});
   o.CopyData(t);
@@ -91,7 +91,7 @@ TEST(TensorClass, CopyData) {
 TEST(TensorClass, Clone) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
+  t.CopyDataFromHostPtr(data, 3);
 
   Tensor o = t.Clone();
   const float* dptr = static_cast<const float*>(o.blob()->data());
@@ -110,30 +110,5 @@ TEST(TensorClass, T) {
   EXPECT_TRUE((t.shape() ==  o.shape()));
 }
 
-TEST(TensorClass, Add) {
-  const float data[] = {1.0f, 2.0f, 3.0f, 1.1f, 2.1f, 3.1f};
-  Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
 
-  Tensor o = t.Clone();
-  o += t;
-  const float* dptr = o.data<float>();
-  EXPECT_FLOAT_EQ(2.0f, dptr[0]);
-  EXPECT_FLOAT_EQ(4.0f, dptr[1]);
-  EXPECT_FLOAT_EQ(6.0f, dptr[2]);
-
-  Tensor p(Shape{3});
-  o += p;
-  const float* dptr1 = o.data<float>();
-  EXPECT_FLOAT_EQ(2.0f, dptr1[0]);
-  EXPECT_FLOAT_EQ(4.0f, dptr1[1]);
-  EXPECT_FLOAT_EQ(6.0f, dptr1[2]);
-
-  Tensor q(Shape{3});
-  q.CopyDataFromHostPtr(data + 3, sizeof(float) * 3);
-  t += q;
-  const float* dptr2 = t.data<float>();
-  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
-  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
-  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
-}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
new file mode 100644
index 0000000..51e7cfb
--- /dev/null
+++ b/test/singa/test_tensor_math.cc
@@ -0,0 +1,84 @@
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+using singa::Tensor;
+using singa::Shape;
+using singa::Device;
+
+class TestTensorMath : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    const float dat1[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+    const float dat2[] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
+    a.ReShape(singa::Shape{6});
+    b.ReShape(singa::Shape{6});
+    c.ReShape(singa::Shape{6, 1});
+    d.ReShape(singa::Shape{3, 2});
+
+    a.CopyDataFromHostPtr<float>(dat1, 6);
+    b.CopyDataFromHostPtr<float>(dat2, 6);
+  }
+  Tensor a, b, c, d;
+};
+
+TEST_F(TestTensorMath, MemberAddTensor) {
+  Tensor aa = a.Clone();
+  aa += a;
+  const float* dptr = aa.data<float>();
+  EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+  EXPECT_FLOAT_EQ(4.0f, dptr[1]);
+  EXPECT_FLOAT_EQ(6.0f, dptr[2]);
+
+  // check p is initialized to 0
+  Tensor p(Shape{6});
+  p += aa;
+  const float* dptr1 = p.data<float>();
+  EXPECT_FLOAT_EQ(2.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(4.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(6.0f, dptr1[2]);
+
+  a += b;
+  const float* dptr2 = a.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
+}
+/*
+TEST(TensorClass, SubTensor) {
+  Tensor a(Shape{2,3}), b(Shape{6});
+  float x[]={1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
+  float y[]={1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
+  a.CopyDataFromHostPtr(x, 6);
+  b.CopyDataFromHostPtr(y, 6);
+  b -= a;
+  const float* dptr = b.data<float>();
+  EXPECT_FLOAT_EQ(0.1f, dptr[0]);
+  EXPECT_FLOAT_EQ(0.1f, dptr[1]);
+  EXPECT_FLOAT_EQ(0.1f, dptr[2]);
+  EXPECT_FLOAT_EQ(0.1f, dptr[5]);
+}
+*/
+
+TEST_F(TestTensorMath, AddTensors) {
+  Tensor ret(a.shape(), a.device(), a.data_type());
+  Add(a, b, &ret);
+  const float* dptr = ret.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr[5]);
+
+  const Tensor d = a + b;
+  const float* dptr2 = d.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
+
+  Add(a, b, &a);
+  const float* dptr1 = a.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr1[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr1[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr1[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr1[5]);
+}

[37/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 94ca283..38a9291 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -5,17 +5,17 @@ using singa::Shape;
 using singa::Device;
 
 class TestTensorMath : public ::testing::Test {
-protected:
+ protected:
   virtual void SetUp() {
     a.Reshape(singa::Shape{6});
     b.Reshape(singa::Shape{6});
     c.Reshape(singa::Shape{6, 1});
     d.Reshape(singa::Shape{3, 2});
-		e.Reshape(singa::Shape{3, 2});
+    e.Reshape(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);
-		e.CopyDataFromHostPtr<float>(dat1, 6);
+    e.CopyDataFromHostPtr<float>(dat1, 6);
   }
   Tensor a, b, c, d, e;
   const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -23,264 +23,262 @@ protected:
 };
 
 TEST_F(TestTensorMath, MemberAbs) {
-	Tensor aa = a.Clone();
-	Tensor bb = b.Clone();
-	Tensor cc = aa - bb;
-	const float* dptr = cc.data<const float*>();
-	EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+  Tensor aa = a.Clone();
+  Tensor bb = b.Clone();
+  Tensor cc = aa - bb;
+  const float *dptr = cc.data<const float *>();
+  EXPECT_NEAR(-0.1, dptr[0], 1e-5);
   EXPECT_NEAR(-0.1, dptr[1], 1e-5);
   EXPECT_NEAR(-0.1, dptr[2], 1e-5);
 
-	Tensor p = Abs(cc);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+  Tensor p = Abs(cc);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(0.1, dptr1[0], 1e-5);
   EXPECT_NEAR(0.1, dptr1[1], 1e-5);
   EXPECT_NEAR(0.1, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberExp) {
-	Tensor p = Exp(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+  Tensor p = Exp(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
   EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
   EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberLog) {
-	Tensor p = Log(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+  Tensor p = Log(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
   EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
   EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberReLU) {
-	Tensor aa = a.Clone();
-	Tensor cc = aa - 2.0f;
-	const float* dptr = cc.data<const float*>();
-	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  Tensor aa = a.Clone();
+  Tensor cc = aa - 2.0f;
+  const float *dptr = cc.data<const float *>();
+  EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
 
-	Tensor p = ReLU(cc);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+  Tensor p = ReLU(cc);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
   EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
   EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberSigmoid) {
-	Tensor p = Sigmoid(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(1.0f/(1.0f + exp(-1.0f)), dptr1[0], 1e-5);
-  EXPECT_NEAR(1.0f/(1.0f + exp(-2.0f)), dptr1[1], 1e-5);
-  EXPECT_NEAR(1.0f/(1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+  Tensor p = Sigmoid(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberSign) {
-	Tensor aa = a.Clone();
-	Tensor cc = aa - 2.0f;
-	const float* dptr = cc.data<const float*>();
-	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  Tensor aa = a.Clone();
+  Tensor cc = aa - 2.0f;
+  const float *dptr = cc.data<const float *>();
+  EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
 
-	Tensor p = Sign(cc);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_EQ(0.0f, dptr1[0]);
+  Tensor p = Sign(cc);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_EQ(0.0f, dptr1[0]);
   EXPECT_EQ(0.0f, dptr1[1]);
   EXPECT_EQ(1.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberSqrt) {
-	Tensor p = Sqrt(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+  Tensor p = Sqrt(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
   EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
   EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberSquare) {
-	Tensor p = Square(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+  Tensor p = Square(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(1.0, dptr1[0], 1e-5);
   EXPECT_NEAR(4.0, dptr1[1], 1e-5);
   EXPECT_NEAR(9.0, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberTanh) {
-	Tensor p = Tanh(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+  Tensor p = Tanh(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
   EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
   EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, Sum) {
-	Tensor p1 = Sum(e, 0);
+  Tensor p1 = Sum(e, 0);
   const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
-	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
+  EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(12.0f, dptr1[1]);
 
-	Tensor p2(Shape{3,1});
-	p2 = Sum(e, 1);
+  Tensor p2(Shape{3, 1});
+  p2 = Sum(e, 1);
   const float *dptr2 = p2.data<const float *>();
-	EXPECT_FLOAT_EQ(3.0f,dptr2[0]);
-	EXPECT_FLOAT_EQ(7.0f,dptr2[1]);
-	EXPECT_FLOAT_EQ(11.0f,dptr2[2]);
+  EXPECT_FLOAT_EQ(3.0f, dptr2[0]);
+  EXPECT_FLOAT_EQ(7.0f, dptr2[1]);
+  EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
 }
 
 TEST_F(TestTensorMath, SoftMax) {
-	Tensor p1(Shape{3,2});
-	p1 = SoftMax(e,0);
+  Tensor p1(Shape{3, 2});
+  p1 = SoftMax(e, 0);
   const float *dptr1 = p1.data<const float *>();
-	float sum = 0;
-	for(int i = 0; i < 6; i++) sum += exp(i+1);
-	EXPECT_NEAR(exp(1)/sum, dptr1[0],1e-5);
-	EXPECT_NEAR(exp(3)/sum, dptr1[2],1e-5);
-	EXPECT_NEAR(exp(5)/sum, dptr1[4],1e-5);
-	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
-	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
-	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
-
-	Tensor p2(Shape{3,2});
-	p2 = SoftMax(e,1);
+  float sum = 0;
+  for (int i = 0; i < 6; i++) sum += exp(i + 1);
+  EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5);
+  EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5);
+  EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5);
+  EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5);
+  EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5);
+
+  Tensor p2(Shape{3, 2});
+  p2 = SoftMax(e, 1);
   const float *dptr2 = p2.data<const float *>();
-	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
-	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
+  EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5);
+  EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberLT) {
-	Tensor p1 = a < 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+  Tensor p1 = a < 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberLE) {
-	Tensor p1 = a <= 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+  Tensor p1 = a <= 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberGT) {
-	Tensor p1 = a > 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+  Tensor p1 = a > 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberGE) {
-	Tensor p1 = a >= 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+  Tensor p1 = a >= 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberPow) {
-	Tensor p1 = Pow(b,3.0f);
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(pow(1.1f,3.0f), dptr1[0]);
-	EXPECT_FLOAT_EQ(pow(2.1f,3.0f), dptr1[1]);
-	EXPECT_FLOAT_EQ(pow(3.1f,3.0f), dptr1[2]);
+  Tensor p1 = Pow(b, 3.0f);
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
+  EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]);
+  EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]);
 
-	//TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the function is complete
-	//Tensor p2 = Pow(a,b);
-	//const float *dptr2 = p2.data<const float *>();
-	//EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
-	//EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
-	//EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+  // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the
+  // function is complete
+  // Tensor p2 = Pow(a,b);
+  // const float *dptr2 = p2.data<const float *>();
+  // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+  // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+  // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
 }
 
-
 TEST_F(TestTensorMath, MemberSub) {
-	Tensor p1 = a - b;
-	const float* dptr1 = p1.data<const float*>();
-	EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+  Tensor p1 = a - b;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
   EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
   EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberEltwiseMult) {
-	Tensor p1 = a * b;
-	const float* dptr1 = p1.data<const float*>();
-	EXPECT_NEAR(1.0*1.1, dptr1[0], 1e-5);
-  EXPECT_NEAR(2.0*2.1, dptr1[1], 1e-5);
-  EXPECT_NEAR(3.0*3.1, dptr1[2], 1e-5);
+  Tensor p1 = a * b;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberDiv) {
-	Tensor p1 = a / b;
-	const float* dptr1 = p1.data<const float*>();
-	EXPECT_NEAR(1.0/1.1, dptr1[0], 1e-5);
-  EXPECT_NEAR(2.0/2.1, dptr1[1], 1e-5);
-  EXPECT_NEAR(3.0/3.1, dptr1[2], 1e-5);
+  Tensor p1 = a / b;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5);
 
-	Tensor p2 = Div(10.0f,b);
-	const float* dptr2 = p2.data<const float*>();
-	EXPECT_NEAR(10.0/1.1, dptr2[0], 1e-5);
-  EXPECT_NEAR(10.0/2.1, dptr2[1], 1e-5);
-  EXPECT_NEAR(10.0/3.1, dptr2[2], 1e-5);
+  Tensor p2 = Div(10.0f, b);
+  const float *dptr2 = p2.data<const float *>();
+  EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5);
+  EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5);
+  EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5);
 
-	Tensor p3 = a / 8.0f;
-	const float* dptr3 = p3.data<const float*>();
-	EXPECT_NEAR(1.0/8.0, dptr3[0], 1e-5);
-  EXPECT_NEAR(2.0/8.0, dptr3[1], 1e-5);
-  EXPECT_NEAR(3.0/8.0, dptr3[2], 1e-5);
+  Tensor p3 = a / 8.0f;
+  const float *dptr3 = p3.data<const float *>();
+  EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5);
+  EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5);
+  EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberBernoulli) {
-	Tensor p1(Shape{10000});
-	Bernoulli(0.3f, &p1);
-	const float* dptr1 = p1.data<const float*>();
-	float sum = 0;
-	for(int i = 0; i < 10000; i++) sum += dptr1[i];
-	float mean = sum/10000;
-	EXPECT_NEAR(mean, 0.3f, 1e-2);
+  Tensor p1(Shape{10000});
+  Bernoulli(0.3f, &p1);
+  const float *dptr1 = p1.data<const float *>();
+  float sum = 0;
+  for (int i = 0; i < 10000; i++) sum += dptr1[i];
+  float mean = sum / 10000;
+  EXPECT_NEAR(mean, 0.3f, 1e-2);
 
-	sum = 0;
-	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
-	float variance = sum/9999;
-	EXPECT_NEAR(variance, 0.3*0.7, 1e-2);
+  sum = 0;
+  for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 9999;
+  EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
 }
 
 TEST_F(TestTensorMath, MemberUniform) {
-	Tensor p1(Shape{10000});
-	Uniform(0.1f,0.2f,&p1);
-	const float* dptr1 = p1.data<const float*>();
-	float sum = 0;
-	for(int i = 0; i < 10000; i++) sum += dptr1[i];
-	float mean = sum/10000;
-	EXPECT_NEAR(mean, 0.15f, 1e-3);
+  Tensor p1(Shape{10000});
+  Uniform(0.1f, 0.2f, &p1);
+  const float *dptr1 = p1.data<const float *>();
+  float sum = 0;
+  for (int i = 0; i < 10000; i++) sum += dptr1[i];
+  float mean = sum / 10000;
+  EXPECT_NEAR(mean, 0.15f, 1e-3);
 
-	sum = 0;
-	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
-	float variance = sum/9999;
-	EXPECT_NEAR(variance, 0.01f/12, 1e-3);
+  sum = 0;
+  for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 9999;
+  EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
 }
 
 TEST_F(TestTensorMath, MemberGaussian) {
-	Tensor p1(Shape{50000});
-	Gaussian(0.0f,1.0f,&p1);
-	const float* dptr1 = p1.data<const float*>();
-	float sum = 0;
-	for(int i = 0; i < 50000; i++) sum += dptr1[i];
-	float mean = sum/50000;
-	EXPECT_NEAR(mean, 0.0, 1e-2);
+  Tensor p1(Shape{50000});
+  Gaussian(0.0f, 1.0f, &p1);
+  const float *dptr1 = p1.data<const float *>();
+  float sum = 0;
+  for (int i = 0; i < 50000; i++) sum += dptr1[i];
+  float mean = sum / 50000;
+  EXPECT_NEAR(mean, 0.0, 1e-2);
 
-	sum = 0;
-	for(int i = 0; i < 50000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
-	float variance = sum/49999;
-	EXPECT_NEAR(variance, 1.0, 1e-2);
+  sum = 0;
+  for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 49999;
+  EXPECT_NEAR(variance, 1.0, 1e-2);
 }
 
-
-
 TEST_F(TestTensorMath, MemberAddTensor) {
   Tensor aa = a.Clone();
   aa += a;
@@ -333,8 +331,7 @@ TEST_F(TestTensorMath, SetValue) {
   Tensor t(Shape{4});
   t.SetValue(0.3f);
   const float *ptr = t.data<const float *>();
-  for (int i = 0; i < 4; i++)
-    EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+  for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
 }
 
 TEST_F(TestTensorMath, Reshape) {
@@ -344,10 +341,15 @@ TEST_F(TestTensorMath, Reshape) {
   const float *ptr = t.data<const float *>();
   EXPECT_EQ(p.shape(0), 4u);
   EXPECT_EQ(p.shape(1), 1u);
-  for (int i = 0; i < 4; i++)
-    EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+  for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
 }
 #ifdef USE_CBLAS
+TEST_F(TestTensorMath, L2Cpp) {
+  float l2 = a.L2();
+  float target = 0.0f;
+  for (size_t i = 0; i < a.Size(); i++) target += dat1[i] * dat1[i];
+  EXPECT_FLOAT_EQ(l2, sqrt(target));
+}
 TEST_F(TestTensorMath, MultCpp) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   Tensor t(Shape{2, 2});
@@ -368,8 +370,7 @@ TEST_F(TestTensorMath, MultCpp) {
   Tensor s(Shape{4, 2});
   s.CopyDataFromHostPtr(y, 8);
   const float *sPtr = s.data<const float *>();
-  for (int i = 0; i < 8; i++)
-    EXPECT_FLOAT_EQ(sPtr[i], y[i]);
+  for (int i = 0; i < 8; i++) EXPECT_FLOAT_EQ(sPtr[i], y[i]);
   Tensor D = Mult(d, s.T());
   const float *DPtr = D.data<const float *>();
   for (int i = 0; i < 3; i++) {
@@ -423,7 +424,6 @@ TEST_F(TestTensorMath, SubColumnCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, DivColumnCpp) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
@@ -438,7 +438,6 @@ TEST_F(TestTensorMath, DivColumnCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, AddRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
@@ -453,7 +452,6 @@ TEST_F(TestTensorMath, AddRowCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, SubRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
@@ -468,7 +466,6 @@ TEST_F(TestTensorMath, SubRowCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, MultRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
@@ -483,7 +480,6 @@ TEST_F(TestTensorMath, MultRowCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, SumRowsCpp) {
   Tensor t(Shape{2});
   d.CopyDataFromHostPtr(dat1, 6);
@@ -498,7 +494,6 @@ TEST_F(TestTensorMath, SumRowsCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, SumColumnsCpp) {
   Tensor t(Shape{3});
   d.CopyDataFromHostPtr(dat1, 6);
@@ -514,6 +509,15 @@ TEST_F(TestTensorMath, SumColumnsCpp) {
 }
 #endif
 #ifdef USE_CUDA
+TEST_F(TestTensorMath, L2Cuda) {
+  singa::CudaGPU dev;
+  Tensor t(Shape{3, 2}, &dev);
+  t.CopyDataFromHostPtr(dat1, 6);
+  float l2 = t.L2();
+  float target = 0.0f;
+  for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
+  EXPECT_FLOAT_EQ(l2, sqrt(target));
+}
 TEST_F(TestTensorMath, MultCuda) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   singa::CudaGPU dev;
@@ -582,7 +586,6 @@ TEST_F(TestTensorMath, AddColumnCuda) {
   }
 }
 
-
 TEST_F(TestTensorMath, SubColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   singa::CudaGPU dev;
@@ -757,4 +760,5 @@ TEST_F(TestTensorMath, SumColumnCuda) {
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
 }
+
 #endif

[40/50] [abbrv] incubator-singa git commit: SINGA-192 Implement optimization algorithms for v1

Posted by zh...@apache.org.

SINGA-192 Implement optimization algorithms for v1

Merge branch PR#164 into dev

Fix the bugs in test adagrad and rmsprop.
Note, expect near (with diff 1e-5) is used to avoid numeric bugs. Need to do test on more
machines.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5784bff3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5784bff3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5784bff3

Branch: refs/heads/master
Commit: 5784bff3e5ebfb3a992624d10f03f30cd5e520a3
Parents: 6d69047 178db01
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 15:43:53 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 18:03:12 2016 +0800

----------------------------------------------------------------------
 include/singa/model/optimizer.h |  43 ++++++++++++++
 src/core/tensor/math_kernel.cu  |  14 ++---
 src/core/tensor/math_kernel.h   |   2 +-
 src/core/tensor/tensor.cc       |   3 +-
 src/model/optimizer/adagrad.cc  |  36 ++++++++++++
 src/model/optimizer/nesterov.cc |  43 ++++++++++++++
 src/model/optimizer/rmsprop.cc  |  41 ++++++++++++++
 src/proto/model.proto           |   3 +
 test/singa/test_adagrad.cc      |  96 +++++++++++++++++++++++++++++++
 test/singa/test_nesterov.cc     | 101 +++++++++++++++++++++++++++++++++
 test/singa/test_rmsprop.cc      | 106 +++++++++++++++++++++++++++++++++++
 11 files changed, 478 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.cu
index b618f9b,aed6add..484868a
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@@ -236,192 -300,151 +236,192 @@@ __global__ void KernelThreshold(const s
    }
  }
  
- __global__ void KernelGE(const int num, const float *in, const float x,
 -__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
 -                           float *des_data, int n) {
 -  int index = blockIdx.x * blockDim.x + threadIdx.x;
 -  int num_threads = blockDim.x * gridDim.x;
 -  for (; index < n; index += num_threads) {
 -    des_data[index] = src_data_a[index] / src_data_b[index];
++__global__ void KernelGE(const size_t num, const float *in, const float x,
 +                         float *out) {
 +  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
 +       idx += blockDim.x * gridDim.x) {
 +    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
    }
  }
- __global__ void KernelGT(const int num, const float *in, const float x,
 -
 -__global__ static void kernel_set_value(float *data, float value, int n) {
 -  int index = blockIdx.x * blockDim.x + threadIdx.x;
 -  int num_threads = blockDim.x * gridDim.x;
 -  for (; index < n; index += num_threads) {
 -    data[index] = value;
++__global__ void KernelGT(const size_t num, const float *in, const float x,
 +                         float *out) {
 +  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
 +       idx += blockDim.x * gridDim.x) {
 +    out[idx] = in[idx] > x ? 1.0f : 0.0f;
    }
  }
- __global__ void KernelLE(const int num, const float *in, const float x,
 -
 -__global__ void kernel_threshold(const float *src_data, float *des_data,
 -                                 float alpha, int n) {
 -  int index = blockIdx.x * blockDim.x + threadIdx.x;
 -  int num_threads = blockDim.x * gridDim.x;
 -  for (; index < n; index += num_threads) {
 -    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
++__global__ void KernelLE(const size_t num, const float *in, const float x,
 +                         float *out) {
 +  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
 +       idx += blockDim.x * gridDim.x) {
 +    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
    }
  }
 -void sum(int n, const float *in, float *out) {
 -  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
 -  //  here, we only need one block
 -  int num_blocks = 1;
  
- __global__ void KernelLT(const int num, const float *in, const float x,
 -  kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
++__global__ void KernelLT(const size_t num, const float *in, const float x,
 +                         float *out) {
 +  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
 +       idx += blockDim.x * gridDim.x) {
 +    out[idx] = in[idx] < x ? 1.0f : 0.0f;
 +  }
  }
  
 -void sum_row(int rows, int cols, int stride, const float *in, float *out) {
 -  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
 -  int num_blocks = cols;
 +// ********************************
 +// Functions call kernels
 +// ********************************
  
 -  kernel_sum_row << <num_blocks, threads_per_block>>>
 -      (in, out, rows, cols, stride);
 +void set(const size_t n, const float v, float *out, cudaStream_t s) {
 +  KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
  }
  
 -void sum_col(int rows, int cols, int stride, const float *in, float *out) {
 -  int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
 -  int num_blocks = rows;
 +void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 +}
  
 -  kernel_sum_col << <num_blocks, threads_per_block>>>
 -      (in, out, rows, cols, stride);
 +void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
 -void add_row(int rows, int cols, int stride, const float *in_row,
 -             const float *in_mat, float *out) {
 -  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
 -  dim3 num_blocks(
 -      cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
 -      rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
 -  kernel_add_vec_row << <num_blocks, threads_per_block>>>
 -      (in_row, in_mat, out, rows, cols, stride);
 +
 +void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
 -void add(int n, const float *a, const float *b, float *out) {
 -  kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 +
 +void log(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
 -void sub(int n, const float *a, const float *b, float *out) {
 -  kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 +
 +void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
 -void exp(int n, const float *in, float *out) {
 -  kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +
 +void square(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
 -void log(int n, const float *in, float *out) {
 -  kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
 -void sigmoid(int n, const float *in, float *out) {
 -  kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 +}
- void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
++void sigmoid(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 +}
 +void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 +}
 +void clamp(const size_t n, const float low, const float high, const float *in,
 +           float *out, cudaStream_t s) {
 +  KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
  }
  
 -void sigmoid_grad(int n, const float *in, float *out) {
 -  kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void pow(const size_t n, const float *in, const float x, float *out,
 +         cudaStream_t s) {
 +  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
 -void relu(int n, const float *in, float *out) {
 -  kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void add(const size_t n, const float *in, const float x, float *out,
 +         cudaStream_t s) {
 +  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
 -void relu_grad(int n, const float *in, float *out) {
 -  kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void mult(const size_t n, const float *in, const float x, float *out,
 +          cudaStream_t s) {
 +  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
 -void tanh(int n, const float *in, float *out) {
 -  kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void div(const size_t n, const float x, const float *in, float *out,
 +          cudaStream_t s) {
 +  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
 -void tanh_grad(int n, const float *in, float *out) {
 -  kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void threshold(const size_t n, const float x, const float *in, float *out,
 +               cudaStream_t s) {
 +  KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
 -void softplus(int n, const float *in, float *out) {
 -  kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void gt(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s) {
 +  KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 +}
 +void ge(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s) {
 +  KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 +}
 +void lt(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s) {
 +  KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 +}
 +void le(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s) {
 +  KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
  }
  
 -void softplus_grad(int n, const float *in, float *out) {
 -  kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void pow(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s) {
 +  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
 -void square(int n, const float *in, float *out) {
 -  kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void add(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s) {
 +  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
 -void square_grad(int n, const float *in, float *out) {
 -  kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void sub(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s) {
 +  KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
 -void sqrt(int n, const float *in, float *out) {
 -  kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 +void mult(const size_t n, const float *in1, const float *in2, float *out,
 +          cudaStream_t s) {
 +  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
 -void pow(int n, const float *a, const float *b, float *out) {
 -  kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 +void div(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s) {
 +  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
 -void mult(int n, const float *a, const float *b, float *out) {
 -  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 +void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
 +  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
 +  //  here, we only need one block
 +  int num_blocks = 1;
 +  KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
 +}
 +/*
 +void square_grad(int n, const float *in, float *out, cudaStream_t s) {
 +  kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
 -void mult(int n, const float *a, const float x, float *out) {
 -  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
 +void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
 +  kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
 -void div(int n, const float *a, const float *b, float *out) {
 -  kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 +
 +void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
 +  kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
 -void set_value(int n, float v, float *out) {
 -  kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
 +
 +void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
 +  kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
 -void threshold(int n, float alpha, const float *in, float *out) {
 -  kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
 +void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
 +  kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
 -// follow the consistency guide for math API
 -__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
 -                          float *out) {
 -  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
 -       idx += blockDim.x * gridDim.x) {
 -    out[idx] = alpha / in[idx];
 +
 +__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
 +                               int rows, int cols, int stride) {
 +  int index = blockIdx.x * blockDim.x + threadIdx.x;
 +  int num_threads = blockDim.x * gridDim.x;
 +  for (; index < rows; index += num_threads) {
 +    dst_vec_data[index] = 0.0f;
 +    for (int k = 0; k < cols; k++) {
 +      dst_vec_data[index] += src_mat_data[index * stride + k];
 +    }
    }
  }
  
@@@ -485,62 -485,30 +485,62 @@@ __global__ void kernel_sigmoid_grad(con
    }
  }
  
 -void Set(const size_t num, const float x, float *out, cudaStream_t s) {
 -  KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
 +
 +__global__ void kernel_relu_grad(const float *src_data, float *des_data,
 +                                 int n) {
 +  int index = blockIdx.x * blockDim.x + threadIdx.x;
 +  int num_threads = blockDim.x * gridDim.x;
 +  for (; index < n; index += num_threads) {
 +    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
 +  }
  }
 -void Div(const size_t num, float alpha, const float *in, float *out,
 -         cudaStream_t s) {
 -  KernelDiv << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, alpha, in, out);
 +
 +__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
 +                                 int n) {
 +  int index = blockIdx.x * blockDim.x + threadIdx.x;
 +  int num_threads = blockDim.x * gridDim.x;
 +  for (; index < n; index += num_threads) {
 +    des_data[index] = (1.0f - src_data[index] * src_data[index]);
 +  }
  }
  
 -void GT(const size_t num, const float *in, const float x, float *out,
 -        cudaStream_t s) {
 -  KernelGT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 +
 +__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
 +                                     int n) {
 +  int index = blockIdx.x * blockDim.x + threadIdx.x;
 +  int num_threads = blockDim.x * gridDim.x;
 +  for (; index < n; index += num_threads) {
 +    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
 +  }
  }
 -void GE(const size_t num, const float *in, const float x, float *out,
 -        cudaStream_t s) {
 -  KernelGE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 +__global__ void KernelSquareGrad(const float *src_data, float *des_data,
 +                                   int n) {
 +  int index = blockIdx.x * blockDim.x + threadIdx.x;
 +  int num_threads = blockDim.x * gridDim.x;
 +  for (; index < n; index += num_threads) {
 +    des_data[index] = 2 * src_data[index];
 +  }
  }
- __global__ void kernel_softmax_loss(const float *prob, const int *label,
 -void LT(const size_t num, const float *in, const float x, float *out,
 -        cudaStream_t s) {
 -  KernelLT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
++__global__ void kernel_softmax_loss(const float *prob, const size_t *label,
 +                                    float *loss, int n, int dim) {
 +  int index = blockIdx.x * blockDim.x + threadIdx.x;
 +  int num_threads = blockDim.x * gridDim.x;
 +  for (; index < n; index += num_threads) {
 +    float prob_of_truth = prob[index * dim + label[index]];
 +    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
 +  }
  }
- __global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
 -void LE(const size_t num, const float *in, const float x, float *out,
 -        cudaStream_t s) {
 -  KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
++__global__ void kernel_softmax_gradient(float *grad, const size_t *label, int n,
 +                                        int dim, float scale) {
 +  int index = blockIdx.x * blockDim.x + threadIdx.x;
 +  int num_threads = blockDim.x * gridDim.x;
 +  for (; index < n; index += num_threads) {
 +    int pos = index * dim + label[index];
 +    grad[pos] = (grad[pos] - 1.0f) * scale;
 +  }
  }
 +*/
 +
  
  }  // namespace cuda
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.h
index d8a58a5,5c906a9..444f6ca
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@@ -31,66 -31,65 +31,66 @@@ namespace singa 
  
  // TODO(wangwei) make all function templates.
  namespace cuda {
 -void sum(int n, const float *in, float *out);
  
 -void sum_row(int rows, int cols, int stride, const float *in, float *out);
 -
 -void sum_col(int rows, int cols, int stride, const float *in, float *out);
 -
 -void add_row(int rows, int cols, int stride, const float *in_row,
 -             const float *in_mat, float *out);
 -
 -void add(int n, const float *a, const float *b, float *out);
 -
 -void sub(int n, const float *a, const float *b, float *out);
 -
 -void exp(int n, const float *in, float *out);
 -
 -void log(int n, const float *in, float *out);
 -
 -void sigmoid(int n, const float *in, float *out);
 -
 -void sigmoid_grad(int n, const float *in, float *out);
 -
 -void relu(int n, const float *in, float *out);
 -
 -void relu_grad(int n, const float *in, float *out);
 -
 -void tanh(int n, const float *in, float *out);
 -
 -void tanh_grad(int n, const float *in, float *out);
 +// 0 input
 +void set(const size_t n, const float v, float *out, cudaStream_t s);
 +
 +// 1 input
 +void abs(const size_t n, const float *in, float *out, cudaStream_t s);
 +void sign(const size_t n, const float *in, float *out, cudaStream_t s);
 +void exp(const size_t n, const float *in, float *out, cudaStream_t s);
 +void log(const size_t n, const float *in, float *out, cudaStream_t s);
 +void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
 +void square(const size_t n, const float *in, float *out, cudaStream_t s);
 +void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
 +void relu(const size_t n, const float *in, float *out, cudaStream_t s);
- void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
++void sigmoid(const size_t n, const float *in, float *out, cudaStream_t s);
 +void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
 +void clamp(const size_t n, const float low, const float high, const float *in,
 +           float *out, cudaStream_t s);
 +
 +void pow(const size_t n, const float *in, const float x, float *out,
 +         cudaStream_t s);
  
 -void softplus(int n, const float *in, float *out);
 +void add(const size_t n, const float *in, const float x, float *out,
 +         cudaStream_t s);
  
 -void softplus_grad(int n, const float *in, float *out);
 +void mult(const size_t n, const float *in, const float x, float *out,
 +          cudaStream_t s);
  
 -void square(int n, const float *in, float *out);
 +void div(const size_t n, const float x, const float *in, float *out,
 +         cudaStream_t s);
  
 -void square_grad(int n, const float *in, float *out);
 +void threshold(const size_t n, const float x, const float *in, float *out,
 +               cudaStream_t s);
  
 -void sqrt(int n, const float *in, float *out);
 +void gt(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s);
 +void ge(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s);
 +void lt(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s);
 +void le(const size_t num, const float *in, const float x, float *out,
 +        cudaStream_t s);
  
 -void pow(int n, const float *a, const float *b, float *out);
 +// 2 inputs
 +void pow(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s);
  
 -void mult(int n, const float *a, const float *b, float *out);
 +void add(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s);
  
 -void mult(int n, const float *a, const float x, float *out);
 +void sub(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s);
  
 -void div(int n, const float *a, const float *b, float *out);
 +void mult(const size_t n, const float *in1, const float *in2, float *out,
 +          cudaStream_t s);
  
 -void set_value(int n, float v, float *out);
 +void div(const size_t n, const float *in1, const float *in2, float *out,
 +         cudaStream_t s);
  
 -void threshold(int n, float alpha, const float *in, float *out);
 +void sum(const size_t n, const float *in, float *out, cudaStream_t s);
  
 -// follow the consistency guide for math API
 -void Div(const size_t num, const float x, const float *in, float *out,
 -         cudaStream_t s);
 -void Set(const size_t num, const float x, float *out, cudaStream_t s);
 -void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 -void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 -void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 -void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
  }  // cuda
  
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index e62386a,5ae375c..e6917d8
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -639,92 -701,4 +639,91 @@@ void SumRows(const Tensor &M, Tensor *v
      Mult(X, one, v);
    }
  }
 +// ====================Random operations=====================================
 +template <typename SType>
 +void Bernoulli(const SType p, Tensor *out) {
 +  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
 +    auto prob = TypeCast<SType, DType>(p);
 +    out->device()->Exec([prob, out](Context *ctx) {
 +      Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
 +    }, {}, {out->blob()}, true);
 +  });
 +}
 +template void Bernoulli<float>(const float p, Tensor *out);
 +
 +template <typename SType>
 +void Uniform(const SType low, const SType high, Tensor *out) {
 +  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
 +    auto l = TypeCast<SType, DType>(low);
 +    auto h = TypeCast<SType, DType>(high);
 +    out->device()->Exec([l, h, out](Context *ctx) {
 +      Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
 +    }, {}, {out->blob()}, true);
 +  });
 +}
 +template void Uniform<float>(const float low, const float high, Tensor *out);
 +
 +template <typename SType>
 +void Gaussian(const SType mean, const SType std, Tensor *out) {
 +  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
 +    auto m = TypeCast<SType, DType>(mean);
 +    auto s = TypeCast<SType, DType>(std);
 +    out->device()->Exec([m, s, out](Context *ctx) {
 +      Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
 +    }, {}, {out->blob()}, true);
 +  });
 +}
 +template void Gaussian<float>(const float mean, const float std, Tensor *out);
 +
 +// ================Blas operations============================================
 +template <typename SType>
 +void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
 +  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
 +    auto a = TypeCast<SType, DType>(alpha);
 +    out->device()->Exec([a, in, out](Context *ctx) {
 +      Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
 +    }, {in.blob(), out->blob()}, {out->blob()});
 +  });
 +}
- template <>
- void Axpy(const float alpha, const Tensor &in, Tensor *out);
++template void Axpy(const float alpha, const Tensor &in, Tensor *out);
 +
 +Tensor Mult(const Tensor &A, const Tensor &B) {
 +  Shape s;
 +  s.push_back(A.shape(0));
 +  if (B.nDim() == 2) s.push_back(B.shape(1));
 +  Tensor out(s, A.device(), A.data_type());
 +  Mult(A, B, &out);
 +  return out;
 +}
 +
 +void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
 +  Mult(1.0f, A, B, 0.0f, out);
 +}
 +
 +template <typename SType>
 +void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
 +          Tensor *C) {
 +  CHECK_EQ(A.shape().size(), 2u);
 +  if (B.nDim() == 1u) {
 +    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
 +      auto a = TypeCast<SType, DType>(alpha);
 +      auto b = TypeCast<SType, DType>(beta);
 +      C->device()->Exec([a, A, b, B, C](Context *ctx) {
 +        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
 +                          B.blob(), b, C->blob(), ctx);
 +      }, {A.blob(), B.blob()}, {C->blob()});
 +    });
 +  } else {
 +    CHECK(!C->transpose());
 +    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
 +      auto a = TypeCast<SType, DType>(alpha);
 +      auto b = TypeCast<SType, DType>(beta);
 +      C->device()->Exec([a, A, b, B, C](Context *ctx) {
 +        GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
 +                          A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
 +      }, {A.blob(), B.blob()}, {C->blob()});
 +    });
 +  }
 +}
 +
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/model/optimizer/adagrad.cc
----------------------------------------------------------------------
diff --cc src/model/optimizer/adagrad.cc
index 0000000,8bdb07c..0b8ec88
mode 000000,100644..100644
--- a/src/model/optimizer/adagrad.cc
+++ b/src/model/optimizer/adagrad.cc
@@@ -1,0 -1,35 +1,36 @@@
+ /**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+ #ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #include "singa/model/optimizer.h"
+ #include <functional>
+ namespace singa {
+ 
+ void Adagrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); }
+ 
+ void Adagrad::Apply(int step, float lr, const string& name, Tensor* grad,
+                     Tensor* value) {
+   if (history_gradient_.find(name) == history_gradient_.end())
+     history_gradient_[name].ResetLike(*value);
+   Tensor& history = history_gradient_[name];
 -  history += (*grad) * (*grad);
 -  (*value) -= (*grad) * lr / Sqrt(history + delta_);
++  history += Square(*grad);
++  (*grad) /= Sqrt(history + delta_);
++  Axpy(-lr, *grad, value);
+ }
+ }  // namespace singa
+ #endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/model/optimizer/rmsprop.cc
----------------------------------------------------------------------
diff --cc src/model/optimizer/rmsprop.cc
index 0000000,cad333c..7b9934c
mode 000000,100644..100644
--- a/src/model/optimizer/rmsprop.cc
+++ b/src/model/optimizer/rmsprop.cc
@@@ -1,0 -1,38 +1,41 @@@
+ /**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+ #ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+ #include "singa/model/optimizer.h"
+ #include <functional>
+ namespace singa {
+ 
+ void RMSProp::Setup(const OptimizerConf& conf) {
+   delta_ = conf.delta();
 -  rho_ = conf.delta();
++  rho_ = conf.rho();
+ }
+ 
+ void RMSProp::Apply(int step, float lr, const string& name, Tensor* grad,
+                     Tensor* value) {
 -  if (history_gradient_.find(name) == history_gradient_.end())
++  if (history_gradient_.find(name) == history_gradient_.end()) {
+     history_gradient_[name].ResetLike(*value);
++  }
+   Tensor& history = history_gradient_[name];
 -  history = history * rho_ + (*grad) * (*grad) * (1 - rho_);
 -  (*value) -= (*grad) * lr / Sqrt(history + delta_);
++  history *= rho_;
++  Axpy(1 - rho_, Square(*grad), &history);
++  (*grad) /= Sqrt(history + delta_);
++  Axpy(-lr, *grad, value);
+ }
+ }  // namespace singa
+ #endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/src/proto/model.proto
----------------------------------------------------------------------
diff --cc src/proto/model.proto
index d368296,c26aa35..ca6f0cd
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@@ -86,6 -86,9 +86,9 @@@ message OptimizerConf 
  
    // used by vanilla sgd and nesterov
    optional float momentum = 5 [default = 0.9];
+ 
+   // delta is used to avoid dividing zero
 -  optional float delta = 6 [default = 0.0000001];
++  optional float delta = 6 [default = 1e-8];
  }
  
  message ConstraintConf {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/test/singa/test_adagrad.cc
----------------------------------------------------------------------
diff --cc test/singa/test_adagrad.cc
index 0000000,1382467..80240b1
mode 000000,100644..100644
--- a/test/singa/test_adagrad.cc
+++ b/test/singa/test_adagrad.cc
@@@ -1,0 -1,92 +1,96 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
+ #include "singa_config.h"
+ #include <cmath>
+ 
+ TEST(Adagrad, ApplyCPU) {
+   singa::Adagrad adagrad;
+   float lr = 0.1f;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
++  singa::OptimizerConf conf;
++  adagrad.Setup(conf);
+   adagrad.Apply(0, lr, "xx", &grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   const float* newv1 = v1.data<const float*>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+   for (int i = 0; i < 4; ++i)
 -    EXPECT_FLOAT_EQ(newv1[i],
 -                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++    EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
++                1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   adagrad.Apply(1, lr, "xx", &grad, &value);
+   singa::Tensor v2 = value.Clone();
+   const float* newv2 = v2.data<const float*>();
+   for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+ 
+   for (int i = 0; i < 4; ++i)
 -    EXPECT_FLOAT_EQ(newv2[i],
 -                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++    EXPECT_NEAR(newv2[i],
++                newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Adagrad, ApplyCUDA) {
+   singa::Adagrad adagrad;
+   float lr = 0.1f;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::CudaGPU dev;
+   singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
++  singa::OptimizerConf conf;
++  adagrad.Setup(conf);
+   adagrad.Apply(0, lr, "xx", &grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   v1.ToHost();
+   const float* newv1 = v1.data<const float*>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+   for (int i = 0; i < 4; ++i)
 -    EXPECT_FLOAT_EQ(newv1[i],
 -                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++    EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
++                1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   adagrad.Apply(1, lr, "xx", &grad, &value);
+   singa::Tensor v2 = value.Clone();
+   v2.ToHost();
+   const float* newv2 = v2.data<const float*>();
+   for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+ 
+   for (int i = 0; i < 4; ++i)
+     EXPECT_FLOAT_EQ(newv2[i],
 -                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++                    newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()));
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5784bff3/test/singa/test_rmsprop.cc
----------------------------------------------------------------------
diff --cc test/singa/test_rmsprop.cc
index 0000000,62101f7..8104f50
mode 000000,100644..100644
--- a/test/singa/test_rmsprop.cc
+++ b/test/singa/test_rmsprop.cc
@@@ -1,0 -1,103 +1,106 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
+ #include "singa_config.h"
+ #include <cmath>
+ 
+ TEST(RMSProp, ApplyCPU) {
+   singa::RMSProp rmsprop;
+   float lr = 0.1f;
 -  float rho = 0.002f;
++  float rho = 0.9;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::OptimizerConf conf;
+   conf.set_rho(rho);
++  conf.set_delta(1E-8);
+ 
+   singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   rmsprop.Setup(conf);
+   rmsprop.Apply(0, lr, "xx", &grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   const float* newv1 = v1.data<const float*>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+   for (int i = 0; i < 4; ++i)
 -    EXPECT_FLOAT_EQ(newv1[i],
 -                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++    EXPECT_NEAR(newv1[i], v[i] - g[i] * lr / sqrt(history[i] + (float)1E-8),
++                1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   rmsprop.Apply(1, lr, "xx", &grad, &value);
+   singa::Tensor v2 = value.Clone();
+   const float* newv2 = v2.data<const float*>();
+   for (int i = 0; i < 4; ++i)
 -    history[i] += history[i] * rho + g[i] * g[i] * (1 - rho);
++    history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+ 
+   for (int i = 0; i < 4; ++i)
 -    EXPECT_FLOAT_EQ(newv2[i],
 -                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++    EXPECT_NEAR(newv2[i], newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8),
++                1e-5);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(RMSProp, ApplyCUDA) {
+   singa::RMSProp rmsprop;
+   float lr = 0.1f;
 -  float rho = 0.002f;
++  float rho = 0.02;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::OptimizerConf conf;
+   conf.set_rho(rho);
++  conf.set_delta(1e-8);
+ 
+   singa::CudaGPU dev;
+   singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
++  rmsprop.Setup(conf);
+   rmsprop.Apply(0, lr, "xx", &grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   v1.ToHost();
+   const float* newv1 = v1.data<const float*>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+   for (int i = 0; i < 4; ++i)
 -    EXPECT_FLOAT_EQ(newv1[i],
 -                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++    EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
++                1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   rmsprop.Apply(1, lr, "xx", &grad, &value);
+   singa::Tensor v2 = value.Clone();
+   v2.ToHost();
+   const float* newv2 = v2.data<const float*>();
+   for (int i = 0; i < 4; ++i)
 -    history[i] += history[i] * rho + g[i] * g[i] * (1 - rho);
++    history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+ 
+   for (int i = 0; i < 4; ++i)
 -    EXPECT_FLOAT_EQ(newv2[i],
 -                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
++    EXPECT_NEAR(newv2[i],
++                newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+ #endif

[06/50] [abbrv] incubator-singa git commit: SINGA-171 - Create CppDevice and CudaDevice

Posted by zh...@apache.org.

SINGA-171 - Create CppDevice and CudaDevice

Add CppDevice and CudaDevice API.
Implement CppDevice and add test for it.
There is link error for cudnn.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/282712ca
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/282712ca
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/282712ca

Branch: refs/heads/master
Commit: 282712caf1582bdc4e23d89fcc14d27eb0c7ad8e
Parents: b491875
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Tue May 17 17:24:40 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 19 14:01:24 2016 +0800

----------------------------------------------------------------------
 include/singa/core/common.h    |   5 +-
 include/singa/core/device.h    | 108 +++++++++++++++++-----
 include/singa/core/tensor.h    |  35 +------
 include/singa/utils/cuda.h     |  94 +++++++++++++++++++
 src/core/device/cpp_device.cc  |  19 +++-
 src/core/device/cuda_device.cc | 132 +++++++++++++++++++++++++++
 src/core/device/device.cc      |  43 +++++----
 src/core/tensor/tensor.cc      | 176 ++++++++++++++----------------------
 src/proto/core.proto           |  13 ++-
 test/singa/test_cpp_device.cc  |  71 +++++++++++++++
 test/singa/test_tensor_math.cc |  16 +---
 11 files changed, 509 insertions(+), 203 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 2f5b167..0fa301a 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -25,6 +25,7 @@
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
+#include <curand.h>
 #ifdef USE_CUDNN
 #include <cudnn.h>
 #endif
@@ -36,8 +37,6 @@ namespace lib {
 typedef struct _Cpp { } Cpp;
 /// To implemente functions using cuda libraries
 typedef struct _Cuda { } Cuda;
-/// To implement function using cudnn
-typedef struct _Cudnn { } Cudnn;
 /// To implement function using opencl libraries
 typedef struct _Opencl { } Opencl;
 }  // namespace lib
@@ -69,10 +68,10 @@ class Blob {
 
 typedef struct _Context {
   std::mt19937 random_generator;
-  unsigned long long seed;
 #ifdef USE_CUDA
   cublasHandle_t cublas_handle;
   cudaStream_t stream;
+  curandGenerator_t curand_generator;
 #ifdef USE_CUDNN
   cudnnHandle_t cudnn_handle;
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 9022041..29b7677 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -48,6 +48,8 @@ class CallbackArg {
 typedef function<void(CallbackArg*)> CallbackFn;
 
 /// Allocate memory and execute Tensor operations.
+/// There are three types of devices distinguished by their programming
+/// languages, namely cpp, cuda and opencl.
 class Device {
  public:
   /// Operation has a function, and read/write blobs.
@@ -63,8 +65,8 @@ class Device {
   /// max mem size to use (in MB), identifier of scheduler type (default
   /// scheduler run operations synchronously) and virtual memory type (default
   /// vm only provides garbage collection).
-  Device(int id, int num_executors = 16, string scheduler = "sync",
-         string vm = "gc-only");
+  Device(int id, int num_executors, string scheduler, string vm);
+  virtual void SetRandSeed(unsigned seed) = 0;
 
   /// Called by Tensor.
   Blob* NewBlob(int size);
@@ -73,14 +75,16 @@ class Device {
   void FreeBlob(Blob* blob);
 
   /// Copy data within or across devices.
-  void CopyData(Blob* dst, const Blob& src, int len, int dst_offset,
-                int src_offset);
+  void CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
+                      CopyDirection direction, int dst_offset, int src_offset);
 
-  void CopyDataFromHostPtr(Blob* dst, const void* src, size_t size);
+  void CopyDataFromHostPtr(Blob* dst, const void* src, size_t nBytes,
+                           size_t dst_offset = 0);
   /// Submit the operation to the device, which may execute it right now or
   /// delay it depending on the scheduler.
-  void Exec(function<void(Context*)> fn, const vector<Blob*> read_blobs,
-              const vector<Blob*> write_blobs, bool use_rand_generator = false);
+  void Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
+                    const vector<Blob*> write_blobs,
+                    bool use_rand_generator = false);
 
   // Wait for one event.
   // void WaitFor();
@@ -88,14 +92,19 @@ class Device {
   /// wait for all operations submitted to this device.
   void Sync();
 
-  LibType device_lib() const { return device_lib_; }
-  LibType nn_lib() const { return nn_lib_; }
+  DeviceType type() const {
+    return device_type_;
+  }
 
   Device* host() const { return host_; }
+  int id() const { return id_; }
 
  protected:
   /// Execute one operation on one executor.
-  virtual void Exec(int operation, int executor) = 0;
+  virtual void DoExec(function<void(Context*)>&& fn, int executor) = 0;
+
+  virtual void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                          CopyDirection direction, Context* ctx) = 0;
 
   /// Allocate device memory.
   virtual void* Malloc(int size) = 0;
@@ -105,31 +114,39 @@ class Device {
 
  protected:
   int id_ = 0;
-  Scheduler* scheduler_ = nullptr;
-  VirtualMemory* vm_ = nullptr;
-  /// could be kCudnn
-  LibType nn_lib_;
+  int num_executors_ = 0;
+  unsigned seed_ = 0;
+  // Scheduler* scheduler_ = nullptr;
+  // VirtualMemory* vm_ = nullptr;
   /// could be kCpp, kCuda, kOpencl
-  LibType device_lib_;
+  DeviceType device_type_;
   // SafeQueue<Operation> op_queue_;
   // SafeQueue<Operation> op_log_;
   /// The host device
-  Context ctx_;
   Device* host_;
 };
-// Implement Device using Cpp libs.
+
+// Implement Device functions using cpp.
 class CppDevice : public Device {
  public:
-  CppDevice(int id, int num_executors);
-
-  void Exec(int operation, int executor) override;
+  CppDevice(int id, int num_executors = 1,
+            string scheduler = "sync", string vm = "gc-only");
 
+  void SetRandSeed(unsigned seed) override;
  protected:
+  void DoExec(function<void(Context*)>&& fn, int executor) override;
+
+  void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx) override;
+
   /// Allocate cpu memory.
   void* Malloc(int size) override;
 
   /// Free cpu memory.
   void Free(void* ptr) override;
+
+ protected:
+  Context ctx_;
 };
 
 /// a singleton CppDevice as the host for all devices.
@@ -138,9 +155,56 @@ extern CppDevice hostDeviceSingleton;
 // Implement Device using OpenCL libs.
 // class OpenclDevice : public Device { };
 
-// Implement Device using Cuda libs for Nvidia GPUs.
-// class CudaDevice : public Device { };
+#ifdef USE_CUDA
+// Implement Device using cuda.
+class CudaDevice : public Device {
+ public:
+  ~CudaDevice();
+  CudaDevice(int id, int num_executors = 1, string scheduler = "sync",
+         string vm = "gc-only");
+
+  void SetRandSeed(unsigned seed) override;
+  static void DeviceQuery();
+  /// This function checks the availability of GPU #device_id.
+  /// It attempts to create a context on the device by calling cudaFree(0).
+  /// cudaSetDevice() alone is not sufficient to check the availability.
+  /// It lazily records device_id, however, does not initialize a
+  /// context. So it does not know if the host thread has the permission to use
+  /// the device or not.
+  ///
+  /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
+  /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
+  /// even if the device is exclusively occupied by another process or thread.
+  /// Cuda operations that initialize the context are needed to check
+  /// the permission. cudaFree(0) is one of those with no side effect,
+  /// except the context initialization.
+  static bool CheckDevice(const int device_id);
+  /// This function finds the first available device by checking devices with
+  /// ordinal from start_id to the highest available value. In the
+  /// EXCLUSIVE_PROCESS or EXCLUSIVE_THREAD mode, if it succeeds, it also
+  /// claims the device due to the initialization of the context.
+  static int FindDevice(const int start_id);
+ protected:
+  void DoExec(function<void(Context*)>&& fn, int executor) override;
+
+  void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx) override;
+
+  /// Allocate cpu memory.
+  void* Malloc(int size) override;
+
+  /// Free cpu memory.
+  void Free(void* ptr) override;
+
+ protected:
+  Context ctx_;
+};
+
+#endif  // USE_CUDA
 
+// Implement a CudaHost device, which used cuda functions for memory
+// malloc/free.
+// class CudaHost : public Device {}
 }  // namespace singa
 
 #endif  // SINGA_CORE_DEVICE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 88a895b..03bf443 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -208,20 +208,12 @@ class Tensor {
 
 /// Copy 'num' elements of src to dst.
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
-void CopyData(Tensor* dst,
+void CopyDataToFrom(Tensor* dst,
               const Tensor& src,
               size_t num,
               size_t src_offset = 0,
               size_t dst_offset = 0);
 
-/// Copy 'nBytes' bytes of src data to dst.
-/// The first 'src_offset' ('dst_offset') bytes will be skipped.
-void CopyRawData(Tensor* dst,
-              const Tensor& src,
-              size_t nBytes,
-              size_t src_offset = 0,
-              size_t dst_offset = 0);
-
 // ==================Simple Linear Algebra Operations=========================
 Tensor Abs(const Tensor& t);
 Tensor Exp(const Tensor& t);
@@ -279,6 +271,8 @@ template <typename DType>
 void Div(const Tensor& t, DType x, Tensor* ret);
 
 // ================Blas operations============================================
+// We fix the scalar argument type to be float.
+
 // ===== Level 1
 // TODO(wangwei) make amax/amin/asum a member function of tensor
 // void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
@@ -289,25 +283,19 @@ void Div(const Tensor& t, DType x, Tensor* ret);
 
 /// Do matrix vector multipication or matrix matrix multiplication depdending
 /// on the Tensor shape.  ret = lhs * rhs
-template <typename DType>
 Tensor Mult(const Tensor& lhs, const Tensor& rhs);
 /// Do matrix vector multipication or matrix matrix multiplication depdending
 /// on the Tensor shape.  ret = lhs * rhs
-template <typename DType>
 void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 
 /// Do matrix vector multipication or matrix matrix multiplication depdending
 /// on the Tensor shape.  ret = alpha lhs * rhs + beta * ret
-template <typename DType>
-Tensor Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs);
+Tensor Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs);
 /// Do matrix vector multipication or matrix matrix multiplication depdending
 /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
-template <typename DType>
-void Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs,
+void Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs,
     Tensor* C);
 
-// tempalte<typename DType> T Dot(const Tensor& lhs, const Tensor& rhs);
-
 // ================Random operations==========================================
 /// For each element x set x = 1 if random() < p; otherwise x = 1.
 void Bernoulli(float p, Tensor* t);
@@ -316,19 +304,6 @@ void Uniform(float low, float high, Tensor* t);
 /// Fill in Tensor 't' following Gaussian distribution.
 void Gaussian(float mean, float std, Tensor* t);
 
-// ================Neural Net operations======================================
-/* following API of cudnn, e.g., conv, pool, lrn, batchnorm, softmax
-void ConvFwd(const ConvConf& conf, const Tensor& x, const Tensor& w, Tensor* y);
-void ConvBwdBias(const ConvConf& conf, const Tensor& dy, Tensor* db);
-void ConvBwdFilter(const ConvConf& conf, const Tensor& dy, const Tensor& x,
-                   Tensor* dw);
-void ConvBwdData(const ConvConf& conf, const Tensor& dy, const Tensor& w,
-                 Tensor* db);
-void PoolFwd(const PoolConf& conf, const Tensor& x, Tensor* y,
-             Tensor* mask = nullptr);
-void PoolBwd(const PoolConf& conf, const Tensor& y, const Tensor& dy,
-             const Tensor& x, Tensor* dx);
-*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/include/singa/utils/cuda.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda.h b/include/singa/utils/cuda.h
new file mode 100644
index 0000000..b2bb5c5
--- /dev/null
+++ b/include/singa/utils/cuda.h
@@ -0,0 +1,94 @@
+// from caffe include/caffe/util/device_alternative.hpp
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+//
+// CUDA macros
+//
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+
+#define CUBLAS_CHECK(condition) \
+  do { \
+    cublasStatus_t status = condition; \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
+      << cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_CHECK(condition) \
+  do { \
+    curandStatus_t status = condition; \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
+      << curandGetErrorString(status); \
+  } while (0)
+
+const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#endif
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif
+  }
+  return "Unknown cublas status";
+}
+
+const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  return "Unknown curand status";
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/src/core/device/cpp_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_device.cc b/src/core/device/cpp_device.cc
index 42f95c3..d0e051e 100644
--- a/src/core/device/cpp_device.cc
+++ b/src/core/device/cpp_device.cc
@@ -18,13 +18,18 @@
 #include "singa/core/device.h"
 namespace singa {
 CppDevice hostDeviceSingleton(-1, 1);
-CppDevice::CppDevice(int id, int num_executors) {
-  nn_lib_ = kCpp;
-  device_lib_ = kCpp;
-  host_ = &hostDeviceSingleton;
+CppDevice::CppDevice(int id, int num_executors, string scheduler,
+         string vm) : Device(id, num_executors, scheduler, vm) {
+  device_type_ = kCpp;
+  host_ = nullptr;
 }
 
-void CppDevice::Exec(int operation, int executor) {
+void CppDevice::SetRandSeed(unsigned seed) {
+  ctx_.random_generator.seed(seed);
+}
+void CppDevice::DoExec(function<void(Context*)>&& fn, int executor) {
+  CHECK_EQ(executor, 0);
+  fn(&ctx_);
 }
 
 void* CppDevice::Malloc(int size) {
@@ -35,4 +40,8 @@ void CppDevice::Free(void* ptr) {
   free(ptr);
 }
 
+void CppDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                           CopyDirection direction, Context* ctx) {
+  memcpy(dst, src, nBytes);
+}
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/src/core/device/cuda_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_device.cc b/src/core/device/cuda_device.cc
index 76c646e..1f6de60 100644
--- a/src/core/device/cuda_device.cc
+++ b/src/core/device/cuda_device.cc
@@ -15,10 +15,142 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifdef USE_CUDA
+#include <chrono>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+
 #include "singa/core/device.h"
+#include "singa/utils/cuda.h"
 namespace singa {
 
+const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
+                                   cudaMemcpyDeviceToHost,
+                                   cudaMemcpyDeviceToDevice};
+
+CudaDevice::~CudaDevice() {
+  if (ctx_.cublas_handle)
+    CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
+  if (ctx_.curand_generator)
+    CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
+#ifdef USE_CUDNN
+  if (ctx_.cudnn_handle) {
+    auto status = cudnnDestroy(ctx_.cudnn_handle);
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+  }
+#endif
+}
+
+CudaDevice::CudaDevice(int id, int num_executors,
+                       string scheduler, string vm)
+    : Device(id, num_executors, scheduler, vm) {
+  device_type_ = kCuda;
+  host_ = nullptr; // TODO(wangwei) add host device
+  ctx_.stream = NULL; // use the default sync stream
+  // TODO(wangwei) create one handle for each steam?
+  CUBLAS_CHECK(cublasCreate(&ctx_.cublas_handle));
+  // use curandCreateGeneratorHost for CudaHost device
+  CURAND_CHECK(
+      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  SetRandSeed(seed);
+  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+
+#ifdef USE_CUDNN
+  // TODO(wangwei) create one handle for each stream?
+  auto status = cudnnCreate(&ctx_.cudnn_handle);
+  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif  // USE_CUDNN
+}
+
+void CudaDevice::SetRandSeed(unsigned seed) {
+  CHECK(ctx_.curand_generator);
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
+}
+
+void CudaDevice::DoExec(function<void(Context*)>&& fn, int executor) {
+  fn(&ctx_);
+}
+
+void CudaDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                            CopyDirection direction, Context* ctx) {
+  cudaMemcpy(dst, src, nBytes, copyKind[direction]);
+  // TODO(wangwei) use async copy
+  // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
+}
 
+/// Allocate cpu memory.
+void* CudaDevice::Malloc(int size) {
+  void* ptr = nullptr;
+  cudaMalloc(&ptr, size);
+  return ptr;
+}
+
+  /// Free cpu memory.
+void CudaDevice::Free(void* ptr) {
+  CHECK_NE(ptr, nullptr);
+  cudaFree(ptr);
+}
+
+
+// ==========Following code is from Caffe src/caffe/common.cpp=================
+
+void CudaDevice::DeviceQuery() {
+  cudaDeviceProp prop;
+  int device;
+  if (cudaSuccess != cudaGetDevice(&device)) {
+    printf("No cuda device present.\n");
+    return;
+  }
+  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+  LOG(INFO) << "Device id:                     " << device;
+  LOG(INFO) << "Major revision number:         " << prop.major;
+  LOG(INFO) << "Minor revision number:         " << prop.minor;
+  LOG(INFO) << "Name:                          " << prop.name;
+  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
+  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
+  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
+  LOG(INFO) << "Warp size:                     " << prop.warpSize;
+  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
+  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
+  LOG(INFO) << "Maximum dimension of block:    "
+      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
+      << prop.maxThreadsDim[2];
+  LOG(INFO) << "Maximum dimension of grid:     "
+      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
+      << prop.maxGridSize[2];
+  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
+  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
+  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
+  LOG(INFO) << "Concurrent copy and execution: "
+      << (prop.deviceOverlap ? "Yes" : "No");
+  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
+  LOG(INFO) << "Kernel execution timeout:      "
+      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+  return;
+}
+
+bool CudaDevice::CheckDevice(const int device_id) {
+  bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
+            (cudaSuccess == cudaFree(0)));
+  // reset any error that may have occurred.
+  cudaGetLastError();
+  return r;
+}
+
+int CudaDevice::FindDevice(const int start_id) {
+  int count = 0;
+  CUDA_CHECK(cudaGetDeviceCount(&count));
+  for (int i = start_id; i < count; i++) {
+    if (CheckDevice(i)) return i;
+  }
+  return -1;
+}
 
 
 }
+#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 33f5bd8..153637c 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -20,44 +20,53 @@
 
 namespace singa {
 Device::Device(int id, int num_executors, string scheduler, string vm)
-    : id_(id) {
-  scheduler_ = nullptr;
-  vm_ = nullptr;
-  ctx_.seed = 0;
-  ctx_.random_generator = std::mt19937(ctx_.seed);
+    : id_(id), num_executors_(num_executors) {
+      // TODO(wangwei) create scheduler and vm.
 }
 
-void Device::Exec(function<void(Context*)> fn, const vector<Blob*> read_blobs,
+void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
                     const vector<Blob*> write_blobs, bool use_rand_generator) {
-  fn(&ctx_);
+  // TODO(wangwei) execute operations scheduled by the scheduler.
+  DoExec(std::move(fn), 0);
 }
 
+// TODO(wangwei) get Blob from the memory manager
 Blob* Device::NewBlob(int size) {
   if (size > 0) {
-    void* ptr = malloc(size);
-    memset(ptr, 0, size);
+    void* ptr = Malloc(size);
+    // memset(ptr, 0, size);
     return new Blob(ptr, size);
   } else {
     return nullptr;
   }
 }
 
+// TODO(wangwei) return Blob to the memory manager
 void Device::FreeBlob(Blob* blob) {
   if (blob != nullptr) {
-    free(blob->mutable_data());
+    Free(blob->mutable_data());
     delete blob;
   }
 }
 
-void Device::CopyData(Blob* dst, const Blob& src, int len, int dst_offset,
-                      int src_offset) {
-
-  memcpy(reinterpret_cast<Byte*>(dst->mutable_data()) + dst_offset,
-         (const Byte*)src.data() + src_offset, len);
+void Device::CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
+                            CopyDirection direct, int dst_offset,
+                            int src_offset) {
+  this->Exec(
+      [this, dst, src, nBytes, direct, dst_offset, src_offset](Context* ctx) {
+        this->CopyToFrom((Byte*)dst->mutable_data() + dst_offset,
+                         (Byte*)src->data() + src_offset, nBytes, direct, ctx);
+      },
+      {src}, {dst});
 }
 
-void Device::CopyDataFromHostPtr(Blob* dst, const void* src, size_t size) {
-  memcpy(dst->mutable_data(), src, size);
+void Device::CopyDataFromHostPtr(Blob* dst, const void* src, size_t nBytes,
+                                 size_t dst_offset) {
+  auto direct = device_type_ == kCpp ? kHostToHost : kHostToDevice;
+  void* dstptr = (Byte*)dst->mutable_data() + dst_offset;
+  Exec([this, dstptr, src, nBytes,
+        direct](Context* ctx) { CopyToFrom(dstptr, src, nBytes, direct, ctx); },
+       {}, {dst});
 }
 void Device::Sync() {}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 0e5570d..339262e 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -115,16 +115,17 @@ void Tensor::ToHost() {
   ToDevice(device_->host());
 }
 
-template<typename DType>
+template <typename DType>
 void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
-  CHECK_EQ(sizeof(DType), SizeOf(data_type_)) << "data_type is "
-                                              << DataType_Name(data_type_)
-                                              << " user given type is of size "
-                                              << sizeof(DType);
-  if (src != nullptr)
-    device_->CopyDataFromHostPtr(blob(), src, sizeof(DType) * num);
-  else
+  CHECK_EQ(sizeof(DType), SizeOf(data_type_))
+      << "data_type is " << DataType_Name(data_type_)
+      << " user given type is of size " << sizeof(DType);
+  if (src != nullptr) {
+    auto direction = device_->type() == kCpp ? kHostToHost : kHostToDevice;
+    device_->CopyDataFromHostPtr(blob(), src, sizeof(DType) * num, direction);
+  } else {
     LOG(WARNING) << "Copy data from null host ptr";
+  }
 }
 template void Tensor::CopyDataFromHostPtr(const float* src, size_t num);
 
@@ -133,7 +134,7 @@ void Tensor::CopyData(const Tensor& src) {
   CHECK(blob_ != nullptr);
   // Do copy only if the src's blob is already initialized.
   if (src.blob_ != nullptr) {
-    singa::CopyData(this, src, Size(), 0, 0);
+    singa::CopyDataToFrom(this, src, Size(), 0, 0);
   }
 }
 
@@ -197,38 +198,32 @@ GenUnaryScalarArgMemberFunction(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFunction(operator/=, Div);
 
 // ====================Tensor Operations=======================================
-void CopyData(Tensor* dst,
-              const Tensor& src,
-              size_t num,
-              size_t dst_offset,
-              size_t src_offset) {
-  CHECK_GE(src.Size(), src_offset + num);
-  CHECK_GE(dst->Size(), dst_offset + num);
+void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
+                    size_t dst_offset, size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
-  CopyRawData(dst, src, num * width, dst_offset * width, src_offset * width);
-}
-
-void CopyRawData(Tensor* dst,
-              const Tensor& src,
-              size_t nBytes,
-              size_t dst_offset,
-              size_t src_offset) {
+  size_t nBytes = num * width;
+  dst_offset *= width;
+  src_offset *= width;
   CHECK_GE(src.MemSize(), src_offset + nBytes);
   CHECK_GE(dst->MemSize(), dst_offset + nBytes);
-  Device* src_dev = src.device(), *dst_dev = dst->device();
-  Blob* src_blob = src.blob(), *dst_blob = dst->blob();
-  if (dst_dev->device_lib() != src_dev->device_lib()) {
+
+  Device *src_dev = src.device(), *dst_dev = dst->device();
+  Blob *from = src.blob(), *to = dst->blob();
+  if (dst_dev->type() != src_dev->type()) {
     // let the none cpp device conduct copy op
-    if (dst_dev->device_lib() == kCpp) {
-      src_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
-    } else if (src_dev->device_lib() == kCpp) {
-      dst_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
+    if (dst_dev->type() == kCpp) {
+      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, dst_offset,
+                              src_offset);
+    } else if (src_dev->type() == kCpp) {
+      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, dst_offset,
+                              src_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
-    src_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
+    auto direct = src_dev->type() == kCpp ? kHostToHost : kDeviceToDevice;
+    src_dev->CopyDataToFrom(to, from, nBytes, direct, dst_offset, src_offset);
   }
 }
 //============================================================================
@@ -257,52 +252,46 @@ void CopyRawData(Tensor* dst,
     }                                                               \
   } while (0)
 
-/// typedef DType and Lib according to values of type and lib respectively.
-/// type is from DataType, and lib is from LibType.
-/// DType and Lib would be used in __VA_ARGS__.
-#define TYPE_LIB_SWITCH(dtype, DType, ltype, Lib, ...)        \
+/// typedef DType and Dev according to values of type and lib respectively.
+/// type is from DataType, and lib is from DevType.
+/// DType and Dev would be used in __VA_ARGS__.
+#define TYPE_LIB_SWITCH(dtype, DType, dev, Dev, ...)        \
   do {                                                        \
     const int _SwitchShift = 3;                               \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);    \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (dev);    \
     switch (_SwitchHash) {                                    \
       case ((kFloat32 << _SwitchShift) + kCuda): {            \
         typedef float DType;                                  \
-        typedef lib::Cuda Lib;                                \
-        { __VA_ARGS__ }                                       \
-        break;                                                \
-      }                                                       \
-      case ((kFloat32 << _SwitchShift) + kCudnn): {           \
-        typedef float DType;                                  \
-        typedef lib::Cudnn Lib;                               \
+        typedef lib::Cuda Dev;                                \
         { __VA_ARGS__ }                                       \
         break;                                                \
       }                                                       \
       case ((kFloat32 << _SwitchShift) + kCpp): {             \
         typedef float DType;                                  \
-        typedef lib::Cpp Lib;                                 \
+        typedef lib::Cpp Dev;                                 \
         { __VA_ARGS__ }                                       \
         break;                                                \
       }                                                       \
       case ((kFloat32 << _SwitchShift) + kOpencl): {          \
         typedef float DType;                                  \
-        typedef lib::Opencl Lib;                              \
+        typedef lib::Opencl Dev;                              \
         { __VA_ARGS__ }                                       \
         break;                                                \
       }                                                       \
       default:                                                \
         LOG(FATAL) << "Unknown combination of data type "     \
                    << DataType_Name(dtype) << " and library " \
-                   << LibType_Name(ltype);                    \
+                   << DeviceType_Name(dev);                    \
     }                                                         \
   } while (0)
 
 
 #define EltwiseUnaryTensorFn(fn, t, ret)                                   \
   do {                                                                     \
-    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, { \
+    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->type(), Dev, { \
       ret->device()->Exec(                                               \
           [t, ret](Context* ctx) {                                         \
-            fn<DType, Lib>(t.Size(), t.blob(), ret->blob(), ctx);          \
+            fn<DType, Dev>(t.Size(), t.blob(), ret->blob(), ctx);          \
           },                                                               \
           {t.blob()}, {ret->blob()});                                      \
     });                                                                    \
@@ -340,10 +329,10 @@ void Softmax(const Tensor& t, Tensor* ret, int axis) {
     CHECK_EQ(size % nrow, 0) << "Size = " << size << " nrow = " << nrow;
     ncol = size / nrow;
   }
-  TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, {
+  TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->type(), Dev, {
     ret->device()->Exec(
         [nrow, ncol, t, ret](Context* ctx) {
-          Softmax<DType, Lib>(nrow, ncol, t.blob(), ret->blob(), ctx);
+          Softmax<DType, Dev>(nrow, ncol, t.blob(), ret->blob(), ctx);
         },
         {t.blob()}, {ret->blob()});
     });
@@ -351,11 +340,11 @@ void Softmax(const Tensor& t, Tensor* ret, int axis) {
 
 #define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
   do {                                                                         \
-    TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, { \
+    TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->type(), Dev, { \
       CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
       ret->device()->Exec(                                                     \
           [lhs, rhs, ret](Context* ctx) {                                      \
-            fn<DType, Lib>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),    \
+            fn<DType, Dev>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),    \
                            ctx);                                               \
           },                                                                   \
           {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
@@ -378,17 +367,17 @@ GenBinaryTensorFunction(operator*, EltwiseMult);
 GenBinaryTensorFunction(operator/, Div);
 GenBinaryTensorFunction(Pow, Pow);
 
-#define EltwiseTensorScalarFn(fn, t, x, ret)                               \
-  do {                                                                     \
-    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, { \
-      static_assert(std::is_same<SType, DType>::value,                     \
-                    "The Scalar type must match the Tensor data type");    \
-      ret->device()->Exec(                                                 \
-          [t, x, ret](Context* ctx) {                                      \
-            fn<DType, Lib>(t.Size(), t.blob(), x, ret->blob(), ctx);       \
-          },                                                               \
-          {t.blob()}, {ret->blob()});                                      \
-    });                                                                    \
+#define EltwiseTensorScalarFn(fn, t, x, ret)                            \
+  do {                                                                  \
+    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->type(), Dev, {    \
+      static_assert(std::is_same<SType, DType>::value,                  \
+                    "The Scalar type must match the Tensor data type"); \
+      ret->device()->Exec(                                              \
+          [t, x, ret](Context* ctx) {                                   \
+            fn<DType, Dev>(t.Size(), t.blob(), x, ret->blob(), ctx);    \
+          },                                                            \
+          {t.blob()}, {ret->blob()});                                   \
+    });                                                                 \
   } while (0)
 
 #define GenTensorScalarFunction(op, fn)                \
@@ -412,43 +401,33 @@ GenTensorScalarFunction(operator/, Div);
 GenTensorScalarFunction(Pow, Pow);
 
 // ================Blas operations============================================
-template <typename DType>
 Tensor Mult(const Tensor& lhs, const Tensor& rhs) {
   Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());
-  Mult<DType>(lhs, rhs, &ret);
+  Mult(lhs, rhs, &ret);
   return ret;
 }
-template Tensor Mult<float>(const Tensor& lhs, const Tensor& rhs);
 
-template <typename DType>
 void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret) {
-  Mult(DType(1), lhs, DType(1), rhs, ret);
+  Mult(1, lhs, 1, rhs, ret);
 }
-template void Mult<float>(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 
-template <typename DType>
-Tensor Mult(DType alpha, const Tensor& A, DType beta, const Tensor& B) {
+Tensor Mult(float alpha, const Tensor& A, float beta, const Tensor& B) {
   Tensor ret(A.shape(), A.device(), A.data_type());
-  Mult<DType>(alpha, A, beta, B, &ret);
+  Mult(alpha, A, beta, B, &ret);
   return ret;
 }
-template Tensor Mult<float>(float alpha, const Tensor& lhs, float beta,
-    const Tensor& rhs);
 
-template <typename SType>
-void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B,
+void Mult(float alpha, const Tensor& A, float beta, const Tensor& B,
           Tensor* C) {
   CHECK_EQ(A.shape().size(), 2u);
   bool transA = A.transpose();
   size_t m = transA ? A.shape()[1] : A.shape()[0], n = 0;
   if (B.shape().size() == 1u) {
     n = C->Size();
-    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
-      static_assert(std::is_same<SType, DType>::value,
-                    "The scalar type must be the same as the tensor data type");
+    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->type(), Dev, {
       C->device()->Exec(
           [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
-            GEMV<DType, Lib>(transA, m, n, alpha, A.blob(), B.blob(), beta,
+            GEMV<DType, Dev>(transA, m, n, alpha, A.blob(), B.blob(), beta,
                              C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
@@ -461,61 +440,42 @@ void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B,
     CHECK_EQ(C->shape()[0], m);
     CHECK_EQ(A.Size(), m * k);
     CHECK_EQ(B.Size(), n * k);
-    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
-      static_assert(std::is_same<SType, DType>::value,
-                    "The scalar type must be the same as the tensor data type");
+    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->type(), Dev, {
       C->device()->Exec(
           [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
-            GEMM<DType, Lib>(transA, transB, m, n, k, alpha, A.blob(), B.blob(),
+            GEMM<DType, Dev>(transA, transB, m, n, k, alpha, A.blob(), B.blob(),
                              beta, C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
     });
   }
 }
-template void Mult<float>(float alpha, const Tensor& lhs, float beta,
-    const Tensor& rhs, Tensor* ret);
 
-
-// ================Neural Net operations======================================
-/*
-void Conv(const OpConf* conf, const Tensor& input, const Tensor& W,
-          const Tensor& b, Tensor* ret) {
-  TYPE_LIB_SWITCH(input.data_type(), DType, input.device()->nn_lib(), Lib, {
-    ret->device()->Exec(
-        [conf, input, W, b, ret](Context* ctx) {
-          Conv<DType, Lib>(conf, input.blob(), W.blob(), b.blob(), ret->blob(),
-                           ctx);
-        },
-        {input.blob(), W.blob(), b.blob()}, {ret->blob()});
-  });
-}
-*/
 void Bernoulli(float p, Tensor* t) {
-  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->type(), Dev, {
     t->device()->Exec(
         [p, t](Context* ctx) {
-          Bernoulli<DType, Lib>(t->Size(), p, t->blob(), ctx);
+          Bernoulli<DType, Dev>(t->Size(), p, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
 void Uniform(float low, float high, Tensor* t) {
-  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->type(), Dev, {
     t->device()->Exec(
         [low, high, t](Context* ctx) {
-          Uniform<DType, Lib>(t->Size(), low, high, t->blob(), ctx);
+          Uniform<DType, Dev>(t->Size(), low, high, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
 void Gaussian(float mean, float std, Tensor* t) {
-  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->type(), Dev, {
     t->device()->Exec(
         [mean, std, t](Context* ctx) {
-          Gaussian<DType, Lib>(t->Size(), mean, std, t->blob(), ctx);
+          Gaussian<DType, Dev>(t->Size(), mean, std, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index f366ed0..f99aba4 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -30,10 +30,17 @@ enum DataType {
   kNumDataType = 5;
 }
 
-enum LibType {
+enum DeviceType {
   kCpp = 0;
   kCuda = 1;
   kOpencl = 2;
-  kCudnn = 3;
-  kNumLibType = 4;
+  kNumDeviceType = 4;
+}
+
+enum CopyDirection {
+  kHostToHost = 0;
+  kHostToDevice = 1;
+  kDeviceToHost = 2;
+  kDeviceToDevice = 3;
+  kNumDirection = 4;
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/test/singa/test_cpp_device.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_device.cc b/test/singa/test_cpp_device.cc
new file mode 100644
index 0000000..d2c0149
--- /dev/null
+++ b/test/singa/test_cpp_device.cc
@@ -0,0 +1,71 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include  "singa/core/device.h"
+#include "singa/proto/core.pb.h"
+
+using singa::CppDevice;
+using singa::Blob;
+TEST(CppDevice, Constructor) {
+  CppDevice dev(0, 1);
+  EXPECT_EQ(0, dev.id());
+}
+
+TEST(CppDevice, MemoryMallocFree) {
+  CppDevice dev(0, 1);
+  Blob* b = dev.NewBlob(4);
+  EXPECT_NE(nullptr, b);
+  EXPECT_EQ(4, b->size());
+  dev.FreeBlob(b);
+}
+
+TEST(CppDevice, Exec) {
+  CppDevice dev(0, 1);
+  Blob* b = dev.NewBlob(4);
+  int x = 1, y =3, z = 0;
+  dev.Exec([x, y, &z](singa::Context *ctx) {
+      z = x + y;
+      }, {b}, {b}, false);
+  EXPECT_EQ(x + y, z);
+}
+
+TEST(CppDevice, CopyData) {
+  CppDevice dev(0, 1);
+  Blob* b = dev.NewBlob(4);
+  char s[] = {'a', 'b', 'c', 'x'};
+  dev.CopyDataFromHostPtr(b, s, 4);
+  const char* bstr = static_cast<const char*>(b->data());
+  EXPECT_EQ('a', bstr[0]);
+  EXPECT_EQ('b', bstr[1]);
+  EXPECT_EQ('x', bstr[3]);
+
+  Blob* c = dev.NewBlob(4);
+  dev.CopyDataToFrom(c, b, 4, singa::kHostToHost, 0, 0);
+  const char* cstr = static_cast<const char*>(c->data());
+
+  EXPECT_EQ('a', cstr[0]);
+  EXPECT_EQ('b', cstr[1]);
+  EXPECT_EQ('x', cstr[3]);
+  dev.FreeBlob(b);
+  dev.FreeBlob(c);
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/282712ca/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 51e7cfb..ccd91a0 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -43,21 +43,7 @@ TEST_F(TestTensorMath, MemberAddTensor) {
   EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
   EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
 }
-/*
-TEST(TensorClass, SubTensor) {
-  Tensor a(Shape{2,3}), b(Shape{6});
-  float x[]={1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
-  float y[]={1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
-  a.CopyDataFromHostPtr(x, 6);
-  b.CopyDataFromHostPtr(y, 6);
-  b -= a;
-  const float* dptr = b.data<float>();
-  EXPECT_FLOAT_EQ(0.1f, dptr[0]);
-  EXPECT_FLOAT_EQ(0.1f, dptr[1]);
-  EXPECT_FLOAT_EQ(0.1f, dptr[2]);
-  EXPECT_FLOAT_EQ(0.1f, dptr[5]);
-}
-*/
+
 
 TEST_F(TestTensorMath, AddTensors) {
   Tensor ret(a.shape(), a.device(), a.data_type());

[33/50] [abbrv] incubator-singa git commit: SINGA-192 Implement optimization algorithms for v1

Posted by zh...@apache.org.

SINGA-192 Implement optimization algorithms for v1

implement optimization algorithms for Singa v1 including nesterov,
adagrad, rmsprop.
Add unit test cases for these algorithms.
However, only nesterov passed the test case, adagrad and rmsprop need
Sqrt() operation for tensor which has not been implemented yet.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/178db014
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/178db014
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/178db014

Branch: refs/heads/master
Commit: 178db0144208fd5d5e7de58a575d0ea6300fdfdf
Parents: 01aaf49
Author: WANG Ji <ij...@gmail.com>
Authored: Sat Jun 11 15:00:18 2016 +0800
Committer: WANG Ji <ij...@gmail.com>
Committed: Sat Jun 11 16:38:27 2016 +0800

----------------------------------------------------------------------
 include/singa/model/optimizer.h |  43 +++++++++++++++
 src/model/optimizer/adagrad.cc  |  35 ++++++++++++
 src/model/optimizer/nesterov.cc |  43 +++++++++++++++
 src/model/optimizer/rmsprop.cc  |  38 +++++++++++++
 src/proto/model.proto           |   3 +
 test/singa/test_adagrad.cc      |  92 +++++++++++++++++++++++++++++++
 test/singa/test_nesterov.cc     | 101 ++++++++++++++++++++++++++++++++++
 test/singa/test_rmsprop.cc      | 103 +++++++++++++++++++++++++++++++++++
 8 files changed, 458 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/include/singa/model/optimizer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h
index 7ca9f53..7da1db8 100644
--- a/include/singa/model/optimizer.h
+++ b/include/singa/model/optimizer.h
@@ -168,6 +168,49 @@ class SGD : Optimizer {
   std::function<float(int)> momentum_generator_;
 };
 
+// =============Nesterov======================================================
+class Nesterov : Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, Tensor* grad,
+             Tensor* value) override;
+
+  /// The argument function returns the momentum value given the current running
+  /// step (i.e., iterations/mini-batches).
+  void SetMomentumGenerator(std::function<float(int)> func) {
+    momentum_generator_ = func;
+  }
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  std::function<float(int)> momentum_generator_;
+};
+
+// =============Adagrad=======================================================
+class Adagrad : Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, Tensor* grad,
+             Tensor* value) override;
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  float delta_;
+};
+// =============RMSProp=======================================================
+class RMSProp : Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, Tensor* grad,
+             Tensor* value) override;
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  float delta_, rho_;
+};
 // ============LocalAllReduce for single node multiple workers ==============
 /// Updater for training models on a single node with multiple devices (workers)
 /// All model parameters are partitioned such that each parameter is updated on

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/model/optimizer/adagrad.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/adagrad.cc b/src/model/optimizer/adagrad.cc
new file mode 100644
index 0000000..8bdb07c
--- /dev/null
+++ b/src/model/optimizer/adagrad.cc
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void Adagrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); }
+
+void Adagrad::Apply(int step, float lr, const string& name, Tensor* grad,
+                    Tensor* value) {
+  if (history_gradient_.find(name) == history_gradient_.end())
+    history_gradient_[name].ResetLike(*value);
+  Tensor& history = history_gradient_[name];
+  history += (*grad) * (*grad);
+  (*value) -= (*grad) * lr / Sqrt(history + delta_);
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/model/optimizer/nesterov.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/nesterov.cc b/src/model/optimizer/nesterov.cc
new file mode 100644
index 0000000..95c5531
--- /dev/null
+++ b/src/model/optimizer/nesterov.cc
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_NESTEROV_H_
+#define SRC_MODEL_OPTIMIZER_NESTEROV_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void Nesterov::Setup(const OptimizerConf& conf) {
+  float m = conf.momentum();
+  SetMomentumGenerator([m](int step) { return m; });
+}
+
+void Nesterov::Apply(int step, float lr, const string& name, Tensor* grad,
+                     Tensor* value) {
+  if (momentum_generator_) {
+    float mom = momentum_generator_(step);
+    if (history_gradient_.find(name) == history_gradient_.end())
+      history_gradient_[name].ResetLike(*value);
+    Tensor& history = history_gradient_[name];
+    Tensor tmp = history;
+    history = history * mom + (*grad) * lr;
+    tmp = history * (1 + mom) - tmp * mom;
+    (*value) -= tmp;
+  }
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_NESTEROV_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/model/optimizer/rmsprop.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/rmsprop.cc b/src/model/optimizer/rmsprop.cc
new file mode 100644
index 0000000..cad333c
--- /dev/null
+++ b/src/model/optimizer/rmsprop.cc
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void RMSProp::Setup(const OptimizerConf& conf) {
+  delta_ = conf.delta();
+  rho_ = conf.delta();
+}
+
+void RMSProp::Apply(int step, float lr, const string& name, Tensor* grad,
+                    Tensor* value) {
+  if (history_gradient_.find(name) == history_gradient_.end())
+    history_gradient_[name].ResetLike(*value);
+  Tensor& history = history_gradient_[name];
+  history = history * rho_ + (*grad) * (*grad) * (1 - rho_);
+  (*value) -= (*grad) * lr / Sqrt(history + delta_);
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index d368296..c26aa35 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -86,6 +86,9 @@ message OptimizerConf {
 
   // used by vanilla sgd and nesterov
   optional float momentum = 5 [default = 0.9];
+
+  // delta is used to avoid dividing zero
+  optional float delta = 6 [default = 0.0000001];
 }
 
 message ConstraintConf {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/test/singa/test_adagrad.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_adagrad.cc b/test/singa/test_adagrad.cc
new file mode 100644
index 0000000..1382467
--- /dev/null
+++ b/test/singa/test_adagrad.cc
@@ -0,0 +1,92 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include "singa_config.h"
+#include <cmath>
+
+TEST(Adagrad, ApplyCPU) {
+  singa::Adagrad adagrad;
+  float lr = 0.1f;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  adagrad.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<const float*>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv1[i],
+                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+
+  grad.CopyDataFromHostPtr(g, 4);
+  adagrad.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv2[i],
+                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+}
+
+#ifdef USE_CUDA
+TEST(Adagrad, ApplyCUDA) {
+  singa::Adagrad adagrad;
+  float lr = 0.1f;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::CudaGPU dev;
+  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  adagrad.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<const float*>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv1[i],
+                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+
+  grad.CopyDataFromHostPtr(g, 4);
+  adagrad.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv2[i],
+                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+}
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/test/singa/test_nesterov.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_nesterov.cc b/test/singa/test_nesterov.cc
new file mode 100644
index 0000000..e7083c8
--- /dev/null
+++ b/test/singa/test_nesterov.cc
@@ -0,0 +1,101 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include "singa_config.h"
+
+TEST(Nesterov, ApplyCPU) {
+  singa::Nesterov nesterov;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+  nesterov.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  nesterov.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<const float*>();
+  float history[4], tmp[4];
+  for (int i = 0; i < 4; ++i) {
+    history[i] = g[i] * lr;
+    tmp[i] = history[i] * (1 + func(0));
+  }
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  nesterov.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; ++i) {
+    tmp[i] = history[i];
+    history[i] = history[i] * func(1) + g[i] * lr;
+    tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+  }
+
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+}
+
+#ifdef USE_CUDA
+TEST(Nesterov, ApplyCUDA) {
+  singa::Nesterov nesterov;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+  nesterov.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::CudaGPU dev;
+  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  nesterov.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<const float*>();
+  float history[4], tmp[4];
+  for (int i = 0; i < 4; ++i) {
+    history[i] = g[i] * lr;
+    tmp[i] = history[i] * (1 + func(0));
+  }
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  nesterov.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; ++i) {
+    tmp[i] = history[i];
+    history[i] = history[i] * func(1) + g[i] * lr;
+    tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+  }
+
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+}
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/test/singa/test_rmsprop.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_rmsprop.cc b/test/singa/test_rmsprop.cc
new file mode 100644
index 0000000..62101f7
--- /dev/null
+++ b/test/singa/test_rmsprop.cc
@@ -0,0 +1,103 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include "singa_config.h"
+#include <cmath>
+
+TEST(RMSProp, ApplyCPU) {
+  singa::RMSProp rmsprop;
+  float lr = 0.1f;
+  float rho = 0.002f;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::OptimizerConf conf;
+  conf.set_rho(rho);
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  rmsprop.Setup(conf);
+  rmsprop.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<const float*>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv1[i],
+                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+
+  grad.CopyDataFromHostPtr(g, 4);
+  rmsprop.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; ++i)
+    history[i] += history[i] * rho + g[i] * g[i] * (1 - rho);
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv2[i],
+                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+}
+
+#ifdef USE_CUDA
+TEST(RMSProp, ApplyCUDA) {
+  singa::RMSProp rmsprop;
+  float lr = 0.1f;
+  float rho = 0.002f;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::OptimizerConf conf;
+  conf.set_rho(rho);
+
+  singa::CudaGPU dev;
+  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  rmsprop.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<const float*>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv1[i],
+                    v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+
+  grad.CopyDataFromHostPtr(g, 4);
+  rmsprop.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; ++i)
+    history[i] += history[i] * rho + g[i] * g[i] * (1 - rho);
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv2[i],
+                    newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8));
+}
+#endif

[11/50] [abbrv] incubator-singa git commit: SINGA-177 Add fully cmake supporting for the compilation of singa_v1

Posted by zh...@apache.org.

SINGA-177 Add fully cmake supporting for the compilation of singa_v1

Add #include "singa_config.h" into files that use compile macros.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/72923b1c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/72923b1c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/72923b1c

Branch: refs/heads/master
Commit: 72923b1cbd6ba61da5abe7e3708a7515ad658607
Parents: 611554f
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon May 23 23:33:04 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon May 23 23:33:04 2016 +0800

----------------------------------------------------------------------
 include/singa/core/common.h        |  1 +
 include/singa/core/device.h        |  2 +-
 include/singa/utils/cuda_utils.h   |  7 ++++++-
 src/CMakeLists.txt                 | 18 +++++++++---------
 src/core/device/cuda_gpu.cc        |  1 +
 src/core/tensor/tensor_math_cuda.h |  1 +
 src/model/layer/cudnn_dropout.cc   |  2 +-
 src/model/layer/cudnn_dropout.h    |  7 ++++---
 src/model/layer/cudnn_utils.h      |  1 +
 test/singa/test_cudnn_dropout.cc   |  2 +-
 10 files changed, 26 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 61c1c41..9d005c4 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -20,6 +20,7 @@
 #define SINGA_CORE_COMMON_H_
 #include <random>
 #include <chrono>
+#include "./singa_config.h"
 #include "singa/utils/logging.h"
 
 #ifdef USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index a67b564..23c2431 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -23,7 +23,7 @@
 #include <vector>
 #include <string>
 #include <functional>
-
+#include "singa_config.h"
 #include "singa/core/common.h"
 #include "singa/core/memory.h"
 #include "singa/core/scheduler.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
index b2bb5c5..076d0d1 100644
--- a/include/singa/utils/cuda_utils.h
+++ b/include/singa/utils/cuda_utils.h
@@ -1,5 +1,9 @@
 // from caffe include/caffe/util/device_alternative.hpp
+#ifndef SINGA_UTILS_CUDA_UTILS_H_
+#define SINGA_UTILS_CUDA_UTILS_H_
 
+#include "singa_config.h"
+#ifdef USE_CUDA
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -91,4 +95,5 @@ const char* curandGetErrorString(curandStatus_t error) {
   }
   return "Unknown curand status";
 }
-
+#endif
+#endif  // SINGA_UTILS_CUDA_UTILS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 39383bd..92e7fe5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,14 +3,14 @@ FILE(GLOB proto_files proto/*.proto)
 singa_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
 # include python files either to force generation
 ADD_LIBRARY(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-LIST(APPEND singa_linker_libs proto)
+LIST(APPEND SINGA_LINKER_LIBS proto)
 
 #FILE(GLOB_RECURSE utils_source ${CMAKE_CURRENT_SOURCE_DIR}/utils/ "*.cc")
 AUX_SOURCE_DIRECTORY(utils utils_source)
 #message(STATUS "UTILS ${utils_source}")
 ADD_LIBRARY(singa_utils SHARED ${utils_source})
-TARGET_LINK_LIBRARIES(singa_utils ${singa_linker_libs})
-LIST(APPEND singa_linker_libs singa_utils)
+TARGET_LINK_LIBRARIES(singa_utils ${SINGA_LINKER_LIBS})
+LIST(APPEND SINGA_LINKER_LIBS singa_utils)
 
 
 #FILE(GLOB_RECURSE core_source ${CMAKE_CURRENT_SOURCE_DIR}/core/ "*.cc")
@@ -20,17 +20,17 @@ AUX_SOURCE_DIRECTORY(core/scheduler core_source)
 AUX_SOURCE_DIRECTORY(core/tensor core_source)
 #message(STATUS "CORE ${core_source}")
 ADD_LIBRARY(singa_core SHARED ${core_source})
-TARGET_LINK_LIBRARIES(singa_core ${singa_linker_libs})
-LIST(APPEND singa_linker_libs singa_core)
-#MESSAGE(STATUS "link libs " ${singa_linker_libs})
+TARGET_LINK_LIBRARIES(singa_core ${SINGA_LINKER_LIBS})
+LIST(APPEND SINGA_LINKER_LIBS singa_core)
+#MESSAGE(STATUS "link libs " ${SINGA_LINKER_LIBS})
 
 #FILE(GLOB_RECURSE model_source ${CMAKE_CURRENT_SOURCE_DIR}/model/ "*.cc")
 AUX_SOURCE_DIRECTORY(model model_source)
 AUX_SOURCE_DIRECTORY(model/layer model_source)
 #MESSAGE(STATUS "MODEL ${model_source}")
 ADD_LIBRARY(singa_model SHARED ${model_source})
-TARGET_LINK_LIBRARIES(singa_model ${singa_linker_libs})
-LIST(APPEND singa_linker_libs singa_model)
+TARGET_LINK_LIBRARIES(singa_model ${SINGA_LINKER_LIBS})
+LIST(APPEND SINGA_LINKER_LIBS singa_model)
 
 #ADD_LIBRARY(singa_layer SHARED ${LAYER_SOURCE})
 #ADD_LIBRARY(singa_model SHARED ${MODEL_SOURCE})
@@ -40,4 +40,4 @@ LIST(APPEND singa_linker_libs singa_model)
 #TARGET_LINK_LIBRARIES(singa_layer singa_core singa_utils)
 #TARGET_LINK_LIBRARIES(singa_model singa_layer singa_core singa_utils)
 
-#ADD_LIBRARY(singa SHARED ${singa_linker_libs})
+#ADD_LIBRARY(singa SHARED ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 8eafc4c..59a5f45 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "singa_config.h"
 #ifdef USE_CUDA
 #include <cublas_v2.h>
 #include <cuda.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 991e8bb..40f9210 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -19,6 +19,7 @@
 #ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
 #define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
 #include "./tensor_math.h"
+#include "singa_config.h"
 #include "singa/core/common.h"
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index 65cd8e5..64a581b 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -15,11 +15,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "./cudnn_dropout.h"
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
 #if CUDNN_VERSION_MAJOR >= 5
 
-#include "./cudnn_dropout.h"
 #include <cudnn.h>
 #include <chrono>
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index a7d00e0..d3b3de6 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -18,16 +18,17 @@
 
 #ifndef SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
 #define SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
+#include "singa_config.h"
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
- #if CUDNN_VERSION_MAJOR >= 5
+#if CUDNN_VERSION_MAJOR >= 5
+#include "./dropout.h"
+
 #include <cudnn.h>
 #include <utility>
 #include <string>
 #include <vector>
 
-#include "./dropout.h"
-#include "singa/core/common.h"
 #include "singa/model/layer.h"
 #include "singa/proto/core.pb.h"
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/src/model/layer/cudnn_utils.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
index 298ee5c..039a1ac 100644
--- a/src/model/layer/cudnn_utils.h
+++ b/src/model/layer/cudnn_utils.h
@@ -18,6 +18,7 @@
 #ifndef SRC_MODEL_LAYER_CUDNN_UTILS_H_
 #define SRC_MODEL_LAYER_CUDNN_UTILS_H_
 
+#include "singa_config.h"
 #ifdef USE_CUDNN
 
 #include <cudnn.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/72923b1c/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_dropout.cc b/test/singa/test_cudnn_dropout.cc
index 5fdc554..393d555 100644
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@ -18,11 +18,11 @@
 * under the License.
 *
 *************************************************************/
+#include "../src/model/layer/cudnn_dropout.h"
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
 //#if CUDNN_MAJOR_VERSION >= 5
 
-#include "../src/model/layer/cudnn_dropout.h"
 #include "gtest/gtest.h"
 
 bool inline GetBitValue(const char* x, int pos) {

[41/50] [abbrv] incubator-singa git commit: SINGA-190 - Add prelu layer and flatten layer

Posted by zh...@apache.org.

SINGA-190 - Add prelu layer and flatten layer

Format code. Fix warning info from compilation.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/58be3f80
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/58be3f80
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/58be3f80

Branch: refs/heads/master
Commit: 58be3f8079e8d00c9fee4e1ce319786cc4e9f225
Parents: 5afd81b
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 22:31:46 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 22:31:46 2016 +0800

----------------------------------------------------------------------
 src/model/layer/flatten.cc |  19 +--
 src/model/layer/flatten.h  |  13 +-
 src/model/layer/prelu.cc   |  62 ++------
 src/model/layer/prelu.h    |  11 +-
 src/proto/model.proto      | 326 ++++++++++++++++++++++------------------
 test/singa/test_flatten.cc |  68 ++++-----
 test/singa/test_prelu.cc   |  46 +++---
 7 files changed, 261 insertions(+), 284 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/58be3f80/src/model/layer/flatten.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/flatten.cc b/src/model/layer/flatten.cc
index 3ed37fe..7341394 100644
--- a/src/model/layer/flatten.cc
+++ b/src/model/layer/flatten.cc
@@ -31,22 +31,15 @@ const Tensor Flatten::Forward(int flag, const Tensor &input) {
   if (!Axis()) {
     // reshape to 1D
     size_t dim = output.Size();
-    output.Reshape(Shape {
-      dim
-    });
-    output_shape_ = Shape { dim }
-    ;
+    output.Reshape(Shape{dim});
+    output_shape_ = Shape{dim};
   } else {
     // reshape to 2D
     size_t dim1 = 1, dim2;
-    for (int i = 0; i < Axis(); i++)
-      dim1 *= output.shape(i);
+    for (int i = 0; i < Axis(); i++) dim1 *= output.shape(i);
     dim2 = output.Size() / dim1;
-    output.Reshape(Shape {
-      dim1, dim2
-    });
-    output_shape_ = Shape { dim1, dim2 }
-    ;
+    output.Reshape(Shape{dim1, dim2});
+    output_shape_ = Shape{dim1, dim2};
   }
   return output;
 }
@@ -55,7 +48,7 @@ const std::pair<Tensor, vector<Tensor> > Flatten::Backward(int flag,
                                                            const Tensor &grad) {
   vector<Tensor> param_grad;
   Tensor input_grad = grad;
-  input_grad.Reshape(Input_shape());
+  input_grad.Reshape(input_shape_);
   return std::make_pair(input_grad, param_grad);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/58be3f80/src/model/layer/flatten.h
----------------------------------------------------------------------
diff --git a/src/model/layer/flatten.h b/src/model/layer/flatten.h
index cb36542..580b2ba 100644
--- a/src/model/layer/flatten.h
+++ b/src/model/layer/flatten.h
@@ -24,7 +24,7 @@
 
 namespace singa {
 class Flatten : public Layer {
-public:
+ public:
   /// \copydoc Layer::layer_type();
   const std::string layer_type() const override { return "Flatten"; }
 
@@ -35,15 +35,14 @@ public:
   const Tensor Forward(int flag, const Tensor &input) override;
 
   /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
-  const std::pair<Tensor, vector<Tensor> > Backward(int flag,
-                                                    const Tensor &grad)
-      override;
+  const std::pair<Tensor, vector<Tensor> > Backward(
+      int flag, const Tensor &grad) override;
 
   const int Axis() const { return axis_; }
-  const Shape Input_shape() const { return input_shape_; }
-  const Shape Output_shape() const { return output_shape_; }
+  const Shape input_shape() const { return input_shape_; }
+  const Shape output_shape() const { return output_shape_; }
 
-protected:
+ protected:
   /// flatten layer reshape the input to 2D, one from 0 to axis_-1, one from
   /// axis_ to end.
   /// if axis_ is 0, reshape the input to 1D.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/58be3f80/src/model/layer/prelu.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/prelu.cc b/src/model/layer/prelu.cc
index 1d6a2e7..b916bed 100644
--- a/src/model/layer/prelu.cc
+++ b/src/model/layer/prelu.cc
@@ -25,8 +25,7 @@ void PReLU::Setup(const LayerConf &conf) {
   channel_shared_ = conf.prelu_conf().channel_shared();
   format_ = conf.prelu_conf().format();
   // Push back params into param_values_
-  for (const auto &spec : conf.param())
-    param_specs_.push_back(spec);
+  for (const auto &spec : conf.param()) param_specs_.push_back(spec);
   param_values_.push_back(&a_);
 }
 
@@ -41,26 +40,18 @@ const Tensor PReLU::Forward(int flag, const Tensor &input) {
         c = temp.shape(1);
         h = temp.shape(2);
         w = temp.shape(3);
-        temp.Reshape(Shape {
-          n *c, h *w
-        });
-        Tensor temp_a(Shape {
-          n, c
-        });
+        temp.Reshape(Shape{n * c, h * w});
+        Tensor temp_a(Shape{n, c});
         Uniform(1.f, 1.f, &temp_a);
         MultRow(a_, &temp_a);
-        temp_a.Reshape(Shape {
-          n *c
-        });
+        temp_a.Reshape(Shape{n * c});
         MultColumn(temp_a, &temp);
       } else if (format_ == "NHWC") {
         n = temp.shape(0);
         h = temp.shape(1);
         w = temp.shape(2);
         c = temp.shape(3);
-        temp.Reshape(Shape {
-          n *h *w, c
-        });
+        temp.Reshape(Shape{n * h * w, c});
         MultRow(a_, &temp);
       } else {
         LOG(FATAL) << "Incorrect input format for prelu layer.";
@@ -74,8 +65,7 @@ const Tensor PReLU::Forward(int flag, const Tensor &input) {
     const float a = a_.data<const float *>()[0];
     output = input * ((input > 0.f) + (input <= 0.f) * a);
   }
-  if (flag & kTrain)
-    buf_.push(input);
+  if (flag & kTrain) buf_.push(input);
   return output;
 }
 
@@ -96,33 +86,21 @@ const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
         c = temp1.shape(1);
         h = temp1.shape(2);
         w = temp1.shape(3);
-        temp1.Reshape(Shape {
-          n *c, h *w
-        });
-        Tensor temp_a(Shape {
-          n, c
-        });
+        temp1.Reshape(Shape{n * c, h * w});
+        Tensor temp_a(Shape{n, c});
         Uniform(1.f, 1.f, &temp_a);
         MultRow(a_, &temp_a);
-        temp_a.Reshape(Shape {
-          n *c
-        });
+        temp_a.Reshape(Shape{n * c});
         MultColumn(temp_a, &temp1);
-        temp1.Reshape(Shape {
-          n, c, h, w
-        });
+        temp1.Reshape(Shape{n, c, h, w});
       } else if (format_ == "NHWC") {
         n = temp1.shape(0);
         h = temp1.shape(1);
         w = temp1.shape(2);
         c = temp1.shape(3);
-        temp1.Reshape(Shape {
-          n *h *w, c
-        });
+        temp1.Reshape(Shape{n * h * w, c});
         MultRow(a_, &temp1);
-        temp1.Reshape(Shape {
-          n, h, w, c
-        });
+        temp1.Reshape(Shape{n, h, w, c});
       } else {
         LOG(FATAL) << "Incorrect input format for prelu layer.";
       }
@@ -130,22 +108,14 @@ const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
       LOG(FATAL) << "Incorrect input format for prelu layer.";
     }
     input_grad = grad * input * ((input > 0.f) + temp1);
-    Tensor temp2 = grad * input * (input <= 0.f), temp3(Shape {
-      n *c
-    });
+    Tensor temp2 = grad * input * (input <= 0.f), temp3(Shape{n * c});
     if (format_ == "NCHW") {
-      temp2.Reshape(Shape {
-        n *c, h *w
-      });
+      temp2.Reshape(Shape{n * c, h * w});
       SumColumns(temp2, &temp3);
-      temp3.Reshape(Shape {
-        n, c
-      });
+      temp3.Reshape(Shape{n, c});
       SumRows(temp3, &da);
     } else if (format_ == "NHWC") {
-      temp2.Reshape(Shape {
-        n *h *w, c
-      });
+      temp2.Reshape(Shape{n * h * w, c});
       SumRows(temp2, &da);
     }
   } else {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/58be3f80/src/model/layer/prelu.h
----------------------------------------------------------------------
diff --git a/src/model/layer/prelu.h b/src/model/layer/prelu.h
index 1a01d98..d165fe2 100644
--- a/src/model/layer/prelu.h
+++ b/src/model/layer/prelu.h
@@ -26,7 +26,7 @@ namespace singa {
 class PReLU : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-   const std::string layer_type() const override { return "PReLU"; }
+  const std::string layer_type() const override { return "PReLU"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const LayerConf &conf) override;
@@ -35,9 +35,8 @@ class PReLU : public Layer {
   const Tensor Forward(int flag, const Tensor &input) override;
 
   /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
-  const std::pair<Tensor, vector<Tensor> > Backward(int flag,
-                                                    const Tensor &grad)
-      override;
+  const std::pair<Tensor, vector<Tensor> > Backward(
+      int flag, const Tensor &grad) override;
 
   void ToDevice(Device *device);
 
@@ -52,8 +51,8 @@ class PReLU : public Layer {
 
  protected:
   bool channel_shared_;
-  std::string format_; // format_ has two valid value, i.e. NCHW, NHWC
-  Tensor a_; // shape of a_ is 2D, i.e. (channels, 1)
+  std::string format_;  // format_ has two valid value, i.e. NCHW, NHWC
+  Tensor a_;            // shape of a_ is 2D, i.e. (channels, 1)
   std::stack<Tensor> buf_;
 };
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/58be3f80/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 1d1f3cf..590fdd6 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -33,59 +33,67 @@ package singa;
 /// using Python (or C++/Java).
 
 // Specifies the shape (dimensions) of a Blob.
-message BlobShape { repeated int64 dim = 1[packed = true]; }
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
 
 message BlobProto {
   optional BlobShape shape = 7;
-  repeated float data = 5[packed = true];
-  repeated float diff = 6[packed = true];
-  repeated double double_data = 8[packed = true];
-  repeated double double_diff = 9[packed = true];
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
 
   // 4D dimensions -- deprecated.  Use "shape" instead.
-  optional int32 num = 1[default = 0];
-  optional int32 channels = 2[default = 0];
-  optional int32 height = 3[default = 0];
-  optional int32 width = 4[default = 0];
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
 }
 
 message FillerConf {
   // The filler type, case insensitive
-  optional string type = 1[default = 'constant'];
-  optional float value = 2[default = 0]; // the value in constant filler
-  optional float min = 3[default = 0];   // the min value in uniform filler
-  optional float max = 4[default = 1];   // the max value in uniform filler
-  optional float mean = 5[default = 0];  // the mean value in Gaussian filler
-  optional float std = 6[default = 1];   // the std value in Gaussian filler
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
   // The expected number of non-zero output weights for a given input in
   // Gaussian filler -- the default -1 means don't perform sparsification.
   /* optional int32 sparse = 7 [default = -1]; */
   // Normalize the filler variance by fan_in, fan_out, or their average.
   // Applies to 'xavier' and 'msra' fillers.
   enum VarianceNorm {
-    FAN_IN = 0; FAN_OUT = 1; AVERAGE = 2;
-  } optional VarianceNorm variance_norm = 8[default = FAN_IN];
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
 }
 
 /// SINGA message
 message OptimizerConf {
   // case insensitive
-  optional string type = 1[default = "sgd"];
+  optional string type = 1 [default = "sgd"];
 
   // used by RMSprop and Adadelta
-  optional float rho = 2[default = 0.001];
+  optional float rho = 2 [default = 0.001];
 
   // used by Adam and AdamMax
-  optional float beta_1 = 3[default = 0.9];
-  optional float beta_2 = 4[default = 0.999];
+  optional float beta_1 = 3 [default = 0.9];
+  optional float beta_2 = 4 [default = 0.999];
 
   // used by vanilla sgd and nesterov
-  optional float momentum = 5[default = 0.9];
+  optional float momentum = 5 [default = 0.9];
+
+  // delta is used to avoid dividing zero
+  optional float delta = 6 [default = 1e-8];
 }
 
 message ConstraintConf {
   // case insensitive to limit the parameter value/gradient scale
-  optional string type = 1[default = "l2"];
+  optional string type = 1 [default = "l2"];
   // e.g., the threshold for limiting the parameter scale.
   optional float threshold = 2;
 }
@@ -93,7 +101,7 @@ message ConstraintConf {
 /// SINGA message
 message RegularizerConf {
   // case insensitive to regularize the parameters, e.g., L2.
-  optional string type = 1[default = "l2"];
+  optional string type = 1 [default = "l2"];
   // e.g., the weight decay for L2 regularizer
   optional float coefficient = 2;
 }
@@ -119,10 +127,10 @@ message ParamSpec {
   */
 
   // The multiplier on the global learning rate for this parameter.
-  optional float lr_mult = 3[default = 1.0];
+  optional float lr_mult = 3 [default = 1.0];
 
   // The multiplier on the global weight decay for this parameter.
-  optional float decay_mult = 4[default = 1.0];
+  optional float decay_mult = 4 [default = 1.0];
 
   // SINGA uses this filed internally. Users just configure the fillers in
   // Layer specific conf message as caffe (style).
@@ -132,13 +140,14 @@ message ParamSpec {
 }
 
 enum Phase {
-  kTrain = 4; kEval = 8;
-}
-    // NOTE
-    // Update the next available ID when you add a new LayerConf field.
-    //
-    // LayerConf next available layer-specific ID: 139 (last added: tile_param)
-    message LayerConf {
+  kTrain = 4;
+  kEval = 8;
+}
+// NOTE
+// Update the next available ID when you add a new LayerConf field.
+//
+// LayerConf next available layer-specific ID: 139 (last added: tile_param)
+message LayerConf {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
   /* repeated string bottom = 3; // the name of each bottom blob */
@@ -242,8 +251,7 @@ message TransformationConf {
   optional uint32 crop_size = 3 [default = 0];
   // mean_file and mean_value cannot be specified at the same time
   optional string mean_file = 4;
-  // if specified can be repeated once (would substract it from all the
-channels)
+  // if specified can be repeated once (would substract it from all the channels)
   // or can be repeated the same number of times as channels
   // (would subtract them from the corresponding channel)
   repeated float mean_value = 5;
@@ -260,33 +268,34 @@ message LossConf {
   optional int32 ignore_label = 1;
   // If true, normalize each batch across all instances (including spatial
   // dimesions, but not ignored instances); else, divide by batch size only.
-  optional bool normalize = 2[default = true];
+  optional bool normalize = 2 [default = true];
 }
 
 message MetricConf {
   // When computing accuracy, count as correct by comparing the true label to
   // the top k scoring classes.  By default, only compare to the top scoring
   // class (i.e. argmax).
-  optional uint32 top_k = 1[default = 1];
+  optional uint32 top_k = 1 [default = 1];
 
   // The "label" axis of the prediction blob, whose argmax corresponds to the
   // predicted label -- may be negative to index from the end (e.g., -1 for the
   // last axis).  For example, if axis == 1 and the predictions are
   // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
   // labels with integer values in {0, 1, ..., C-1}.
-  optional int32 axis = 2[default = 1];
+  optional int32 axis = 2 [default = 1];
 
   // If specified, ignore instances with the given label.
   optional int32 ignore_label = 3;
 }
-// Messages that store hyper-parameters used by individual layer types follow,
-// in
+// Messages that store hyper-parameters used by individual layer types follow, in
 // alphabetical order.
 
+
+
 message ArgMaxConf {
   // If true produce pairs (argmax, maxval)
-  optional bool out_max_val = 1[default = false];
-  optional uint32 top_k = 2[default = 1];
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
   // The axis along which to maximise -- may be negative to index from the
   // end (e.g., -1 for the last axis).
   // By default ArgMaxLayer maximizes over the flattened trailing dimensions
@@ -299,51 +308,54 @@ message ConcatConf {
   // end (e.g., -1 for the last axis).  Other axes must have the
   // same dimension for all the bottom blobs.
   // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 2[default = 1];
+  optional int32 axis = 2 [default = 1];
 
   // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 concat_dim = 1[default = 1];
+  optional uint32 concat_dim = 1 [default = 1];
 }
 
 message ContrastiveLossConf {
   // margin for dissimilar pair
-  optional float margin = 1[default = 1.0];
+  optional float margin = 1 [default = 1.0];
   // The first implementation of this cost did not exactly match the cost of
   // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
   // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
   // Hadsell paper. New models should probably use this version.
   // legacy_version = true uses (margin - d^2). This is kept to support /
   // reproduce existing models and results
-  optional bool legacy_version = 2[default = false];
+  optional bool legacy_version = 2 [default = false];
 }
 
 message ConvolutionConf {
   optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2[default = true]; // whether to have bias terms
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
 
   // Pad, kernel size, and stride are all given as a single value for equal
   // dimensions in all spatial dimensions, or once per spatial dimension.
-  repeated uint32 pad = 3;         // The padding size; defaults to 0
+  repeated uint32 pad = 3; // The padding size; defaults to 0
   repeated uint32 kernel_size = 4; // The kernel size
-  repeated uint32 stride = 6;      // The stride; defaults to 1
+  repeated uint32 stride = 6; // The stride; defaults to 1
 
   // For 2D convolution only, the *_h and *_w versions may also be used to
   // specify both spatial dimensions.
-  optional uint32 pad_h = 9[default = 0];  // The padding height (2D only)
-  optional uint32 pad_w = 10[default = 0]; // The padding width (2D only)
-  optional uint32 kernel_h = 11;           // The kernel height (2D only)
-  optional uint32 kernel_w = 12;           // The kernel width (2D only)
-  optional uint32 stride_h = 13;           // The stride height (2D only)
-  optional uint32 stride_w = 14;           // The stride width (2D only)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
 
   // SINGA: not supported.
   // optional uint32 group = 5 [default = 1]; // The group size for group conv
 
   optional FillerConf weight_filler = 7; // The filler for the weight
-  optional FillerConf bias_filler = 8;   // The filler for the bias
+  optional FillerConf bias_filler = 8; // The filler for the bias
   enum Engine {
-    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
-  } optional Engine engine = 15[default = DEFAULT];
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
 
   // The axis to interpret as "channels" when performing convolution.
   // Preceding dimensions are treated as independent inputs;
@@ -365,12 +377,13 @@ message ConvolutionConf {
   // SINGA: not supported;
   // optional bool force_nd_im2col = 17 [default = false];
 
+
   // SINGA: add by xiangrui
   // cudnn workspace size in MB
-  optional int32 workspace_byte_limit = 50[default = 512];
+  optional int32 workspace_byte_limit = 50 [default = 512];
   // cudnn algorithm preference
   // options: "fastest", "limited_workspace", "no_workspace"
-  optional string prefer = 51[default = "fastest"];
+  optional string prefer = 51 [default = "fastest"];
   // input shape
   optional int32 channels = 52;
   optional int32 height = 53;
@@ -414,7 +427,7 @@ message DataConf {
 */
 
 message DropoutConf {
-  optional float dropout_ratio = 1[default = 0.5]; // dropout ratio
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
 }
 
 // DummyDataLayer fills any number of arbitrarily shaped blobs with random
@@ -438,13 +451,16 @@ message DummyDataConf {
 
 message EltwiseConf {
   enum EltwiseOp {
-    PROD = 0; SUM = 1; MAX = 2;
-  } optional EltwiseOp operation = 1[default = SUM]; // element-wise operation
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
   repeated float coeff = 2; // blob-wise coefficient for SUM operation
 
   // Whether to use an asymptotically slower (for >2 inputs) but stabler method
   // of computing the gradient for the PROD operation. (No effect for SUM op.)
-  optional bool stable_prod_grad = 3[default = true];
+  optional bool stable_prod_grad = 3 [default = true];
 }
 
 // Message that stores hyper-parameters used by EmbedLayer
@@ -455,9 +471,9 @@ message EmbedConf {
   // 1 greater than the maximum possible input value.
   optional uint32 input_dim = 2;
 
-  optional bool bias_term = 3[default = true]; // Whether to use a bias term
-  optional FillerConf weight_filler = 4;       // The filler for the weight
-  optional FillerConf bias_filler = 5;         // The filler for the bias
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerConf weight_filler = 4; // The filler for the weight
+  optional FillerConf bias_filler = 5; // The filler for the bias
 
 }
 
@@ -466,21 +482,21 @@ message ExpConf {
   // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
   // Or if base is set to the default (-1), base is set to e,
   // so y = exp(shift + scale * x).
-  optional float base = 1[default = -1.0];
-  optional float scale = 2[default = 1.0];
-  optional float shift = 3[default = 0.0];
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
 }
 
 /// Message that stores hyper-parameters used by FlattenLayer
 message FlattenConf {
   // The first axis to flatten: all preceding axes are retained in the output.
   // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 1[default = 1];
+  optional int32 axis = 1 [default = 1];
 
   // The last axis to flatten: all following axes are retained in the output.
   // May be negative to index from the end (e.g., the default -1 for the last
   // axis).
-  optional int32 end_axis = 2[default = -1];
+  optional int32 end_axis = 2 [default = -1];
 }
 
 /*
@@ -506,10 +522,11 @@ message HDF5OutputConf {
 
 message HingeLossConf {
   enum Norm {
-    L1 = 1; L2 = 2;
+    L1 = 1;
+    L2 = 2;
   }
-      // Specify the Norm to use L1 or L2
-      optional Norm norm = 1[default = L1];
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
 }
 
 /*
@@ -552,29 +569,29 @@ message InfogainLossConf {
 
 message InnerProductConf {
   optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2[default = true]; // whether to have bias terms
-  optional FillerConf weight_filler = 3;       // The filler for the weight
-  optional FillerConf bias_filler = 4;         // The filler for the bias
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3; // The filler for the weight
+  optional FillerConf bias_filler = 4; // The filler for the bias
 
   // The first axis to be lumped into a single inner product computation;
   // all preceding axes are retained in the output.
   // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 5[default = 1];
+  optional int32 axis = 5 [default = 1];
 }
 
 message DenseConf {
   optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2[default = true]; // whether to have bias terms
-  optional FillerConf weight_filler = 3;       // The filler for the weight
-  optional FillerConf bias_filler = 4;         // The filler for the bias
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3; // The filler for the weight
+  optional FillerConf bias_filler = 4; // The filler for the bias
 
   // The first axis to be lumped into a single inner product computation;
   // all preceding axes are retained in the output.
   // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 5[default = 1];
+  optional int32 axis = 5 [default = 1];
 
   optional uint32 num_input = 20; // The number of inputs for the layer
-  optional bool transpose = 21[default = false]; // whether transpose or not
+  optional bool transpose = 21 [default = false]; // whether transpose or not
 }
 
 // Message that stores hyper-parameters used by LogLayer
@@ -582,20 +599,22 @@ message LogConf {
   // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
   // Or if base is set to the default (-1), base is set to e,
   // so y = ln(shift + scale * x) = log_e(shift + scale * x)
-  optional float base = 1[default = -1.0];
-  optional float scale = 2[default = 1.0];
-  optional float shift = 3[default = 0.0];
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
 }
 
 // Message that stores hyper-parameters used by LRNLayer
 message LRNConf {
-  optional uint32 local_size = 1[default = 5];
-  optional float alpha = 2[default = 1.];
-  optional float beta = 3[default = 0.75];
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
   enum NormRegion {
-    ACROSS_CHANNELS = 0; WITHIN_CHANNEL = 1;
-  } optional NormRegion norm_region = 4[default = ACROSS_CHANNELS];
-  optional float k = 5[default = 1.];
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
 }
 
 message MemoryDataConf {
@@ -607,30 +626,33 @@ message MemoryDataConf {
 
 message MVNConf {
   // This parameter can be set to false to normalize mean only
-  optional bool normalize_variance = 1[default = true];
+  optional bool normalize_variance = 1 [default = true];
 
   // This parameter can be set to true to perform DNN-like MVN
-  optional bool across_channels = 2[default = false];
+  optional bool across_channels = 2 [default = false];
 
   // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 3[default = 1e-9];
+  optional float eps = 3 [default = 1e-9];
 }
 
 message PoolingConf {
   enum PoolMethod {
-    MAX = 0; AVE = 1; STOCHASTIC = 2;
-  } optional PoolMethod pool = 1[default = MAX]; // The pooling method
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
   // Pad, kernel size, and stride are all given as a single value for equal
   // dimensions in height and width or as Y, X pairs.
-  optional uint32 pad = 4[default = 0];    // The padding size (equal in Y, X)
-  optional uint32 pad_h = 9[default = 0];  // The padding height
-  optional uint32 pad_w = 10[default = 0]; // The padding width
-  optional uint32 kernel_size = 2;         // The kernel size (square)
-  optional uint32 kernel_h = 5;            // The kernel height
-  optional uint32 kernel_w = 6;            // The kernel width
-  optional uint32 stride = 3[default = 1]; // The stride (equal in Y, X)
-  optional uint32 stride_h = 7;            // The stride height
-  optional uint32 stride_w = 8;            // The stride width
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
   /*
   enum Engine {
     DEFAULT = 0;
@@ -641,20 +663,20 @@ message PoolingConf {
   */
   // If global_pooling then it will pool over the size of the bottom by doing
   // kernel_h = bottom->height and kernel_w = bottom->width
-  optional bool global_pooling = 12[default = false];
+  optional bool global_pooling = 12 [default = false];
   // Shape of source
   optional int32 channels = 50;
   optional int32 height = 51;
   optional int32 width = 52;
   // whether to propagate nan
-  optional bool nan_prop = 53[default = false];
+  optional bool nan_prop = 53 [default = false];
 }
 
 message PowerConf {
   // PowerLayer computes outputs y = (shift + scale * x) ^ power.
-  optional float power = 1[default = 1.0];
-  optional float scale = 2[default = 1.0];
-  optional float shift = 3[default = 0.0];
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
 }
 /*
 message PythonConf {
@@ -665,8 +687,7 @@ message PythonConf {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data
-parallelism.
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
   // If true, each worker solver sequentially run forward from this layer.
   // This value should be set true if you are using it as a data layer.
   optional bool share_in_parallel = 4 [default = false];
@@ -676,8 +697,13 @@ parallelism.
 // Message that stores hyper-parameters used by ReductionLayer
 message ReductionConf {
   enum ReductionOp {
-    SUM = 1; ASUM = 2; SUMSQ = 3; MEAN = 4;
-  } optional ReductionOp operation = 1[default = SUM]; // reduction operation
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
 
   // The first axis to reduce to a scalar -- may be negative to index from the
   // end (e.g., -1 for the last axis).
@@ -692,9 +718,9 @@ message ReductionConf {
   // If axis == 0 (the default), the output Blob always has the empty shape
   // (count 1), performing reduction across the entire input --
   // often useful for creating new loss functions.
-  optional int32 axis = 2[default = 0];
+  optional int32 axis = 2 [default = 0];
 
-  optional float coeff = 3[default = 1.0]; // coefficient for output
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
 }
 
 // Message that stores hyper-parameters used by ReLULayer
@@ -704,7 +730,7 @@ message ReLUConf {
   // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
   // improve neural network acoustic models. In ICML Workshop on Deep Learning
   // for Audio, Speech, and Language Processing.
-  optional float negative_slope = 1[default = 0];
+  optional float negative_slope = 1 [default = 0];
   /*
   enum Engine {
     DEFAULT = 0;
@@ -775,50 +801,58 @@ message ReshapeConf {
   //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
   //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
   //
-  optional int32 axis = 2[default = 0];
-  optional int32 num_axes = 3[default = -1];
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
 }
 
 message SigmoidConf {
   enum Engine {
-    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
-  } optional Engine engine = 1[default = DEFAULT];
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
 }
 
 message SliceConf {
   // The axis along which to slice -- may be negative to index from the end
   // (e.g., -1 for the last axis).
   // By default, SliceLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 3[default = 1];
+  optional int32 axis = 3 [default = 1];
   repeated uint32 slice_point = 2;
 
   // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 slice_dim = 1[default = 1];
+  optional uint32 slice_dim = 1 [default = 1];
 }
 
-// Message that stores hyper-parameters used by SoftmaxLayer,
-// SoftmaxWithLossLayer
+// Message that stores hyper-parameters used by SoftmaxLayer, SoftmaxWithLossLayer
 message SoftmaxConf {
   enum Engine {
-    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
-  } optional Engine engine = 1[default = DEFAULT];
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
 
   // The axis along which to perform the softmax -- may be negative to index
   // from the end (e.g., -1 for the last axis).
   // Any other axes will be evaluated as independent softmaxes.
-  optional int32 axis = 2[default = 1];
+  optional int32 axis = 2 [default = 1];
 }
 
 message TanHConf {
   enum Engine {
-    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
-  } optional Engine engine = 1[default = DEFAULT];
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
 }
 
 // Message that stores hyper-parameters used by TileLayer
 message TileConf {
   // The index of the axis to tile.
-  optional int32 axis = 1[default = 1];
+  optional int32 axis = 1 [default = 1];
 
   // The number of copies (tiles) of the blob to output.
   optional int32 tiles = 2;
@@ -826,7 +860,7 @@ message TileConf {
 
 // Message that stores hyper-parameters used by ThresholdLayer
 message ThresholdConf {
-  optional float threshold = 1[default = 0]; // Strictly positive values
+  optional float threshold = 1 [default = 0]; // Strictly positive values
 }
 
 /*
@@ -866,12 +900,18 @@ message WindowDataConf {
 
 message SPPConf {
   enum PoolMethod {
-    MAX = 0; AVE = 1; STOCHASTIC = 2;
-  } optional uint32 pyramid_height = 1;
-  optional PoolMethod pool = 2[default = MAX]; // The pooling method
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
   enum Engine {
-    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
-  } optional Engine engine = 6[default = DEFAULT];
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
 }
 
 message PReLUConf {
@@ -881,15 +921,15 @@ message PReLUConf {
   // Initial value of a_i. Default is a_i=0.25 for all i.
   optional FillerConf filler = 1;
   // Whether or not slope paramters are shared across channels.
-  optional bool channel_shared = 2[default = false];
-  // format of the input. Default is NCHW.
-  optional string format = 50[default = "NCHW"];
+  optional bool channel_shared = 2 [default = false];
+
+  optional string format = 20 [default = "NCHW"];
 }
 
 message BatchNormConf {
   // Used in the moving average computation runningMean =
   // newMean*factor + runningMean*(1-factor).
-  optional double factor = 1[default = 0.9];
+  optional double factor = 1 [default = 0.9];
   // input shape
   optional int32 channels = 2;
   optional int32 height = 3;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/58be3f80/test/singa/test_flatten.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_flatten.cc b/test/singa/test_flatten.cc
index 906e4b8..0ba8d3c 100644
--- a/test/singa/test_flatten.cc
+++ b/test/singa/test_flatten.cc
@@ -36,10 +36,10 @@ TEST(Flatten, Setup) {
 }
 
 TEST(Flatten, ForwardCPU) {
-  const float x[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
-                      -2.f, -1.f };
+  const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                     1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
   size_t n = sizeof(x) / sizeof(float);
-  singa::Shape s = { 2, 1, 3, 2 };
+  singa::Shape s = {2, 1, 3, 2};
   singa::Tensor in(s);
   in.CopyDataFromHostPtr<float>(x, n);
 
@@ -52,22 +52,19 @@ TEST(Flatten, ForwardCPU) {
 
   singa::Tensor out = flt.Forward(singa::kTrain, in);
   EXPECT_EQ(n, out.Size());
-  EXPECT_EQ(6, out.shape(0));
-  EXPECT_EQ(2, out.shape(1));
+  EXPECT_EQ(6u, out.shape(0));
+  EXPECT_EQ(2u, out.shape(1));
   const float *yptr = out.data<const float *>();
-  for (size_t i = 0; i < n; i++)
-    EXPECT_FLOAT_EQ(x[i], yptr[i]);
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
 }
 
 TEST(Flatten, BackwardCPU) {
   // directly use input as the output_grad for backward
   // note that only the shape of input really matters
-  const float dy[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
-                       -2.f, -1.f };
+  const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
   size_t n = sizeof(dy) / sizeof(float);
-  singa::Tensor in(singa::Shape {
-    2, 1, 3, 2
-  });
+  singa::Tensor in(singa::Shape{2, 1, 3, 2});
   in.CopyDataFromHostPtr<float>(dy, n);
 
   int axis = 2;
@@ -81,24 +78,20 @@ TEST(Flatten, BackwardCPU) {
   const auto out = flt.Backward(singa::kTrain, temp);
   const float *xptr = out.first.data<const float *>();
   EXPECT_EQ(n, out.first.Size());
-  EXPECT_EQ(2, out.first.shape(0));
-  EXPECT_EQ(1, out.first.shape(1));
-  EXPECT_EQ(3, out.first.shape(2));
-  EXPECT_EQ(2, out.first.shape(3));
-  for (size_t i = 0; i < n; i++)
-    EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+  EXPECT_EQ(2u, out.first.shape(0));
+  EXPECT_EQ(1u, out.first.shape(1));
+  EXPECT_EQ(3u, out.first.shape(2));
+  EXPECT_EQ(2u, out.first.shape(3));
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
 }
 
 #ifdef USE_CUDA
 TEST(Flatten, ForwardGPU) {
-  const float x[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
-                      -2.f, -1.f };
+  const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                     1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
   size_t n = sizeof(x) / sizeof(float);
   singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape {
-    2, 1, 3, 2
-  },
-                   &cuda);
+  singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
   in.CopyDataFromHostPtr<float>(x, n);
 
   int axis = 3;
@@ -112,24 +105,20 @@ TEST(Flatten, ForwardGPU) {
   singa::CppCPU host(0, 1);
   out.ToDevice(&host);
   EXPECT_EQ(n, out.Size());
-  EXPECT_EQ(6, out.shape(0));
-  EXPECT_EQ(2, out.shape(1));
+  EXPECT_EQ(6u, out.shape(0));
+  EXPECT_EQ(2u, out.shape(1));
   const float *yptr = out.data<const float *>();
-  for (size_t i = 0; i < n; i++)
-    EXPECT_FLOAT_EQ(x[i], yptr[i]);
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
 }
 
 TEST(Flatten, BackwardGPU) {
   // directly use input as the output_grad for backward
   // note that only the shape of input really matters
-  const float dy[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
-                       -2.f, -1.f };
+  const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
   size_t n = sizeof(dy) / sizeof(float);
   singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape {
-    2, 1, 3, 2
-  },
-                   &cuda);
+  singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
   in.CopyDataFromHostPtr<float>(dy, n);
 
   int axis = 2;
@@ -146,11 +135,10 @@ TEST(Flatten, BackwardGPU) {
   in_diff.ToDevice(&host);
   const float *xptr = in_diff.data<const float *>();
   EXPECT_EQ(n, in_diff.Size());
-  EXPECT_EQ(2, in_diff.shape(0));
-  EXPECT_EQ(1, in_diff.shape(1));
-  EXPECT_EQ(3, in_diff.shape(2));
-  EXPECT_EQ(2, in_diff.shape(3));
-  for (size_t i = 0; i < n; i++)
-    EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+  EXPECT_EQ(2u, in_diff.shape(0));
+  EXPECT_EQ(1u, in_diff.shape(1));
+  EXPECT_EQ(3u, in_diff.shape(2));
+  EXPECT_EQ(2u, in_diff.shape(3));
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
 }
 #endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/58be3f80/test/singa/test_prelu.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_prelu.cc b/test/singa/test_prelu.cc
index 2dde9e9..6fc372b 100644
--- a/test/singa/test_prelu.cc
+++ b/test/singa/test_prelu.cc
@@ -39,13 +39,11 @@ TEST(PReLU, Setup) {
 }
 
 TEST(PReLU, ForwardCPU) {
-  const float x[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -1.f, -1.f, 2.f, -1.f, -2.f,
-                      -2.f, -1.f };
+  const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                     -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
   size_t n = sizeof(x) / sizeof(float);
   size_t batchsize = 2, c = 3, h = 2, w = 1;
-  singa::Tensor in(singa::Shape {
-    batchsize, h, w, c
-  });
+  singa::Tensor in(singa::Shape{batchsize, h, w, c});
   in.CopyDataFromHostPtr<float>(x, n);
 
   PReLU prelu;
@@ -55,10 +53,8 @@ TEST(PReLU, ForwardCPU) {
   preluconf->set_format("NHWC");
   prelu.Setup(conf);
 
-  const float neg_slope[] = { 0.25f, 0.5f, 0.75f };
-  singa::Tensor a(singa::Shape {
-    c
-  });
+  const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+  singa::Tensor a(singa::Shape{c});
   a.CopyDataFromHostPtr<float>(neg_slope, c);
   prelu.Set_a(a);
 
@@ -79,17 +75,15 @@ TEST(PReLU, ForwardCPU) {
       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
     }
   }
-  for (size_t i = 0; i < n; i++)
-    EXPECT_FLOAT_EQ(y[i], yptr[i]);
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
 }
 
 TEST(PReLU, BackwardCPU) {
-  const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f, -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+  const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                     -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
   size_t n = sizeof(x) / sizeof(float);
   size_t batchsize = 2, c = 3, h = 2, w = 1;
-  singa::Tensor in(singa::Shape {
-    batchsize, c, h, w
-  });
+  singa::Tensor in(singa::Shape{batchsize, c, h, w});
   in.CopyDataFromHostPtr<float>(x, n);
 
   PReLU prelu;
@@ -99,20 +93,16 @@ TEST(PReLU, BackwardCPU) {
   preluconf->set_format("NCHW");
   prelu.Setup(conf);
 
-  const float neg_slope[] = { 0.25f, 0.5f, 0.75f };
-  singa::Tensor a(singa::Shape {
-    c
-  });
+  const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+  singa::Tensor a(singa::Shape{c});
   a.CopyDataFromHostPtr<float>(neg_slope, c);
   prelu.Set_a(a);
 
   singa::Tensor out = prelu.Forward(singa::kTrain, in);
 
-  const float grad[] = { 1.f, 2.f, -2.f, -1.f, -1.f, -3.f, 2.f, -2.f, 1.f, 1.f,
-                         -2.f, 0.f };
-  singa::Tensor out_diff(singa::Shape {
-    batchsize, c, h, w
-  });
+  const float grad[] = {1.f, 2.f,  -2.f, -1.f, -1.f, -3.f,
+                        2.f, -2.f, 1.f,  1.f,  -2.f, 0.f};
+  singa::Tensor out_diff(singa::Shape{batchsize, c, h, w});
   out_diff.CopyDataFromHostPtr<float>(grad, n);
   const auto ret = prelu.Backward(singa::kTrain, out_diff);
   const float *xptr = ret.first.data<const float *>();
@@ -120,7 +110,7 @@ TEST(PReLU, BackwardCPU) {
   float *dx = new float[n];
   size_t div_factor = prelu.Channel_shared() ? c : 1;
   size_t params = prelu.Channel_shared() ? 1 : c;
-  float da[] = { 0.f, 0.f, 0.f };
+  float da[] = {0.f, 0.f, 0.f};
   if (prelu.Format() == "NCHW") {
     for (size_t i = 0; i < n; i++) {
       size_t pos = i / (h * w) % c / div_factor;
@@ -142,8 +132,6 @@ TEST(PReLU, BackwardCPU) {
       da[pos] += grad[i] * std::min(x[i], 0.f);
     }
   }
-  for (size_t i = 0; i < n; i++)
-    EXPECT_FLOAT_EQ(dx[i], xptr[i]);
-  for (size_t i = 0; i < params; i++)
-    EXPECT_FLOAT_EQ(da[i], aptr[i]);
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+  for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
 }

[20/50] [abbrv] incubator-singa git commit: SINGA-178 Add Convolution layer and Pooling layer

Posted by zh...@apache.org.

SINGA-178 Add Convolution layer and Pooling layer

Add CudnnConvolution layer and CudnnPooling layer.
Add tests for these two layers.
Passed tests and cpplint.py.

(Add on May 30th)
Process the parameters and specs in this way,
1. Each layer is responsible for pushing its own parameters into the param_value vector, and pushing its own param specs into the param_specs vector.
2. Each layer is responsible for deleting the tensors it created including params and buffer tensors.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/152056d4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/152056d4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/152056d4

Branch: refs/heads/master
Commit: 152056d4896cc96dfa63ee5ca53a1df861e408de
Parents: c6ae786
Author: XiangruiCAI <ca...@gmail.com>
Authored: Thu May 26 17:40:25 2016 +0800
Committer: XiangruiCAI <ca...@gmail.com>
Committed: Mon May 30 15:42:01 2016 +0800

----------------------------------------------------------------------
 include/singa/model/layer.h          |  22 ++-
 src/model/layer/convolution.cc       |  94 ++++++++++++
 src/model/layer/convolution.h        |  78 ++++++++++
 src/model/layer/cudnn_convolution.cc | 232 ++++++++++++++++++++++++++++++
 src/model/layer/cudnn_convolution.h  |  70 +++++++++
 src/model/layer/cudnn_pooling.cc     | 129 +++++++++++++++++
 src/model/layer/cudnn_pooling.h      |  57 ++++++++
 src/model/layer/pooling.cc           |  79 ++++++++++
 src/model/layer/pooling.h            |  63 ++++++++
 src/proto/model.proto                |  16 +++
 test/singa/test_cudnn_convolution.cc | 205 ++++++++++++++++++++++++++
 test/singa/test_cudnn_pooling.cc     | 141 ++++++++++++++++++
 12 files changed, 1173 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 5803295..c6a3bd1 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -44,11 +44,7 @@ class Layer {
 
   // ============= Following Functions could be override =====================
   /// Destruct objects created by this layer.
-  virtual ~Layer() {
-    for (Tensor* t : param_values_) {
-      delete t;
-    }
-  }
+  virtual ~Layer() {}; 
 
   /// Each layer sub-class would optionaly have a type name.
   /// Used for debugging and logging.
@@ -64,7 +60,7 @@ class Layer {
   /// batchsize is 1.
   virtual void Setup(const LayerConf& conf) {
     name_ = conf.name();
-    for (const auto& spec : conf.param()) param_specs_.push_back(spec);
+    // for (const auto& spec : conf.param()) param_specs_.push_back(spec);
     // TODO(wangwei) load param values from checkpoint blobs.
   }
 
@@ -130,21 +126,21 @@ class Layer {
   /// Move the layer (including its parameters and other internal Tensor) onto
   /// the given device
   virtual void ToDevice(Device* device) {
-    for (auto p : param_values_) p->ToDevice(device);
+    //for (auto p : param_values_) p->ToDevice(device);
   }
 
   /// Set the data type of Tensor in this layer.
   virtual void AsType(DataType dtype) {
-    for (auto p : param_values_) p->AsType(dtype);
+    //for (auto p : param_values_) p->AsType(dtype);
   }
 
   /// Serialize the layer info (including params) into a LayerConf proto message
   virtual void ToProto(LayerConf* conf) const {
-    conf->set_name(name_);
-    for (const auto& spec : param_specs_) {
-      ParamSpec* p = conf->add_param();
-      p->CopyFrom(spec);
-    }
+    //conf->set_name(name_);
+    //for (const auto& spec : param_specs_) {
+    //  ParamSpec* p = conf->add_param();
+    //  p->CopyFrom(spec);
+    //}
     // TODO(wangwei) add param values into conf;
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
new file mode 100644
index 0000000..6406a31
--- /dev/null
+++ b/src/model/layer/convolution.cc
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./convolution.h"
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+using std::vector;
+
+void Convolution::Setup(const LayerConf &conf) {
+  Layer::Setup(conf);
+  ConvolutionConf conv_conf = conf.convolution_conf();
+  // kernel_size, pad, and stride are repeated fields.
+  if (conv_conf.kernel_size_size() > 0) {
+    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+  } else {
+    kernel_w_ = conv_conf.kernel_w();
+    kernel_h_ = conv_conf.kernel_h();
+  }
+  CHECK_NE(kernel_w_, 0);
+  CHECK_NE(kernel_h_, 0);
+
+  if (conv_conf.pad_size() > 0) {
+    pad_w_ = pad_h_ = conv_conf.pad(0);
+  } else {
+    pad_w_ = conv_conf.pad_w();
+    pad_h_ = conv_conf.pad_h();
+  }
+
+  if (conv_conf.stride_size() > 0) {
+    stride_w_ = stride_h_ = conv_conf.stride(0);
+  } else {
+    stride_w_ = conv_conf.stride_w();
+    stride_h_ = conv_conf.stride_h();
+  }
+
+  num_filters_ = conv_conf.num_output();
+  bias_term_ = conv_conf.bias_term();
+
+  // Shape of src
+  channels_ = conv_conf.channels();
+  height_ = conv_conf.height();
+  width_ = conv_conf.width();
+
+  conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+  conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+  col_height_ = channels_ * kernel_w_ * kernel_h_;
+  col_width_ = conv_height_ * conv_width_;
+
+  // Setup shape of weight_ and bias_
+  weight_.Reshape(Shape{num_filters_, col_height_});
+  bias_.Reshape(Shape{num_filters_});
+  // Push back params into param_values_
+  // Assume the order of param is: weight, bias
+  for (const auto& spec : conf.param()) param_specs_.push_back(spec);
+  param_values_.push_back(&weight_);
+  param_values_.push_back(&bias_);
+}
+
+/// \copydoc Layer::Forward(int flag, const Tensor&)
+const Tensor Convolution::Forward(int flag, const Tensor &input) {
+  Tensor output;
+  // will be used in cpp version later
+  Tensor col_data(Shape{col_height_, col_width_});
+  Tensor col_grad(Shape{col_height_, col_width_});
+  return output;
+}
+
+/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+const std::pair<Tensor, vector<Tensor>> Convolution::Backward(
+    int flag, const Tensor &grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad;
+
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
new file mode 100644
index 0000000..a9bf833
--- /dev/null
+++ b/src/model/layer/convolution.h
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_CONVOLUTION_H_
+#define SRC_MODEL_LAYER_CONVOLUTION_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include <stack>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Convolution : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Convolution"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf &conf) override;
+
+  // void SetupParam(const Tensor &input);
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor &input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+
+  size_t kernel_w() const { return kernel_w_; }
+  size_t kernel_h() const { return kernel_h_; }
+  size_t pad_w() const { return pad_w_; }
+  size_t pad_h() const { return pad_h_; }
+  size_t stride_w() const { return stride_w_; }
+  size_t stride_h() const { return stride_h_; }
+  size_t num_filters() const { return num_filters_; }
+  size_t batchsize() const { return batchsize_; }
+  size_t channels() const { return channels_; }
+  size_t height() const { return height_; }
+  size_t width() const { return width_; }
+  bool bias_term() const { return bias_term_; }
+  const Tensor &weight() const { return weight_; }
+  const Tensor &bias() const { return bias_; }
+
+  void set_weight(Tensor w) {
+    weight_.ResetLike(w);
+    weight_.CopyData(w);
+  }
+  void set_bias(Tensor b) {
+    bias_.ResetLike(b);
+    bias_.CopyData(b);
+  }
+
+ protected:
+  size_t kernel_w_, pad_w_, stride_w_;
+  size_t kernel_h_, pad_h_, stride_h_;
+  size_t batchsize_, channels_, height_, width_;
+  size_t col_height_, col_width_, conv_height_, conv_width_, num_filters_;
+  Tensor weight_, bias_;
+  // store intermediate data, i.e., input tensor
+  std::stack<Tensor> buf_;
+  bool bias_term_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_CONVOLUTION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
new file mode 100644
index 0000000..ec7cd6a
--- /dev/null
+++ b/src/model/layer/cudnn_convolution.cc
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_convolution.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <chrono>
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+CudnnConvolution::~CudnnConvolution() {
+  if (bias_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
+  if (filter_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
+  if (conv_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnConvolution::Setup(const LayerConf &conf) {
+  Convolution::Setup(conf);
+  ConvolutionConf conv_conf = conf.convolution_conf();
+  // convert MB to bytes
+  workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
+  pref_ = conv_conf.algo_pref();
+  CHECK(pref_ == "fastest" || pref_ == "limited_workspace" ||
+        pref_ == "no_workspace")
+      << "CudnnConvolution only supports three algorithm preferences: fastest, "
+         "limited_workspace and no_workspace";
+}
+
+void CudnnConvolution::ToDevice(Device *device) {
+  weight_.ToDevice(device);
+  bias_.ToDevice(device);
+  workspace_.ToDevice(device);
+}
+
+void CudnnConvolution::InitCudnn(DataType dtype, Device *dev, Context *ctx) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+  CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), batchsize_,
+                                         channels_, height_, width_));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize_,
+      num_filters_, conv_height_, conv_width_));
+  if (bias_term_)
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), 1, 1,
+                                           num_filters_, 1));
+  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
+                                              stride_h_, stride_w_, 1, 1,
+                                              CUDNN_CROSS_CORRELATION));
+#if CUDNN_VERSION_MAJOR == 5
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                         CUDNN_TENSOR_NCHW, num_filters_,
+                                         channels_, kernel_h_, kernel_w_));
+#elif CUDNN_VERSION_MAJOR == 4
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(
+      filter_desc_, GetCudnnDataType(dtype), CUDNN_TENSOR_NCHW, num_filters_,
+      channels_, kernel_h_, kernel_w_));
+#else
+  LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
+#endif
+
+  cudnnConvolutionFwdPreference_t fwd_pref;
+  cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+  cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+  if (pref_ == "fastest") {
+    fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+  } else if (pref_ == "limited_workspace") {
+    fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+  } else if (pref_ == "no_workspace") {
+    fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+  } else {
+    LOG(FATAL) << "Algorithm preference is not implemented!";
+  }
+  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+      ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+      workspace_byte_limit_, &fp_alg_));
+
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+      ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+      bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+      ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+      bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+
+  size_t fp_byte, bp_data_byte, bp_filter_byte;
+  CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+      ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+      &fp_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+      ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+      bp_data_alg_, &bp_data_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+      bp_filter_alg_, &bp_filter_byte));
+  workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                         sizeof(float) +
+                     1;
+  workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
+  CHECK_EQ(input.device()->lang(), kCuda);
+  CHECK_EQ(input.shape().size(), 4);
+  buf_.push(input);
+  batchsize_ = input.shape()[0];
+  DataType dtype = input.data_type();
+  Device *dev = input.device();
+
+  if (!has_init_cudnn_) InitCudnn(dtype, dev, dev->context(0));
+
+  Shape shape{batchsize_, num_filters_, conv_height_, conv_width_};
+  Tensor output(shape, dev, dtype);
+  float alpha = 1.f, beta = 0.f;
+  output.device()->Exec(
+      [input, output, alpha, beta, this](Context *ctx) {
+        Blob *inblob = input.blob(), *outblob = output.blob(),
+             *wblob = this->weight_.blob();
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
+                                inblob->data(), this->filter_desc_,
+                                wblob->data(), this->conv_desc_, this->fp_alg_,
+                                this->workspace_.blob()->mutable_data(),
+                                this->workspace_count_ * sizeof(float), &beta,
+                                this->y_desc_, outblob->mutable_data());
+      },
+      {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
+
+  if (bias_term_) {
+    beta = 1.f;
+    output.device()->Exec(
+        [output, alpha, beta, this](Context *ctx) {
+          Blob *outblob = output.blob(), *bblob = this->bias_.blob();
+          cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
+                         bblob->data(), &beta, this->y_desc_,
+                         outblob->mutable_data());
+        },
+        {output.blob(), bias_.blob()}, {output.blob()});
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
+    int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kCuda);
+  CHECK_EQ(grad.shape().size(), 4);
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  float alpha = 1.f, beta = 0.f;
+  vector<Tensor> param_grad;
+  Tensor dx;
+  dx.ResetLike(src_data);
+  Tensor db, dw;
+  db.ResetLike(bias_);
+  dw.ResetLike(weight_);
+
+  // LOG(ERROR) << "backward bias";
+  if (bias_term_) {
+    dx.device()->Exec(
+        [grad, db, alpha, beta, this](Context *ctx) {
+          Blob *dyblob = grad.blob(), *dbblob = db.blob();
+          cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
+                                       dyblob->data(), &beta, this->bias_desc_,
+                                       dbblob->mutable_data());
+        },
+        {grad.blob()}, {db.blob()});
+  }
+  // LOG(ERROR) << "backward w";
+  dx.device()->Exec(
+      [grad, dw, src_data, alpha, beta, this](Context *ctx) {
+        Blob *inblob = src_data.blob(), *dyblob = grad.blob(),
+             *dwblob = dw.blob();
+        cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_,
+            this->bp_filter_alg_, this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
+            dwblob->mutable_data());
+      },
+      {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
+
+  // LOG(ERROR) << "backward src";
+  dx.device()->Exec(
+      [dx, grad, alpha, beta, this](Context *ctx) {
+        Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
+             *dxblob = dx.blob();
+        cudnnConvolutionBackwardData(
+            ctx->cudnn_handle, &alpha, this->filter_desc_, wblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_, this->bp_data_alg_,
+            this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->x_desc_,
+            dxblob->mutable_data());
+      },
+      {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
+  param_grad.push_back(dw);
+  param_grad.push_back(db);
+  return std::make_pair(dx, param_grad);
+}
+
+}  // namespace singa
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.h b/src/model/layer/cudnn_convolution.h
new file mode 100644
index 0000000..cf04be0
--- /dev/null
+++ b/src/model/layer/cudnn_convolution.h
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
+#define SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
+#include "singa_config.h"
+#ifdef USE_CUDNN
+#include <string>
+#include <utility>
+#include <vector>
+#include "./convolution.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnConvolution : public Convolution {
+ public:
+  ~CudnnConvolution();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "CudnnConvolution"; }
+
+  const Tensor Forward(int flag, const Tensor &input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf &conf) override;
+  /// Init cudnn related data structures.
+  void InitCudnn(DataType dtype, Device *dev, Context *ctx);
+
+  void ToDevice(Device *device) override;
+
+  size_t workspace_byte_limit() { return workspace_byte_limit_; }
+  string pref() { return pref_; }
+
+ protected:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t x_desc_ = nullptr;
+  cudnnTensorDescriptor_t y_desc_ = nullptr;
+  cudnnTensorDescriptor_t bias_desc_ = nullptr;
+  cudnnFilterDescriptor_t filter_desc_ = nullptr;
+  cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
+  cudnnConvolutionFwdAlgo_t fp_alg_;
+  cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+  cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+  size_t workspace_byte_limit_, workspace_count_;
+  Tensor workspace_;
+  string pref_;
+};
+
+}  // namespace singa
+
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
new file mode 100644
index 0000000..d68bcd2
--- /dev/null
+++ b/src/model/layer/cudnn_pooling.cc
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_pooling.h"
+#ifdef USE_CUDNN
+
+#include <cudnn.h>
+#include <chrono>
+
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+CudnnPooling::~CudnnPooling() {
+  if (pool_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnPooling::Setup(const LayerConf &conf) {
+  Pooling::Setup(conf);
+  PoolingConf pool_conf = conf.pooling_conf();
+  if (pool_conf.nan_prop())
+    nan_prop_ = CUDNN_PROPAGATE_NAN;
+  else
+    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
+}
+
+void CudnnPooling::InitCudnn(DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreatePoolingDescriptor(&pool_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), batchsize_,
+                                         channels_, height_, width_));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize_,
+      channels_, pooled_height_, pooled_width_));
+  auto pool_method = CUDNN_POOLING_MAX;
+  if (pool_ == PoolingConf_PoolMethod_MAX)
+    pool_method = CUDNN_POOLING_MAX;
+  else if (pool_ == PoolingConf_PoolMethod_AVE)
+    pool_method = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+  else
+    LOG(FATAL) << "Not implemented!";
+
+#if CUDNN_VERSION_MAJOR == 5
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor(pool_desc_, pool_method, nan_prop_,
+                                          kernel_h_, kernel_w_, pad_h_, pad_w_,
+                                          stride_h_, stride_w_));
+#elif CUDNN_VERSION_MAJOR == 4
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(pool_desc_, pool_method, nan_prop_,
+                                             kernel_h_, kernel_w_, pad_h_,
+                                             pad_w_, stride_h_, stride_w_));
+#else
+  LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
+#endif
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
+  CHECK_EQ(input.device()->lang(), kCuda);
+  CHECK_EQ(input.shape().size(), 4);
+  buf_.push(input);
+  batchsize_ = input.shape()[0];
+  DataType dtype = input.data_type();
+  Device *dev = input.device();
+  float alpha = 1.0f, beta = 0.0f;
+  if (!has_init_cudnn_) InitCudnn(dtype);
+
+  Shape shape{batchsize_, channels_, pooled_height_, pooled_width_};
+  Tensor output = Tensor(shape, dev, dtype);
+  output.device()->Exec(
+      [input, output, alpha, beta, this](Context *ctx) {
+        Blob *inblob = input.blob(), *outblob = output.blob();
+        cudnnPoolingForward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                            this->x_desc_, inblob->data(), &beta, this->y_desc_,
+                            outblob->mutable_data());
+      },
+      {input.blob()}, {output.blob()});
+  buf_.push(output);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnPooling::Backward(
+    int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kCuda);
+  CHECK_EQ(grad.shape().size(), 4);
+  vector<Tensor> param_grad;
+  Tensor dx;
+  Tensor data = buf_.top();
+  buf_.pop();
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  dx.ResetLike(src_data);
+
+  float alpha = 1.0f, beta = 0.0f;
+  dx.device()->Exec(
+      [dx, grad, src_data, data, alpha, beta, this](Context *ctx) {
+        Blob *dyblob = grad.blob(), *dxblob = dx.blob(),
+             *yblob = data.blob(), *xblob = src_data.blob();
+        cudnnPoolingBackward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                             this->y_desc_, yblob->data(), this->y_desc_,
+                             dyblob->data(), this->x_desc_, xblob->data(),
+                             &beta, this->x_desc_, dxblob->mutable_data());
+      },
+      {grad.blob(), data.blob(), src_data.blob()}, {dx.blob()});
+
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.h b/src/model/layer/cudnn_pooling.h
new file mode 100644
index 0000000..14bdf40
--- /dev/null
+++ b/src/model/layer/cudnn_pooling.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_POOLING_H_
+#define SRC_MODEL_LAYER_CUDNN_POOLING_H_
+#include "singa_config.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "./pooling.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnPooling : public Pooling {
+ public:
+  ~CudnnPooling();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "CudnnPooling"; }
+
+  void Setup(const LayerConf &conf) override;
+  const Tensor Forward(int flag, const Tensor &input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+
+  /// Init cudnn related data structures.
+  void InitCudnn(DataType dtype);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t x_desc_ = nullptr;
+  cudnnTensorDescriptor_t y_desc_ = nullptr;
+  cudnnPoolingDescriptor_t pool_desc_ = nullptr;
+  cudnnNanPropagation_t nan_prop_;
+};
+}  // namespace singa
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_POOLING_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc
new file mode 100644
index 0000000..05c6bc9
--- /dev/null
+++ b/src/model/layer/pooling.cc
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./pooling.h"
+#include "singa/model/layer.h"
+namespace singa {
+
+void Pooling::Setup(const LayerConf& conf) {
+  Layer::Setup(conf);
+
+  PoolingConf pool_conf = conf.pooling_conf();
+  if (pool_conf.has_kernel_size()) {
+    kernel_w_ = kernel_h_ = pool_conf.kernel_size();
+  } else {
+    kernel_w_ = pool_conf.kernel_w();
+    kernel_h_ = pool_conf.kernel_h();
+  }
+  CHECK_NE(kernel_w_, 0);
+  CHECK_NE(kernel_h_, 0);
+
+  if (pool_conf.has_pad()) {
+    pad_w_ = pad_h_ = pool_conf.pad();
+  } else {
+    pad_w_ = pool_conf.pad_w();
+    pad_h_ = pool_conf.pad_h();
+  }
+
+  if (pool_conf.has_stride()) {
+    stride_w_ = stride_h_ = pool_conf.stride();
+  } else {
+    stride_w_ = pool_conf.stride_w();
+    stride_h_ = pool_conf.stride_h();
+  }
+
+  pool_ = pool_conf.pool();
+  CHECK(pool_ == PoolingConf_PoolMethod_AVE ||
+        pool_ == PoolingConf_PoolMethod_MAX ||
+        pool_ == PoolingConf_PoolMethod_STOCHASTIC)
+      << "Padding implemented only for average and max pooling.";
+
+  channels_ = pool_conf.channels();
+  height_ = pool_conf.height();
+  width_ = pool_conf.width();
+  pooled_height_ =
+      static_cast<size_t>((height_ + 2 * pad_h_ - kernel_h_) / stride_h_) + 1;
+  pooled_width_ =
+      static_cast<size_t>((width_ + 2 * pad_w_ - kernel_w_) / stride_w_) + 1;
+}
+
+const Tensor Pooling::Forward(int flag, const Tensor& input) {
+  Tensor out;
+
+  return out;
+}
+
+const std::pair<Tensor, vector<Tensor>> Pooling::Backward(int flag,
+                                                          const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad;
+
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h
new file mode 100644
index 0000000..ce6670d
--- /dev/null
+++ b/src/model/layer/pooling.h
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_POOLING_H_
+#define SRC_MODEL_LAYER_POOLING_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include <stack>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Pooling : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Pooling"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf& conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  size_t kernel_w() const { return kernel_w_; }
+  size_t kernel_h() const { return kernel_h_; }
+  size_t pad_w() const { return pad_w_; }
+  size_t pad_h() const { return pad_h_; }
+  size_t stride_w() const { return stride_w_; }
+  size_t stride_h() const { return stride_h_; }
+  PoolingConf_PoolMethod pool_method() const { return pool_; }
+  size_t batchsize() const { return batchsize_; }
+  size_t channels() const { return channels_; }
+  size_t height() const { return height_; }
+  size_t width() const { return width_; }
+
+ protected:
+  size_t kernel_w_, pad_w_, stride_w_;
+  size_t kernel_h_, pad_h_, stride_h_;
+  size_t batchsize_, channels_, height_, width_, pooled_height_, pooled_width_;
+  PoolingConf_PoolMethod pool_;
+  // To store the input and output(of forward) tensors
+  std::stack<Tensor> buf_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_POOLING_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 51225ee..03ad6ad 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -334,6 +334,16 @@ message ConvolutionConf {
   // implementation; for input blobs with num_axes != 2, this option is
   // ignored and the ND implementation will be used.)
   optional bool force_nd_im2col = 17 [default = false];
+  // add by xiangrui
+  // cudnn workspace size in MB
+  optional int32 workspace_byte_limit = 50 [default = 512];
+  // cudnn algorithm preference
+  // options: "fastest", "limited_workspace", "no_workspace"
+  optional string algo_pref = 51 [default = "fastest"];
+  // input shape
+  optional int32 channels = 52;
+  optional int32 height = 53;
+  optional int32 width = 54;
 }
 
 /*
@@ -595,6 +605,12 @@ message PoolingConf {
   // If global_pooling then it will pool over the size of the bottom by doing
   // kernel_h = bottom->height and kernel_w = bottom->width
   optional bool global_pooling = 12 [default = false];
+  // Shape of source
+  optional int32 channels = 50;
+  optional int32 height = 51;
+  optional int32 width = 52;
+  // whether to propagate nan
+  optional bool nan_prop = 53 [default = false];
 }
 
 message PowerConf {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
new file mode 100644
index 0000000..0955c82
--- /dev/null
+++ b/test/singa/test_cudnn_convolution.cc
@@ -0,0 +1,205 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/cudnn_convolution.h"
+#ifdef USE_CUDNN
+
+#include "gtest/gtest.h"
+
+using singa::CudnnConvolution;
+TEST(CudnnConvolution, Setup) {
+  CudnnConvolution conv;
+  EXPECT_EQ("CudnnConvolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_algo_pref("fastest");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  EXPECT_EQ(2, conv.kernel_h());
+  EXPECT_EQ(2, conv.kernel_w());
+  EXPECT_EQ(1, conv.pad_h());
+  EXPECT_EQ(1, conv.pad_w());
+  EXPECT_EQ(1, conv.stride_h());
+  EXPECT_EQ(1, conv.stride_w());
+  EXPECT_EQ(2, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(256 << 20, conv.workspace_byte_limit());
+  EXPECT_STREQ("fastest", conv.pref().c_str());
+  EXPECT_EQ(1, conv.channels());
+  EXPECT_EQ(3, conv.height());
+  EXPECT_EQ(3, conv.width());
+}
+
+TEST(CudnnConvolution, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * h * w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * h * w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_algo_pref("fastest");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(4, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+}
+
+TEST(CudnnConvolution, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                              6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * src_h * src_w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
+                       &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_algo_pref("fastest");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w}, &cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToDevice(&host);
+  const float *dx = in_grad.data<const float *>();
+  const float *wptr = we;
+  EXPECT_EQ(9, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  dw.ToDevice(&host);
+  db.ToDevice(&host);
+  const float *dbptr = db.data<const float *>();
+  EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
+
+  const float *dwptr = dw.data<const float *>();
+  EXPECT_EQ(9, dw.Size());
+  EXPECT_EQ(dy[3] * x[4], dwptr[0]);
+  EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
+  EXPECT_EQ(dy[2] * x[4], dwptr[2]);
+  EXPECT_EQ(dy[1] * x[1] + dy[3] * x[7], dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[1] * x[2] + dy[2] * x[6] + dy[3] * x[8],
+                  dwptr[4]);
+  EXPECT_EQ(dy[0] * x[1] + dy[2] * x[7], dwptr[5]);
+  EXPECT_EQ(dy[1] * x[4], dwptr[6]);
+  EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
+  EXPECT_EQ(dy[0] * x[4], dwptr[8]);
+}  // USE_CUDNN
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/test/singa/test_cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_pooling.cc b/test/singa/test_cudnn_pooling.cc
new file mode 100644
index 0000000..0bfd620
--- /dev/null
+++ b/test/singa/test_cudnn_pooling.cc
@@ -0,0 +1,141 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/cudnn_pooling.h"
+#ifdef USE_CUDNN
+
+#include "gtest/gtest.h"
+
+using singa::CudnnPooling;
+TEST(CudnnPooling, Setup) {
+  CudnnPooling pool;
+  EXPECT_EQ("CudnnPooling", pool.layer_type());
+
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(1);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(1);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(2);
+  poolconf->set_stride_w(1);
+  poolconf->set_channels(1);
+  poolconf->set_height(3);
+  poolconf->set_width(3);
+  pool.Setup(conf);
+
+  EXPECT_EQ(singa::PoolingConf_PoolMethod_MAX, pool.pool_method());
+  EXPECT_EQ(1, pool.kernel_h());
+  EXPECT_EQ(2, pool.kernel_w());
+  EXPECT_EQ(1, pool.pad_h());
+  EXPECT_EQ(0, pool.pad_w());
+  EXPECT_EQ(2, pool.stride_h());
+  EXPECT_EQ(1, pool.stride_w());
+  EXPECT_EQ(1, pool.channels());
+  EXPECT_EQ(3, pool.height());
+  EXPECT_EQ(3, pool.width());
+}
+
+TEST(CudnnPooling, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c,  h, w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  CudnnPooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  poolconf->set_channels(1);
+  poolconf->set_height(3);
+  poolconf->set_width(3);
+  pool.Setup(conf);
+
+  // Parameter "flag" does not influence pooling
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
+  EXPECT_EQ(4, out1.Size());
+  EXPECT_EQ(5.0f, outptr1[0]);
+  EXPECT_EQ(6.0f, outptr1[1]);
+  EXPECT_EQ(8.0f, outptr1[2]);
+  EXPECT_EQ(9.0f, outptr1[3]);
+}
+
+TEST(CudnnPooling, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                              6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  CudnnPooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  poolconf->set_channels(1);
+  poolconf->set_height(3);
+  poolconf->set_width(3);
+  pool.Setup(conf);
+
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, &cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
+
+  const auto ret = pool.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToDevice(&host);
+  const float *dx = in_grad.data<const float *>();
+  EXPECT_EQ(9, in_grad.Size());
+  EXPECT_EQ(0.0f, dx[0]);
+  EXPECT_EQ(0.0f, dx[1]);
+  EXPECT_EQ(0.0f, dx[2]);
+  EXPECT_EQ(0.0f, dx[3]);
+  EXPECT_EQ(0.1f, dx[4]);
+  EXPECT_EQ(0.2f, dx[5]);
+  EXPECT_EQ(0.0f, dx[6]);
+  EXPECT_EQ(0.3f, dx[7]);
+  EXPECT_EQ(0.4f, dx[8]);
+}
+#endif  // USE_CUDNN

[44/50] [abbrv] incubator-singa git commit: SINGA-195 Channel for sending training statistics

Posted by zh...@apache.org.

SINGA-195 Channel for sending training statistics

Add utils/channel.h, utils/channel.cc, test/test_channel.cc

Channel provides functionalities of unifily sending training statistics
to different terminals.

Upon setup, config it in global:
InitChannel();
SetChannelDirectory(path);

To get channel instance:
GetChannel(channel_name);

Then send messages:
channel->Send(message);


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a2a8e34b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a2a8e34b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a2a8e34b

Branch: refs/heads/master
Commit: a2a8e34b4c33be406af081c731d7ee990936701f
Parents: 26df5ac
Author: WANG Sheng <wa...@gmail.com>
Authored: Mon Jun 13 11:20:43 2016 +0800
Committer: WANG Sheng <wa...@gmail.com>
Committed: Mon Jun 13 11:28:16 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/channel.h | 76 +++++++++++++++++++++++++++++
 src/utils/channel.cc          | 99 ++++++++++++++++++++++++++++++++++++++
 test/singa/test_channel.cc    | 39 +++++++++++++++
 3 files changed, 214 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2a8e34b/include/singa/utils/channel.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/channel.h b/include/singa/utils/channel.h
new file mode 100644
index 0000000..7cd7aa3
--- /dev/null
+++ b/include/singa/utils/channel.h
@@ -0,0 +1,76 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_CHANNEL_H_
+#define SINGA_UTILS_CHANNEL_H_
+
+#include <google/protobuf/message.h>
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+
+namespace singa {
+
+class Channel {
+ public:
+  explicit Channel(const std::string& name);
+  ~Channel();
+
+  inline const std::string& GetName() { return name_; }
+  inline void EnableDestStderr(bool enable) { stderr_ = enable; }
+  inline void EnableDestFile(bool enable) { file_ = enable; }
+  void SetDestFilePath(const std::string& file);
+  void Send(const std::string& message);
+  void Send(const google::protobuf::Message& message);
+
+ private:
+  std::string name_ = "";
+  bool stderr_ = false;
+  bool file_ = false;
+  std::ofstream os_;
+};
+
+class ChannelManager {
+ public:
+  ChannelManager() {}
+  ~ChannelManager();
+
+  void Init();
+  void SetDefaultDir(const char* dir);
+  Channel* GetInstance(const std::string& channel);
+
+ private:
+  std::string dir_ = "";
+  std::map<std::string, Channel*> name2ptr_;
+};
+
+/// Initial function for global usage of channel
+void InitChannel(const char* argv);
+/// Set the directory name for persisting channel content
+void SetChannelDirectory(const char* path);
+/// Get the channel instance
+Channel* GetChannel(const std::string& channel_name);
+
+}  // namespace singa
+
+#endif  // SINGA_UTILS_CHANNEL_H__

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2a8e34b/src/utils/channel.cc
----------------------------------------------------------------------
diff --git a/src/utils/channel.cc b/src/utils/channel.cc
new file mode 100644
index 0000000..52909a3
--- /dev/null
+++ b/src/utils/channel.cc
@@ -0,0 +1,99 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* 
+*   http://www.apache.org/licenses/LICENSE-2.0
+* 
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/utils/channel.h"
+
+#include "singa/utils/logging.h"
+#include "singa/utils/singleton.h"
+
+namespace singa {
+
+ChannelManager::~ChannelManager() {
+  for (auto it : name2ptr_) {
+    if (it.second != nullptr) delete(it.second);
+  }
+}
+
+void ChannelManager::Init() {
+  // do nothing here
+}
+
+void ChannelManager::SetDefaultDir(const char* dir) {
+  if (dir != nullptr) {
+    dir_ = dir;
+    if (dir[dir_.length()-1] != '/') dir_ += '/';
+  }
+}
+
+Channel* ChannelManager::GetInstance(const std::string& channel) {
+  // find the channel
+  if (name2ptr_.find(channel) == name2ptr_.end()) {
+    // create new channel
+    Channel* chn = new Channel(channel);
+    chn->SetDestFilePath(dir_+channel);
+    chn->EnableDestFile(true);
+    name2ptr_[channel] = chn;
+  }
+  return name2ptr_[channel];
+}
+
+Channel::Channel(const std::string& name) {
+  name_ = name;
+}
+
+Channel::~Channel() {
+  if (os_.is_open()) os_.close();
+}
+
+void Channel::SetDestFilePath(const std::string& file) {
+  // file is append only
+  if (os_.is_open()) os_.close();
+  os_.open(file.c_str(), std::ios::app);
+  if (os_.is_open() == false)
+    LOG(WARNING) << "Cannot open channel file (" << file << ")";
+}
+
+void Channel::Send(const std::string& message) {
+  if (stderr_) fprintf(stderr, "%s\n", message.c_str());
+  if (file_ && os_.is_open()) os_ << message << "\n";
+}
+
+void Channel::Send(const google::protobuf::Message& message) {
+  if (stderr_) fprintf(stderr, "%s\n", message.DebugString().c_str());
+  if (file_ && os_.is_open()) message.SerializeToOstream(&os_);
+}
+
+void InitChannel(const char* argv) {
+  ChannelManager* mng = Singleton<ChannelManager>().Instance();
+  mng->Init();
+}
+
+void SetChannelDirectory(const char* path) {
+  ChannelManager * mng = Singleton<ChannelManager>().Instance();
+  mng->SetDefaultDir(path);
+}
+
+Channel* GetChannel(const std::string& channel_name) {
+  ChannelManager * mng = Singleton<ChannelManager>().Instance();
+  return mng->GetInstance(channel_name);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2a8e34b/test/singa/test_channel.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_channel.cc b/test/singa/test_channel.cc
new file mode 100644
index 0000000..77d7cbc
--- /dev/null
+++ b/test/singa/test_channel.cc
@@ -0,0 +1,39 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* 
+*   http://www.apache.org/licenses/LICENSE-2.0
+* 
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/utils/channel.h"
+
+TEST(Channel, InitChannel) {
+  singa::InitChannel("");
+  singa::SetChannelDirectory("/tmp");
+}
+
+TEST(Channel, SendStringToFile) {
+  singa::Channel* chn = singa::GetChannel("test_channel");
+  chn->Send("test to file");
+}
+
+TEST(Channel, SendStringToFileAndStderr) {
+  singa::Channel* chn = singa::GetChannel("test_channel");
+  chn->EnableDestStderr(true);
+  chn->Send("test to both file and stderr");
+}

[45/50] [abbrv] incubator-singa git commit: SINGA-184 Add Cross Entropy loss computation

Posted by zh...@apache.org.

SINGA-184 Add Cross Entropy loss computation

Merge with asf dev.
Fix the bug for cuda crossentropy caused by the target type (should use
int instead of float).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/21e4b2d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/21e4b2d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/21e4b2d7

Branch: refs/heads/master
Commit: 21e4b2d79a4bd3acfb8b487cf96c197da464ae70
Parents: ec17aca 26df5ac
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 13 13:04:23 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 13:04:23 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                          |   4 +-
 cmake/Dependencies.cmake                |   7 +-
 cmake/ProtoBuf.cmake                    | 116 ----
 include/singa/core/common.h             |   2 +-
 include/singa/core/tensor.h             | 400 +++++++-------
 src/CMakeLists.txt                      |  27 +-
 src/core/tensor/math_kernel.cu          | 707 +++++++++++++------------
 src/core/tensor/math_kernel.h           |  93 ++--
 src/core/tensor/tensor.cc               | 761 ++++++++++++++-------------
 src/core/tensor/tensor_math.h           | 404 +++++++-------
 src/core/tensor/tensor_math_cpp.h       | 610 +++++++++++++++------
 src/core/tensor/tensor_math_cuda.h      | 423 +++++++++++----
 src/model/layer/activation.cc           |  10 +-
 src/model/layer/batchnorm.cc            |  70 +++
 src/model/layer/batchnorm.h             |  84 +++
 src/model/layer/cudnn_activation.cc     |  13 +-
 src/model/layer/cudnn_batchnorm.cc      | 214 ++++++++
 src/model/layer/cudnn_batchnorm.h       |  60 +++
 src/model/layer/cudnn_convolution.cc    | 183 ++++---
 src/model/layer/cudnn_lrn.cc            | 118 +++++
 src/model/layer/cudnn_lrn.h             |  56 ++
 src/model/layer/cudnn_pooling.cc        |   7 +-
 src/model/layer/cudnn_softmax.cc        |   4 +-
 src/model/layer/dense.cc                |  86 +++
 src/model/layer/dense.h                 |  70 +++
 src/model/layer/flatten.cc              |  55 ++
 src/model/layer/flatten.h               |  53 ++
 src/model/layer/lrn.cc                  |  59 +++
 src/model/layer/lrn.h                   |  70 +++
 src/model/layer/prelu.cc                | 139 +++++
 src/model/layer/prelu.h                 |  59 +++
 src/model/layer/softmax.cc              |  10 +-
 src/model/loss/softmax_cross_entropy.cc |   8 +-
 src/proto/model.proto                   |  39 +-
 test/CMakeLists.txt                     |   1 +
 test/singa/test_activation.cc           |   8 +-
 test/singa/test_cross_entropy.cc        |   4 +-
 test/singa/test_cudnn_activation.cc     |   6 +-
 test/singa/test_cudnn_batchnorm.cc      | 257 +++++++++
 test/singa/test_cudnn_convolution.cc    | 181 +++++++
 test/singa/test_cudnn_lrn.cc            | 205 ++++++++
 test/singa/test_cudnn_softmax.cc        |   6 +-
 test/singa/test_dense.cc                | 249 +++++++++
 test/singa/test_flatten.cc              | 144 +++++
 test/singa/test_mse.cc                  |   1 -
 test/singa/test_prelu.cc                | 137 +++++
 test/singa/test_sgd.cc                  |   2 +-
 test/singa/test_softmax.cc              |  12 +-
 test/singa/test_tensor_math.cc          | 303 ++++++++++-
 49 files changed, 4873 insertions(+), 1664 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/CMakeLists.txt
----------------------------------------------------------------------
diff --cc CMakeLists.txt
index fbe3adc,fbe3adc..a9d9b17
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -10,7 -10,7 +10,9 @@@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT
  IF(UNIX OR APPLE)
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
  ENDIF()
--
++IF(CMAKE_BUILD_TYPE=Debug)
++  SET(NVCC_FLAG "${NVCC_FLAG} -g -G ")
++ENDIF()
  #message(STATUS "${CMAKE_CXX_FLAGS}")
  SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
  #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --cc include/singa/core/tensor.h
index 865e1e4,cd750c5..8cfa705
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@@ -212,199 -213,171 +213,180 @@@ Tensor Reshape(const Tensor &in, Shape 
  
  /// Copy 'num' elements of src to dst.
  /// The first 'src_offset' ('dst_offset') elements will be skipped.
- void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                     size_t src_offset = 0, size_t dst_offset = 0);
- 
- // ==================Simple Linear Algebra Operations=========================
- Tensor Abs(const Tensor &t);
- Tensor Exp(const Tensor &t);
- Tensor Log(const Tensor &t);
- Tensor ReLU(const Tensor &t);
- Tensor Sigmoid(const Tensor &t);
- Tensor Sign(const Tensor &t);
- Tensor Sqrt(const Tensor &t);
- Tensor Square(const Tensor &t);
- Tensor Tanh(const Tensor &t);
+ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                     const size_t src_offset = 0, const size_t dst_offset = 0);
+ 
+ // =============Element-wise operations====================================
+ Tensor Abs(const Tensor &in);
+ Tensor Exp(const Tensor &in);
+ Tensor Log(const Tensor &in);
+ Tensor ReLU(const Tensor &in);
+ Tensor Sigmoid(const Tensor &in);
+ Tensor Sign(const Tensor &in);
+ Tensor Sqrt(const Tensor &in);
+ Tensor Square(const Tensor &in);
+ Tensor Tanh(const Tensor &in);
+ 
+ /// Element-wise opeartion, out[i]=in[i]^x
+ template <typename SType>
+ Tensor Pow(const Tensor &in, const SType x);
+ /// Element-wise opeartion, out[i]=in[i]^x
+ template <typename SType>
+ void Pow(const Tensor &in, const SType x, Tensor *out);
+ /// Element-wise opeartion, out[i]=baes[i]^exp[i]
+ Tensor Pow(const Tensor &base, const Tensor &exp);
+ /// Element-wise opeartion, out[i]=baes[i]^exp[i]
+ void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
  
+ /// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
  template <typename SType>
- SType Sum(const Tensor &t);
- /// Sum elements in the Tensor, currently only support vector and matrix.
- /// if 'axis' is 0, sum all rows into a single row
- /// if 'axis' is 1, sum all columns into a single column
- /// TODO(wangwei) support arbitrary Tensor like numpy.sum
- Tensor Sum(const Tensor &t, int axis);
+ Tensor operator<(const Tensor &in, const SType x);
+ template <typename SType>
+ void LT(const Tensor &in, const SType x, Tensor *out);
  
- /// Average elements in the Tensor, currently only support vector and matrix.
- /// if 'axis' is 0, average all rows into a single row
- /// if 'axis' is 1, average all columns into a single column
- /// TODO(wangwei) support arbitrary Tensor like numpy.average
- Tensor Average(const Tensor &t, int axis);
- /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
- Tensor SoftMax(const Tensor &in);
- /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
- void SoftMax(const Tensor &in, Tensor *out);
+ /// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator<=(const Tensor &in, const SType x);
+ template <typename SType>
+ void LE(const Tensor &in, const SType x, Tensor *out);
 -
+ /// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator>(const Tensor &in, const SType x);
+ template <typename SType>
+ void GT(const Tensor &in, const SType x, Tensor *out);
  
- /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
- /// and shape_[axis+1]*...*shape_[nDim()] columns.
- /// and do softmax along each row.
- // Tensor Softmax(const Tensor& t, int axis = -1);
- // void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
- 
- /// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator<(const Tensor &t, const DType x);
- template <typename DType>
- void LT(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator<=(const Tensor &t, const DType x);
- template <typename DType>
- void LE(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator>(const Tensor &t, const DType x);
- template <typename DType>
- void GT(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator>=(const Tensor &t, const DType x);
- template <typename DType>
- void GE(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise opeartion, ret[i]=t[i]^x
- template <typename DType>
- Tensor Pow(const Tensor &t, DType x);
- /// Element-wise opeartion, ret[i]=t[i]^x
- template <typename DType>
- void Pow(const Tensor &t, DType x, Tensor *ret);
- /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
- Tensor Pow(const Tensor &base, Tensor exp);
- /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
- void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
+ /// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator>=(const Tensor &in, const SType x);
+ template <typename SType>
+ void GE(const Tensor &in, const SType x, Tensor *out);
  
  Tensor operator+(const Tensor &lhs, const Tensor &rhs);
- void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator-(const Tensor &lhs, const Tensor &rhs);
- void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator*(const Tensor &lhs, const Tensor &rhs);
- void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator/(const Tensor &lhs, const Tensor &rhs);
- void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- 
- template <typename DType>
- Tensor operator+(const Tensor &t, DType x);
- template <typename DType>
- void Add(const Tensor &t, DType x, Tensor *ret);
- 
- template <typename DType>
- Tensor operator-(const Tensor &t, DType x);
- template <typename DType>
- void Sub(const Tensor &t, DType x, Tensor *ret);
+ void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  
- template <typename DType>
- Tensor operator*(const Tensor &t, DType x);
- template <typename DType>
- void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
- 
- template <typename DType>
- Tensor operator/(const Tensor &t, DType x);
- template <typename DType>
- void Div(const Tensor &t, DType x, Tensor *ret);
- 
- // ================Blas operations============================================
- // We fix the scalar argument type to be float.
+ template <typename SType>
+ Tensor operator+(const Tensor &in, const SType x);
+ template <typename SType>
+ void Add(const Tensor &in, const SType x, Tensor *out);
  
- // ===== Level 1
- // TODO(wangwei) make amax/amin/asum a member function of tensor
- // void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
- // void Asum(Tensor Context* ctx);
+ template <typename SType>
+ Tensor operator-(const Tensor &in, const SType x);
+ template <typename SType>
+ void Sub(const Tensor &in, const SType x, Tensor *out);
  
- // template <typename DType>
- // void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+ template <typename SType>
+ Tensor operator*(const Tensor &in, const SType x);
+ template <typename SType>
+ void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
  
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape.  result = A * B
- Tensor Mult(const Tensor &A, const Tensor &B);
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape.  C = A * B
- void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+ /// For each element e of Tensor 'in', compute e / x
+ template <typename SType>
+ Tensor operator/(const Tensor &in, const SType x);
+ /// For each element e of Tensor 'in', compute e / x into out
+ template <typename SType>
+ void Div(const Tensor &in, const SType x, Tensor *out);
  
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
- void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
-           const float beta, Tensor *C);
+ /// For each element e of Tensor 'in', compute x/e
+ template <typename SType>
+ Tensor Div(const SType x, const Tensor &in);
+ /// For each element e of Tensor 'in', compute x/e into 'out'
+ template <typename SType>
+ void Div(const SType x, const Tensor &in, Tensor *out);
  
- // ================Random operations==========================================
- /// For each element x set x = 1 if random() < p; otherwise x = 1.
- void Bernoulli(float p, Tensor *t);
- /// Fill in Tensor 't' following uniform distribution.
- void Uniform(float low, float high, Tensor *t);
- /// Fill in Tensor 't' following Gaussian distribution.
- void Gaussian(float mean, float std, Tensor *t);
+ template <typename SType>
+ SType Sum(const Tensor &in);
 -
+ // ============Matrix (row/column) operations==================================
+ /// Average elements in the Tensor, currently only support vector and matrix.
+ /// if 'axis' is 0, average all rows into a single row
+ /// if 'axis' is 1, average all columns into a single column
+ /// TODO(wangwei) support arbitrary Tensor like numpy.average
+ Tensor Average(const Tensor &in, const int axis);
 -/// Sum elements in the Tensor, currently only support vector and matrix.
 -/// if 'axis' is 0, sum all rows into a single row
 -/// if 'axis' is 1, sum all columns into a single column
 -/// TODO(wangwei) support arbitrary Tensor like numpy.sum
 -Tensor Sum(const Tensor &in, const int axis);
 -/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
 -/// and shape_[axis]*...*shape_[nDim()] columns.
 -/// and do softmax along each row.
 -Tensor SoftMax(const Tensor &in, const int axis = 0);
 -void SoftMax(const Tensor &in, const int axis, Tensor *out);
  
- // follow the consistency guide
- // https://issues.apache.org/jira/browse/SINGA-182
- // ============Matrix vector operations=======================================
  /// Add column 'v' with each column of matrix M
  void AddColumn(const Tensor &v, Tensor *M);
- void AddColumn(const float alpha, const float beta, const Tensor &v,
+ /// For each column 'c' of matrix out, do c=alpha*v + beta*c
+ template <typename SType>
+ void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                 Tensor *out);
- /// Sub column 'v' by each column of matrix M
- void SubColumn(const Tensor &v, Tensor *M);
- /// Multiply column 'v' and each column of matrix M; write results into 'out'
- void MultColumn(const Tensor &v, Tensor *M);
- /// Divide column 'v' by each column of matrix M; write results into 'out'
- void DivColumn(const Tensor &v, Tensor *M);
- 
  /// Add row 'v' with each row of matrix M; write results into 'out'
  void AddRow(const Tensor &v, Tensor *out);
- void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
- /// Sub row 'v' by each row of matrix M; write results into 'out'
- void SubRow(const Tensor &v, Tensor *M);
- /// Multiply row 'v' with each row of matrix M; write results into 'out'
- void MultRow(const Tensor &v, Tensor *M);
+ /// For each row 'r' of matrix out, do r=alpha*v + beta*r
+ template <typename SType>
+ void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+ /// Divide column 'v' by each column of matrix M; write results into 'out'
+ void DivColumn(const Tensor &v, Tensor *M);
  /// Divide row 'v' by each row of matrix M; write results into 'out'
  void DivRow(const Tensor &v, Tensor *M);
- 
- /// Sum all rows of matrix M into a single row as 'out'
- void SumRows(const Tensor &M, Tensor *out);
+ /// Multiply column 'v' and each column of matrix M; write results into 'out'
+ void MultColumn(const Tensor &v, Tensor *M);
+ /// Multiply row 'v' with each row of matrix M; write results into 'out'
+ void MultRow(const Tensor &v, Tensor *M);
++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
++Tensor SoftMax(const Tensor &in);
++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
++void SoftMax(const Tensor &in, Tensor *out);
+ /// Sub column 'v' by each column of matrix M
+ void SubColumn(const Tensor &v, Tensor *M);
+ /// Sub row 'v' by each row of matrix M; write results into 'out'
+ void SubRow(const Tensor &v, Tensor *M);
  /// Sum all columns of matrix M into a single column as 'out'
  void SumColumns(const Tensor &M, Tensor *out);
+ /// Sum all rows of matrix M into a single row as 'out'
+ void SumRows(const Tensor &M, Tensor *out);
+ 
++/// Sum elements in the Tensor, currently only support vector and matrix.
++/// if 'axis' is 0, sum all rows into a single row
++/// if 'axis' is 1, sum all columns into a single column
++/// TODO(wangwei) support arbitrary Tensor like numpy.sum
++Tensor Sum(const Tensor &in, const int axis);
++
+ // ================Random operations==========================================
+ /// For each element x set x = 1 if random() < p; otherwise x = 1.
+ template <typename SType>
+ void Bernoulli(const SType p, Tensor *out);
+ /// Fill in Tensor 't' following Gaussian distribution.
+ template <typename SType>
+ void Gaussian(const SType mean, const SType std, Tensor *out);
+ /// Fill in Tensor 't' following uniform distribution.
+ template <typename SType>
+ void Uniform(const SType low, const SType high, Tensor *out);
+ 
+ // ================Blas operations============================================
+ // TODO(wangwei) make amax/amin/asum a member function of tensor
  
- /// For each element x of Tensor 'in', compute alpha/x
+ /// out = alpha*in + out
  template <typename SType>
- Tensor Div(const SType alpha, const Tensor &in);
+ void Axpy(SType alpha, const Tensor &in, Tensor *out);
  
- /// For each element x of Tensor 'in', compute alpha/x into 'out'
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape.  result = A * B
+ Tensor Mult(const Tensor &A, const Tensor &B);
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape.  C = A * B
+ void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 -
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape. out = alpha lhs * rhs + beta * out
  template <typename SType>
- void Div(const SType alpha, const Tensor &in, Tensor *out);
- 
- /*
- /// Multiply each column of the lhs matrix with the rhs column
- Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
- void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Multiply each row of the lhs matrix with the rhs row
- Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
- void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Div each row of the lhs matrix with the rhs column
- Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
- void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Divide each row of the lhs matrix by the rhs row
- Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
- void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- */
+ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+           Tensor *C);
 +
++// *****************
++// Misc.
++// ****************
 +/// Compute the cross entropy loss given the prediction probability 'p' and
 +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
 +/// or 2-d matrix. 'loss' is 1-d vector. The loss is computed into p.
- void ComputeCrossEntropy(const Tensor& t, Tensor* p);
++void ComputeCrossEntropy(const Tensor& p, const Tensor& t, Tensor* loss);
 +/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
 +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
 +/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
 +void SoftmaxCrossEntropyBwd(const Tensor& t, Tensor* p);
  }  // namespace singa
  
  #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/CMakeLists.txt
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.cu
index f12763e,b618f9b..21ebdd8
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@@ -309,142 -258,170 +258,202 @@@ __global__ void KernelLE(const int num
    }
  }
  
- __global__ static void kernel_set_value(float *data, float value, int n) {
-   int index = blockIdx.x * blockDim.x + threadIdx.x;
-   int num_threads = blockDim.x * gridDim.x;
-   for (; index < n; index += num_threads) {
-     data[index] = value;
+ __global__ void KernelLT(const int num, const float *in, const float x,
+                          float *out) {
+   for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+        idx += blockDim.x * gridDim.x) {
+     out[idx] = in[idx] < x ? 1.0f : 0.0f;
+   }
+ }
++__global__ void KernelComputeCrossEntropy(const size_t batchsize,
++                                          const size_t dim, const float *p,
++                                          const int *t, float *loss) {
++  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
++  size_t num_threads = blockDim.x * gridDim.x;
++  for (; sample < batchsize; sample += num_threads) {
++    float prob_of_truth = p[sample * dim + t[sample]];
++    loss[sample] = -std::log(max(prob_of_truth, FLT_MIN));
 +  }
 +}
  
- __global__ void kernel_threshold(const float *src_data, float *des_data,
-                                  float alpha, int n) {
-   int index = blockIdx.x * blockDim.x + threadIdx.x;
-   int num_threads = blockDim.x * gridDim.x;
-   for (; index < n; index += num_threads) {
-     des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
++__global__ void KernelSoftmaxCrossEntropyBwd(const size_t batchsize,
++                                             const size_t dim, const float *p,
++                                             const int *t, float *grad) {
++  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
++  size_t num_threads = blockDim.x * gridDim.x;
++  for (; sample < batchsize; sample += num_threads) {
++    size_t pos = sample * dim + t[sample];
++    grad[pos] = p[pos] - 1.0f;  // TODO(wangwei) Consider p and grad are diff
 +  }
 +}
- void sum(int n, const float *in, float *out) {
-   int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-   //  here, we only need one block
-   int num_blocks = 1;
+ // ********************************
+ // Functions call kernels
+ // ********************************
+ 
+ void set(const size_t n, const float v, float *out, cudaStream_t s) {
+   KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
+ }
+ 
+ void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
+ void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
- void sum_row(int rows, int cols, int stride, const float *in, float *out) {
-   int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-   int num_blocks = cols;
+ void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
+ 
+ void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_row << <num_blocks, threads_per_block>>>
-       (in, out, rows, cols, stride);
+ void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
- void sum_col(int rows, int cols, int stride, const float *in, float *out) {
-   int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
-   int num_blocks = rows;
+ void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_col << <num_blocks, threads_per_block>>>
-       (in, out, rows, cols, stride);
+ void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void add_row(int rows, int cols, int stride, const float *in_row,
-              const float *in_mat, float *out) {
-   dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-   dim3 num_blocks(
-       cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
-       rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
-   kernel_add_vec_row << <num_blocks, threads_per_block>>>
-       (in_row, in_mat, out, rows, cols, stride);
+ 
+ void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void add(int n, const float *a, const float *b, float *out) {
-   kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
+   KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void sub(int n, const float *a, const float *b, float *out) {
-   kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void exp(int n, const float *in, float *out) {
-   kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void clamp(const size_t n, const float low, const float high, const float *in,
+            float *out, cudaStream_t s) {
+   KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
  }
  
- void log(int n, const float *in, float *out) {
-   kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void pow(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+   KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void sigmoid(int n, const float *in, float *out) {
-   kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void add(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+   KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void sigmoid_grad(int n, const float *in, float *out) {
-   kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void mult(const size_t n, const float *in, const float x, float *out,
+           cudaStream_t s) {
+   KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void relu(int n, const float *in, float *out) {
-   kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void div(const size_t n, const float x, const float *in, float *out,
+           cudaStream_t s) {
+   KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
- void relu_grad(int n, const float *in, float *out) {
-   kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void threshold(const size_t n, const float x, const float *in, float *out,
+                cudaStream_t s) {
+   KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
- void tanh(int n, const float *in, float *out) {
-   kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void gt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void ge(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void lt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void le(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
  }
  
- void tanh_grad(int n, const float *in, float *out) {
-   kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void pow(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void softplus(int n, const float *in, float *out) {
-   kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void add(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void softplus_grad(int n, const float *in, float *out) {
-   kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void sub(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void square(int n, const float *in, float *out) {
-   kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void mult(const size_t n, const float *in1, const float *in2, float *out,
+           cudaStream_t s) {
+   KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void square_grad(int n, const float *in, float *out) {
-   kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void div(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void sqrt(int n, const float *in, float *out) {
-   kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+   int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+   //  here, we only need one block
+   int num_blocks = 1;
+   KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
  }
 +
- void pow(int n, const float *a, const float *b, float *out) {
-   kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
++void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float *p,
++                         const int *t, float *loss, cudaStream_t stream) {
++  KernelComputeCrossEntropy <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
++      (batchsize, dim, p, t, loss);
 +}
 +
- void mult(int n, const float *a, const float *b, float *out) {
-   kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
++void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float *p,
++                            const int *t, float *grad, cudaStream_t stream) {
++  KernelSoftmaxCrossEntropyBwd <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
++      (batchsize, dim, p, t, grad);
++}
+ /*
+ void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void mult(int n, const float *a, const float x, float *out) {
-   kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+ void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void div(int n, const float *a, const float *b, float *out) {
-   kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ 
+ void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void set_value(int n, float v, float *out) {
-   kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+ 
+ void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void threshold(int n, float alpha, const float *in, float *out) {
-   kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+ void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- // follow the consistency guide for math API
- __global__ void KernelDiv(const size_t num, const float alpha, const float *in,
-                           float *out) {
-   for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-        idx += blockDim.x * gridDim.x) {
-     out[idx] = alpha / in[idx];
+ 
+ __global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                                int rows, int cols, int stride) {
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   int num_threads = blockDim.x * gridDim.x;
+   for (; index < rows; index += num_threads) {
+     dst_vec_data[index] = 0.0f;
+     for (int k = 0; k < cols; k++) {
+       dst_vec_data[index] += src_mat_data[index * stride + k];
+     }
    }
  }
  

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.h
index 09953e4,d8a58a5..976b78f
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@@ -31,72 -31,66 +31,73 @@@ namespace singa 
  
  // TODO(wangwei) make all function templates.
  namespace cuda {
- void sum(int n, const float *in, float *out);
  
- void sum_row(int rows, int cols, int stride, const float *in, float *out);
- 
- void sum_col(int rows, int cols, int stride, const float *in, float *out);
- 
- void add_row(int rows, int cols, int stride, const float *in_row,
-              const float *in_mat, float *out);
- 
- void add(int n, const float *a, const float *b, float *out);
- 
- void sub(int n, const float *a, const float *b, float *out);
- 
- void exp(int n, const float *in, float *out);
- 
- void log(int n, const float *in, float *out);
- 
- void sigmoid(int n, const float *in, float *out);
- 
- void sigmoid_grad(int n, const float *in, float *out);
- 
- void relu(int n, const float *in, float *out);
- 
- void relu_grad(int n, const float *in, float *out);
- 
- void tanh(int n, const float *in, float *out);
- 
- void tanh_grad(int n, const float *in, float *out);
+ // 0 input
+ void set(const size_t n, const float v, float *out, cudaStream_t s);
+ 
+ // 1 input
+ void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+ void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+ void log(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+ void square(const size_t n, const float *in, float *out, cudaStream_t s);
+ void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+ void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
+ void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+ void clamp(const size_t n, const float low, const float high, const float *in,
+            float *out, cudaStream_t s);
+ 
+ void pow(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
  
- void softplus(int n, const float *in, float *out);
+ void add(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
  
- void softplus_grad(int n, const float *in, float *out);
+ void mult(const size_t n, const float *in, const float x, float *out,
+           cudaStream_t s);
  
- void square(int n, const float *in, float *out);
+ void div(const size_t n, const float x, const float *in, float *out,
+          cudaStream_t s);
  
- void square_grad(int n, const float *in, float *out);
+ void threshold(const size_t n, const float x, const float *in, float *out,
+                cudaStream_t s);
  
- void sqrt(int n, const float *in, float *out);
+ void gt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void ge(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void lt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void le(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
  
- void pow(int n, const float *a, const float *b, float *out);
+ // 2 inputs
+ void pow(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void mult(int n, const float *a, const float *b, float *out);
+ void add(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void mult(int n, const float *a, const float x, float *out);
+ void sub(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void div(int n, const float *a, const float *b, float *out);
+ void mult(const size_t n, const float *in1, const float *in2, float *out,
+           cudaStream_t s);
  
- void set_value(int n, float v, float *out);
+ void div(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void threshold(int n, float alpha, const float *in, float *out);
+ void sum(const size_t n, const float *in, float *out, cudaStream_t s);
  
- // follow the consistency guide for math API
 +void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
 +                         const float *p, const int *t, float *loss,
 +                         cudaStream_t stream);
- void Div(const size_t num, const float x, const float *in, float *out,
-          cudaStream_t s);
- void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void Set(const size_t num, const float x, float *out, cudaStream_t s);
 +void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
 +                            const float *p, const int *t, float *grad,
 +                            cudaStream_t stream);
 +
  }  // cuda
  
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index 1ac25c6,e62386a..4e0d94b
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -592,42 -557,8 +556,8 @@@ void AddRow(const SType alpha, const ST
      Mult(alpha, one, vmat, beta, M);
    }
  }
- void ComputeCrossEntropy(const Tensor& t, Tensor* p) {
-   CHECK_LE(p->nDim(), 2u);
-   CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
-   size_t batchsize = 1;
-   if (p->nDim() == 2u) batchsize = p->shape(0);
-   size_t dim = p->Size() / batchsize;
-   TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
-     p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
-       ComputeCrossEntropy<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
-                                        p->blob(), ctx);
-     }, {p->blob(), t.blob()}, {p->blob()});
-   });
- }
- 
- template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
-   Tensor out(in.shape(), in.device(), in.data_type());
-   Div(alpha, in, &out);
-   return out;
- }
- 
- template Tensor Div<float>(const float, const Tensor &);
- 
- template <typename SType>
- void Div(const SType alpha, const Tensor &in, Tensor *out) {
-   CheckDataTypeAndLang(in, *out);
-   CHECK(in.shape() == out->shape());
-   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-     // TODO(wangwei) type cast SType to DType;
-     in.device()->Exec(
-         [alpha, in, out](Context *ctx) {
-           Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
-         },
-         {in.blob()}, {out->blob()});
-   });
- }
- template void Div<float>(const float, const Tensor &, Tensor *);
 -template <>
++template
+ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
  
  /// Divide column 'v' by each column of matrix M; write results into 'out'
  void DivColumn(const Tensor &v, Tensor *M) {
@@@ -725,4 -639,92 +638,122 @@@ void SumRows(const Tensor &M, Tensor *v
      Mult(X, one, v);
    }
  }
+ // ====================Random operations=====================================
+ template <typename SType>
+ void Bernoulli(const SType p, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto prob = TypeCast<SType, DType>(p);
+     out->device()->Exec([prob, out](Context *ctx) {
+       Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Bernoulli<float>(const float p, Tensor *out);
+ 
+ template <typename SType>
+ void Uniform(const SType low, const SType high, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto l = TypeCast<SType, DType>(low);
+     auto h = TypeCast<SType, DType>(high);
+     out->device()->Exec([l, h, out](Context *ctx) {
+       Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Uniform<float>(const float low, const float high, Tensor *out);
+ 
+ template <typename SType>
+ void Gaussian(const SType mean, const SType std, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto m = TypeCast<SType, DType>(mean);
+     auto s = TypeCast<SType, DType>(std);
+     out->device()->Exec([m, s, out](Context *ctx) {
+       Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Gaussian<float>(const float mean, const float std, Tensor *out);
+ 
+ // ================Blas operations============================================
+ template <typename SType>
+ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+     auto a = TypeCast<SType, DType>(alpha);
+     out->device()->Exec([a, in, out](Context *ctx) {
+       Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+     }, {in.blob(), out->blob()}, {out->blob()});
+   });
+ }
 -template <>
 -void Axpy(const float alpha, const Tensor &in, Tensor *out);
++template void Axpy(const float alpha, const Tensor &in, Tensor *out);
+ 
+ Tensor Mult(const Tensor &A, const Tensor &B) {
+   Shape s;
+   s.push_back(A.shape(0));
+   if (B.nDim() == 2) s.push_back(B.shape(1));
+   Tensor out(s, A.device(), A.data_type());
+   Mult(A, B, &out);
+   return out;
+ }
+ 
+ void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+   Mult(1.0f, A, B, 0.0f, out);
+ }
+ 
+ template <typename SType>
+ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+           Tensor *C) {
+   CHECK_EQ(A.shape().size(), 2u);
+   if (B.nDim() == 1u) {
+     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+       auto a = TypeCast<SType, DType>(alpha);
+       auto b = TypeCast<SType, DType>(beta);
+       C->device()->Exec([a, A, b, B, C](Context *ctx) {
+         GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+                           B.blob(), b, C->blob(), ctx);
+       }, {A.blob(), B.blob()}, {C->blob()});
+     });
+   } else {
+     CHECK(!C->transpose());
+     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+       auto a = TypeCast<SType, DType>(alpha);
+       auto b = TypeCast<SType, DType>(beta);
+       C->device()->Exec([a, A, b, B, C](Context *ctx) {
+         GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+                           A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+       }, {A.blob(), B.blob()}, {C->blob()});
+     });
+   }
+ }
+ 
++
++// ************************
++// Misc.
++// ***********************
++void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
++  CHECK_LE(p.nDim(), 2u);
++  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
++  size_t batchsize = 1;
++  if (p.nDim() == 2u) batchsize = p.shape(0);
++  size_t dim = p.Size() / batchsize;
++  TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
++    p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
++      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p.blob(), t.blob(),
++                                       loss->blob(), ctx);
++    }, {p.blob(), t.blob()}, {loss->blob()});
++  });
++}
++void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
++  CHECK_LE(p->nDim(), 2u);
++  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
++  size_t batchsize = 1;
++  if (p->nDim() == 2u)
++    batchsize = p->shape(0);
++  size_t dim = p->Size() / batchsize;
++  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
++    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
++      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
++                                          p->blob(), ctx);
++    }, {p->blob(), t.blob()}, {p->blob()});
++  });
++}
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math.h
index bcf4908,b86e1cb..12490d1
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@@ -269,109 -293,74 +293,95 @@@ void Scale(const size_t num, const DTyp
  template <typename DType, typename Lang>
  void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
           Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+   LOG(FATAL) << "Dot Not Implemented";
  }
  
- // ===== Level 2
- /// ret = alpha * op(A) * v + beta * ret.
- /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+ /// out = alpha * A * v + beta * out.
+ /// transA indicates if the internal data layout is transposed of A
  template <typename DType, typename Lang>
- void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
-           DType beta, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+           const Blob *A, const Blob *v, const DType beta, Blob *out,
+           Context *ctx) {
+   LOG(FATAL) << "GEMV Not Implemented";
  }
  
- // ===== Level 3
- 
- // ================Random functions===========================================
- /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
- // Get the random generator from 'ctx'
- // If DType is not float, then convert the threshold to DType
- template <typename DType, typename Lang>
- void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
- // The random generator should be extracted from ctx.
- // If DType is not float, then convert the low and high to DType
+ /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+ /// if matrix_lef_side is true, do M*v; else do v*M
  template <typename DType, typename Lang>
- void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
+           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
+   LOG(FATAL) << "DGMM Not Implemented";
  }
- // The random generator should be extracted from ctx.
- // If DType is not float, then convert the mean and std to DType
+ 
+ /// C = alpha * A * B + beta * C.
+ /// transA indicates if the internal data layout is transposed of A
  template <typename DType, typename Lang>
- void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void GEMM(const bool transA, const bool transB, const size_t nrowA,
+           const size_t ncolB, const size_t ncolA, const DType alpha,
+           const Blob *A, const Blob *B, const DType beta, Blob *C,
+           Context *ctx) {
+   LOG(FATAL) << "GEMM Not Implemented";
  }
  
- // ========follow the consistency guide of math API
- 
 +/// Divide alpha by each element of 'in'.
 +// following the consistency guide.
 +template <typename DType, typename Lang>
 +void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
 +                         const Blob *p, const Blob *t, Blob *loss,
 +                         Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
- template <typename DType, typename Lang>
- void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
-          Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
 +
- /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
- /// if matrix_lef_side is true, do M*v; else do v*M
 +template <typename DType, typename Lang>
- void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
-           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
++void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
++                            const Blob *p, const Blob *t, Blob *grad,
++                            Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
 +
- /// C = alpha * A * B + beta * C.
- /// transA indicates if the internal data layout is transposed of A
+ // **************************************
+ // Matrix functions
+ // **************************************
+ /*
+ /// Add the vector v to every column of A as the column of out
  template <typename DType, typename Lang>
- void GEMM(const bool transA, const bool transB, const size_t nrowA,
-           const size_t ncolB, const size_t ncolA, const DType alpha,
-           const Blob *A, const Blob *B, const DType beta, Blob *C,
-           Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+             Blob *out, Context *ctx) {
+   LOG(FATAL) << "AddCol Not Implemented";
  }
- /// ret[i]=(input[i]<x)?1.f:0.f
+ // TODO(wangwei) unify AddRow and AddCol.
+ /// Add the vector v to every row of A as the row of out
  template <typename DType, typename Lang>
- void LT(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+             Blob *out, Context *ctx) {
+   LOG(FATAL) << "AddRow Not Implemented";
  }
- /// ret[i]=(input[i]<=x)?1.f:0.f
- template <typename DType, typename Lang>
- void LE(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
- /// ret[i]=(input[i]>x)?1.f:0.f
+ /// outer-product.
+ /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
  template <typename DType, typename Lang>
- void GT(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+            Blob *out, Context *ctx) {
+   LOG(FATAL) << "Outer Not Implemented";
  }
- /// ret[i]=(input[i]>=x)?1.f:0.f
+ 
+ /// Sum the columns of the in matrix into a vector
  template <typename DType, typename Lang>
- void GE(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                 Context *ctx) {
+   LOG(FATAL) << "SumColumns Not Implemented";
  }
 +template <typename DType, typename Lang>
 +void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
 +
+ // TODO(wangwei) unify SumRow and SumCol.
+ /// Sum the rows of the in matrix into a vector
  template <typename DType, typename Lang>
- void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
-                             const Blob *p, const Blob *t, Blob *grad,
-                             Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+              Context *ctx) {
+   LOG(FATAL) << "SumRows Not Implemented";
  }
- 
+ */
  }  // namespace singa
  #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math_cpp.h
index 907c656,0b280a3..c5d092b
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@@ -323,32 -420,196 +422,229 @@@ void GEMM<float, lang::Cpp>(const bool 
    cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
                lda, BPtr, ldb, beta, CPtr, ldc);
  }
- #endif  // USE_CBLAS
+ 
+ #else
  
  template <>
- void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
+ void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                             Context *ctx) {
+   size_t maxPos = 0;
+   float maxVal = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     if (i == 0) {
+       maxVal = inPtr[i];
+     } else if (inPtr[i] > maxVal) {
+       maxVal = inPtr[i];
+       maxPos = i;
+     }
+   }
+   *out = maxPos;
+ }
+ template <>
+ void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                             Context *ctx) {
+   size_t minPos = 0;
+   float minVal = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     if (i == 0) {
+       minVal = inPtr[i];
+     } else if (inPtr[i] > minVal) {
+       minVal = inPtr[i];
+       minPos = i;
+     }
+   }
+   *out = minPos;
+ }
+ 
+ template <>
+ void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                             Context *ctx) {
+   float sum = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     sum += fabs(inPtr[i]);
+   }
+ }
+ 
+ template <>
+ void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                             Blob *out, Context *ctx) {
    float *outPtr = static_cast<float *>(out->mutable_data());
-   for (size_t i = 0; i < num; i++) outPtr[i] = x;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     outPtr[i] += alpha * inPtr[i];
+   }
  }
+ 
+ template <>
+ void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                              Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   for (size_t i = 0; i < num; i++) {
+     outPtr[i] *= x;
+   }
+ }
+ 
+ template <>
+ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                            float *out, Context *ctx) {
+   float sum = 0;
+   const float *in1Ptr = static_cast<const float *>(in1->data());
+   const float *in2Ptr = static_cast<const float *>(in2->data());
+   for (size_t i = 0; i < num; i++) {
+     sum += in1Ptr[i] * in2Ptr[i];
+   }
+ }
+ 
+ template <>
+ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                             const float alpha, const Blob *A, const Blob *v,
+                             const float beta, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < m; r++) {
+     float sum = 0;
+     for (size_t c = 0; c < n; c++) {
+       size_t idx = trans ? c * m + r : r * n + c;
+       sum += APtr[idx] * vPtr[c];
+     }
+     outPtr[r] = alpha * sum + beta * outPtr[r];
+   }
+ }
+ 
+ #endif  // USE_CBLAS
++template <>
++void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
++                                           const size_t dim, const Blob *p,
++                                           const Blob *t, Blob *loss,
++                                           Context *ctx) {
++  const float *pPtr = static_cast<const float *>(p->data());
++  const int *tPtr = static_cast<const int *>(t->data());
++  float *lossPtr = static_cast<float *>(loss->mutable_data());
++  for (size_t i = 0; i < batchsize; i++) {
++    int truth_idx = tPtr[i];
++    CHECK_GE(truth_idx, 0);
++    float prob_of_truth = pPtr[i * dim + truth_idx];
++    lossPtr[i] = -std::log(std::max(prob_of_truth, FLT_MIN));
++  }
++}
++
 +template <>
 +void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
 +                                              const size_t dim, const Blob *p,
 +                                              const Blob *t,
 +                                              Blob *grad, Context *ctx) {
 +  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
 +  // const float* pPtr = static_cast<const float*>(p->data());
-   const float *tPtr = static_cast<const float *>(t->data());
++  const int *tPtr = static_cast<const int *>(t->data());
 +  float *gradPtr = static_cast<float *>(grad->mutable_data());
 +
 +  for (size_t i = 0; i < batchsize; i++) {
 +    int truth_idx = static_cast<int>(tPtr[i]);
 +    CHECK_GE(truth_idx, 0);
 +    gradPtr[i * dim + truth_idx] -= 1.0;
 +  }
 +}
 +
  
+ // =========Matrix operations ================================================
+ /*
+ template <>
+ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *A, const Blob *v, Blob *out,
+                               Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+     }
+   }
+ }
+ 
+ template <>
+ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *A, const Blob *v, Blob *out,
+                               Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+     }
+   }
+ }
+ template <>
+ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                              const Blob *in2, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *in1Ptr = static_cast<const float *>(in1->data());
+   const float *in2Ptr = static_cast<const float *>(in2->data());
+   for (size_t r = 0; r < m; r++) {
+     size_t offset = r * n;
+     for (size_t c = 0; c < n; c++) {
+       outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+     }
+   }
+ }
+ template <>
+ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   float *bPtr = new float[ncol];
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     float denom = 0.f;
+     for (size_t c = 0; c < ncol; c++) {
+       bPtr[c] = exp(inPtr[offset + c]);
+       denom += bPtr[c];
+     }
+     for (size_t c = 0; c < ncol; c++) {
+       size_t idx = offset + c;
+       outPtr[idx] = bPtr[c] / denom;
+     }
+   }
+   delete bPtr;
+ }
+ 
+ template <>
+ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                   const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t c = 0; c < ncol; c++) {
+     outPtr[c] = 0.f;
+   }
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[c] += inPtr[offset + c];
+     }
+   }
+ }
+ 
+ template <>
+ void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     outPtr[r] = 0.f;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[r] += inPtr[offset + c];
+     }
+   }
+ }
+ */
  }  // namespace singa
  
  #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math_cuda.h
index c69620c,e2597d5..9a8839e
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@@ -131,55 -398,6 +398,28 @@@ void GEMM<float, lang::Cuda>(const boo
                             BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
  }
  
 +template <>
- void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out, Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::GE(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out,  Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::GT(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out, Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::LE(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out,  Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::LT(num, inPtr, x, outPtr, ctx->stream);
- }
- 
- template<>
- void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                             Context *ctx) {
-   float *outPtr = static_cast<float *>(out->mutable_data());
-   cuda::Set(num, x, outPtr, ctx->stream);
++void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize,
++                                            const size_t dim, const Blob *p,
++                                            const Blob *t, Blob *loss,
++                                            Context *ctx) {
++  const float *pPtr = static_cast<const float *>(p->data());
++  const int *tPtr = static_cast<const int *>(t->data());
++  float *lossPtr = static_cast<float *>(loss->mutable_data());
++  cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream);
 +}
- 
 +template <>
 +void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
 +                                               const size_t dim, const Blob *p,
 +                                               const Blob *t, Blob *grad,
 +                                               Context *ctx) {
 +  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
 +  const float *pPtr = static_cast<const float *>(p->data());
 +  const int *tPtr = static_cast<const int *>(t->data());
 +  float *gradPtr = static_cast<float *>(grad->mutable_data());
 +  cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
 +                               ctx->stream);
 +}
- 
  }  // namespace singa
  
  #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --cc src/model/layer/softmax.cc
index 8af1d76,b379fc1..f554f25
--- a/src/model/layer/softmax.cc
+++ b/src/model/layer/softmax.cc
@@@ -25,14 -25,16 +25,17 @@@ void Softmax::Setup(const LayerConf& co
  }
  
  const Tensor Softmax::Forward(int flag, const Tensor& input) {
+   Tensor output;
    if (input.nDim() == 1) {
-     buf_.push(SoftMax(input));
 -    Tensor tmp = Reshape(input, Shape{1, input.Size()});
 -      output = SoftMax(tmp, 0);
++    output = SoftMax(input);
    } else {
 -    output = SoftMax(input, axis_);
 +    size_t nrow = Product(input.shape(), 0, axis_);
 +    const Tensor& tmp = Reshape(input, Shape{nrow, input.Size() / nrow});
-     buf_.push(SoftMax(tmp));
++    output = SoftMax(tmp);
    }
-   return buf_.top();
+   if (flag & kTrain)
+     buf_.push(output);
+   return output;
  }
  
  const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/loss/softmax_cross_entropy.cc
----------------------------------------------------------------------
diff --cc src/model/loss/softmax_cross_entropy.cc
index 4ca323a,0000000..bed3348
mode 100644,000000..100644
--- a/src/model/loss/softmax_cross_entropy.cc
+++ b/src/model/loss/softmax_cross_entropy.cc
@@@ -1,53 -1,0 +1,53 @@@
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +#include <stack>
 +#include "singa/model/loss.h"
 +
 +namespace singa {
 +
- 
- Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
++Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction,
++                                    const Tensor& target) {
 +  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
 +                      << " The calling pattern is [Forward|Evaluate] Backward";
 +  size_t batchsize = 1;
 +  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
 +  size_t dim = prediction.Size() / batchsize;
 +  const Tensor& input = Reshape(prediction, Shape{batchsize, dim});
 +  Tensor prob = SoftMax(input);
 +
 +  // buffer intermediate data
 +  buf_.push(prob);
 +  buf_.push(target);
-   Tensor loss = prob.Clone();
++  Tensor loss(Shape{batchsize}, prob.device(), prob.data_type());
 +
-   ComputeCrossEntropy(target, &loss);
++  ComputeCrossEntropy(prob, target, &loss);
 +  return loss;
 +}
 +
 +Tensor SoftmaxCrossEntropy::Backward() {
 +  const Tensor target = buf_.top();
 +  buf_.pop();
 +  Tensor prob = buf_.top();
 +  buf_.pop();
 +  SoftmaxCrossEntropyBwd(target, &prob);
 +  return prob;
 +}
 +}  // namespace singa
 +
 +

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cross_entropy.cc
index 6b8cb69,0000000..0eb36e5
mode 100644,000000..100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@@ -1,114 -1,0 +1,116 @@@
 +/************************************************************
 +*
 +* Licensed to the Apache Software Foundation (ASF) under one
 +* or more contributor license agreements.  See the NOTICE file
 +* distributed with this work for additional information
 +* regarding copyright ownership.  The ASF licenses this file
 +* to you under the Apache License, Version 2.0 (the
 +* "License"); you may not use this file except in compliance
 +* with the License.  You may obtain a copy of the License at
 +*
 +*   http://www.apache.org/licenses/LICENSE-2.0
 +*
 +* Unless required by applicable law or agreed to in writing,
 +* software distributed under the License is distributed on an
 +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 +* KIND, either express or implied.  See the License for the
 +* specific language governing permissions and limitations
 +* under the License.
 +*
 +*************************************************************/
 +
 +#include "gtest/gtest.h"
 +#include "singa/core/tensor.h"
 +#include "singa/core/device.h"
 +#include "singa/model/loss.h"
 +#include "singa_config.h"
 +
 +using singa::Tensor;
 +class TestSoftmaxCrossEntropy : public ::testing::Test {
 + protected:
 +  virtual void SetUp() {
 +    p.Reshape(singa::Shape{2, 4});
 +    t.Reshape(singa::Shape{2, 1});
 +  }
 +  const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
-   const float tdat[2] = {0.0, 2.0};
++  const int tdat[2] = {0, 2};
 +
 +  singa::Tensor p, t;
 +};
 +
 +TEST_F(TestSoftmaxCrossEntropy, CppForward) {
 +  p.CopyDataFromHostPtr(pdat, 8);
++  t.AsType(singa::kInt);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  const Tensor& loss = cross_entropy.Forward(p, t);
 +  auto ldat = loss.data<const float*>();
 +
 +  const float result_test = -log(0.25);
 +  EXPECT_FLOAT_EQ(ldat[0], result_test);
 +  EXPECT_FLOAT_EQ(ldat[1], result_test);
 +}
 +
 +TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
 +  p.CopyDataFromHostPtr(pdat, 8);
++  t.AsType(singa::kInt);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  cross_entropy.Forward(p, t);
 +  const Tensor& grad = cross_entropy.Backward();
 +
 +  auto gdat = grad.data<const float*>();
 +  EXPECT_FLOAT_EQ(gdat[0], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[1], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[2], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[3], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[4], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[5], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[6], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[7], 0.25);
 +}
 +
 +#ifdef USE_CUDA
 +
 +TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  singa::CudaGPU dev;
 +  p.ToDevice(&dev);
 +  t.ToDevice(&dev);
 +  p.CopyDataFromHostPtr(pdat, 8);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  Tensor loss = cross_entropy.Forward(p, t);
 +  loss.ToHost();
 +  auto ldat = loss.data<const float*>();
 +
 +  const float result_test = -log(0.25);
 +  EXPECT_FLOAT_EQ(ldat[0], result_test);
 +  EXPECT_FLOAT_EQ(ldat[1], result_test);
 +}
 +
 +TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  singa::CudaGPU dev;
 +  p.ToDevice(&dev);
 +  t.ToDevice(&dev);
 +  p.CopyDataFromHostPtr(pdat, 8);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  cross_entropy.Forward(p, t);
 +  Tensor grad = cross_entropy.Backward();
 +
 +  grad.ToHost();
 +  auto gdat = grad.data<const float*>();
 +  EXPECT_FLOAT_EQ(gdat[0], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[1], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[2], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[3], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[4], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[5], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[6], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[7], 0.25);
 +}
 +#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_mse.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --cc test/singa/test_softmax.cc
index 09dfcd9,c087605..fff8510
--- a/test/singa/test_softmax.cc
+++ b/test/singa/test_softmax.cc
@@@ -36,13 -36,14 +36,14 @@@ TEST(Softmax, Setup) 
    EXPECT_EQ(2, sft.Axis());
  }
  
+ #ifdef USE_CBLAS
  TEST(Softmax, Forward) {
    const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
--  size_t n = sizeof(x) / sizeof(float);
    size_t row = 2;
    size_t col = 3;
++  size_t n = row * col;
    singa::Tensor in(singa::Shape{row, col});
--  in.CopyDataFromHostPtr<float>(x, n);
++  in.CopyDataFromHostPtr<float>(x, row * col);
  
    int axis = 1;
    Softmax sft;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --cc test/singa/test_tensor_math.cc
index 8368c55,38a9291..1092d69
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@@ -20,6 -22,263 +22,261 @@@ class TestTensorMath : public ::testing
    const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
  };
  
+ TEST_F(TestTensorMath, MemberAbs) {
+   Tensor aa = a.Clone();
+   Tensor bb = b.Clone();
+   Tensor cc = aa - bb;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+   EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+   EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+ 
+   Tensor p = Abs(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberExp) {
+   Tensor p = Exp(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+   EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+   EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLog) {
+   Tensor p = Log(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+   EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+   EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberReLU) {
+   Tensor aa = a.Clone();
+   Tensor cc = aa - 2.0f;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+ 
+   Tensor p = ReLU(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSigmoid) {
+   Tensor p = Sigmoid(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+   EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+   EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSign) {
+   Tensor aa = a.Clone();
+   Tensor cc = aa - 2.0f;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+ 
+   Tensor p = Sign(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_EQ(0.0f, dptr1[0]);
+   EXPECT_EQ(0.0f, dptr1[1]);
+   EXPECT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSqrt) {
+   Tensor p = Sqrt(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+   EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+   EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSquare) {
+   Tensor p = Square(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+   EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+   EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberTanh) {
+   Tensor p = Tanh(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+   EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+   EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, Sum) {
+   Tensor p1 = Sum(e, 0);
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(12.0f, dptr1[1]);
+ 
+   Tensor p2(Shape{3, 1});
+   p2 = Sum(e, 1);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_FLOAT_EQ(3.0f, dptr2[0]);
+   EXPECT_FLOAT_EQ(7.0f, dptr2[1]);
+   EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
+ }
+ 
+ TEST_F(TestTensorMath, SoftMax) {
 -  Tensor p1(Shape{3, 2});
 -  p1 = SoftMax(e, 0);
++  Tensor p1 = SoftMax(Reshape(e, Shape{1, 6}));
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 6; i++) sum += exp(i + 1);
+   EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5);
+   EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5);
+   EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5);
+   EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5);
+   EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5);
+   EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5);
+ 
 -  Tensor p2(Shape{3, 2});
 -  p2 = SoftMax(e, 1);
++  Tensor p2 = SoftMax(e);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5);
+   EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLT) {
+   Tensor p1 = a < 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLE) {
+   Tensor p1 = a <= 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGT) {
+   Tensor p1 = a > 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGE) {
+   Tensor p1 = a >= 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberPow) {
+   Tensor p1 = Pow(b, 3.0f);
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
+   EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]);
+   EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]);
+ 
+   // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the
+   // function is complete
+   // Tensor p2 = Pow(a,b);
+   // const float *dptr2 = p2.data<const float *>();
+   // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+   // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+   // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSub) {
+   Tensor p1 = a - b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberEltwiseMult) {
+   Tensor p1 = a * b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberDiv) {
+   Tensor p1 = a / b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5);
+ 
+   Tensor p2 = Div(10.0f, b);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5);
+   EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5);
+   EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5);
+ 
+   Tensor p3 = a / 8.0f;
+   const float *dptr3 = p3.data<const float *>();
+   EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5);
+   EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5);
+   EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberBernoulli) {
+   Tensor p1(Shape{10000});
+   Bernoulli(0.3f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 10000; i++) sum += dptr1[i];
+   float mean = sum / 10000;
+   EXPECT_NEAR(mean, 0.3f, 1e-2);
+ 
+   sum = 0;
+   for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 9999;
+   EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
+ }
+ 
+ TEST_F(TestTensorMath, MemberUniform) {
+   Tensor p1(Shape{10000});
+   Uniform(0.1f, 0.2f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 10000; i++) sum += dptr1[i];
+   float mean = sum / 10000;
+   EXPECT_NEAR(mean, 0.15f, 1e-3);
+ 
+   sum = 0;
+   for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 9999;
+   EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGaussian) {
+   Tensor p1(Shape{50000});
+   Gaussian(0.0f, 1.0f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 50000; i++) sum += dptr1[i];
+   float mean = sum / 50000;
+   EXPECT_NEAR(mean, 0.0, 1e-2);
+ 
+   sum = 0;
+   for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 49999;
+   EXPECT_NEAR(variance, 1.0, 1e-2);
+ }
+ 
  TEST_F(TestTensorMath, MemberAddTensor) {
    Tensor aa = a.Clone();
    aa += a;

[31/50] [abbrv] incubator-singa git commit: SINGA-162 Transfer the codebase for SINGA v1.0 into dev branch

Posted by zh...@apache.org.

SINGA-162 Transfer the codebase for SINGA v1.0 into dev branch

add guard flags in test file to support test without cuda


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/04e23d1a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/04e23d1a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/04e23d1a

Branch: refs/heads/master
Commit: 04e23d1a60d5160ff319f63fe89d715feee53b57
Parents: 7a19165
Author: WANG Sheng <wa...@gmail.com>
Authored: Fri Jun 3 15:31:06 2016 +0800
Committer: WANG Sheng <wa...@gmail.com>
Committed: Fri Jun 3 15:31:06 2016 +0800

----------------------------------------------------------------------
 test/singa/test_dense.cc       |  8 ++++----
 test/singa/test_mse.cc         |  2 ++
 test/singa/test_sgd.cc         |  2 +-
 test/singa/test_softmax.cc     |  2 ++
 test/singa/test_tensor_math.cc | 10 ++++++----
 5 files changed, 15 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/04e23d1a/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 606f819..5050d7e 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -37,7 +37,7 @@ TEST(Dense, Setup) {
   EXPECT_EQ(3u, dense.num_output());
   EXPECT_EQ(2u, dense.num_input());
 }
-
+#ifdef USE_CBLAS
 TEST(Dense, ForwardCpp) {
   Dense dense;
 
@@ -75,9 +75,8 @@ TEST(Dense, ForwardCpp) {
                        x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
                       outptr1[i * 3 + j]);
 }
-
-
-
+#endif  // USE_CBLAS
+#ifdef USE_CUDA
 TEST(Dense, BackwardCpp) {
   Dense dense;
 
@@ -140,6 +139,7 @@ TEST(Dense, BackwardCpp) {
   for (int i = 0; i < 3; i++)
     EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
 }
+#endif
 
 #ifdef USE_CUDA
 TEST(Dense, ForwardCuda) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/04e23d1a/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 67f583c..ccaab7a 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -66,6 +66,7 @@ TEST_F(TestMSE, CppBackward) {
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }
 #endif
+#ifdef USE_CUDA
 TEST_F(TestMSE, CudaForward) {
   singa::MSE mse;
   singa::CudaGPU dev;
@@ -98,3 +99,4 @@ TEST_F(TestMSE, CudaBackward) {
   for (size_t i = 0; i < grad.Size(); i++)
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/04e23d1a/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
index a660556..c0b6e2b 100644
--- a/test/singa/test_sgd.cc
+++ b/test/singa/test_sgd.cc
@@ -82,7 +82,7 @@ TEST(SGD, ApplyWithMomentum) {
   }
 }
 
-#ifndef USE_CUDA
+#ifdef USE_CUDA
 TEST(SGD, ApplyWithoutMomentumCuda) {
   singa::SGD sgd;
   const float v[4] = {0.1, 0.2, 0.3, 0.4};

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/04e23d1a/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_softmax.cc b/test/singa/test_softmax.cc
index 6ee8b3f..c087605 100644
--- a/test/singa/test_softmax.cc
+++ b/test/singa/test_softmax.cc
@@ -36,6 +36,7 @@ TEST(Softmax, Setup) {
   EXPECT_EQ(2, sft.Axis());
 }
 
+#ifdef USE_CBLAS
 TEST(Softmax, Forward) {
   const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
   size_t n = sizeof(x) / sizeof(float);
@@ -108,3 +109,4 @@ TEST(Softmax, Backward) {
   EXPECT_FLOAT_EQ(dx[4], xptr[4]);
   EXPECT_FLOAT_EQ(dx[5], xptr[5]);
 }
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/04e23d1a/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 8368c55..170b96c 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -252,6 +252,7 @@ TEST_F(TestTensorMath, SumColumnsCpp) {
   }
 }
 #endif
+#ifdef USE_CUDA
 TEST_F(TestTensorMath, MultCuda) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   singa::CudaGPU dev;
@@ -337,7 +338,7 @@ TEST_F(TestTensorMath, SubColumnCuda) {
     }
   }
 }
-
+#endif
 TEST_F(TestTensorMath, MultColumnCpp) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
@@ -351,7 +352,7 @@ TEST_F(TestTensorMath, MultColumnCpp) {
     }
   }
 }
-
+#ifdef USE_CUDA
 TEST_F(TestTensorMath, MultColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   singa::CudaGPU dev;
@@ -432,7 +433,7 @@ TEST_F(TestTensorMath, MultRowCuda) {
     }
   }
 }
-
+#endif
 TEST_F(TestTensorMath, DivRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
@@ -446,7 +447,7 @@ TEST_F(TestTensorMath, DivRowCpp) {
     }
   }
 }
-
+#ifdef USE_CUDA
 TEST_F(TestTensorMath, DivRowCuda) {
   const float x[2] = {1.1f, 2.1f};
   singa::CudaGPU dev;
@@ -495,3 +496,4 @@ TEST_F(TestTensorMath, SumColumnCuda) {
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
 }
+#endif

[16/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

SINGA-182 Clean math function APIs and implementations

Add comments (guides) in corresponding math function files.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/fbd52197
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/fbd52197
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/fbd52197

Branch: refs/heads/master
Commit: fbd52197e369e1066ad367bcbde502f451462190
Parents: 3171459
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu May 26 14:46:50 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 26 14:46:50 2016 +0800

----------------------------------------------------------------------
 src/core/tensor/math_kernel.h      |  2 ++
 src/core/tensor/tensor_math.h      | 41 ++++++++++++++++++++++-----------
 src/core/tensor/tensor_math_cpp.h  |  3 +++
 src/core/tensor/tensor_math_cuda.h |  1 +
 4 files changed, 33 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbd52197/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5367f4a..b016007 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -25,6 +25,8 @@
 #include "singa_config.h"
 #ifdef USE_CUDA
 
+/// TODO(wangwei) Clean the function APIs as commented in tensor_math.h
+///  Add 'Context *ctx' as an argument of all cuda functions.
 namespace singa {
 /*
   void softmaxloss_forward(int n, int dim, const float *prob,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbd52197/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index d55e15a..b53d4cb 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -27,21 +27,19 @@ namespace singa {
 /// operations.
 /// All functions have a template argument, DType for DataType, Lang for the
 /// device programming language, e.g., Langice::kCpp, Langice::kCuda
+///
+/// TODO(wangwei) Clean the functions to make the function APIs consistent:
+/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the first
+///    letter.
+/// 2. Order functions based on function name in alphabetical order.
+/// 3. Function arguments order is [const basic type] [const Blob] [mutable Blob].
+/// 4. Function argument names, use 'num' for total number of elements in
+///    elementwise operations; use 'in1' 'in2' for input blobs; use 'out' for
+///    output blob or value. With exceptions for some functions, e.g.,
+///      Scale(const float alpha, const Blob* in, Blob* out);
+///    For such cases, use x, v, alpha, etc for scalar types.
+///    For blas functions, follow the blas style for argument names.
 
-/// Some operations would have many config/hyper-parameters, e.g., Conv, and
-/// these config vary among diff implementations, e.g., cuda/cudnn/opencl.
-/// To separate the modules, we pass a OpConf pointer to the Tensor Op function.
-/// The specific fields are implemented by inheriting OpConf, and casting the
-/// pointer between the base and the sub-class.
-class OpConf {
- public:
-  template <typename T>
-  T* CastTo() {
-    static_assert(std::is_base_of<OpConf, T>::value,
-                  "The cast type must be a sub-class of OpConf");
-    return static_cast<T*>(this);
-  }
-};
 
 // ================Linear algebra functions====================================
 /// ret[i] = |input[i]|
@@ -292,6 +290,21 @@ void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+/*Some operations would have many config/hyper-parameters, e.g., Conv, and
+these config vary among diff implementations, e.g., cuda/cudnn/opencl.
+To separate the modules, we pass a OpConf pointer to the Tensor Op function.
+The specific fields are implemented by inheriting OpConf, and casting the
+pointer between the base and the sub-class.
+class OpConf {
+ public:
+  template <typename T>
+  T* CastTo() {
+    static_assert(std::is_base_of<OpConf, T>::value,
+                  "The cast type must be a sub-class of OpConf");
+    return static_cast<T*>(this);
+  }
+};
+*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbd52197/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 7dc35c9..5ce33ad 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -24,6 +24,9 @@
 #include <cblas.h>
 #endif
 
+/// TODO(wangwei) Clean the implementations following the comments in
+/// tensor_math.h.
+/// For Blob argument xxx, name its pointer as xxxPtr.
 namespace singa {
 template <>
 void Square<float, lang::Cpp>(int count, const Blob* input,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbd52197/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 12fc58e..f26b5a3 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -26,6 +26,7 @@
 
 namespace singa {
 
+// TODO(wangwei) Clean implementations following comments in tensor_math_cpp.h.
 // TODO(wangwei) optimize using stream
 template<>
 void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,

[26/50] [abbrv] incubator-singa git commit: SINGA-188 Add Dense layer

Posted by zh...@apache.org.

SINGA-188 Add Dense layer

Minor change to format code and update IDs of DenseConf fields.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/64ea2065
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/64ea2065
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/64ea2065

Branch: refs/heads/master
Commit: 64ea2065411ed29d6870d75c8577cbe086f4daa7
Parents: 73d4a34
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu Jun 2 12:02:16 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu Jun 2 12:02:16 2016 +0800

----------------------------------------------------------------------
 src/model/layer/dense.cc |  20 ++---
 src/model/layer/dense.h  |  15 ++--
 src/proto/model.proto    |   8 +-
 test/singa/test_dense.cc | 177 ++++++++++++++++++++++--------------------
 4 files changed, 115 insertions(+), 105 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/64ea2065/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index ebee62a..29ff8cb 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -29,7 +29,7 @@ Dense::~Dense() {
 }
 void Dense::Setup(const LayerConf &conf) {
   Layer::Setup(conf);
-  DenseConf dense_conf = conf.dense_conf();
+  auto dense_conf = conf.dense_conf();
   hdim_ = dense_conf.num_output();
   vdim_ = dense_conf.num_input();
   transpose_ = dense_conf.transpose();
@@ -45,7 +45,8 @@ void Dense::Setup(const LayerConf &conf) {
 /// \copydoc Layer::Forward(int flag, const Tensor&)
 const Tensor Dense::Forward(int flag, const Tensor &input) {
   Tensor output;
-  if (transpose_)
+
+  if (transpose_)  // use the transposed version of weight_ for computing
     output = Mult(input, weight_);
   else
     output = Mult(input, weight_.T());
@@ -55,8 +56,8 @@ const Tensor Dense::Forward(int flag, const Tensor &input) {
 }
 
 /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
-const std::pair<Tensor, vector<Tensor>>
-Dense::Backward(int flag, const Tensor &grad) {
+const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag,
+                                                        const Tensor &grad) {
   vector<Tensor> param_grad;
   Tensor src_data = buf_.top();
   buf_.pop();
@@ -65,11 +66,10 @@ Dense::Backward(int flag, const Tensor &grad) {
   dw.ResetLike(weight_);
   dx.ResetLike(src_data);
   SumRows(grad, &db);
-  if (transpose_){
-    dx = Mult(grad, weight_.T()); 
+  if (transpose_) {
+    dx = Mult(grad, weight_.T());
     dw = Mult(src_data.T(), grad);
-  }
-  else{
+  } else {
     dx = Mult(grad, weight_);
     dw = Mult(grad.T(), src_data);
   }
@@ -78,8 +78,8 @@ Dense::Backward(int flag, const Tensor &grad) {
   return std::make_pair(dx, param_grad);
 }
 
-void Dense::ToDevice(Device *device) { 
+void Dense::ToDevice(Device *device) {
   weight_.ToDevice(device);
-  bias_.ToDevice(device); 
+  bias_.ToDevice(device);
 }
 } // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/64ea2065/src/model/layer/dense.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h
index d686a01..a5a6f66 100644
--- a/src/model/layer/dense.h
+++ b/src/model/layer/dense.h
@@ -33,7 +33,6 @@ class Dense : public Layer {
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const LayerConf& conf) override;
 
-  void SetupParam(const Tensor& input);
   /// \copydoc Layer::Forward(int flag, const Tensor&)
   const Tensor Forward(int flag, const Tensor& input) override;
 
@@ -42,12 +41,12 @@ class Dense : public Layer {
                                                    const Tensor& grad) override;
 
   void ToDevice(Device* device) override;
-  
+
   size_t num_output() const { return hdim_; }
   size_t num_input() const { return vdim_; }
   bool transpose() const { return transpose_; }
-  const Tensor &weight() const { return weight_; }
-  const Tensor &bias() const { return bias_; }
+  const Tensor& weight() const { return weight_; }
+  const Tensor& bias() const { return bias_; }
 
   void set_weight(Tensor w) {
     weight_.ResetLike(w);
@@ -58,9 +57,11 @@ class Dense : public Layer {
     bias_.CopyData(b);
   }
 
-protected:
-  size_t batchsize_, vdim_, hdim_;
-  bool transpose_;
+ protected:
+  /// Used in auto-encoder, where the decoder would share its weight matrix from
+  /// the encoder's transposed weight matrix.
+  bool transpose_ = false;
+  size_t vdim_, hdim_;
   Tensor weight_, bias_;
   // Tensor data_, grad_;
   std::stack<Tensor> buf_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/64ea2065/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 75e2be7..16ba62f 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -208,11 +208,8 @@ message LayerConf {
   // optional ImageDataConf image_data_conf = 115;
   optional InfogainLossConf infogain_loss_conf = 116;
   optional InnerProductConf inner_product_conf = 117;
-  optional DenseConf dense_conf = 150;
   optional LogConf log_conf = 134;
   optional LRNConf lrn_conf = 118;
-  // Used in SINGA
-  optional MetricConf metric_conf = 200;
   // optional MemoryDataConf memory_data_conf = 119;
   optional MVNConf mvn_conf = 120;
   optional PoolingConf pooling_conf = 121;
@@ -230,6 +227,10 @@ message LayerConf {
   optional ThresholdConf threshold_conf = 128;
   optional TileConf tile_conf = 138;
   //optional WindowDataConf window_data_conf = 129;
+
+  // Used in SINGA
+  optional DenseConf dense_conf = 201;
+  optional MetricConf metric_conf = 200;
 }
 
 // Message that stores hyper-parameters used to apply transformation
@@ -584,6 +585,7 @@ message DenseConf {
   // all preceding axes are retained in the output.
   // May be negative to index from the end (e.g., -1 for the last axis).
   optional int32 axis = 5 [default = 1];
+
   optional uint32 num_input = 20; // The number of inputs for the layer
   optional bool transpose = 21 [default = false]; // whether transpose or not
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/64ea2065/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 6d136af..606f819 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -19,8 +19,8 @@
 *
 *************************************************************/
 #include "../src/model/layer/dense.h"
-
 #include "gtest/gtest.h"
+#include "singa_config.h"
 
 using singa::Dense;
 TEST(Dense, Setup) {
@@ -34,8 +34,8 @@ TEST(Dense, Setup) {
   denseconf->set_transpose(false);
   dense.Setup(conf);
 
-  EXPECT_EQ(3, dense.num_output());
-  EXPECT_EQ(2, dense.num_input());
+  EXPECT_EQ(3u, dense.num_output());
+  EXPECT_EQ(2u, dense.num_input());
 }
 
 TEST(Dense, ForwardCpp) {
@@ -47,82 +47,40 @@ TEST(Dense, ForwardCpp) {
   denseconf->set_num_output(3);
   denseconf->set_transpose(false);
   dense.Setup(conf);
-  
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
-  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-                                      6.0f};
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   singa::Tensor in(singa::Shape{batchsize, vdim});
   in.CopyDataFromHostPtr(x, batchsize * vdim);
-  
+
   // set weight
-  const float we[hdim * vdim] = {
-      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
   singa::Tensor weight(singa::Shape{hdim, vdim});
   weight.CopyDataFromHostPtr(we, hdim * vdim);
 
-  const float bia[hdim] = {
-      1.0f, 1.0f, 1.0f};
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
   singa::Tensor bias(singa::Shape{hdim});
   bias.CopyDataFromHostPtr(bia, hdim);
- 
+
   dense.set_weight(weight);
   dense.set_bias(bias);
-  
+
   singa::Tensor out1 = dense.Forward(singa::kTrain, in);
   singa::CppCPU host(0, 1);
   const float *outptr1 = out1.data<const float *>();
-  EXPECT_EQ(9, out1.Size());
+  EXPECT_EQ(9u, out1.Size());
   for (int i = 0; i < 3; i++)
     for (int j = 0; j < 3; j++)
-      EXPECT_FLOAT_EQ((x[i * 2 +  0] * we[j * 2 + 0] + x[i * 2 + 1] * we[j * 2 + 1] + bia[j]), outptr1[i * 3 + j]);
+      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
+                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
+                      outptr1[i * 3 + j]);
 }
 
-TEST(Dense, ForwardCuda) {
-  Dense dense;
 
-  singa::LayerConf conf;
-  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-  denseconf->set_num_input(2);
-  denseconf->set_num_output(3);
-  denseconf->set_transpose(false);
-  dense.Setup(conf);
-  
-
-  const size_t batchsize = 3, vdim = 2, hdim = 3;
-  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-                                      6.0f};
-  singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
-  in.CopyDataFromHostPtr(x, batchsize * vdim);
-  
-  // set weight
-  const float we[hdim * vdim] = {
-      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
-  weight.CopyDataFromHostPtr(we, hdim * vdim);
-
-  const float bia[hdim] = {
-      1.0f, 1.0f, 1.0f};
-  singa::Tensor bias(singa::Shape{hdim}, &cuda);
-  bias.CopyDataFromHostPtr(bia, hdim);
- 
-  dense.set_weight(weight);
-  dense.set_bias(bias);
-  
-  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-  singa::CppCPU host(0, 1);
-  out1.ToDevice(&host);
-  const float *outptr1 = out1.data<const float *>();
-  EXPECT_EQ(9, out1.Size());
-  for (int i = 0; i < 3; i++)
-    for (int j = 0; j < 3; j++)
-      EXPECT_FLOAT_EQ((x[i * 2 +  0] * we[j * 2 + 0] + x[i * 2 + 1] * we[j * 2 + 1] + bia[j]), outptr1[i * 3 + j]);
-}
 
 TEST(Dense, BackwardCpp) {
   Dense dense;
-  
+
   singa::LayerConf conf;
   singa::DenseConf *denseconf = conf.mutable_dense_conf();
   denseconf->set_num_input(2);
@@ -131,30 +89,28 @@ TEST(Dense, BackwardCpp) {
   dense.Setup(conf);
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
-  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-                                      6.0f};
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{batchsize, vdim});
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
   // set weight
-  const float we[hdim * vdim] = {
-      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
   singa::Tensor weight(singa::Shape{hdim, vdim});
   weight.CopyDataFromHostPtr(we, hdim * vdim);
-  
-  const float bia[hdim] = {
-      1.0f, 1.0f, 1.0f};
+
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
   singa::Tensor bias(singa::Shape{hdim});
   bias.CopyDataFromHostPtr(bia, hdim);
-  
+
   dense.set_weight(weight);
   dense.set_bias(bias);
 
   singa::Tensor out1 = dense.Forward(singa::kTrain, in);
 
   // grad
-  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 3.0f, 3.0f};
+  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
+                                      2.0f, 3.0f, 3.0f, 3.0f};
   singa::Tensor grad(singa::Shape{batchsize, hdim});
   grad.CopyDataFromHostPtr(dy, batchsize * hdim);
 
@@ -164,24 +120,70 @@ TEST(Dense, BackwardCpp) {
   singa::Tensor dweight = ret.second.at(0);
   singa::Tensor dbias = ret.second.at(1);
   const float *dx = in_grad.data<const float *>();
-  EXPECT_EQ(6, in_grad.Size());
+  EXPECT_EQ(6u, in_grad.Size());
   for (int i = 0; i < 3; i++)
     for (int j = 0; j < 2; j++)
-      EXPECT_FLOAT_EQ((dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 +  j] + dy[i * 3 +  2] * we[2 * 2 + j]), dx[i * 2 + j]);
+      EXPECT_FLOAT_EQ(
+          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
+           dy[i * 3 + 2] * we[2 * 2 + j]),
+          dx[i * 2 + j]);
   const float *dweightx = dweight.data<const float *>();
-  EXPECT_EQ(6, dweight.Size());
+  EXPECT_EQ(6u, dweight.Size());
   for (int i = 0; i < 3; i++)
     for (int j = 0; j < 2; j++)
-      EXPECT_FLOAT_EQ((dy[0 * 3 + i] * x[0 *2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] + dy[2 * 3 + i] * x[2 * 2 + j]), dweightx[i * 2 + j]);
+      EXPECT_FLOAT_EQ(
+          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
+           dy[2 * 3 + i] * x[2 * 2 + j]),
+          dweightx[i * 2 + j]);
   const float *dbiasx = dbias.data<const float *>();
-  EXPECT_EQ(3, dbias.Size());
+  EXPECT_EQ(3u, dbias.Size());
   for (int i = 0; i < 3; i++)
     EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
 }
 
+#ifdef USE_CUDA
+TEST(Dense, ForwardCuda) {
+  Dense dense;
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_input(2);
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(conf);
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+
+  // set weight
+  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim}, &cuda);
+  bias.CopyDataFromHostPtr(bia, hdim);
+
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  EXPECT_EQ(9u, out1.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++)
+      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
+                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
+                      outptr1[i * 3 + j]);
+}
 TEST(Dense, BackwardCuda) {
   Dense dense;
-  
+
   singa::LayerConf conf;
   singa::DenseConf *denseconf = conf.mutable_dense_conf();
   denseconf->set_num_input(2);
@@ -190,30 +192,28 @@ TEST(Dense, BackwardCuda) {
   dense.Setup(conf);
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
-  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-                                      6.0f};
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
   // set weight
-  const float we[hdim * vdim] = {
-      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
   singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
   weight.CopyDataFromHostPtr(we, hdim * vdim);
-  
-  const float bia[hdim] = {
-      1.0f, 1.0f, 1.0f};
+
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
   singa::Tensor bias(singa::Shape{hdim}, &cuda);
   bias.CopyDataFromHostPtr(bia, hdim);
-  
+
   dense.set_weight(weight);
   dense.set_bias(bias);
 
   singa::Tensor out1 = dense.Forward(singa::kTrain, in);
 
   // grad
-  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 3.0f, 3.0f};
+  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
+                                      2.0f, 3.0f, 3.0f, 3.0f};
   singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
   grad.CopyDataFromHostPtr(dy, batchsize * hdim);
 
@@ -224,19 +224,26 @@ TEST(Dense, BackwardCuda) {
   singa::Tensor dbias = ret.second.at(1);
   in_grad.ToDevice(&host);
   const float *dx = in_grad.data<const float *>();
-  EXPECT_EQ(6, in_grad.Size());
+  EXPECT_EQ(6u, in_grad.Size());
   for (int i = 0; i < 3; i++)
     for (int j = 0; j < 2; j++)
-      EXPECT_FLOAT_EQ((dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 +  j] + dy[i * 3 +  2] * we[2 * 2 + j]), dx[i * 2 + j]);
+      EXPECT_FLOAT_EQ(
+          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
+           dy[i * 3 + 2] * we[2 * 2 + j]),
+          dx[i * 2 + j]);
   dweight.ToDevice(&host);
   const float *dweightx = dweight.data<const float *>();
-  EXPECT_EQ(6, dweight.Size());
+  EXPECT_EQ(6u, dweight.Size());
   for (int i = 0; i < 3; i++)
     for (int j = 0; j < 2; j++)
-      EXPECT_FLOAT_EQ((dy[0 * 3 + i] * x[0 *2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] + dy[2 * 3 + i] * x[2 * 2 + j]), dweightx[i * 2 + j]);
+      EXPECT_FLOAT_EQ(
+          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
+           dy[2 * 3 + i] * x[2 * 2 + j]),
+          dweightx[i * 2 + j]);
   dbias.ToDevice(&host);
   const float *dbiasx = dbias.data<const float *>();
-  EXPECT_EQ(3, dbias.Size());
+  EXPECT_EQ(3u, dbias.Size());
   for (int i = 0; i < 3; i++)
     EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
 }
+#endif

[28/50] [abbrv] incubator-singa git commit: SINGA-174 Add Batch Normalization layer and Local Response Normalization layer.

Posted by zh...@apache.org.

SINGA-174 Add Batch Normalization layer and Local Response Normalization layer.

Remove buffering input/output if the flag of Layer::Forward() is not kTrain.
The input/output are used during Layer::Backward() which only appears
for kTrain.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/fa2ea304
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/fa2ea304
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/fa2ea304

Branch: refs/heads/master
Commit: fa2ea304d8989818a80780c9f428e0fcc19db031
Parents: eadd3f9
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu Jun 2 14:01:45 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu Jun 2 14:02:08 2016 +0800

----------------------------------------------------------------------
 src/model/layer/activation.cc        | 10 +++++++---
 src/model/layer/cudnn_activation.cc  | 13 ++++++++-----
 src/model/layer/cudnn_convolution.cc |  3 ++-
 src/model/layer/cudnn_lrn.cc         | 16 ++++++++++------
 src/model/layer/cudnn_pooling.cc     |  7 +++++--
 src/model/layer/cudnn_softmax.cc     |  4 +++-
 src/model/layer/dense.cc             |  5 +++--
 src/model/layer/softmax.cc           | 10 +++++++---
 test/singa/test_activation.cc        |  8 ++++----
 test/singa/test_cudnn_activation.cc  |  6 +++---
 test/singa/test_cudnn_softmax.cc     |  6 +++---
 test/singa/test_softmax.cc           |  6 +++---
 12 files changed, 58 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/activation.cc b/src/model/layer/activation.cc
index 464e24d..e7c0696 100644
--- a/src/model/layer/activation.cc
+++ b/src/model/layer/activation.cc
@@ -32,13 +32,16 @@ const Tensor Activation::Forward(int flag, const Tensor& input) {
   Tensor output;
   if (mode_ == "SIGMOID") {
     output = Sigmoid(input);
-    buf_.push(output);
+    if (flag & kTrain)
+      buf_.push(output);
   } else if (mode_ == "TANH") {
     output = Tanh(input);
-    buf_.push(output);
+    if (flag & kTrain)
+      buf_.push(output);
   } else if (mode_ == "RELU") {
     output = ReLU(input);
-    buf_.push(input);
+    if (flag & kTrain)
+      buf_.push(input);
   } else {
     LOG(FATAL) << "Unkown activation: " << mode_;
   }
@@ -48,6 +51,7 @@ const Tensor Activation::Forward(int flag, const Tensor& input) {
 const std::pair<Tensor, vector<Tensor>> Activation::Backward(
     int flag, const Tensor& grad) {
   vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
   // inout means either input or output, but only one is valid for an
   // activation.
   Tensor input_grad, inout = buf_.top();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
index 73c70d7..8ecbbc7 100644
--- a/src/model/layer/cudnn_activation.cc
+++ b/src/model/layer/cudnn_activation.cc
@@ -75,11 +75,13 @@ const Tensor CudnnActivation::Forward(int flag, const Tensor& input) {
         inblob->data(), &beta, this->desc_, outblob->mutable_data()));
 #endif
   }, {input.blob()}, {output.blob()});
-  if (cudnn_mode_ == CUDNN_ACTIVATION_SIGMOID ||
-      cudnn_mode_ == CUDNN_ACTIVATION_TANH) {
-    buf_.push(output);
-  } else if (cudnn_mode_ == CUDNN_ACTIVATION_RELU) {
-    buf_.push(input);
+  if (flag & kTrain) {
+    if (cudnn_mode_ == CUDNN_ACTIVATION_SIGMOID ||
+        cudnn_mode_ == CUDNN_ACTIVATION_TANH) {
+      buf_.push(output);
+    } else if (cudnn_mode_ == CUDNN_ACTIVATION_RELU) {
+      buf_.push(input);
+    }
   }
   return output;
 }
@@ -88,6 +90,7 @@ const std::pair<Tensor, vector<Tensor>> CudnnActivation::Backward(
     int flag, const Tensor& grad) {
   vector<Tensor> param_grad;
   Tensor dx;  // inout = buf_.top();
+  CHECK(!buf_.empty());
   // inout means either used as input or output, only one is valid for one type
   // of activation
   Tensor inout = buf_.top();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index 922b7e0..97aa256 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -138,7 +138,7 @@ void CudnnConvolution::InitCudnn(const Tensor& input) {
 const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
   CHECK_EQ(input.device()->lang(), kCuda);
   CHECK_EQ(input.nDim(), 4u);
-  buf_.push(input);
+  if (flag & kTrain) buf_.push(input);  // buffer the input for backward
   size_t batchsize = input.shape()[0];
   DataType dtype = input.data_type();
   Device *dev = input.device();
@@ -175,6 +175,7 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
   CHECK(has_init_cudnn_);
   CHECK_EQ(grad.device()->lang(), kCuda);
   CHECK_EQ(grad.nDim(), 4u);
+  CHECK(!buf_.empty());
   Tensor src_data = buf_.top();
   buf_.pop();
   vector<Tensor> param_grad;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/cudnn_lrn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.cc b/src/model/layer/cudnn_lrn.cc
index ee661b6..1012813 100644
--- a/src/model/layer/cudnn_lrn.cc
+++ b/src/model/layer/cudnn_lrn.cc
@@ -33,7 +33,7 @@ void CudnnLRN::InitCudnn(const Shape& shape , DataType dtype) {
   CHECK(!has_init_cudnn_);
   mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&shape_desc_));
-  CHECK_EQ(shape.size(), 4);
+  CHECK_EQ(shape.size(), 4u);
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(shape_desc_,
       CUDNN_TENSOR_NCHW,
       GetCudnnDataType(dtype),
@@ -58,9 +58,9 @@ const Tensor CudnnLRN::Forward(int flag, const Tensor& input) {
   output.ResetLike(input);
   output.device()->Exec(
       [=](Context* ctx) {
-        Blob *inblob = input.blob(), *outblob = output.blob();
-        const float alpha = 1.0f, beta = 0.0f;
-        CUDNN_CHECK(cudnnLRNCrossChannelForward(ctx->cudnn_handle,
+      Blob *inblob = input.blob(), *outblob = output.blob();
+      const float alpha = 1.0f, beta = 0.0f;
+      CUDNN_CHECK(cudnnLRNCrossChannelForward(ctx->cudnn_handle,
             this->lrn_desc_,
             this->mode_,
             &alpha,
@@ -70,8 +70,11 @@ const Tensor CudnnLRN::Forward(int flag, const Tensor& input) {
             this->shape_desc_,
             outblob->mutable_data()));
       }, {input.blob()}, {output.blob()});
-  buf_.push(input);
-  buf_.push(output);
+
+  if (flag & kTrain) {
+    buf_.push(input);
+    buf_.push(output);
+  }
   return output;
 }
 
@@ -79,6 +82,7 @@ const std::pair<Tensor, vector<Tensor>> CudnnLRN::Backward(
     int flag, const Tensor& grad) {
   vector <Tensor> param_grad;
   Tensor dx;
+  CHECK(!buf_.empty());
   Tensor output = buf_.top();
   buf_.pop();
   Tensor input = buf_.top();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
index afbc490..842685d 100644
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@ -80,7 +80,6 @@ void CudnnPooling::InitCudnn(const Tensor& input) {
 const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
   CHECK_EQ(input.device()->lang(), kCuda);
   CHECK_EQ(input.nDim(), 4u);
-  buf_.push(input);
   size_t batchsize = input.shape(0);
   DataType dtype = input.data_type();
   Device *dev = input.device();
@@ -97,7 +96,10 @@ const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
                             outblob->mutable_data());
       },
       {input.blob()}, {output.blob()});
-  buf_.push(output);
+  if (flag & kTrain) {
+    buf_.push(input);
+    buf_.push(output);
+  }
   return output;
 }
 
@@ -106,6 +108,7 @@ const std::pair<Tensor, vector<Tensor>> CudnnPooling::Backward(
   CHECK_EQ(grad.device()->lang(), kCuda);
   CHECK_EQ(grad.nDim(), 4u);
   vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
   Tensor y = buf_.top();
   buf_.pop();
   Tensor x = buf_.top();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
index bc7fe78..85b0c3d 100644
--- a/src/model/layer/cudnn_softmax.cc
+++ b/src/model/layer/cudnn_softmax.cc
@@ -53,13 +53,15 @@ const Tensor CudnnSoftmax::Forward(int flag, const Tensor& input) {
                         &alpha, this->desc_, inblob->data(), &beta, this->desc_,
                         outblob->mutable_data());
   }, {input.blob()}, {output.blob()});
-  buf_.push(output);
+  if (flag & kTrain)
+    buf_.push(output);
   return output;
 }
 
 const std::pair<Tensor, vector<Tensor>> CudnnSoftmax::Backward(
     int flag, const Tensor& grad) {
   vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
   Tensor dx, output = buf_.top();
   buf_.pop();
   dx.ResetLike(grad);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index 29ff8cb..b349787 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -45,13 +45,13 @@ void Dense::Setup(const LayerConf &conf) {
 /// \copydoc Layer::Forward(int flag, const Tensor&)
 const Tensor Dense::Forward(int flag, const Tensor &input) {
   Tensor output;
-
   if (transpose_)  // use the transposed version of weight_ for computing
     output = Mult(input, weight_);
   else
     output = Mult(input, weight_.T());
   AddRow(bias_, &output);
-  buf_.push(input);
+  if (flag & kTrain)
+    buf_.push(input);
   return output;
 }
 
@@ -59,6 +59,7 @@ const Tensor Dense::Forward(int flag, const Tensor &input) {
 const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag,
                                                         const Tensor &grad) {
   vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
   Tensor src_data = buf_.top();
   buf_.pop();
   Tensor db, dw, dx;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/softmax.cc b/src/model/layer/softmax.cc
index 813ebf0..b379fc1 100644
--- a/src/model/layer/softmax.cc
+++ b/src/model/layer/softmax.cc
@@ -25,13 +25,16 @@ void Softmax::Setup(const LayerConf& conf) {
 }
 
 const Tensor Softmax::Forward(int flag, const Tensor& input) {
+  Tensor output;
   if (input.nDim() == 1) {
     Tensor tmp = Reshape(input, Shape{1, input.Size()});
-    buf_.push(SoftMax(tmp, 0));
+      output = SoftMax(tmp, 0);
   } else {
-    buf_.push(SoftMax(input, axis_));
+    output = SoftMax(input, axis_);
   }
-  return buf_.top();
+  if (flag & kTrain)
+    buf_.push(output);
+  return output;
 }
 
 const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,
@@ -43,6 +46,7 @@ const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,
   }
   Tensor input_grad = grad.Clone();
   input_grad.Reshape(Shape{nrow, ncol});
+  CHECK(!buf_.empty());
   Tensor y = buf_.top();
   buf_.pop();
   CHECK(y.shape() == input_grad.shape());

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/test/singa/test_activation.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_activation.cc b/test/singa/test_activation.cc
index 9e34282..2d88121 100644
--- a/test/singa/test_activation.cc
+++ b/test/singa/test_activation.cc
@@ -57,7 +57,7 @@ TEST(Activation, Forward) {
     }
     acti.Setup(conf);
 
-    singa::Tensor out = acti.Forward(0, in);
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
 
     const float* yptr = out.data<const float*>();
     EXPECT_EQ(n, out.Size());
@@ -90,7 +90,7 @@ TEST(Activation, Backward) {
   in.CopyDataFromHostPtr<float>(x, n);
 
   float neg_slope = 0.5f;
-  std::string types[] = {"SIGMOID","TANH","RELU"};  
+  std::string types[] = {"SIGMOID","TANH","RELU"};
   for (int j = 0; j < 3; j++) {
     Activation acti;
     singa::LayerConf conf;
@@ -102,13 +102,13 @@ TEST(Activation, Backward) {
     }
     acti.Setup(conf);
 
-    singa::Tensor out = acti.Forward(0, in);
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
     const float* yptr = out.data<const float*>();
 
     const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
     singa::Tensor out_diff(singa::Shape{n});
     out_diff.CopyDataFromHostPtr<float>(grad, n);
-    const auto in_diff = acti.Backward(0, out_diff);
+    const auto in_diff = acti.Backward(singa::kTrain, out_diff);
     const float* xptr = in_diff.first.data<const float*>();
 
     float* dx = new float[n];

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_activation.cc b/test/singa/test_cudnn_activation.cc
index ee9f9b5..892b80b 100644
--- a/test/singa/test_cudnn_activation.cc
+++ b/test/singa/test_cudnn_activation.cc
@@ -64,7 +64,7 @@ TEST(TCudnnActivation, Forward) {
     acti.Setup(conf);
     // acti.InitCudnn(n, singa::kFloat32);
 
-    singa::Tensor out = acti.Forward(0, in);
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
     EXPECT_EQ(n, out.Size());
     singa::CppCPU host(0, 1);
     out.ToDevice(&host);
@@ -103,7 +103,7 @@ TEST(TCudnnActivation, Backward) {
     }
     acti.Setup(conf);
     acti.InitCudnn(n, singa::kFloat32);
-    singa::Tensor out = acti.Forward(0, in);
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
     EXPECT_EQ(n, out.Size());
     singa::CppCPU host(0, 1);
     out.ToDevice(&host);
@@ -113,7 +113,7 @@ TEST(TCudnnActivation, Backward) {
                           -1.0, 1.5,  2.5,  -1.5, -2.5};
     singa::Tensor out_diff(singa::Shape{n}, &cuda);
     out_diff.CopyDataFromHostPtr<float>(grad, n);
-    const auto ret = acti.Backward(0, out_diff);
+    const auto ret = acti.Backward(singa::kTrain, out_diff);
     singa::Tensor in_diff = ret.first;
     in_diff.ToDevice(&host);
     const float* xptr = in_diff.data<const float*>();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_softmax.cc b/test/singa/test_cudnn_softmax.cc
index dcbf1ed..05783e2 100644
--- a/test/singa/test_cudnn_softmax.cc
+++ b/test/singa/test_cudnn_softmax.cc
@@ -55,7 +55,7 @@ TEST(CudnnSoftmax, Forward) {
   sft.Setup(conf);
   sft.InitCudnn(n, singa::kFloat32);
 
-  singa::Tensor out = sft.Forward(0, in);
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
   singa::CppCPU host(0, 1);
   out.ToDevice(&host);
   const float* yptr = out.data<const float*>();
@@ -83,7 +83,7 @@ TEST(CudnnSoftmax, Backward) {
   singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
   softmaxconf->set_axis(axis);
   sft.Setup(conf);
-  singa::Tensor out = sft.Forward(0, in);
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
   singa::CppCPU host(0, 1);
   out.ToDevice(&host);
   const float* yptr = out.data<const float*>();
@@ -91,7 +91,7 @@ TEST(CudnnSoftmax, Backward) {
   const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
   singa::Tensor out_diff(singa::Shape{n}, &cuda);
   out_diff.CopyDataFromHostPtr<float>(grad, n);
-  const auto ret = sft.Backward(0, out_diff);
+  const auto ret = sft.Backward(singa::kTrain, out_diff);
   singa::Tensor in_diff = ret.first;
   in_diff.ToDevice(&host);
   const float* xptr = in_diff.data<const float*>();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fa2ea304/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_softmax.cc b/test/singa/test_softmax.cc
index da2a6ef..6ee8b3f 100644
--- a/test/singa/test_softmax.cc
+++ b/test/singa/test_softmax.cc
@@ -51,7 +51,7 @@ TEST(Softmax, Forward) {
   softmaxconf->set_axis(axis);
   sft.Setup(conf);
 
-  singa::Tensor out = sft.Forward(0, in);
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
   const float* yptr = out.data<const float*>();
   EXPECT_EQ(n, out.Size());
 
@@ -84,13 +84,13 @@ TEST(Softmax, Backward) {
   singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
   softmaxconf->set_axis(axis);
   sft.Setup(conf);
-  singa::Tensor out = sft.Forward(0, in);
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
   const float* yptr = out.data<const float*>();
 
   const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
   singa::Tensor out_diff(singa::Shape{row, col});
   out_diff.CopyDataFromHostPtr<float>(grad, n);
-  const auto in_diff = sft.Backward(0, out_diff);
+  const auto in_diff = sft.Backward(singa::kTrain, out_diff);
   const float* xptr = in_diff.first.data<const float*>();
 
   float* dx = new float[n];

[38/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

SINGA-182 Clean math function APIs and implementations

Merge branch 'cuda' from #jinyangturbo.
Clean the cuda related code (tensor_math_cuda.h, kernel_math.h and kernel_math.cu)
by unify the function arugments (names and arg order).
Need to reorder the functions.
Add Nrm2 for L2 norm using cblas and cublas.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6d69047a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6d69047a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6d69047a

Branch: refs/heads/master
Commit: 6d69047addc46e5c9f381b7e1d4cebd20ce9b2e3
Parents: 564c88a
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 12:08:48 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |   2 +
 src/core/tensor/math_kernel.cu     | 656 +++++++++++++++++---------------
 src/core/tensor/math_kernel.h      |  93 ++---
 src/core/tensor/tensor.cc          |  14 +
 src/core/tensor/tensor_math.h      | 140 ++++---
 src/core/tensor/tensor_math_cpp.h  | 227 ++++++-----
 src/core/tensor/tensor_math_cuda.h | 384 +++++++++++++++----
 test/singa/test_tensor_math.cc     | 346 ++++++++---------
 8 files changed, 1092 insertions(+), 770 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 82bbe81..cd750c5 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -173,6 +173,8 @@ class Tensor {
   template <typename SType>
   Tensor &operator/=(const SType x);
 
+  float L2() const;
+
  protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..b618f9b 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -35,36 +35,16 @@
 namespace singa {
 // Cuda Kernel Functions
 namespace cuda {
-__global__ void kernel_softmax_loss(const float *prob, const int *label,
-                                    float *loss, int n, int dim) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    float prob_of_truth = prob[index * dim + label[index]];
-    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
-  }
-}
-
-__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
-                                        int dim, float scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    int pos = index * dim + label[index];
-    grad[pos] = (grad[pos] - 1.0f) * scale;
-  }
-}
-
-__global__ void kernel_sum_vec(const float *data, float *sum, int n) {
+__global__ void KernelSum(const size_t n, const float *in, float *out) {
   int THREADS = blockDim.x;
 
   __shared__ float aux[CU1DBLOCK];
   int steps = (n - 1) / THREADS + 1;
-  aux[threadIdx.x] = data[threadIdx.x];
+  aux[threadIdx.x] = in[threadIdx.x];
 
   for (int i = 1; i < steps; ++i) {
     if (threadIdx.x + i * THREADS < n) {
-      aux[threadIdx.x] += data[threadIdx.x + i * THREADS];
+      aux[threadIdx.x] += in[threadIdx.x + i * THREADS];
     }
   }
 
@@ -83,432 +63,484 @@ __global__ void kernel_sum_vec(const float *data, float *sum, int n) {
   }
 
   __syncthreads();
-  *sum = aux[0];
+  *out = aux[0];
 }
 
-__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
-                               int rows, int cols, int stride) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < rows; index += num_threads) {
-    dst_vec_data[index] = 0.0f;
-    for (int k = 0; k < cols; k++) {
-      dst_vec_data[index] += src_mat_data[index * stride + k];
-    }
+__global__ void KernelAdd(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] + in2[i];
   }
 }
 
-__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
-                               int rows, int cols, int stride) {
-  int j = blockIdx.x;
-  int THREADS = blockDim.x;
-  if (j >= cols) {
-    return;
-  }
-
-  __shared__ float aux[CU1DBLOCK];
-  int steps = (rows - 1) / THREADS + 1;
-  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
-  for (int i = 1; i < steps; ++i) {
-    if (threadIdx.x + i * THREADS < rows) {
-      aux[threadIdx.x] +=
-          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
-    }
+__global__ void KernelAdd(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] + x;
   }
+}
 
-  int total_threads = THREADS;
-  __syncthreads();
-  while (total_threads > 1) {
-    int half_point = ((1 + total_threads) >> 1);
-    if (threadIdx.x < half_point) {
-      if (threadIdx.x + half_point < total_threads) {
-        aux[threadIdx.x] += aux[threadIdx.x + half_point];
-      }
-    }
-    __syncthreads();
-    total_threads = ((total_threads + 1) >> 1);
+__global__ void KernelSub(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] - in2[i];
   }
-
-  __syncthreads();
-  dst_vec_data[j] = aux[0];
 }
 
-__global__ void kernel_add_vec_row(const float *src_vec_data,
-                                   const float *src_mat_data,
-                                   float *des_mat_data, int rows, int cols,
-                                   int stride) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int num_threads_x = blockDim.x * gridDim.x;
-  int num_threads_y = blockDim.y * gridDim.y;
-  int index = 0;
-  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
-    index = j * stride + i;
-    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+__global__ void KernelExp(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::exp(in[i]);
   }
 }
-__global__ void kernel_add(const float *src1, const float *src2, float *out,
-                           int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    out[index] = src1[index] + src2[index];
+
+__global__ void KernelLog(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::log(in[i]);
   }
 }
 
-__global__ void kernel_sub(const float *src1, const float *src2, float *out,
-                           int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    out[index] = src1[index] - src2[index];
+__global__ void KernelSigmoid(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = 1.0f / (1.0f + expf(-in[i]));
   }
 }
-__global__ void kernel_exp(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::exp(src_data[index]);
+__global__ void KernelSign(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > 0.0f)
+      out[i] = 1.0f;
+    else if (in[i] < 0.0f)
+      out[i] = -1.0f;
+    else
+      out[i] = 0.0f;
   }
 }
 
-__global__ void kernel_log(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::log(src_data[index]);
+__global__ void KernelClamp(const size_t n, const float low, const float high,
+                            const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > high)
+      out[i] = high;
+    else if (in[i] < low)
+      out[i] = low;
+    else
+      out[i] = in[i];
   }
 }
 
-__global__ void kernel_sigmoid(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelRelu(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = max(in[i], 0.0f);
   }
 }
 
-__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
-                                    int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * (1.0f - src_data[index]);
+__global__ void KernelAbs(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] =  max(in[i], -in[i]);
   }
 }
 
-__global__ void kernel_relu(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = max(src_data[index], 0.0f);
+__global__ void KernelTanh(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = tanhf(in[i]);
   }
 }
 
-__global__ void kernel_relu_grad(const float *src_data, float *des_data,
-                                 int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+__global__ void KernelSoftplus(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = logf(1 + expf(in[i]));
   }
 }
-
-__global__ void kernel_tanh(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = tanhf(src_data[index]);
+__global__ void KernelSquare(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * in[i];
   }
 }
-
-__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
-                                 int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+__global__ void KernelSqrt(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::sqrt(in[i]);
   }
 }
 
-__global__ void kernel_softplus(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = logf(1 + expf(src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in1[i], in2[i]);
   }
 }
 
-__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
-                                     int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in[i], x);
   }
 }
 
-__global__ void kernel_square(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in1, const float *in2,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] * in2[i];
   }
 }
 
-__global__ void kernel_square_grad(const float *src_data, float *des_data,
-                                   int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 2 * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in, const float x,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * x;
   }
 }
 
-__global__ void kernel_sqrt(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::sqrt(src_data[index]);
+__global__ void KernelDiv(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] / in2[i];
   }
 }
-
-__global__ void kernel_pow(const float *src_data_a, const float *src_data_b,
-                           float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::pow(src_data_a[index], src_data_b[index]);
+__global__ void KernelDiv(const size_t n, const float x, const float *in,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x / in[i];
   }
 }
-
-__global__ void kernel_mult(const float *src_data_a, const float *src_data_b,
-                            float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] * src_data_b[index];
+__global__ static void KernelSet(const size_t n, const float x, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x;
   }
 }
 
-__global__ void kernel_mult(const float *src_data_a, const float x,
-                            float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] * x;
+__global__ void KernelThreshold(const size_t n, const float x, const float *in,
+                                float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] < x ? 1.0f : 0.0f;
   }
 }
 
-__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
-                           float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] / src_data_b[index];
+__global__ void KernelGE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
   }
 }
-
-__global__ static void kernel_set_value(float *data, float value, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    data[index] = value;
+__global__ void KernelGT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] > x ? 1.0f : 0.0f;
   }
 }
-
-__global__ void kernel_threshold(const float *src_data, float *des_data,
-                                 float alpha, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+__global__ void KernelLE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
   }
 }
-void sum(int n, const float *in, float *out) {
-  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-  //  here, we only need one block
-  int num_blocks = 1;
 
-  kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
+__global__ void KernelLT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+  }
 }
 
-void sum_row(int rows, int cols, int stride, const float *in, float *out) {
-  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-  int num_blocks = cols;
+// ********************************
+// Functions call kernels
+// ********************************
 
-  kernel_sum_row << <num_blocks, threads_per_block>>>
-      (in, out, rows, cols, stride);
+void set(const size_t n, const float v, float *out, cudaStream_t s) {
+  KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
 }
 
-void sum_col(int rows, int cols, int stride, const float *in, float *out) {
-  int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
-  int num_blocks = rows;
+void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
 
-  kernel_sum_col << <num_blocks, threads_per_block>>>
-      (in, out, rows, cols, stride);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void add_row(int rows, int cols, int stride, const float *in_row,
-             const float *in_mat, float *out) {
-  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-  dim3 num_blocks(
-      cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
-      rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
-  kernel_add_vec_row << <num_blocks, threads_per_block>>>
-      (in_row, in_mat, out, rows, cols, stride);
+
+void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void add(int n, const float *a, const float *b, float *out) {
-  kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void sub(int n, const float *a, const float *b, float *out) {
-  kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void exp(int n, const float *in, float *out) {
-  kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+
+void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
 
-void log(int n, const float *in, float *out) {
-  kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
 
-void sigmoid(int n, const float *in, float *out) {
-  kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
+  KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s) {
+  KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
 }
 
-void sigmoid_grad(int n, const float *in, float *out) {
-  kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void relu(int n, const float *in, float *out) {
-  kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void relu_grad(int n, const float *in, float *out) {
-  kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void tanh(int n, const float *in, float *out) {
-  kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void div(const size_t n, const float x, const float *in, float *out,
+          cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
 }
 
-void tanh_grad(int n, const float *in, float *out) {
-  kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s) {
+  KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
 }
 
-void softplus(int n, const float *in, float *out) {
-  kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 }
 
-void softplus_grad(int n, const float *in, float *out) {
-  kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void square(int n, const float *in, float *out) {
-  kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void square_grad(int n, const float *in, float *out) {
-  kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void sqrt(int n, const float *in, float *out) {
-  kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void pow(int n, const float *a, const float *b, float *out) {
-  kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void mult(int n, const float *a, const float *b, float *out) {
-  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+  //  here, we only need one block
+  int num_blocks = 1;
+  KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
+}
+/*
+void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void mult(int n, const float *a, const float x, float *out) {
-  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void div(int n, const float *a, const float *b, float *out) {
-  kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void set_value(int n, float v, float *out) {
-  kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+
+void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void threshold(int n, float alpha, const float *in, float *out) {
-  kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-// follow the consistency guide for math API
-__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
-                          float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = alpha / in[idx];
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < rows; index += num_threads) {
+    dst_vec_data[index] = 0.0f;
+    for (int k = 0; k < cols; k++) {
+      dst_vec_data[index] += src_mat_data[index * stride + k];
+    }
   }
 }
 
-__global__ void KernelGE(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
+__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int j = blockIdx.x;
+  int THREADS = blockDim.x;
+  if (j >= cols) {
+    return;
   }
-}
-__global__ void KernelGT(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] > x ? 1.0f : 0.0f;
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (rows - 1) / THREADS + 1;
+  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < rows) {
+      aux[threadIdx.x] +=
+          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
+    }
   }
-}
-__global__ void KernelLE(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
+
+  int total_threads = THREADS;
+  __syncthreads();
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
   }
+
+  __syncthreads();
+  dst_vec_data[j] = aux[0];
 }
 
-__global__ void KernelLT(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+
+__global__ void kernel_add_vec_row(const float *src_vec_data,
+                                   const float *src_mat_data,
+                                   float *des_mat_data, int rows, int cols,
+                                   int stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int num_threads_x = blockDim.x * gridDim.x;
+  int num_threads_y = blockDim.y * gridDim.y;
+  int index = 0;
+  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+    index = j * stride + i;
+    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
   }
 }
 
-__global__ void KernelSet(const size_t num, const float x, float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = x;
+__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
+                                    int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * (1.0f - src_data[index]);
   }
 }
 
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
-  KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+  }
 }
-void Div(const size_t num, float alpha, const float *in, float *out,
-         cudaStream_t s) {
-  KernelDiv << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, alpha, in, out);
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+  }
 }
 
-void GT(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelGT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+                                     int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
 }
-void GE(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelGE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void KernelSquareGrad(const float *src_data, float *des_data,
+                                   int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 2 * src_data[index];
+  }
 }
-void LT(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelLT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_loss(const float *prob, const int *label,
+                                    float *loss, int n, int dim) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    float prob_of_truth = prob[index * dim + label[index]];
+    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
+  }
 }
-void LE(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
+                                        int dim, float scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    int pos = index * dim + label[index];
+    grad[pos] = (grad[pos] - 1.0f) * scale;
+  }
 }
+*/
+
 
 }  // namespace cuda
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..d8a58a5 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -31,65 +31,66 @@ namespace singa {
 
 // TODO(wangwei) make all function templates.
 namespace cuda {
-void sum(int n, const float *in, float *out);
 
-void sum_row(int rows, int cols, int stride, const float *in, float *out);
-
-void sum_col(int rows, int cols, int stride, const float *in, float *out);
-
-void add_row(int rows, int cols, int stride, const float *in_row,
-             const float *in_mat, float *out);
-
-void add(int n, const float *a, const float *b, float *out);
-
-void sub(int n, const float *a, const float *b, float *out);
-
-void exp(int n, const float *in, float *out);
-
-void log(int n, const float *in, float *out);
-
-void sigmoid(int n, const float *in, float *out);
-
-void sigmoid_grad(int n, const float *in, float *out);
-
-void relu(int n, const float *in, float *out);
-
-void relu_grad(int n, const float *in, float *out);
-
-void tanh(int n, const float *in, float *out);
-
-void tanh_grad(int n, const float *in, float *out);
+// 0 input
+void set(const size_t n, const float v, float *out, cudaStream_t s);
+
+// 1 input
+void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+void log(const size_t n, const float *in, float *out, cudaStream_t s);
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+void square(const size_t n, const float *in, float *out, cudaStream_t s);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s);
+
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
 
-void softplus(int n, const float *in, float *out);
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
 
-void softplus_grad(int n, const float *in, float *out);
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
 
-void square(int n, const float *in, float *out);
+void div(const size_t n, const float x, const float *in, float *out,
+         cudaStream_t s);
 
-void square_grad(int n, const float *in, float *out);
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s);
 
-void sqrt(int n, const float *in, float *out);
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
 
-void pow(int n, const float *a, const float *b, float *out);
+// 2 inputs
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void mult(int n, const float *a, const float *b, float *out);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void mult(int n, const float *a, const float x, float *out);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void div(int n, const float *a, const float *b, float *out);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
 
-void set_value(int n, float v, float *out);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void threshold(int n, float alpha, const float *in, float *out);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s);
 
-// follow the consistency guide for math API
-void Div(const size_t num, const float x, const float *in, float *out,
-         cudaStream_t s);
-void Set(const size_t num, const float x, float *out, cudaStream_t s);
-void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 }  // cuda
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index f4e9da2..e62386a 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -219,6 +219,8 @@ GenUnaryScalarArgMemberFn(operator+=, Add);
 GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);
 
+
+
 // ====================Tensor Operations=======================================
 void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                     const size_t dst_offset, const size_t src_offset) {
@@ -309,6 +311,18 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
   } while (0)
 
 // =============Element-wise operations====================================
+/// L2 norm, Do not use Nrm2 (name conflict).
+float Tensor::L2() const {
+  float nrm = 0.0f;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec([&nrm, this](Context *ctx) {
+      DType ret;
+      Nrm2<DType, Lang>(this->Size(), this->blob(), &ret, ctx);
+      nrm = TypeCast<DType, float>(ret);
+    }, {this->blob()}, {});
+  });
+  return nrm;
+}
 template <typename SType>
 void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index b5d0ba9..b86e1cb 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -48,41 +48,45 @@ namespace singa {
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Blob level math functions.
 
-// =============Element-wise operations====================================
+// **************************************
+// Element-wise functions
+// **************************************
+
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
-/// out = in + x
+/// out[i] = in[i] + x
 template <typename DType, typename Lang>
 void Add(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Add Not Implemented";
 }
 
-/// out = in1 + in2
+/// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
 void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Add-Pair Not Implemented";
 }
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Clamp every element into [low, high]
+/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
 void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
            Blob *out, Context *ctx) {
   LOG(FATAL) << "Clamp Not Implemented";
 }
 
-/// out = x / in
+/// out[i] = x / in[i]
 template <typename DType, typename Lang>
 void Div(const size_t num, const DType x, const Blob *in, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Div Not Implemented";
 }
 
+/// out[i] = in[i] / x
 template <typename DType, typename Lang>
 void Div(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
@@ -90,21 +94,21 @@ void Div(const size_t num, const Blob *in, const DType x, Blob *out,
   EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
-/// out = in1 / in2
+/// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
 void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
-/// out = in * x
+/// out[i] = in[i] * x
 template <typename DType, typename Lang>
 void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
-/// out = in2 * in2
+/// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
 void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
                  Context *ctx) {
@@ -146,31 +150,32 @@ void GT(const size_t num, const Blob *in, const DType x, Blob *out,
         Context *ctx) {
   LOG(FATAL) << "GT Not Implemented";
 }
-/// Element-wise operation, do v^x for every v from the in tensor
+/// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
 void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
-/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
+/// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
 void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
-/// Element-wise operation, out[i]=max(0, in[i])
+/// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
 void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "ReLU Not Implemented";
 }
 
+/// out[i] = x
 template <typename DType, typename Lang>
 void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Set Not Implemented";
 }
-/// Element-wise operation, out[i]=sigmoid([in[i])
+/// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
 void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sigmoid Not Implemented";
@@ -181,85 +186,47 @@ template <typename DType, typename Lang>
 void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sign Not Implemented";
 }
-/// Element-wise operation, out[i]=sqrt([in[i])
+/// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
 void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sqrt Not Implemented";
 }
 
-/// Element-wise operation, out[i]=square([in[i])
+/// out[i]=square(in[i])
 template <typename DType, typename Lang>
 void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Square Not Implemented";
+  EltwiseMult<DType, Lang>(num, in, in, out, ctx);
 }
 
-/// out =  in - x
+/// out[i] =  in[i] - x
 template <typename DType, typename Lang>
 void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   Add<DType, Lang>(num, in, -x, out, ctx);
 }
 
-/// out = in1 - in2
+/// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
 void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Sub-Pair Not Implemented";
 }
+
 /// sum all elements of in into out
 template <typename DType, typename Lang>
 void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Sum Not Implemented";
 }
 
-/// Element-wise operation, out[i]=tanh([in[i])
+/// out[i]=tanh(in[i])
 template <typename DType, typename Lang>
 void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Tanh Not Implemented";
 }
 
-// =========== Matrix operations ===========================================
-/// Add the vector v to every column of A as the column of out
-template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddCol Not Implemented";
-}
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out
-template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddRow Not Implemented";
-}
-/// outer-product.
-/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
-template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
-           Blob *out, Context *ctx) {
-  LOG(FATAL) << "Outer Not Implemented";
-}
-// Do softmax for each row invidually
-template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-             Context *ctx) {
-  LOG(FATAL) << "Softmax Not Implemented";
-}
-/// Sum the columns of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-                Context *ctx) {
-  LOG(FATAL) << "SumColumns Not Implemented";
-}
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-             Context *ctx) {
-  LOG(FATAL) << "SumRows Not Implemented";
-}
-
-// ================Random functions===========================================
+// **************************************
+// Random functions
+// **************************************
 /// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
@@ -282,7 +249,10 @@ void Uniform(const size_t num, const float low, const float high, Blob *out,
   LOG(FATAL) << "Uniform Not Implemented";
 }
 
-// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
@@ -307,12 +277,19 @@ void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
+/// out = ||in||_2^2, i.e, L2 norm.
+template <typename DType, typename Lang>
+void Nrm2(const size_t num, const Blob *in, float *out, Context *ctx) {
+  LOG(FATAL) << "Nrm2 Not Implemented";
+}
+
 /// out *= x
 template <typename DType, typename Lang>
 void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
+/// inner product of array in1 and in2
 template <typename DType, typename Lang>
 void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
          Context *ctx) {
@@ -346,5 +323,44 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
   LOG(FATAL) << "GEMM Not Implemented";
 }
 
+// **************************************
+// Matrix functions
+// **************************************
+/*
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
+}
+/// outer-product.
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
+template <typename DType, typename Lang>
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
+  LOG(FATAL) << "Outer Not Implemented";
+}
+
+/// Sum the columns of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
+}
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
+}
+*/
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 2c5c272..0b280a3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -241,7 +241,7 @@ void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
     outPtr[i] = sqrt(inPtr[i]);
   }
 }
-
+/*
 template <>
 void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
                               Context *ctx) {
@@ -251,6 +251,7 @@ void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
     outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
+*/
 
 template <>
 void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -287,101 +288,6 @@ void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
   }
 }
 
-// =========Matrix operations ================================================
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
-    }
-  }
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
-    }
-  }
-}
-template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
-                             const Blob *in2, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t r = 0; r < m; r++) {
-    size_t offset = r * n;
-    for (size_t c = 0; c < n; c++) {
-      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
-    }
-  }
-}
-template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *bPtr = new float[ncol];
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    float denom = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      bPtr[c] = exp(inPtr[offset + c]);
-      denom += bPtr[c];
-    }
-    for (size_t c = 0; c < ncol; c++) {
-      size_t idx = offset + c;
-      outPtr[idx] = bPtr[c] / denom;
-    }
-  }
-  delete bPtr;
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t c = 0; c < ncol; c++) {
-    outPtr[c] = 0.f;
-  }
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[c] += inPtr[offset + c];
-    }
-  }
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    outPtr[r] = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[r] += inPtr[offset + c];
-    }
-  }
-}
-
 // ===============Random operations==========================================
 template <>
 void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
@@ -440,18 +346,26 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
 
 #ifdef USE_CBLAS
 template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_isamax(num, inPtr, 1);
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_sasum(num, inPtr, 1);
+}
+
+template <>
 void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
                             Blob *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
   cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
 }
-template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                             Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cblas_sscal(num, x, outPtr, 1);
-}
 
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -461,6 +375,19 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
   *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
 template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cblas_sscal(num, x, outPtr, 1);
+}
+template <>
+void Nrm2<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_snrm2(num, inPtr, 1);
+}
+
+template <>
 void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
                             const float alpha, const Blob *A, const Blob *v,
                             const float beta, Blob *out, Context *ctx) {
@@ -587,6 +514,102 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
 }
 
 #endif  // USE_CBLAS
+
+// =========Matrix operations ================================================
+/*
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+    }
+  }
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+    }
+  }
+}
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                             const Blob *in2, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t r = 0; r < m; r++) {
+    size_t offset = r * n;
+    for (size_t c = 0; c < n; c++) {
+      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+    }
+  }
+}
+template <>
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      bPtr[c] = exp(inPtr[offset + c]);
+      denom += bPtr[c];
+    }
+    for (size_t c = 0; c < ncol; c++) {
+      size_t idx = offset + c;
+      outPtr[idx] = bPtr[c] / denom;
+    }
+  }
+  delete bPtr;
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                  const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t c = 0; c < ncol; c++) {
+    outPtr[c] = 0.f;
+  }
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[c] += inPtr[offset + c];
+    }
+  }
+}
+
+template <>
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    outPtr[r] = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[r] += inPtr[offset + c];
+    }
+  }
+}
+*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index f9841a3..e2597d5 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -24,105 +24,336 @@
 #include "./math_kernel.h"
 #include "singa/utils/cuda_utils.h"
 #include "singa/core/common.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include "singa/utils/cuda_utils.h"
 
 namespace singa {
-// =================Elementwise operations===================================
+
+/// out[i] = |in[i]|
+template <>
+void Abs<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::abs(num, inPtr, outPtr, ctx->stream);
+}
+/// out = in + x
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 + in2
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
+template <>
+void Clamp<float, lang::Cuda>(const size_t num, const float low,
+                              const float high, const Blob* in, Blob* out,
+                              Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
+}
+/// out = in1 / in2
+template <>
+void Div<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+template <>
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob* in,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, x, inPtr, outPtr, ctx->stream);
+}
+
+/// out = in * x
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in,
+                                    const float x, Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 * in2
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in1,
+                                    const Blob* in2, Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Base is e. out[i]=e^in[i]
+template <>
+void Exp<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::exp(num, inPtr, outPtr, ctx->stream);
+}
+
+template <>
+void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::ge(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::gt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::le(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
+template <>
+void Log<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::log(num, inPtr, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::lt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i] = in[i]^x
+template <>
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr, x, outPtr, ctx->stream);
+}
+/// Element-wise operation, out[i] = in1[i]^in2[i]
 template <>
-void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
-                            Blob *out, Context *ctx) {
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::add(num, in1Ptr, in2Ptr, outPtr);
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
 }
 
-// follow the consistency guide of math API
+/// Element-wise operation, out[i]=max(0, in[i])
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
-                            Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::Div(num, x, inPtr, outPtr, ctx->stream);
+void ReLU<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::relu(num, inPtr, outPtr, ctx->stream);
 }
 
+/// out[i] = x
 template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
-                                    const float x, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::mult(num, inPtr, x, outPtr);
+void Set<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+                            Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::set(num, x, outPtr, ctx->stream);
 }
+/// Element-wise operation, out[i]=sigmoid([in[i])
 template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
+void Sigmoid<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                                Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
 }
+// out[i] = sign(in[i])
 template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
+void Sign<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sign(num, inPtr, outPtr, ctx->stream);
 }
+
+/// Element-wise operation, out[i]=sqrt([in[i])
+template <>
+void Sqrt<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sqrt(num, inPtr, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=in[i]^2
 template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+void Square<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                               Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::square(num, inPtr, outPtr, ctx->stream);
 }
+/// out = in1 - in2
 template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
+void Sub<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sub(num, inPtr1, inPtr2, outPtr, ctx->stream);
 }
+
+/// sum all elements of input into out
 template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::Set(num, x, outPtr, ctx->stream);
+void Sum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::sum(num, inPtr, out, ctx->stream);
 }
-// TODO(wangwei) optimize using stream
+
+/// Element-wise operation, out[i]=tanh([in[i])
 template <>
-void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
-                               Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::square(num, inPtr, outPtr);
+void Tanh<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::tanh(num, inPtr, outPtr, ctx->stream);
 }
-// TODO(wangwei) optimize using stream
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
 template <>
-void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
-                            Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Blob* out,
+                                  Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
 }
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
 template <>
-void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
-                            Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::sum(num, inPtr, out);
+void Uniform<float, lang::Cuda>(const size_t num, const float low,
+                                const float high, Blob* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
+  cuda::add(num, outPtr, low, outPtr, ctx->stream);
+}
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and delta to DType
+template <>
+void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
+                                 const float std, Blob* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
 }
 
 // =========================Blas operations==================================
+// ref to http://docs.nvidia.com/cuda/cublas
+template <>
+void Amax<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;  // cublas index starts from 1
+}
+
+/// return the index of the element with the min value.
+template <>
+void Amin<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;
+}
+
+/// out = sum |x| for all x in in
+template <>
+void Asum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
+}
+
+/// out = alpha * in + out
+template <>
+void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
+                             const Blob* in, Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
+}
+
+/// out = \sum_i in1[i] * in2[i]
+template <>
+void Dot<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            float* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
+}
+template <>
+void Nrm2<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                             Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const float* inPtr = static_cast<const float*>(in->data());
+  cublasSnrm2(handle, num, inPtr, 1, out);
+}
+template <>
+void Scale<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+                              Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
+}
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
-                             const size_t ncol, const Blob *M, const Blob *v,
-                             Blob *out, Context *ctx) {
+                             const size_t ncol, const Blob* M, const Blob* v,
+                             Blob* out, Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float *MPtr = static_cast<const float *>(M->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float* MPtr = static_cast<const float*>(M->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
   if (side_right) {
     CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
                              vPtr, 1, outPtr, ncol));
@@ -133,11 +364,11 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
 }
 template <>
 void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
-                             const float alpha, const Blob *A, const Blob *v,
-                             const float beta, Blob *out, Context *ctx) {
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+                             const float alpha, const Blob* A, const Blob* v,
+                             const float beta, Blob* out, Context* ctx) {
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   if (!trans)
     CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
@@ -152,16 +383,16 @@ template <>
 void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
                              const size_t nrowA, const size_t ncolB,
                              const size_t ncolA, const float alpha,
-                             const Blob *A, const Blob *B, const float beta,
-                             Blob *C, Context *ctx) {
+                             const Blob* A, const Blob* B, const float beta,
+                             Blob* C, Context* ctx) {
   auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
   int lda = transA ? nrowA : ncolA;
   int ldb = transB ? ncolA : ncolB;
   int ldc = ncolB;
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *BPtr = static_cast<const float *>(B->data());
-  float *CPtr = static_cast<float *>(C->mutable_data());
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* BPtr = static_cast<const float*>(B->data());
+  float* CPtr = static_cast<float*>(C->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
@@ -171,4 +402,3 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
 
 #endif  // USE_CUDA
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-

[15/50] [abbrv] incubator-singa git commit: SINGA-181 Add NVCC supporting for .cu files

Posted by zh...@apache.org.

SINGA-181 Add NVCC supporting for .cu files

Use nvcc to compile math_kernel.cu. The output file is cuda_compile_generated_math_kernel.cu.o, which is linked to libsinga_core.so later.
Also fix some bugs/typos in source code.

fix bugs from kernel functions by using std::sqrt to differentiate with cuda::sqrt. error in linking libsinga_core.so, the kernel functions are not included in the link arg list

Fix compilation bugs.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/668ae167
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/668ae167
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/668ae167

Branch: refs/heads/master
Commit: 668ae1679be1975fd153c30b522797988863dff7
Parents: d680079
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Wed May 25 23:10:43 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 26 14:11:18 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                   |  2 +-
 src/CMakeLists.txt               | 12 ++++++++++--
 src/core/tensor/math_kernel.cu   | 19 +++++++++++--------
 src/core/tensor/math_kernel.h    |  7 ++++++-
 src/model/metric/accuracy.h      |  1 +
 test/singa/test_cudnn_dropout.cc |  4 ++--
 6 files changed, 31 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/668ae167/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8cb42fb..e08fb98 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
 
 PROJECT(singa)
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-#message(STATUS "${CMAKE_CXX_FLAGS}")
 
 LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
 #message(STATUS "module path: ${CMAKE_MODULE_PATH}")
@@ -12,6 +11,7 @@ IF(UNIX OR APPLE)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
 ENDIF()
 
+#message(STATUS "${CMAKE_CXX_FLAGS}")
 SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
 #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
 INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/668ae167/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 92e7fe5..df8b22b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,8 +18,16 @@ AUX_SOURCE_DIRECTORY(core/device core_source)
 AUX_SOURCE_DIRECTORY(core/memory core_source)
 AUX_SOURCE_DIRECTORY(core/scheduler core_source)
 AUX_SOURCE_DIRECTORY(core/tensor core_source)
-#message(STATUS "CORE ${core_source}")
-ADD_LIBRARY(singa_core SHARED ${core_source})
+FILE(GLOB_RECURSE cuda_source core "*.cu")
+set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
+set(CMAKE_CXX_FLAGS "")
+CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+#message(STATUS "FLAGS ${CMAKE_CXX_FLAGS}")
+#message(STATUS "CORE ${cuda_source}")
+#message(STATUS "OBJ ${cuda_objs}")
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/core/tensor")
+set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
+ADD_LIBRARY(singa_core SHARED ${core_source} ${cuda_objs})
 TARGET_LINK_LIBRARIES(singa_core ${SINGA_LINKER_LIBS})
 LIST(APPEND SINGA_LINKER_LIBS singa_core)
 #MESSAGE(STATUS "link libs " ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/668ae167/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index 585d65d..30863a1 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -19,9 +19,11 @@
 *
 *************************************************************/
 
+#include "singa_config.h"
 #ifdef USE_CUDA
 #include <cmath>
 #include <algorithm>
+#include <cfloat>
 #include "./math_kernel.h"
 
 #define CU2DBLOCK_X 32
@@ -30,6 +32,7 @@
 #define CU1DBLOCK 1024
 #define CU1DBLOCKF 1024.0
 
+namespace singa{
 // Cuda Kernel Functions
 namespace cuda {
 __global__ void kernel_softmax_loss(const float *prob, const int *label,
@@ -38,7 +41,7 @@ __global__ void kernel_softmax_loss(const float *prob, const int *label,
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
     float prob_of_truth = prob[index * dim + label[index]];
-    loss[index] -= log(max(prob_of_truth, FLT_MIN));
+    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
   }
 }
 
@@ -52,7 +55,7 @@ __global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
   }
 }
 
-__global__ void kernel_sum_vec(float *data, float *sum, int n) {
+__global__ void kernel_sum_vec(const float *data, float *sum, int n) {
   int THREADS = blockDim.x;
 
   __shared__ float aux[CU1DBLOCK];
@@ -149,7 +152,7 @@ __global__ void kernel_exp(const float *src_data, float *des_data, int n) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
-    des_data[index] = exp(src_data[index]);
+    des_data[index] = std::exp(src_data[index]);
   }
 }
 
@@ -157,7 +160,7 @@ __global__ void kernel_log(const float *src_data, float *des_data, int n) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
-    des_data[index] = log(src_data[index]);
+    des_data[index] = std::log(src_data[index]);
   }
 }
 
@@ -242,7 +245,7 @@ __global__ void kernel_square_grad(const float *src_data, float *des_data,
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
-    des_data[index] = 2 * sqrt(src_data[index]);
+    des_data[index] = 2 * src_data[index];
   }
 }
 
@@ -250,7 +253,7 @@ __global__ void kernel_sqrt(const float *src_data, float *des_data, int n) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
-    des_data[index] = sqrt(src_data[index]);
+    des_data[index] = std::sqrt(src_data[index]);
   }
 }
 
@@ -259,7 +262,7 @@ __global__ void kernel_pow(const float *src_data_a, const float *src_data_b,
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
-    des_data[index] = pow(src_data_a[index], src_data_b[index]);
+    des_data[index] = std::pow(src_data_a[index], src_data_b[index]);
   }
 }
 
@@ -331,7 +334,7 @@ void sum_col(int rows, int cols, int stride, const float *in, float *out) {
   int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
   int num_blocks = rows;
 
-  kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data,
+  kernel_sum_col<<<num_blocks, threads_per_block>>>(in, out,
                                                     rows, cols, stride);
 }
 void add_row(int rows, int cols, int stride, const float *in_row,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/668ae167/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 7629ac8..f5da772 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -21,8 +21,11 @@
 #ifndef SRC_CORE_TENSOR__MATH_KERNEL_H_
 #define SRC_CORE_TENSOR__MATH_KERNEL_H_
 
-namespace singa {
 
+#include "singa_config.h"
+#ifdef USE_CUDA
+
+namespace singa {
 /*
   void softmaxloss_forward(int n, int dim, const float *prob,
       const int *label, float *loss);
@@ -77,6 +80,8 @@ void set_value(int n, float v, float *out);
 
 void threshold(int n, float alpha, const float *in, float *out);
 }  // cuda
+
 }  // namespace singa
 
+#endif
 #endif  // SRC_CORE_TENSOR__MATH_KERNEL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/668ae167/src/model/metric/accuracy.h
----------------------------------------------------------------------
diff --git a/src/model/metric/accuracy.h b/src/model/metric/accuracy.h
index 05c1643..fb23634 100644
--- a/src/model/metric/accuracy.h
+++ b/src/model/metric/accuracy.h
@@ -19,6 +19,7 @@
 #ifndef SINGA_MODEL_METRIC_ACCURACY_H_
 #define SINGA_MODEL_METRIC_ACCURACY_H_
 #include "singa/model/metric.h"
+#include <algorithm>
 namespace singa {
 
 /// Compute the accuray of the prediction, which is matched against the

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/668ae167/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_dropout.cc b/test/singa/test_cudnn_dropout.cc
index 393d555..e1a6333 100644
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@ -21,7 +21,7 @@
 #include "../src/model/layer/cudnn_dropout.h"
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
-//#if CUDNN_MAJOR_VERSION >= 5
+#if CUDNN_MAJOR_VERSION >= 5
 
 #include "gtest/gtest.h"
 
@@ -123,5 +123,5 @@ TEST(CudnnDropout, Backward) {
   EXPECT_FLOAT_EQ(dx[1], dy[1] * GetBitValue(mptr, 1) * scale);
   EXPECT_FLOAT_EQ(dx[7], dy[7] * GetBitValue(mptr, 7) * scale);
 }
-//#endif  // CUDNN_VERSION_MAJOR>=5
+#endif  // CUDNN_VERSION_MAJOR>=5
 #endif  // USE_CUDNN

[09/50] [abbrv] incubator-singa git commit: SINGA-171 - Create CppDevice and CudaDevice

Posted by zh...@apache.org.

SINGA-171 - Create CppDevice and CudaDevice

Rename Device subclasses based on the programming language and hardware,
e.g., CppCPU indicates the device is a CPU which runs cpp code, CudaGPU
indicates the device is a NvidiaGPU which runs cuda code, and CudaCPU
indicates the device is a CPU which uses cuda to malloc and free pinned
memory for the CudaGPU.

Corrspondingly, we rename the lib namepace to lang. and Device type()
to lang().


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/9d1bcb42
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/9d1bcb42
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/9d1bcb42

Branch: refs/heads/master
Commit: 9d1bcb429a6f0a79426551a5fd42fdcadbf2f852
Parents: e3da6a5
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu May 19 17:00:01 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 19 17:08:36 2016 +0800

----------------------------------------------------------------------
 include/singa/core/common.h        |   5 +-
 include/singa/core/device.h        |  77 +++++++------
 include/singa/model/layer.h        |   9 +-
 include/singa/utils/cuda.h         |  94 ----------------
 include/singa/utils/cuda_utils.h   |  94 ++++++++++++++++
 src/core/device/cpp_cpu.cc         |  47 ++++++++
 src/core/device/cpp_device.cc      |  47 --------
 src/core/device/cuda_device.cc     | 157 ---------------------------
 src/core/device/cuda_gpu.cc        | 159 +++++++++++++++++++++++++++
 src/core/device/device.cc          |   2 +-
 src/core/tensor/tensor.cc          | 185 +++++++++++++++-----------------
 src/core/tensor/tensor_math.h      | 106 +++++++-----------
 src/core/tensor/tensor_math_cpp.h  |  56 +++++-----
 src/core/tensor/tensor_math_cuda.h |   2 +-
 src/proto/core.proto               |   2 +-
 test/singa/test_cpp_cpu.cc         |  71 ++++++++++++
 test/singa/test_cpp_device.cc      |  71 ------------
 test/singa/test_cudnn_dropout.cc   |   8 +-
 test/singa/test_tensor.cc          |   6 +-
 19 files changed, 588 insertions(+), 610 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 0fa301a..61c1c41 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -32,16 +32,15 @@
 #endif
 
 namespace singa {
-namespace lib {
+namespace lang {
 /// To implemente functions using cpp libraries
 typedef struct _Cpp { } Cpp;
 /// To implemente functions using cuda libraries
 typedef struct _Cuda { } Cuda;
 /// To implement function using opencl libraries
 typedef struct _Opencl { } Opencl;
-}  // namespace lib
+}  // namespace lang
 
-typedef unsigned char Byte;
 /// Blob reprent a chunk of memory (on device or host) managed by VirtualMemory.
 class Blob {
  public:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 29b7677..a67b564 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -33,33 +33,12 @@ using std::vector;
 using std::string;
 using std::function;
 namespace singa {
-/// The base type of callback argument structure.
-/// The specific arg should inherit from this one.
-class CallbackArg {
- public:
-  template <typename T>
-  T* CastTo() {
-    static_assert(std::is_base_of<CallbackArg, T>::value,
-                  "The casted type must be a sub-class of CallbackArg");
-    return static_cast<T*>(this);
-  }
-};
-/// Type of callback functions for executing tensor ops.
-typedef function<void(CallbackArg*)> CallbackFn;
 
 /// Allocate memory and execute Tensor operations.
 /// There are three types of devices distinguished by their programming
 /// languages, namely cpp, cuda and opencl.
 class Device {
- public:
-  /// Operation has a function, and read/write blobs.
-  typedef struct _Operation {
-    function<void(Context*)> fn;
-    const vector<Blob*> read_blobs;
-    const vector<Blob*> write_blobs;
-  } Operation;
-
- public:
+  public:
   Device() = default;
   /// Constructor with device ID, num of executors (e.g., cuda streams),
   /// max mem size to use (in MB), identifier of scheduler type (default
@@ -92,11 +71,14 @@ class Device {
   /// wait for all operations submitted to this device.
   void Sync();
 
-  DeviceType type() const {
-    return device_type_;
+  /// Return the programming language for this device.
+  LangType lang() const {
+    return lang_;
   }
 
+  /// TODO(wangwei) remove it?
   Device* host() const { return host_; }
+
   int id() const { return id_; }
 
  protected:
@@ -118,18 +100,19 @@ class Device {
   unsigned seed_ = 0;
   // Scheduler* scheduler_ = nullptr;
   // VirtualMemory* vm_ = nullptr;
-  /// could be kCpp, kCuda, kOpencl
-  DeviceType device_type_;
+  /// Programming language type, could be kCpp, kCuda, kOpencl
+  LangType lang_;
   // SafeQueue<Operation> op_queue_;
   // SafeQueue<Operation> op_log_;
   /// The host device
   Device* host_;
 };
 
-// Implement Device functions using cpp.
-class CppDevice : public Device {
+/// Represent a CPU device which may have multiple threads/executors.
+/// It runs cpp code.
+class CppCPU : public Device {
  public:
-  CppDevice(int id, int num_executors = 1,
+  CppCPU(int id = -1, int num_executors = 1,
             string scheduler = "sync", string vm = "gc-only");
 
   void SetRandSeed(unsigned seed) override;
@@ -150,17 +133,17 @@ class CppDevice : public Device {
 };
 
 /// a singleton CppDevice as the host for all devices.
-extern CppDevice hostDeviceSingleton;
+extern CppCPU defaultDevice;
 
 // Implement Device using OpenCL libs.
 // class OpenclDevice : public Device { };
 
 #ifdef USE_CUDA
-// Implement Device using cuda.
-class CudaDevice : public Device {
+// Represent a Nvidia GPU which runs cuda code.
+class CudaGPU : public Device {
  public:
-  ~CudaDevice();
-  CudaDevice(int id, int num_executors = 1, string scheduler = "sync",
+  ~CudaGPU();
+  CudaGPU(int id = -1, int num_executors = 1, string scheduler = "sync",
          string vm = "gc-only");
 
   void SetRandSeed(unsigned seed) override;
@@ -200,11 +183,37 @@ class CudaDevice : public Device {
   Context ctx_;
 };
 
+/// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
+
 #endif  // USE_CUDA
 
 // Implement a CudaHost device, which used cuda functions for memory
 // malloc/free.
 // class CudaHost : public Device {}
+//
+/// The base type of callback argument structure.
+/// The specific arg should inherit from this one.
+/*
+class CallbackArg {
+ public:
+  template <typename T>
+  T* CastTo() {
+    static_assert(std::is_base_of<CallbackArg, T>::value,
+                  "The casted type must be a sub-class of CallbackArg");
+    return static_cast<T*>(this);
+  }
+};
+/// Type of callback functions for executing tensor ops.
+typedef function<void(CallbackArg*)> CallbackFn;
+public:
+  /// Operation has a function, and read/write blobs.
+  typedef struct _Operation {
+    function<void(Context*)> fn;
+    const vector<Blob*> read_blobs;
+    const vector<Blob*> write_blobs;
+  } Operation;
+
+*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_DEVICE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 050236a..084c42e 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -36,7 +36,7 @@ class Layer {
   Layer() = default;
 
   /// Set meta data fields from a string representing a proto message.
-  void Setup(const string& proto_str) {
+    void Setup(const string& proto_str) {
     LayerConf conf;
     conf.ParseFromString(proto_str);
     this->Setup(conf);
@@ -55,6 +55,13 @@ class Layer {
   virtual const std::string layer_type() const { return "Unknown"; }
 
   /// Set meta data fields configured in 'conf' (a proto message).
+  /// For some layers, which use input tensor shapes for setting its parameter
+  /// shapes (e.g, desen layer and convolution layer), users or wrapper
+  /// functions need to configure ncessary fields inside LayerConf.
+  /// After calling Setup, the shape info of parameters should be accssed
+  /// correctly. All other info that depends on input tensors (e.g., batchsize)
+  /// should be set inside Forward(). Internal buffer/fields are set assuming
+  /// batchsize is 1.
   virtual void Setup(const LayerConf& conf) {
     name_ = conf.name();
     for (const auto& spec : conf.param()) param_specs_.push_back(spec);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/include/singa/utils/cuda.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda.h b/include/singa/utils/cuda.h
deleted file mode 100644
index b2bb5c5..0000000
--- a/include/singa/utils/cuda.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// from caffe include/caffe/util/device_alternative.hpp
-
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-//
-// CUDA macros
-//
-
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << curandGetErrorString(status); \
-  } while (0)
-
-const char* cublasGetErrorString(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-#if CUDA_VERSION >= 6000
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-#endif
-#if CUDA_VERSION >= 6050
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif
-  }
-  return "Unknown cublas status";
-}
-
-const char* curandGetErrorString(curandStatus_t error) {
-  switch (error) {
-  case CURAND_STATUS_SUCCESS:
-    return "CURAND_STATUS_SUCCESS";
-  case CURAND_STATUS_VERSION_MISMATCH:
-    return "CURAND_STATUS_VERSION_MISMATCH";
-  case CURAND_STATUS_NOT_INITIALIZED:
-    return "CURAND_STATUS_NOT_INITIALIZED";
-  case CURAND_STATUS_ALLOCATION_FAILED:
-    return "CURAND_STATUS_ALLOCATION_FAILED";
-  case CURAND_STATUS_TYPE_ERROR:
-    return "CURAND_STATUS_TYPE_ERROR";
-  case CURAND_STATUS_OUT_OF_RANGE:
-    return "CURAND_STATUS_OUT_OF_RANGE";
-  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-  case CURAND_STATUS_LAUNCH_FAILURE:
-    return "CURAND_STATUS_LAUNCH_FAILURE";
-  case CURAND_STATUS_PREEXISTING_FAILURE:
-    return "CURAND_STATUS_PREEXISTING_FAILURE";
-  case CURAND_STATUS_INITIALIZATION_FAILED:
-    return "CURAND_STATUS_INITIALIZATION_FAILED";
-  case CURAND_STATUS_ARCH_MISMATCH:
-    return "CURAND_STATUS_ARCH_MISMATCH";
-  case CURAND_STATUS_INTERNAL_ERROR:
-    return "CURAND_STATUS_INTERNAL_ERROR";
-  }
-  return "Unknown curand status";
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
new file mode 100644
index 0000000..b2bb5c5
--- /dev/null
+++ b/include/singa/utils/cuda_utils.h
@@ -0,0 +1,94 @@
+// from caffe include/caffe/util/device_alternative.hpp
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+//
+// CUDA macros
+//
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+
+#define CUBLAS_CHECK(condition) \
+  do { \
+    cublasStatus_t status = condition; \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
+      << cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_CHECK(condition) \
+  do { \
+    curandStatus_t status = condition; \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
+      << curandGetErrorString(status); \
+  } while (0)
+
+const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#endif
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif
+  }
+  return "Unknown cublas status";
+}
+
+const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  return "Unknown curand status";
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
new file mode 100644
index 0000000..3287911
--- /dev/null
+++ b/src/core/device/cpp_cpu.cc
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa/core/device.h"
+namespace singa {
+CppCPU defaultDevice(-1, 1);
+CppCPU::CppCPU(int id, int num_executors, string scheduler,
+         string vm) : Device(id, num_executors, scheduler, vm) {
+  lang_ = kCpp;
+  host_ = nullptr;
+}
+
+void CppCPU::SetRandSeed(unsigned seed) {
+  ctx_.random_generator.seed(seed);
+}
+void CppCPU::DoExec(function<void(Context*)>&& fn, int executor) {
+  CHECK_EQ(executor, 0);
+  fn(&ctx_);
+}
+
+void* CppCPU::Malloc(int size) {
+  return malloc(size);
+}
+
+void CppCPU::Free(void* ptr) {
+  free(ptr);
+}
+
+void CppCPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                           CopyDirection direction, Context* ctx) {
+  memcpy(dst, src, nBytes);
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/device/cpp_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_device.cc b/src/core/device/cpp_device.cc
deleted file mode 100644
index 763156c..0000000
--- a/src/core/device/cpp_device.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "singa/core/device.h"
-namespace singa {
-CppDevice hostDeviceSingleton(-1, 1);
-CppDevice::CppDevice(int id, int num_executors, string scheduler,
-         string vm) : Device(id, num_executors, scheduler, vm) {
-  device_type_ = kCpp;
-  host_ = nullptr;
-}
-
-void CppDevice::SetRandSeed(unsigned seed) {
-  ctx_.random_generator.seed(seed);
-}
-void CppDevice::DoExec(function<void(Context*)>&& fn, int executor) {
-  CHECK_EQ(executor, 0);
-  fn(&ctx_);
-}
-
-void* CppDevice::Malloc(int size) {
-  return malloc(size);
-}
-
-void CppDevice::Free(void* ptr) {
-  free(ptr);
-}
-
-void CppDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
-                           CopyDirection direction, Context* ctx) {
-  memcpy(dst, src, nBytes);
-}
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/device/cuda_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_device.cc b/src/core/device/cuda_device.cc
deleted file mode 100644
index 9be1a6e..0000000
--- a/src/core/device/cuda_device.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifdef USE_CUDA
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <chrono>
-
-#include "singa/core/device.h"
-#include "singa/utils/cuda.h"
-namespace singa {
-
-const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
-                                   cudaMemcpyDeviceToHost,
-                                   cudaMemcpyDeviceToDevice};
-
-CudaDevice::~CudaDevice() {
-  if (ctx_.cublas_handle)
-    CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
-  if (ctx_.curand_generator)
-    CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
-#ifdef USE_CUDNN
-  if (ctx_.cudnn_handle) {
-    auto status = cudnnDestroy(ctx_.cudnn_handle);
-    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
-  }
-#endif
-}
-
-CudaDevice::CudaDevice(int id, int num_executors,
-                       string scheduler, string vm)
-    : Device(id, num_executors, scheduler, vm) {
-  device_type_ = kCuda;
-  host_ = nullptr;  // TODO(wangwei) add host device
-  ctx_.stream = NULL;  // use the default sync stream
-  // TODO(wangwei) create one handle for each steam?
-  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
-  // use curandCreateGeneratorHost for CudaHost device
-  CURAND_CHECK(
-      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
-  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
-  SetRandSeed(seed);
-  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
-  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
-  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
-
-#ifdef USE_CUDNN
-  // TODO(wangwei) create one handle for each stream?
-  auto status = cudnnCreate(&ctx_.cudnn_handle);
-  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
-#endif  // USE_CUDNN
-}
-
-void CudaDevice::SetRandSeed(unsigned seed) {
-  CHECK(ctx_.curand_generator);
-  CURAND_CHECK(
-      curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
-}
-
-void CudaDevice::DoExec(function<void(Context*)>&& fn, int executor) {
-  fn(&ctx_);
-}
-
-void CudaDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
-                            CopyDirection direction, Context* ctx) {
-  cudaMemcpy(dst, src, nBytes, copyKind[direction]);
-  // TODO(wangwei) use async copy
-  // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
-}
-
-/// Allocate cpu memory.
-void* CudaDevice::Malloc(int size) {
-  void* ptr = nullptr;
-  CUDA_CHECK(cudaMalloc(&ptr, size));
-  return ptr;
-}
-
-  /// Free cpu memory.
-void CudaDevice::Free(void* ptr) {
-  CHECK_NE(ptr, nullptr);
-  CUDA_CHECK(cudaFree(ptr));
-}
-
-
-// ==========Following code is from Caffe src/caffe/common.cpp=================
-
-void CudaDevice::DeviceQuery() {
-  cudaDeviceProp prop;
-  int device;
-  if (cudaSuccess != cudaGetDevice(&device)) {
-    printf("No cuda device present.\n");
-    return;
-  }
-  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-  LOG(INFO) << "Device id:                     " << device;
-  LOG(INFO) << "Major revision number:         " << prop.major;
-  LOG(INFO) << "Minor revision number:         " << prop.minor;
-  LOG(INFO) << "Name:                          " << prop.name;
-  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
-  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
-  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
-  LOG(INFO) << "Warp size:                     " << prop.warpSize;
-  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
-  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
-  LOG(INFO) << "Maximum dimension of block:    "
-      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
-      << prop.maxThreadsDim[2];
-  LOG(INFO) << "Maximum dimension of grid:     "
-      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
-      << prop.maxGridSize[2];
-  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
-  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
-  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
-  LOG(INFO) << "Concurrent copy and execution: "
-      << (prop.deviceOverlap ? "Yes" : "No");
-  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
-  LOG(INFO) << "Kernel execution timeout:      "
-      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-  return;
-}
-
-bool CudaDevice::CheckDevice(const int device_id) {
-  bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
-            (cudaSuccess == cudaFree(0)));
-  // reset any error that may have occurred.
-  cudaGetLastError();
-  return r;
-}
-
-int CudaDevice::FindDevice(const int start_id) {
-  int count = 0;
-  CUDA_CHECK(cudaGetDeviceCount(&count));
-  for (int i = start_id; i < count; i++) {
-    if (CheckDevice(i)) return i;
-  }
-  return -1;
-}
-
-
-}  // namespace singa
-#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
new file mode 100644
index 0000000..8eafc4c
--- /dev/null
+++ b/src/core/device/cuda_gpu.cc
@@ -0,0 +1,159 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef USE_CUDA
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <chrono>
+
+#include "singa/core/device.h"
+#include "singa/utils/cuda_utils.h"
+namespace singa {
+
+const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
+                                   cudaMemcpyDeviceToHost,
+                                   cudaMemcpyDeviceToDevice};
+
+CudaGPU::~CudaGPU() {
+  if (ctx_.cublas_handle)
+    CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
+  if (ctx_.curand_generator)
+    CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
+#ifdef USE_CUDNN
+  if (ctx_.cudnn_handle) {
+    auto status = cudnnDestroy(ctx_.cudnn_handle);
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+  }
+#endif
+}
+
+CudaGPU::CudaGPU(int id, int num_executors,
+                       string scheduler, string vm)
+    : Device(id, num_executors, scheduler, vm) {
+  if (id == -1)
+    id = FindDevice(0);
+  lang_ = kCuda;
+  host_ = nullptr;  // TODO(wangwei) add host device
+  ctx_.stream = NULL;  // use the default sync stream
+  // TODO(wangwei) create one handle for each steam?
+  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
+  // use curandCreateGeneratorHost for CudaHost device
+  CURAND_CHECK(
+      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  SetRandSeed(seed);
+  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+  // TODO(wangwei) create one handle for each stream?
+  auto status = cudnnCreate(&ctx_.cudnn_handle);
+  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif  // USE_CUDNN
+}
+
+void CudaGPU::SetRandSeed(unsigned seed) {
+  CHECK(ctx_.curand_generator);
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
+}
+
+void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) {
+  fn(&ctx_);
+}
+
+void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                            CopyDirection direction, Context* ctx) {
+  cudaMemcpy(dst, src, nBytes, copyKind[direction]);
+  // TODO(wangwei) use async copy
+  // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
+}
+
+/// Allocate cpu memory.
+void* CudaGPU::Malloc(int size) {
+  void* ptr = nullptr;
+  CUDA_CHECK(cudaMalloc(&ptr, size));
+  return ptr;
+}
+
+  /// Free cpu memory.
+void CudaGPU::Free(void* ptr) {
+  CHECK_NE(ptr, nullptr);
+  CUDA_CHECK(cudaFree(ptr));
+}
+
+
+// ==========Following code is from Caffe src/caffe/common.cpp=================
+
+void CudaGPU::DeviceQuery() {
+  cudaDeviceProp prop;
+  int device;
+  if (cudaSuccess != cudaGetDevice(&device)) {
+    printf("No cuda device present.\n");
+    return;
+  }
+  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+  LOG(INFO) << "Device id:                     " << device;
+  LOG(INFO) << "Major revision number:         " << prop.major;
+  LOG(INFO) << "Minor revision number:         " << prop.minor;
+  LOG(INFO) << "Name:                          " << prop.name;
+  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
+  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
+  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
+  LOG(INFO) << "Warp size:                     " << prop.warpSize;
+  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
+  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
+  LOG(INFO) << "Maximum dimension of block:    "
+      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
+      << prop.maxThreadsDim[2];
+  LOG(INFO) << "Maximum dimension of grid:     "
+      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
+      << prop.maxGridSize[2];
+  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
+  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
+  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
+  LOG(INFO) << "Concurrent copy and execution: "
+      << (prop.deviceOverlap ? "Yes" : "No");
+  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
+  LOG(INFO) << "Kernel execution timeout:      "
+      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+  return;
+}
+
+bool CudaGPU::CheckDevice(const int device_id) {
+  bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
+            (cudaSuccess == cudaFree(0)));
+  // reset any error that may have occurred.
+  cudaGetLastError();
+  return r;
+}
+
+int CudaGPU::FindDevice(const int start_id) {
+  int count = 0;
+  CUDA_CHECK(cudaGetDeviceCount(&count));
+  for (int i = start_id; i < count; i++) {
+    if (CheckDevice(i)) return i;
+  }
+  return -1;
+}
+
+
+}  // namespace singa
+#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 205601b..cd860db 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -64,7 +64,7 @@ void Device::CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
 
 void Device::CopyDataFromHostPtr(Blob* dst, const void* src, size_t nBytes,
                                  size_t dst_offset) {
-  auto direct = device_type_ == kCpp ? kHostToHost : kHostToDevice;
+  auto direct = lang_ == kCpp ? kHostToHost : kHostToDevice;
   void* dstptr = reinterpret_cast<char*>(dst->mutable_data()) + dst_offset;
   Exec([this, dstptr, src, nBytes,
         direct](Context* ctx) { CopyToFrom(dstptr, src, nBytes, direct, ctx); },

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index fac846c..185b1f9 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -25,23 +25,20 @@
 namespace singa {
 
 Tensor::~Tensor() {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
-Tensor::Tensor() {
-  device_ = &hostDeviceSingleton;
-}
+Tensor::Tensor() { device_ = &defaultDevice; }
 
 Tensor::Tensor(const Shape& shape, DataType dtype)
-    : data_type_(dtype), device_(&hostDeviceSingleton), shape_(shape) {
-  device_ = &hostDeviceSingleton;
+    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
+  device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(Shape&& shape, DataType dtype)
-    : data_type_(dtype), device_(&hostDeviceSingleton), shape_(shape) {
-  device_ = &hostDeviceSingleton;
+    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
+  device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(const Shape& shape, Device* device, DataType dtype)
@@ -82,8 +79,7 @@ void Tensor::ResetLike(const Tensor& t) {
 
 void Tensor::ReShape(const Shape& shape) {
   if (shape_ != shape) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
     shape_ = shape;
   }
@@ -91,8 +87,7 @@ void Tensor::ReShape(const Shape& shape) {
 
 void Tensor::AsType(DataType type) {
   if (data_type_ != type) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
     data_type_ = type;
   }
@@ -103,17 +98,14 @@ void Tensor::ToDevice(Device* dst) {
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     tmp.CopyData(*this);
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = tmp.blob_;
     tmp.blob_ = nullptr;
     device_ = dst;
   }
 }
 
-void Tensor::ToHost() {
-  ToDevice(device_->host());
-}
+void Tensor::ToHost() { ToDevice(device_->host()); }
 
 template <typename DType>
 void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
@@ -153,8 +145,7 @@ Tensor Tensor::T() const {
 }
 
 Tensor& Tensor::operator=(const Tensor& t) {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
   data_type_ = t.data_type_;
   shape_ = t.shape_;
@@ -165,8 +156,7 @@ Tensor& Tensor::operator=(const Tensor& t) {
 }
 
 Tensor& Tensor::operator=(Tensor&& t) {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
   data_type_ = t.data_type_;
   shape_ = std::move(t.shape_);
@@ -177,7 +167,10 @@ Tensor& Tensor::operator=(Tensor&& t) {
 }
 
 #define GenUnaryTensorArgMemberFunction(op, fn) \
-  Tensor& Tensor::op(const Tensor& t) { fn(*this, t, this); return *this; }
+  Tensor& Tensor::op(const Tensor& t) {         \
+    fn(*this, t, this);                         \
+    return *this;                               \
+  }
 
 GenUnaryTensorArgMemberFunction(operator+=, Add);
 GenUnaryTensorArgMemberFunction(operator-=, Sub);
@@ -210,19 +203,19 @@ void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
 
   Device *src_dev = src.device(), *dst_dev = dst->device();
   Blob *from = src.blob(), *to = dst->blob();
-  if (dst_dev->type() != src_dev->type()) {
+  if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
-    if (dst_dev->type() == kCpp) {
+    if (dst_dev->lang() == kCpp) {
       src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, dst_offset,
                               src_offset);
-    } else if (src_dev->type() == kCpp) {
+    } else if (src_dev->lang() == kCpp) {
       dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, dst_offset,
                               src_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
-    auto direct = src_dev->type() == kCpp ? kHostToHost : kDeviceToDevice;
+    auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
     src_dev->CopyDataToFrom(to, from, nBytes, direct, dst_offset, src_offset);
   }
 }
@@ -252,49 +245,49 @@ void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
     }                                                               \
   } while (0)
 
-/// typedef DType and Dev according to values of type and lib respectively.
-/// type is from DataType, and lib is from DevType.
-/// DType and Dev would be used in __VA_ARGS__.
-#define TYPE_LIB_SWITCH(dtype, DType, dev, Dev, ...)        \
-  do {                                                        \
-    const int _SwitchShift = 3;                               \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (dev);    \
-    switch (_SwitchHash) {                                    \
-      case ((kFloat32 << _SwitchShift) + kCuda): {            \
-        typedef float DType;                                  \
-        typedef lib::Cuda Dev;                                \
-        { __VA_ARGS__ }                                       \
-        break;                                                \
-      }                                                       \
-      case ((kFloat32 << _SwitchShift) + kCpp): {             \
-        typedef float DType;                                  \
-        typedef lib::Cpp Dev;                                 \
-        { __VA_ARGS__ }                                       \
-        break;                                                \
-      }                                                       \
-      case ((kFloat32 << _SwitchShift) + kOpencl): {          \
-        typedef float DType;                                  \
-        typedef lib::Opencl Dev;                              \
-        { __VA_ARGS__ }                                       \
-        break;                                                \
-      }                                                       \
-      default:                                                \
-        LOG(FATAL) << "Unknown combination of data type "     \
-                   << DataType_Name(dtype) << " and library " \
-                   << DeviceType_Name(dev);                    \
-    }                                                         \
+/// typedef DType and Lang according to data type and device programming
+/// language respectively.
+/// type is from DataType, and lang is from LangType.
+/// DType and Lang would be used in __VA_ARGS__.
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
+  do {                                                         \
+    const int _SwitchShift = 3;                                \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
+    switch (_SwitchHash) {                                     \
+      case ((kFloat32 << _SwitchShift) + kCuda): {             \
+        typedef float DType;                                   \
+        typedef lang::Cuda Lang;                               \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kCpp): {              \
+        typedef float DType;                                   \
+        typedef lang::Cpp Lang;                                \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kOpencl): {           \
+        typedef float DType;                                   \
+        typedef lang::Opencl Lang;                             \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      default:                                                 \
+        LOG(FATAL) << "Unknown combination of data type "      \
+                   << DataType_Name(dtype) << " and language " \
+                   << LangType_Name(ltype);                    \
+    }                                                          \
   } while (0)
 
-
-#define EltwiseUnaryTensorFn(fn, t, ret)                                   \
-  do {                                                                     \
-    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->type(), Dev, { \
-      ret->device()->Exec(                                               \
-          [t, ret](Context* ctx) {                                         \
-            fn<DType, Dev>(t.Size(), t.blob(), ret->blob(), ctx);          \
-          },                                                               \
-          {t.blob()}, {ret->blob()});                                      \
-    });                                                                    \
+#define EltwiseUnaryTensorFn(fn, t, ret)                               \
+  do {                                                                 \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
+      ret->device()->Exec(                                             \
+          [t, ret](Context* ctx) {                                     \
+            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);     \
+          },                                                           \
+          {t.blob()}, {ret->blob()});                                  \
+    });                                                                \
   } while (0)
 
 #define GenUnaryTensorFunction(fn)                    \
@@ -329,26 +322,26 @@ void Softmax(const Tensor& t, Tensor* ret, int axis) {
     CHECK_EQ(size % nrow, 0) << "Size = " << size << " nrow = " << nrow;
     ncol = size / nrow;
   }
-  TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->type(), Dev, {
+  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
     ret->device()->Exec(
         [nrow, ncol, t, ret](Context* ctx) {
-          Softmax<DType, Dev>(nrow, ncol, t.blob(), ret->blob(), ctx);
+          Softmax<DType, Lang>(nrow, ncol, t.blob(), ret->blob(), ctx);
         },
         {t.blob()}, {ret->blob()});
-    });
+  });
 }
 
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
-  do {                                                                         \
-    TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->type(), Dev, { \
-      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
-      ret->device()->Exec(                                                     \
-          [lhs, rhs, ret](Context* ctx) {                                      \
-            fn<DType, Dev>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),    \
-                           ctx);                                               \
-          },                                                                   \
-          {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
-    });                                                                        \
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                             \
+  do {                                                                       \
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {   \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                      \
+      ret->device()->Exec(                                                   \
+          [lhs, rhs, ret](Context* ctx) {                                    \
+            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), \
+                            ctx);                                            \
+          },                                                                 \
+          {lhs.blob(), rhs.blob()}, {ret->blob()});                          \
+    });                                                                      \
   } while (0)
 
 #define GenBinaryTensorFunction(op, fn)                        \
@@ -369,12 +362,12 @@ GenBinaryTensorFunction(Pow, Pow);
 
 #define EltwiseTensorScalarFn(fn, t, x, ret)                            \
   do {                                                                  \
-    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->type(), Dev, {    \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
       static_assert(std::is_same<SType, DType>::value,                  \
                     "The Scalar type must match the Tensor data type"); \
       ret->device()->Exec(                                              \
           [t, x, ret](Context* ctx) {                                   \
-            fn<DType, Dev>(t.Size(), t.blob(), x, ret->blob(), ctx);    \
+            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);   \
           },                                                            \
           {t.blob()}, {ret->blob()});                                   \
     });                                                                 \
@@ -424,11 +417,11 @@ void Mult(float alpha, const Tensor& A, float beta, const Tensor& B,
   size_t m = transA ? A.shape()[1] : A.shape()[0], n = 0;
   if (B.shape().size() == 1u) {
     n = C->Size();
-    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->type(), Dev, {
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       C->device()->Exec(
           [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
-            GEMV<DType, Dev>(transA, m, n, alpha, A.blob(), B.blob(), beta,
-                             C->blob(), ctx);
+            GEMV<DType, Lang>(transA, m, n, alpha, A.blob(), B.blob(), beta,
+                              C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
     });
@@ -440,11 +433,11 @@ void Mult(float alpha, const Tensor& A, float beta, const Tensor& B,
     CHECK_EQ(C->shape()[0], m);
     CHECK_EQ(A.Size(), m * k);
     CHECK_EQ(B.Size(), n * k);
-    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->type(), Dev, {
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       C->device()->Exec(
           [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
-            GEMM<DType, Dev>(transA, transB, m, n, k, alpha, A.blob(), B.blob(),
-                             beta, C->blob(), ctx);
+            GEMM<DType, Lang>(transA, transB, m, n, k, alpha, A.blob(),
+                              B.blob(), beta, C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
     });
@@ -452,30 +445,30 @@ void Mult(float alpha, const Tensor& A, float beta, const Tensor& B,
 }
 
 void Bernoulli(float p, Tensor* t) {
-  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->type(), Dev, {
+  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
         [p, t](Context* ctx) {
-          Bernoulli<DType, Dev>(t->Size(), p, t->blob(), ctx);
+          Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
 void Uniform(float low, float high, Tensor* t) {
-  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->type(), Dev, {
+  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
         [low, high, t](Context* ctx) {
-          Uniform<DType, Dev>(t->Size(), low, high, t->blob(), ctx);
+          Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
 void Gaussian(float mean, float std, Tensor* t) {
-  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->type(), Dev, {
+  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
         [mean, std, t](Context* ctx) {
-          Gaussian<DType, Dev>(t->Size(), mean, std, t->blob(), ctx);
+          Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index aa520c9..53e979b 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -25,8 +25,8 @@ namespace singa {
 
 /// \file math.h Math functions for linear algebra, neural net and random
 /// operations.
-/// All functions have a template argument, DType for DataType, Lib for the
-/// backend library, e.g., lib::Cublas, lib::Cudnn, etc.
+/// All functions have a template argument, DType for DataType, Lang for the
+/// device programming language, e.g., Langice::kCpp, Langice::kCuda
 
 /// Some operations would have many config/hyper-parameters, e.g., Conv, and
 /// these config vary among diff implementations, e.g., cuda/cudnn/opencl.
@@ -45,133 +45,133 @@ class OpConf {
 
 // ================Linear algebra functions====================================
 /// ret[i] = |input[i]|
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// sum all elements of input into ret
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret[i] = sign(input[i])
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Sign(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Base is e, Neper number. ret[i]=exp(input[i])
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Exp(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Log(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=sqrt([input[i])
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=tanh([input[i])
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// Element-wise operation, ret[i]=max(0, input[i])
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void ReLU(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// Element-wise operation, ret[i]=sigmoid([input[i])
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Do softmax for each row invidually
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Softmax(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the input tensor
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret,
            Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = input + x
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Add(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret =  input - x
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Sub(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
-  Add<DType, Lib>(count, input, -x, ret, ctx);
+  Add<DType, Lang>(count, input, -x, ret, ctx);
 }
 /// ret = input * x
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void EltwiseMult(int count, const Blob* input, DType x, Blob* ret, Context* ctx)
 {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret = input / x
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Div(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
-  EltwiseMult<DType, Lib>(count, input, DType(1) / x, ret, ctx);
+  EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx);
 }
 
 /// ret = lhs + rhs
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs - rhs
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs * rhs
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void EltwiseMult(int count, const Blob* lhs, const Blob* rhs, Blob* ret,
           Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs / rhs
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// outer-product.
 /// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
            Context* ctx) {
   LOG(FATAL) << "Not Implemented";
@@ -179,26 +179,26 @@ void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void SumRow(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void SumCol(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // TODO(wangwei) unify AddRow and AddCol.
 /// Add the vector v to every row of A as the row of ret
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
             Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Add the vector v to every column of A as the column of ret
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
             Context* ctx) {
   LOG(FATAL) << "Not Implemented";
@@ -207,35 +207,35 @@ void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // ===== Level 1
 /// return the index of the element with the max value.
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Amax(int count, const Blob* input, int* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// return the index of the element with the min value.
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Amin(int count, const Blob* input, int* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret = sum |x| for all x in input
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Asum(int count, const Blob* input, DType* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = alpha * input + ret
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret *= x
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Scale(int count, DType x, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
          Context* ctx) {
   LOG(FATAL) << "Not Implemented";
@@ -244,7 +244,7 @@ void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
 // ===== Level 2
 /// ret = alpha * op(A) * v + beta * ret.
 /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
           DType beta, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
@@ -253,7 +253,7 @@ void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
 // ===== Level 3
 /// ret = alpha * op(A) * op(B) + beta * ret.
 /// op(A) = A if trans = false; A^T otherwise; rows(ret) = m, cols(ret) = n.
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha,
           const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
@@ -263,47 +263,23 @@ void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha,
 /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Bernoulli(int count, float p, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Uniform(int count, float low, float high, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lib>
+template <typename DType, typename Lang>
 void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
-/* ================Neural net functions=======================================
-template <typename DType, typename Lib>
-void ConvFwd(ConvConf* conf, const Blob* x, const Blob* w, Blob* y,
-             Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void ConvBwdBias(const ConvConf* conf, const Blob* dy, Blob* db, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void PoolFwd(const PoolConf* conf, const Blob* x, Blob* y, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void PoolBwd(const PoolConf* conf, const Blob* y, const Blob* dy, const Blob* x,
-             Blob* dx, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-*/
-
 }  // namespace singa
 
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 2cbc225..b58e3bd 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -25,64 +25,60 @@
 #endif
 
 namespace singa {
-template<>
-void Add<float, lib::Cpp>(int count,
-                     const Blob* lhs,
-                     const Blob* rhs,
-                     Blob* ret,
-                     Context* ctx) {
+template <>
+void Add<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
+                           Blob* ret, Context* ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float*>(ret->mutable_data());
-  const float *lptr = static_cast<const float*>(lhs->data());
-  const float *rptr = static_cast<const float*>(rhs->data());
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* lptr = static_cast<const float*>(lhs->data());
+  const float* rptr = static_cast<const float*>(rhs->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = lptr[i] + rptr[i];
   }
 }
 template <>
-void EltwiseMult<float, lib::Cpp>(int count, const Blob* input, float x, Blob* ret, Context* ctx)
-{
-  float *dptr = static_cast<float*>(ret->mutable_data());
-  const float *lptr = static_cast<const float*>(input->data());
+void EltwiseMult<float, lang::Cpp>(int count, const Blob* input, float x,
+                                   Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* lptr = static_cast<const float*>(input->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = lptr[i] * x;
   }
 }
 
 template <>
-void EltwiseMult<float, lib::Cpp>(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx)
-{
-  float *dptr = static_cast<float*>(ret->mutable_data());
-  const float *lptr = static_cast<const float*>(lhs->data());
-  const float *rptr = static_cast<const float*>(rhs->data());
+void EltwiseMult<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
+                                   Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* lptr = static_cast<const float*>(lhs->data());
+  const float* rptr = static_cast<const float*>(rhs->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = lptr[i] * rptr[i];
   }
 }
 
 template <>
-void Bernoulli<float, lib::Cpp>(int count, float p, Blob* ret,
-                                 Context* ctx) {
+void Bernoulli<float, lang::Cpp>(int count, float p, Blob* ret, Context* ctx) {
   std::bernoulli_distribution distribution(p);
   float* ptr = static_cast<float*>(ret->mutable_data());
-  for (int i = 0; i < count; i ++) {
+  for (int i = 0; i < count; i++) {
     ptr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void Uniform<float, lib::Cpp>(int count, float low, float high, Blob* ret,
+void Uniform<float, lang::Cpp>(int count, float low, float high, Blob* ret,
                                Context* ctx) {
   std::uniform_real_distribution<float> distribution(low, high);
   float* ptr = static_cast<float*>(ret->mutable_data());
-  for (int i = 0; i < count; i ++) {
+  for (int i = 0; i < count; i++) {
     ptr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
 template <>
-void Gaussian<float, lib::Cpp>(int count, float mean, float std, Blob* ret,
-                              Context* ctx) {
+void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob* ret,
+                                Context* ctx) {
   std::normal_distribution<float> distribution(mean, std);
   float* ptr = static_cast<float*>(ret->mutable_data());
   for (int i = 0; i < count; i++) {
@@ -90,14 +86,10 @@ void Gaussian<float, lib::Cpp>(int count, float mean, float std, Blob* ret,
   }
 }
 
-
 #ifdef USE_CBLAS
-template<>
-void Dot<float, lib::Cpp>(int count,
-                     const Blob* lhs,
-                     const Blob* rhs,
-                     float* ret,
-                     Context* ctx) {
+template <>
+void Dot<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
+                           float* ret, Context* ctx) {
   float dptr = ret->mutable_data(), lptr = lhs->data(), rptr = rhs->data();
   *ret = cblas_sdot(count, lptr, 1, rptr, 1);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index c5ea3c4..991e8bb 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -26,7 +26,7 @@ namespace singa {
 
 #ifdef USE_CUDA
 template<>
-void Add<float, lib::Cuda>(int count, const Blob* lhs, const Blob* rhs,
+void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
                         Blob* ret, Context* ctx) {
   /*
   cublasSetStream(ctx->cublas_handle, ctx->stream);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index f99aba4..88d7f12 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -30,7 +30,7 @@ enum DataType {
   kNumDataType = 5;
 }
 
-enum DeviceType {
+enum LangType {
   kCpp = 0;
   kCuda = 1;
   kOpencl = 2;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/test/singa/test_cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_cpu.cc b/test/singa/test_cpp_cpu.cc
new file mode 100644
index 0000000..86654e1
--- /dev/null
+++ b/test/singa/test_cpp_cpu.cc
@@ -0,0 +1,71 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include  "singa/core/device.h"
+#include "singa/proto/core.pb.h"
+
+using singa::CppCPU;
+using singa::Blob;
+TEST(CppCPU, Constructor) {
+  CppCPU dev(0, 1);
+  EXPECT_EQ(0, dev.id());
+}
+
+TEST(CppCPU, MemoryMallocFree) {
+  CppCPU dev(0, 1);
+  Blob* b = dev.NewBlob(4);
+  EXPECT_NE(nullptr, b);
+  EXPECT_EQ(4u, b->size());
+  dev.FreeBlob(b);
+}
+
+TEST(CppCPU, Exec) {
+  CppCPU dev(0, 1);
+  Blob* b = dev.NewBlob(4);
+  int x = 1, y =3, z = 0;
+  dev.Exec([x, y, &z](singa::Context *ctx) {
+      z = x + y;
+      }, {b}, {b}, false);
+  EXPECT_EQ(x + y, z);
+}
+
+TEST(CppCPU, CopyData) {
+  CppCPU dev(0, 1);
+  Blob* b = dev.NewBlob(4);
+  char s[] = {'a', 'b', 'c', 'x'};
+  dev.CopyDataFromHostPtr(b, s, 4);
+  const char* bstr = static_cast<const char*>(b->data());
+  EXPECT_EQ('a', bstr[0]);
+  EXPECT_EQ('b', bstr[1]);
+  EXPECT_EQ('x', bstr[3]);
+
+  Blob* c = dev.NewBlob(4);
+  dev.CopyDataToFrom(c, b, 4, singa::kHostToHost, 0, 0);
+  const char* cstr = static_cast<const char*>(c->data());
+
+  EXPECT_EQ('a', cstr[0]);
+  EXPECT_EQ('b', cstr[1]);
+  EXPECT_EQ('x', cstr[3]);
+  dev.FreeBlob(b);
+  dev.FreeBlob(c);
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/test/singa/test_cpp_device.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_device.cc b/test/singa/test_cpp_device.cc
deleted file mode 100644
index c302206..0000000
--- a/test/singa/test_cpp_device.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "gtest/gtest.h"
-#include  "singa/core/device.h"
-#include "singa/proto/core.pb.h"
-
-using singa::CppDevice;
-using singa::Blob;
-TEST(CppDevice, Constructor) {
-  CppDevice dev(0, 1);
-  EXPECT_EQ(0, dev.id());
-}
-
-TEST(CppDevice, MemoryMallocFree) {
-  CppDevice dev(0, 1);
-  Blob* b = dev.NewBlob(4);
-  EXPECT_NE(nullptr, b);
-  EXPECT_EQ(4u, b->size());
-  dev.FreeBlob(b);
-}
-
-TEST(CppDevice, Exec) {
-  CppDevice dev(0, 1);
-  Blob* b = dev.NewBlob(4);
-  int x = 1, y =3, z = 0;
-  dev.Exec([x, y, &z](singa::Context *ctx) {
-      z = x + y;
-      }, {b}, {b}, false);
-  EXPECT_EQ(x + y, z);
-}
-
-TEST(CppDevice, CopyData) {
-  CppDevice dev(0, 1);
-  Blob* b = dev.NewBlob(4);
-  char s[] = {'a', 'b', 'c', 'x'};
-  dev.CopyDataFromHostPtr(b, s, 4);
-  const char* bstr = static_cast<const char*>(b->data());
-  EXPECT_EQ('a', bstr[0]);
-  EXPECT_EQ('b', bstr[1]);
-  EXPECT_EQ('x', bstr[3]);
-
-  Blob* c = dev.NewBlob(4);
-  dev.CopyDataToFrom(c, b, 4, singa::kHostToHost, 0, 0);
-  const char* cstr = static_cast<const char*>(c->data());
-
-  EXPECT_EQ('a', cstr[0]);
-  EXPECT_EQ('b', cstr[1]);
-  EXPECT_EQ('x', cstr[3]);
-  dev.FreeBlob(b);
-  dev.FreeBlob(c);
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_dropout.cc b/test/singa/test_cudnn_dropout.cc
index 9913074..5fdc554 100644
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@ -48,7 +48,7 @@ TEST(CudnnDropout, Setup) {
 TEST(CudnnDropout, Forward) {
   const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   size_t n = sizeof(x) / sizeof(float);
-  singa::CudaDevice cuda(0, 1);
+  singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{n}, &cuda);
   in.CopyDataFromHostPtr(x, n);
 
@@ -67,7 +67,7 @@ TEST(CudnnDropout, Forward) {
   for (size_t i = 0; i < n; i++)
     EXPECT_FLOAT_EQ(0, GetBitValue(mptr, i) * (GetBitValue(mptr, i) - 1));
 
-  singa::CppDevice host(0, 1);
+  singa::CppCPU host(0, 1);
   out1.ToDevice(&host);
   const float* outptr1 = out1.data<const float*>();
   EXPECT_EQ(n, out1.Size());
@@ -90,7 +90,7 @@ TEST(CudnnDropout, Forward) {
 TEST(CudnnDropout, Backward) {
   const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   size_t n = sizeof(x) / sizeof(float);
-  singa::CudaDevice cuda(0, 1);
+  singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{n}, &cuda);
   in.CopyDataFromHostPtr(x, n);
 
@@ -109,7 +109,7 @@ TEST(CudnnDropout, Backward) {
   grad.CopyDataFromHostPtr(dy, n);
 
   const auto ret = drop.Backward(singa::kTrain, grad);
-  singa::CppDevice host(0, 1);
+  singa::CppCPU host(0, 1);
   singa::Tensor in_grad = ret.first;
   in_grad.ToDevice(&host);
   const float* dx = in_grad.data<const float*>();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9d1bcb42/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index 8c3c901..b3f0c6b 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -59,10 +59,10 @@ TEST(TensorClass, AsType) {
 
 TEST(TensorClass, ToDevice) {
   Tensor t(Shape{2,3});
-  EXPECT_EQ(static_cast<Device*>(&singa::hostDeviceSingleton), t.device());
-  singa::CppDevice *dev = new singa::CppDevice(0, 1);
+  EXPECT_EQ(static_cast<Device*>(&singa::defaultDevice), t.device());
+  singa::CppCPU *dev = new singa::CppCPU(0, 1);
   t.ToDevice(dev);
-  EXPECT_NE(static_cast<Device*>(&singa::hostDeviceSingleton), t.device());
+  EXPECT_NE(static_cast<Device*>(&singa::defaultDevice), t.device());
 }
 
 TEST(TensorClass, CopyDataFromHostPtr) {

[21/50] [abbrv] incubator-singa git commit: SINGA-178 Add Convolution layer and Pooling layer

Posted by zh...@apache.org.

SINGA-178 Add Convolution layer and Pooling layer

Minor update on variable names and InitCudnn arguments.
Fix compiling warnings about signed and unsigned number comparison.
Format code. Pass all tests.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/7d149ecf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/7d149ecf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/7d149ecf

Branch: refs/heads/master
Commit: 7d149ecf786f816cf2da47ea9e5bb86f8fecdd6b
Parents: 152056d
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon May 30 16:53:40 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon May 30 18:17:57 2016 +0800

----------------------------------------------------------------------
 include/singa/core/device.h          |  12 +--
 include/singa/core/tensor.h          |   2 +-
 include/singa/model/layer.h          |   7 +-
 include/singa/utils/string.h         |  81 ++++++++++++++++++
 include/singa/utils/tokenizer.h      |  65 --------------
 src/core/tensor/tensor.cc            |   8 +-
 src/model/layer/convolution.cc       |  33 ++++++--
 src/model/layer/convolution.h        |   3 +-
 src/model/layer/cudnn_convolution.cc | 135 ++++++++++++++----------------
 src/model/layer/cudnn_convolution.h  |  11 ++-
 src/model/layer/cudnn_pooling.cc     |  40 ++++-----
 src/model/layer/cudnn_pooling.h      |   2 +-
 src/model/layer/pooling.cc           |   8 +-
 src/model/layer/pooling.h            |   3 +-
 src/proto/model.proto                |  15 ++--
 test/singa/test_cudnn_convolution.cc |  50 +++++------
 test/singa/test_cudnn_pooling.cc     |  26 +++---
 17 files changed, 274 insertions(+), 227 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index a4b3f6d..56eda70 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -77,6 +77,10 @@ class Device {
 
   Device* host() const { return host_;}
 
+  Context* context(int k) {
+    return &ctx_;
+  }
+
   int id() const { return id_; }
 
  protected:
@@ -104,6 +108,8 @@ class Device {
   // SafeQueue<Operation> op_log_;
   /// The host device
   Device* host_;
+  // TODO(wangwei) define multiple contexts, one per executor
+  Context ctx_;
 };
 
 /// Represent a CPU device which may have multiple threads/executors.
@@ -125,9 +131,6 @@ class CppCPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-
- protected:
-  Context ctx_;
 };
 
 /// a singleton CppDevice as the host for all devices.
@@ -177,9 +180,6 @@ class CudaGPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-
- protected:
-  Context ctx_;
 };
 
 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index f51c899..8682bca 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -97,7 +97,7 @@ public:
     return shape_.at(idx);
   }
 
-  int nDim() const { return shape_.size(); }
+  size_t nDim() const { return shape_.size(); }
 
   bool transpose() const { return transpose_; }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index c6a3bd1..82c8edc 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -44,7 +44,7 @@ class Layer {
 
   // ============= Following Functions could be override =====================
   /// Destruct objects created by this layer.
-  virtual ~Layer() {}; 
+  virtual ~Layer() {};
 
   /// Each layer sub-class would optionaly have a type name.
   /// Used for debugging and logging.
@@ -160,7 +160,10 @@ class Layer {
   const vector<ParamSpec> param_specs() { return param_specs_; }
 
   /// Return the i-th ParamSpec.
-  const ParamSpec& param_specs(int i) { return param_specs_.at(i); }
+  const ParamSpec& param_specs(size_t i) {
+    CHECK_LT(i, param_specs_.size());
+    return param_specs_.at(i);
+  }
 
   /// Return pointers to parameter Tensor s.
   const vector<Tensor*> param_values() { return param_values_; }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/include/singa/utils/string.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/string.h b/include/singa/utils/string.h
new file mode 100644
index 0000000..b739afc
--- /dev/null
+++ b/include/singa/utils/string.h
@@ -0,0 +1,81 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_TOKENIZER_H_
+#define SINGA_UTILS_TOKENIZER_H_
+
+#include <string>
+#include <algorithm>
+#include "singa/utils/logging.h"
+
+namespace singa {
+inline bool icasecmp(const string& l, const string& r) {
+  return l.size() == r.size() &&
+         equal(l.cbegin(), l.cend(), r.cbegin(),
+               [](string::value_type l1, string::value_type r1) {
+                 return toupper(l1) == toupper(r1);
+               });
+}
+
+inline string ToLowerCase(const string& input) {
+  string out;
+  out.resize(input.size());
+  std::transform(input.begin(), input.end(), out.begin(), ::tolower);
+  return out;
+}
+
+/**
+ * Tokenize a string.
+ *
+ * example:
+ * Tokenizer t("assa,asf;wes", ",;");
+ * string x;
+ * t >> x; // x is assa
+ * t >> x; // x is asf
+ * t >> x; // x is wes
+ * cout << (t >> x); // print 0.
+ */
+
+class Tokenizer {
+ public:
+  Tokenizer(const std::string& str, const std::string& sep): start_(0),
+  sep_(sep), buf_(str) {}
+  Tokenizer & operator>>(std::string& out) {
+    CHECK_LT(start_, buf_.length());
+    int start = start_;
+    auto pos = buf_.find_first_of(sep_, start);
+    if (pos == std::string::npos)
+      pos = buf_.length();
+    start_ = pos + 1;
+    out = buf_.substr(start, pos);
+    return *this;
+  }
+  bool Valid() { return start_ < buf_.length(); }
+
+ private:
+  unsigned start_;
+  std::string sep_;
+  const std::string& buf_;
+};
+
+}  // namespace singa
+
+#endif  // SINGA_UTILS_TOKENIZER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/include/singa/utils/tokenizer.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/tokenizer.h b/include/singa/utils/tokenizer.h
deleted file mode 100644
index 92c24b6..0000000
--- a/include/singa/utils/tokenizer.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_TOKENIZER_H_
-#define SINGA_UTILS_TOKENIZER_H_
-
-#include <string>
-#include "singa/utils/logging.h"
-
-namespace singa {
-/**
- * Tokenize a string.
- *
- * example:
- * Tokenizer t("assa,asf;wes", ",;");
- * string x;
- * t >> x; // x is assa
- * t >> x; // x is asf
- * t >> x; // x is wes
- * cout << (t >> x); // print 0.
- */
-
-class Tokenizer {
- public:
-  Tokenizer(const std::string& str, const std::string& sep): start_(0),
-  sep_(sep), buf_(str) {}
-  Tokenizer & operator>>(std::string& out) {
-    CHECK_LT(start_, buf_.length());
-    int start = start_;
-    auto pos = buf_.find_first_of(sep_, start);
-    if (pos == std::string::npos)
-      pos = buf_.length();
-    start_ = pos + 1;
-    out = buf_.substr(start, pos);
-    return *this;
-  }
-  bool Valid() { return start_ < buf_.length(); }
-
- private:
-  unsigned start_;
-  std::string sep_;
-  const std::string& buf_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_TOKENIZER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 0e47a4f..fcf42c2 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -562,8 +562,8 @@ void AddColumn(const float alpha, const float beta, const Tensor &v,
     Tensor X = M->T();
     AddRow(v, &X);
   } else {
-    CHECK_EQ(M->nDim(), 2);
-    CHECK_EQ(v.nDim(), 1);
+    CHECK_EQ(M->nDim(), 2u);
+    CHECK_EQ(v.nDim(), 1u);
     size_t nb_row = M->shape(0), nb_col = M->shape(1);
     CHECK_EQ(nb_row, v.Size());
 
@@ -581,8 +581,8 @@ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
     Tensor X = M->T();
     AddColumn(v, &X);
   } else {
-    CHECK_EQ(M->nDim(), 2);
-    CHECK_EQ(v.nDim(), 1);
+    CHECK_EQ(M->nDim(), 2u);
+    CHECK_EQ(v.nDim(), 1u);
     size_t nb_row = M->shape(0), nb_col = M->shape(1);
     CHECK_EQ(nb_col, v.Size());
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
index 6406a31..50ee3c8 100644
--- a/src/model/layer/convolution.cc
+++ b/src/model/layer/convolution.cc
@@ -28,32 +28,51 @@ void Convolution::Setup(const LayerConf &conf) {
   ConvolutionConf conv_conf = conf.convolution_conf();
   // kernel_size, pad, and stride are repeated fields.
   if (conv_conf.kernel_size_size() > 0) {
-    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+    if (conv_conf.kernel_size_size() == 1) {
+      kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+    } else {
+      kernel_w_ = conv_conf.kernel_size(0);
+      kernel_h_ = conv_conf.kernel_size(1);
+    }
   } else {
     kernel_w_ = conv_conf.kernel_w();
     kernel_h_ = conv_conf.kernel_h();
   }
-  CHECK_NE(kernel_w_, 0);
-  CHECK_NE(kernel_h_, 0);
+  CHECK_GT(kernel_w_, 0u);
+  CHECK_GT(kernel_h_, 0u);
 
   if (conv_conf.pad_size() > 0) {
-    pad_w_ = pad_h_ = conv_conf.pad(0);
+    if (conv_conf.pad_size() == 1) {
+      pad_w_ = pad_h_ = conv_conf.pad(0);
+    } else {
+      pad_w_ = conv_conf.pad(0);
+      pad_h_ = conv_conf.pad(1);
+    }
   } else {
     pad_w_ = conv_conf.pad_w();
     pad_h_ = conv_conf.pad_h();
   }
+  CHECK_GE(pad_w_, 0u);
+  CHECK_GE(pad_h_, 0u);
 
   if (conv_conf.stride_size() > 0) {
-    stride_w_ = stride_h_ = conv_conf.stride(0);
+    if (conv_conf.stride_size() == 1) {
+      stride_w_ = stride_h_ = conv_conf.stride(0);
+    } else {
+      stride_w_ = conv_conf.stride(0);
+      stride_h_ = conv_conf.stride(1);
+    }
   } else {
     stride_w_ = conv_conf.stride_w();
     stride_h_ = conv_conf.stride_h();
   }
+  CHECK_GT(stride_w_, 0u);
+  CHECK_GT(stride_h_, 0u);
 
   num_filters_ = conv_conf.num_output();
   bias_term_ = conv_conf.bias_term();
 
-  // Shape of src
+  // Shape of input image
   channels_ = conv_conf.channels();
   height_ = conv_conf.height();
   width_ = conv_conf.width();
@@ -68,7 +87,7 @@ void Convolution::Setup(const LayerConf &conf) {
   bias_.Reshape(Shape{num_filters_});
   // Push back params into param_values_
   // Assume the order of param is: weight, bias
-  for (const auto& spec : conf.param()) param_specs_.push_back(spec);
+  for (const auto &spec : conf.param()) param_specs_.push_back(spec);
   param_values_.push_back(&weight_);
   param_values_.push_back(&bias_);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
index a9bf833..477efb3 100644
--- a/src/model/layer/convolution.h
+++ b/src/model/layer/convolution.h
@@ -47,7 +47,6 @@ class Convolution : public Layer {
   size_t stride_w() const { return stride_w_; }
   size_t stride_h() const { return stride_h_; }
   size_t num_filters() const { return num_filters_; }
-  size_t batchsize() const { return batchsize_; }
   size_t channels() const { return channels_; }
   size_t height() const { return height_; }
   size_t width() const { return width_; }
@@ -67,7 +66,7 @@ class Convolution : public Layer {
  protected:
   size_t kernel_w_, pad_w_, stride_w_;
   size_t kernel_h_, pad_h_, stride_h_;
-  size_t batchsize_, channels_, height_, width_;
+  size_t channels_, height_, width_;
   size_t col_height_, col_width_, conv_height_, conv_width_, num_filters_;
   Tensor weight_, bias_;
   // store intermediate data, i.e., input tensor

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index ec7cd6a..922b7e0 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -39,9 +39,9 @@ void CudnnConvolution::Setup(const LayerConf &conf) {
   ConvolutionConf conv_conf = conf.convolution_conf();
   // convert MB to bytes
   workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
-  pref_ = conv_conf.algo_pref();
-  CHECK(pref_ == "fastest" || pref_ == "limited_workspace" ||
-        pref_ == "no_workspace")
+  prefer_ = ToLowerCase(conv_conf.prefer());
+  CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+        prefer_ == "no_workspace")
       << "CudnnConvolution only supports three algorithm preferences: fastest, "
          "limited_workspace and no_workspace";
 }
@@ -52,8 +52,12 @@ void CudnnConvolution::ToDevice(Device *device) {
   workspace_.ToDevice(device);
 }
 
-void CudnnConvolution::InitCudnn(DataType dtype, Device *dev, Context *ctx) {
+void CudnnConvolution::InitCudnn(const Tensor& input) {
   CHECK(!has_init_cudnn_);
+  DataType dtype = input.data_type();
+  Device *dev = input.device();
+  Context *ctx = dev->context(0);
+  size_t batchsize = input.shape(0);
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
@@ -61,10 +65,10 @@ void CudnnConvolution::InitCudnn(DataType dtype, Device *dev, Context *ctx) {
   CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
 
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                         GetCudnnDataType(dtype), batchsize_,
+                                         GetCudnnDataType(dtype), batchsize,
                                          channels_, height_, width_));
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize_,
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
       num_filters_, conv_height_, conv_width_));
   if (bias_term_)
     CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
@@ -88,20 +92,20 @@ void CudnnConvolution::InitCudnn(DataType dtype, Device *dev, Context *ctx) {
   cudnnConvolutionFwdPreference_t fwd_pref;
   cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
   cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-  if (pref_ == "fastest") {
+  if (prefer_ == "fastest") {
     fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
     bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
     bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-  } else if (pref_ == "limited_workspace") {
+  } else if (prefer_ == "limited_workspace") {
     fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
     bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
     bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-  } else if (pref_ == "no_workspace") {
+  } else if (prefer_ == "no_workspace") {
     fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
     bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
     bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
   } else {
-    LOG(FATAL) << "Algorithm preference is not implemented!";
+    LOG(FATAL) << "Preferred algorithm is not available!";
   }
   CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
       ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
@@ -133,51 +137,46 @@ void CudnnConvolution::InitCudnn(DataType dtype, Device *dev, Context *ctx) {
 
 const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
   CHECK_EQ(input.device()->lang(), kCuda);
-  CHECK_EQ(input.shape().size(), 4);
+  CHECK_EQ(input.nDim(), 4u);
   buf_.push(input);
-  batchsize_ = input.shape()[0];
+  size_t batchsize = input.shape()[0];
   DataType dtype = input.data_type();
   Device *dev = input.device();
 
-  if (!has_init_cudnn_) InitCudnn(dtype, dev, dev->context(0));
+  if (!has_init_cudnn_) InitCudnn(input);
 
-  Shape shape{batchsize_, num_filters_, conv_height_, conv_width_};
+  Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
   Tensor output(shape, dev, dtype);
-  float alpha = 1.f, beta = 0.f;
-  output.device()->Exec(
-      [input, output, alpha, beta, this](Context *ctx) {
-        Blob *inblob = input.blob(), *outblob = output.blob(),
-             *wblob = this->weight_.blob();
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
-                                inblob->data(), this->filter_desc_,
-                                wblob->data(), this->conv_desc_, this->fp_alg_,
-                                this->workspace_.blob()->mutable_data(),
-                                this->workspace_count_ * sizeof(float), &beta,
-                                this->y_desc_, outblob->mutable_data());
-      },
-      {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
+  output.device()->Exec([input, output, this](Context *ctx) {
+    Blob *inblob = input.blob(), *outblob = output.blob(),
+         *wblob = this->weight_.blob();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
+                            inblob->data(), this->filter_desc_, wblob->data(),
+                            this->conv_desc_, this->fp_alg_,
+                            this->workspace_.blob()->mutable_data(),
+                            this->workspace_count_ * sizeof(float), &beta,
+                            this->y_desc_, outblob->mutable_data());
+  }, {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
 
   if (bias_term_) {
-    beta = 1.f;
-    output.device()->Exec(
-        [output, alpha, beta, this](Context *ctx) {
-          Blob *outblob = output.blob(), *bblob = this->bias_.blob();
-          cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
-                         bblob->data(), &beta, this->y_desc_,
-                         outblob->mutable_data());
-        },
-        {output.blob(), bias_.blob()}, {output.blob()});
+    output.device()->Exec([output, this](Context *ctx) {
+      float beta = 1.f, alpha = 1.0f;
+      Blob *outblob = output.blob(), *bblob = this->bias_.blob();
+      cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_, bblob->data(),
+                     &beta, this->y_desc_, outblob->mutable_data());
+    }, {output.blob(), bias_.blob()}, {output.blob()});
   }
   return output;
 }
 
 const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
     int flag, const Tensor &grad) {
+  CHECK(has_init_cudnn_);
   CHECK_EQ(grad.device()->lang(), kCuda);
-  CHECK_EQ(grad.shape().size(), 4);
+  CHECK_EQ(grad.nDim(), 4u);
   Tensor src_data = buf_.top();
   buf_.pop();
-  float alpha = 1.f, beta = 0.f;
   vector<Tensor> param_grad;
   Tensor dx;
   dx.ResetLike(src_data);
@@ -187,42 +186,38 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
 
   // LOG(ERROR) << "backward bias";
   if (bias_term_) {
-    dx.device()->Exec(
-        [grad, db, alpha, beta, this](Context *ctx) {
-          Blob *dyblob = grad.blob(), *dbblob = db.blob();
-          cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
-                                       dyblob->data(), &beta, this->bias_desc_,
-                                       dbblob->mutable_data());
-        },
-        {grad.blob()}, {db.blob()});
+    dx.device()->Exec([grad, db, this](Context *ctx) {
+      Blob *dyblob = grad.blob(), *dbblob = db.blob();
+      float alpha = 1.f, beta = 0.f;
+      cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
+                                   dyblob->data(), &beta, this->bias_desc_,
+                                   dbblob->mutable_data());
+    }, {grad.blob()}, {db.blob()});
   }
   // LOG(ERROR) << "backward w";
-  dx.device()->Exec(
-      [grad, dw, src_data, alpha, beta, this](Context *ctx) {
-        Blob *inblob = src_data.blob(), *dyblob = grad.blob(),
-             *dwblob = dw.blob();
-        cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(),
-            this->y_desc_, dyblob->data(), this->conv_desc_,
-            this->bp_filter_alg_, this->workspace_.blob()->mutable_data(),
-            this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
-            dwblob->mutable_data());
-      },
-      {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
+  dx.device()->Exec([grad, dw, src_data, this](Context *ctx) {
+    Blob *inblob = src_data.blob(), *dyblob = grad.blob(), *dwblob = dw.blob();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+        ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(), this->y_desc_,
+        dyblob->data(), this->conv_desc_, this->bp_filter_alg_,
+        this->workspace_.blob()->mutable_data(),
+        this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
+        dwblob->mutable_data());
+  }, {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
 
   // LOG(ERROR) << "backward src";
-  dx.device()->Exec(
-      [dx, grad, alpha, beta, this](Context *ctx) {
-        Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
-             *dxblob = dx.blob();
-        cudnnConvolutionBackwardData(
-            ctx->cudnn_handle, &alpha, this->filter_desc_, wblob->data(),
-            this->y_desc_, dyblob->data(), this->conv_desc_, this->bp_data_alg_,
-            this->workspace_.blob()->mutable_data(),
-            this->workspace_count_ * sizeof(float), &beta, this->x_desc_,
-            dxblob->mutable_data());
-      },
-      {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
+  dx.device()->Exec([dx, grad, this](Context *ctx) {
+    Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
+         *dxblob = dx.blob();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, this->filter_desc_,
+                                 wblob->data(), this->y_desc_, dyblob->data(),
+                                 this->conv_desc_, this->bp_data_alg_,
+                                 this->workspace_.blob()->mutable_data(),
+                                 this->workspace_count_ * sizeof(float), &beta,
+                                 this->x_desc_, dxblob->mutable_data());
+  }, {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
   param_grad.push_back(dw);
   param_grad.push_back(db);
   return std::make_pair(dx, param_grad);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.h b/src/model/layer/cudnn_convolution.h
index cf04be0..b86c576 100644
--- a/src/model/layer/cudnn_convolution.h
+++ b/src/model/layer/cudnn_convolution.h
@@ -27,6 +27,7 @@
 #include "singa/core/common.h"
 #include "singa/model/layer.h"
 #include "singa/proto/core.pb.h"
+#include "singa/utils/string.h"
 
 namespace singa {
 class CudnnConvolution : public Convolution {
@@ -41,13 +42,15 @@ class CudnnConvolution : public Convolution {
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const LayerConf &conf) override;
-  /// Init cudnn related data structures.
-  void InitCudnn(DataType dtype, Device *dev, Context *ctx);
 
   void ToDevice(Device *device) override;
 
   size_t workspace_byte_limit() { return workspace_byte_limit_; }
-  string pref() { return pref_; }
+  string prefer() { return prefer_; }
+
+ protected:
+  /// Init cudnn related data structures.
+  void InitCudnn(const Tensor& input);
 
  protected:
   bool has_init_cudnn_ = false;
@@ -61,7 +64,7 @@ class CudnnConvolution : public Convolution {
   cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
   size_t workspace_byte_limit_, workspace_count_;
   Tensor workspace_;
-  string pref_;
+  string prefer_;
 };
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
index d68bcd2..afbc490 100644
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@ -41,17 +41,19 @@ void CudnnPooling::Setup(const LayerConf &conf) {
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
 }
 
-void CudnnPooling::InitCudnn(DataType dtype) {
+void CudnnPooling::InitCudnn(const Tensor& input) {
   CHECK(!has_init_cudnn_);
+  DataType dtype = input.data_type();
+  size_t batchsize = input.shape(0);
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
   CUDNN_CHECK(cudnnCreatePoolingDescriptor(&pool_desc_));
 
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                         GetCudnnDataType(dtype), batchsize_,
+                                         GetCudnnDataType(dtype), batchsize,
                                          channels_, height_, width_));
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize_,
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
       channels_, pooled_height_, pooled_width_));
   auto pool_method = CUDNN_POOLING_MAX;
   if (pool_ == PoolingConf_PoolMethod_MAX)
@@ -77,19 +79,19 @@ void CudnnPooling::InitCudnn(DataType dtype) {
 
 const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
   CHECK_EQ(input.device()->lang(), kCuda);
-  CHECK_EQ(input.shape().size(), 4);
+  CHECK_EQ(input.nDim(), 4u);
   buf_.push(input);
-  batchsize_ = input.shape()[0];
+  size_t batchsize = input.shape(0);
   DataType dtype = input.data_type();
   Device *dev = input.device();
-  float alpha = 1.0f, beta = 0.0f;
-  if (!has_init_cudnn_) InitCudnn(dtype);
+  if (!has_init_cudnn_) InitCudnn(input);
 
-  Shape shape{batchsize_, channels_, pooled_height_, pooled_width_};
+  Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
   Tensor output = Tensor(shape, dev, dtype);
   output.device()->Exec(
-      [input, output, alpha, beta, this](Context *ctx) {
+      [input, output, this](Context *ctx) {
         Blob *inblob = input.blob(), *outblob = output.blob();
+        float alpha = 1.0f, beta = 0.0f;
         cudnnPoolingForward(ctx->cudnn_handle, this->pool_desc_, &alpha,
                             this->x_desc_, inblob->data(), &beta, this->y_desc_,
                             outblob->mutable_data());
@@ -102,26 +104,26 @@ const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
 const std::pair<Tensor, vector<Tensor>> CudnnPooling::Backward(
     int flag, const Tensor &grad) {
   CHECK_EQ(grad.device()->lang(), kCuda);
-  CHECK_EQ(grad.shape().size(), 4);
+  CHECK_EQ(grad.nDim(), 4u);
   vector<Tensor> param_grad;
-  Tensor dx;
-  Tensor data = buf_.top();
+  Tensor y = buf_.top();
   buf_.pop();
-  Tensor src_data = buf_.top();
+  Tensor x = buf_.top();
   buf_.pop();
-  dx.ResetLike(src_data);
+  Tensor dx;
+  dx.ResetLike(x);
 
-  float alpha = 1.0f, beta = 0.0f;
   dx.device()->Exec(
-      [dx, grad, src_data, data, alpha, beta, this](Context *ctx) {
-        Blob *dyblob = grad.blob(), *dxblob = dx.blob(),
-             *yblob = data.blob(), *xblob = src_data.blob();
+      [dx, grad, x, y, this](Context *ctx) {
+        Blob *dyblob = grad.blob(), *dxblob = dx.blob(), *yblob = y.blob(),
+             *xblob = x.blob();
+        float alpha = 1.0f, beta = 0.0f;
         cudnnPoolingBackward(ctx->cudnn_handle, this->pool_desc_, &alpha,
                              this->y_desc_, yblob->data(), this->y_desc_,
                              dyblob->data(), this->x_desc_, xblob->data(),
                              &beta, this->x_desc_, dxblob->mutable_data());
       },
-      {grad.blob(), data.blob(), src_data.blob()}, {dx.blob()});
+      {grad.blob(), y.blob(), x.blob()}, {dx.blob()});
 
   return std::make_pair(dx, param_grad);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.h b/src/model/layer/cudnn_pooling.h
index 14bdf40..1a38cd5 100644
--- a/src/model/layer/cudnn_pooling.h
+++ b/src/model/layer/cudnn_pooling.h
@@ -43,7 +43,7 @@ class CudnnPooling : public Pooling {
                                                    const Tensor &grad) override;
 
   /// Init cudnn related data structures.
-  void InitCudnn(DataType dtype);
+  void InitCudnn(const Tensor& input);
 
  private:
   bool has_init_cudnn_ = false;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc
index 05c6bc9..2655369 100644
--- a/src/model/layer/pooling.cc
+++ b/src/model/layer/pooling.cc
@@ -30,8 +30,8 @@ void Pooling::Setup(const LayerConf& conf) {
     kernel_w_ = pool_conf.kernel_w();
     kernel_h_ = pool_conf.kernel_h();
   }
-  CHECK_NE(kernel_w_, 0);
-  CHECK_NE(kernel_h_, 0);
+  CHECK_GT(kernel_w_, 0u);
+  CHECK_GT(kernel_h_, 0u);
 
   if (pool_conf.has_pad()) {
     pad_w_ = pad_h_ = pool_conf.pad();
@@ -39,6 +39,8 @@ void Pooling::Setup(const LayerConf& conf) {
     pad_w_ = pool_conf.pad_w();
     pad_h_ = pool_conf.pad_h();
   }
+  CHECK_GE(pad_w_, 0u);
+  CHECK_GE(pad_h_, 0u);
 
   if (pool_conf.has_stride()) {
     stride_w_ = stride_h_ = pool_conf.stride();
@@ -46,6 +48,8 @@ void Pooling::Setup(const LayerConf& conf) {
     stride_w_ = pool_conf.stride_w();
     stride_h_ = pool_conf.stride_h();
   }
+  CHECK_GT(stride_w_, 0u);
+  CHECK_GT(stride_h_, 0u);
 
   pool_ = pool_conf.pool();
   CHECK(pool_ == PoolingConf_PoolMethod_AVE ||

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/model/layer/pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h
index ce6670d..522b603 100644
--- a/src/model/layer/pooling.h
+++ b/src/model/layer/pooling.h
@@ -46,7 +46,6 @@ class Pooling : public Layer {
   size_t stride_w() const { return stride_w_; }
   size_t stride_h() const { return stride_h_; }
   PoolingConf_PoolMethod pool_method() const { return pool_; }
-  size_t batchsize() const { return batchsize_; }
   size_t channels() const { return channels_; }
   size_t height() const { return height_; }
   size_t width() const { return width_; }
@@ -54,7 +53,7 @@ class Pooling : public Layer {
  protected:
   size_t kernel_w_, pad_w_, stride_w_;
   size_t kernel_h_, pad_h_, stride_h_;
-  size_t batchsize_, channels_, height_, width_, pooled_height_, pooled_width_;
+  size_t channels_, height_, width_, pooled_height_, pooled_width_;
   PoolingConf_PoolMethod pool_;
   // To store the input and output(of forward) tensors
   std::stack<Tensor> buf_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 03ad6ad..66296d5 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -306,7 +306,8 @@ message ConvolutionConf {
   optional uint32 stride_h = 13; // The stride height (2D only)
   optional uint32 stride_w = 14; // The stride width (2D only)
 
-  optional uint32 group = 5 [default = 1]; // The group size for group conv
+  // SINGA: not supported.
+  // optional uint32 group = 5 [default = 1]; // The group size for group conv
 
   optional FillerConf weight_filler = 7; // The filler for the weight
   optional FillerConf bias_filler = 8; // The filler for the bias
@@ -326,20 +327,24 @@ message ConvolutionConf {
   // With (N, C, D, H, W) inputs, and axis == 1, we perform
   // N independent 3D convolutions, sliding (C/g)-channels
   // filters across the spatial axes (D, H, W) of the input.
-  optional int32 axis = 16 [default = 1];
+  // SINGA: not supported;
+  // optional int32 axis = 16 [default = 1];
 
   // Whether to force use of the general ND convolution, even if a specific
   // implementation for blobs of the appropriate number of spatial dimensions
   // is available. (Currently, there is only a 2D-specific convolution
   // implementation; for input blobs with num_axes != 2, this option is
   // ignored and the ND implementation will be used.)
-  optional bool force_nd_im2col = 17 [default = false];
-  // add by xiangrui
+  // SINGA: not supported;
+  // optional bool force_nd_im2col = 17 [default = false];
+
+
+  // SINGA: add by xiangrui
   // cudnn workspace size in MB
   optional int32 workspace_byte_limit = 50 [default = 512];
   // cudnn algorithm preference
   // options: "fastest", "limited_workspace", "no_workspace"
-  optional string algo_pref = 51 [default = "fastest"];
+  optional string prefer = 51 [default = "fastest"];
   // input shape
   optional int32 channels = 52;
   optional int32 height = 53;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
index 0955c82..73359b4 100644
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@ -40,31 +40,31 @@ TEST(CudnnConvolution, Setup) {
   convconf->set_bias_term(true);
   // MB
   convconf->set_workspace_byte_limit(256);
-  convconf->set_algo_pref("fastest");
+  convconf->set_prefer("fastest");
   convconf->set_channels(1);
   convconf->set_height(3);
   convconf->set_width(3);
   conv.Setup(conf);
 
-  EXPECT_EQ(2, conv.kernel_h());
-  EXPECT_EQ(2, conv.kernel_w());
-  EXPECT_EQ(1, conv.pad_h());
-  EXPECT_EQ(1, conv.pad_w());
-  EXPECT_EQ(1, conv.stride_h());
-  EXPECT_EQ(1, conv.stride_w());
-  EXPECT_EQ(2, conv.num_filters());
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
   EXPECT_EQ(true, conv.bias_term());
-  EXPECT_EQ(256 << 20, conv.workspace_byte_limit());
-  EXPECT_STREQ("fastest", conv.pref().c_str());
-  EXPECT_EQ(1, conv.channels());
-  EXPECT_EQ(3, conv.height());
-  EXPECT_EQ(3, conv.width());
+  EXPECT_EQ(256u << 20, conv.workspace_byte_limit());
+  EXPECT_STREQ("fastest", conv.prefer().c_str());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
 }
 
 TEST(CudnnConvolution, Forward) {
   const size_t batchsize = 1, c = 1, h = 3, w = 3;
   const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-                                      6.0f, 7.0f, 8.0f, 9.0f};
+                                          6.0f, 7.0f, 8.0f, 9.0f};
   singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * h * w);
@@ -94,7 +94,7 @@ TEST(CudnnConvolution, Forward) {
   convconf->set_bias_term(true);
   // MB
   convconf->set_workspace_byte_limit(256);
-  convconf->set_algo_pref("fastest");
+  convconf->set_prefer("fastest");
   convconf->set_channels(1);
   convconf->set_height(3);
   convconf->set_width(3);
@@ -106,7 +106,7 @@ TEST(CudnnConvolution, Forward) {
   out1.ToDevice(&host);
   const float *outptr1 = out1.data<const float *>();
   // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
-  EXPECT_EQ(4, out1.Size());
+  EXPECT_EQ(4u, out1.Size());
 
   EXPECT_EQ(3.0f, outptr1[0]);
   EXPECT_EQ(7.0f, outptr1[1]);
@@ -118,7 +118,7 @@ TEST(CudnnConvolution, Backward) {
   // src_data
   const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
   const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-                                              6.0f, 7.0f, 8.0f, 9.0f};
+                                                  6.0f, 7.0f, 8.0f, 9.0f};
   singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
@@ -148,7 +148,7 @@ TEST(CudnnConvolution, Backward) {
   convconf->set_num_output(1);
   convconf->set_bias_term(true);
   convconf->set_workspace_byte_limit(256);
-  convconf->set_algo_pref("fastest");
+  convconf->set_prefer("fastest");
   convconf->set_channels(1);
   convconf->set_height(3);
   convconf->set_width(3);
@@ -159,8 +159,10 @@ TEST(CudnnConvolution, Backward) {
 
   // grad
   const size_t grad_h = 2, grad_w = 2;
-  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
-  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w}, &cuda);
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
+                                                               0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
+                     &cuda);
   grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
 
   const auto ret = conv.Backward(singa::kTrain, grad);
@@ -169,7 +171,7 @@ TEST(CudnnConvolution, Backward) {
   in_grad.ToDevice(&host);
   const float *dx = in_grad.data<const float *>();
   const float *wptr = we;
-  EXPECT_EQ(9, in_grad.Size());
+  EXPECT_EQ(9u, in_grad.Size());
   EXPECT_EQ(dy[0] * wptr[4], dx[0]);
   EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
   EXPECT_EQ(dy[1] * wptr[4], dx[2]);
@@ -190,7 +192,7 @@ TEST(CudnnConvolution, Backward) {
   EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
 
   const float *dwptr = dw.data<const float *>();
-  EXPECT_EQ(9, dw.Size());
+  EXPECT_EQ(9u, dw.Size());
   EXPECT_EQ(dy[3] * x[4], dwptr[0]);
   EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
   EXPECT_EQ(dy[2] * x[4], dwptr[2]);
@@ -201,5 +203,5 @@ TEST(CudnnConvolution, Backward) {
   EXPECT_EQ(dy[1] * x[4], dwptr[6]);
   EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
   EXPECT_EQ(dy[0] * x[4], dwptr[8]);
-}  // USE_CUDNN
-#endif
+}
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7d149ecf/test/singa/test_cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_pooling.cc b/test/singa/test_cudnn_pooling.cc
index 0bfd620..e66f212 100644
--- a/test/singa/test_cudnn_pooling.cc
+++ b/test/singa/test_cudnn_pooling.cc
@@ -43,23 +43,23 @@ TEST(CudnnPooling, Setup) {
   pool.Setup(conf);
 
   EXPECT_EQ(singa::PoolingConf_PoolMethod_MAX, pool.pool_method());
-  EXPECT_EQ(1, pool.kernel_h());
-  EXPECT_EQ(2, pool.kernel_w());
-  EXPECT_EQ(1, pool.pad_h());
-  EXPECT_EQ(0, pool.pad_w());
-  EXPECT_EQ(2, pool.stride_h());
-  EXPECT_EQ(1, pool.stride_w());
-  EXPECT_EQ(1, pool.channels());
-  EXPECT_EQ(3, pool.height());
-  EXPECT_EQ(3, pool.width());
+  EXPECT_EQ(1u, pool.kernel_h());
+  EXPECT_EQ(2u, pool.kernel_w());
+  EXPECT_EQ(1u, pool.pad_h());
+  EXPECT_EQ(0u, pool.pad_w());
+  EXPECT_EQ(2u, pool.stride_h());
+  EXPECT_EQ(1u, pool.stride_w());
+  EXPECT_EQ(1u, pool.channels());
+  EXPECT_EQ(3u, pool.height());
+  EXPECT_EQ(3u, pool.width());
 }
 
 TEST(CudnnPooling, Forward) {
   const size_t batchsize = 1, c = 1, h = 3, w = 3;
   const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-                                      6.0f, 7.0f, 8.0f, 9.0f};
+                                          6.0f, 7.0f, 8.0f, 9.0f};
   singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape{batchsize, c,  h, w}, &cuda);
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * h * w);
 
   CudnnPooling pool;
@@ -83,7 +83,7 @@ TEST(CudnnPooling, Forward) {
   out1.ToDevice(&host);
   const float *outptr1 = out1.data<const float *>();
   // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
-  EXPECT_EQ(4, out1.Size());
+  EXPECT_EQ(4u, out1.Size());
   EXPECT_EQ(5.0f, outptr1[0]);
   EXPECT_EQ(6.0f, outptr1[1]);
   EXPECT_EQ(8.0f, outptr1[2]);
@@ -127,7 +127,7 @@ TEST(CudnnPooling, Backward) {
   singa::Tensor in_grad = ret.first;
   in_grad.ToDevice(&host);
   const float *dx = in_grad.data<const float *>();
-  EXPECT_EQ(9, in_grad.Size());
+  EXPECT_EQ(9u, in_grad.Size());
   EXPECT_EQ(0.0f, dx[0]);
   EXPECT_EQ(0.0f, dx[1]);
   EXPECT_EQ(0.0f, dx[2]);

[43/50] [abbrv] incubator-singa git commit: SINGA-184 Add Cross Entropy loss computation

Posted by zh...@apache.org.

SINGA-184 Add Cross Entropy loss computation

Update softmaxcrossentropy layer to support both cpp and cuda devices;

Fix bugs from crossentropy fwd and bwd; need the cuda version exp();


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ec17acab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ec17acab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ec17acab

Branch: refs/heads/master
Commit: ec17acab49d595fdc48b2dae6f71901b5a4c8191
Parents: efd7b62
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri May 27 17:25:01 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 11:12:05 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h             |  17 +++--
 include/singa/model/loss.h              |  47 ++++++++++++
 src/CMakeLists.txt                      |   3 +-
 src/core/tensor/math_kernel.cu          |  37 +++++++++-
 src/core/tensor/math_kernel.h           |   9 ++-
 src/core/tensor/tensor.cc               |  52 +++++++++----
 src/core/tensor/tensor_math.h           |  24 ++++--
 src/core/tensor/tensor_math_cpp.h       |  50 +++++++++++--
 src/core/tensor/tensor_math_cuda.h      |  41 ++++++++---
 src/model/layer/softmax.cc              |   7 +-
 src/model/loss/cross_entropy.h          | 105 ---------------------------
 src/model/loss/mse.cc                   |  41 +++++++++++
 src/model/loss/mse.h                    |  66 -----------------
 src/model/loss/softmax_cross_entropy.cc |  53 ++++++++++++++
 test/singa/test_cross_entropy.cc        |  64 ++++++++++++++--
 test/singa/test_mse.cc                  |   6 +-
 test/singa/test_softmax.cc              |   9 +--
 17 files changed, 393 insertions(+), 238 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..865e1e4 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -239,11 +239,10 @@ Tensor Sum(const Tensor &t, int axis);
 /// if 'axis' is 1, average all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.average
 Tensor Average(const Tensor &t, int axis);
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
-/// and shape_[axis]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-Tensor SoftMax(const Tensor &t, int axis = 0);
-void SoftMax(const Tensor &t, int axis, Tensor *ret);
+/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
+Tensor SoftMax(const Tensor &in);
+/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
+void SoftMax(const Tensor &in, Tensor *out);
 
 /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
 /// and shape_[axis+1]*...*shape_[nDim()] columns.
@@ -398,6 +397,14 @@ Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
 void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
 */
 
+/// Compute the cross entropy loss given the prediction probability 'p' and
+/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
+/// or 2-d matrix. 'loss' is 1-d vector. The loss is computed into p.
+void ComputeCrossEntropy(const Tensor& t, Tensor* p);
+/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
+/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
+/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
+void SoftmaxCrossEntropyBwd(const Tensor& t, Tensor* p);
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 6a23067..d188de0 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -18,6 +18,7 @@
 
 #ifndef SINGA_MODEL_LOSS_H_
 #define SINGA_MODEL_LOSS_H_
+#include <stack>
 #include "singa/proto/model.pb.h"
 #include "singa/core/tensor.h"
 namespace singa {
@@ -54,6 +55,52 @@ class Loss {
   /// Compute the gradients of the loss values w.r.t. the prediction.
   virtual Tensor Backward() = 0;
 };
+
+
+
+// ============= Mean Squared Error ===========================================
+/// MSE is for mean squared error or squared euclidean distance.
+class MSE : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is 0.5/||prediction-target||^2
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is (prediction-target)/batchsize
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., prediction-target
+  std::stack<Tensor> buf_;
+};
+
+
+// ===============Softamx Cross Entropy =======================================
+/// Softmax + cross entropy for multi-category classification
+class SoftmaxCrossEntropy : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is -log(p[idx_truth]), idx_truth is the truth
+  /// category's index and p[] is the probability for each category, computed
+  /// from Softmax(prediction).
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is: p[idx] - 1 if idx is the truth category's index; else,
+  /// p[idx]
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., probability for each category and
+  // the target (ground truth)
+  std::stack<Tensor> buf_;
+};
+
 }  // namespace singa
 
 #endif  // SINGA_MODEL_LOSS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 28066de..23cae85 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,7 +21,7 @@ AUX_SOURCE_DIRECTORY(core/tensor core_source)
 FILE(GLOB_RECURSE cuda_source core "*.cu")
 set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
 set(CMAKE_CXX_FLAGS "")
-CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC ")
 #message(STATUS "FLAGS ${CMAKE_CXX_FLAGS}")
 #message(STATUS "CORE ${cuda_source}")
 #message(STATUS "OBJ ${cuda_objs}")
@@ -36,6 +36,7 @@ LIST(APPEND SINGA_LINKER_LIBS singa_core)
 AUX_SOURCE_DIRECTORY(model model_source)
 AUX_SOURCE_DIRECTORY(model/layer model_source)
 AUX_SOURCE_DIRECTORY(model/optimizer model_source)
+AUX_SOURCE_DIRECTORY(model/loss model_source)
 #MESSAGE(STATUS "MODEL ${model_source}")
 ADD_LIBRARY(singa_model SHARED ${model_source})
 TARGET_LINK_LIBRARIES(singa_model ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..f12763e 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -485,8 +485,26 @@ __global__ void KernelSet(const size_t num, const float x, float *out) {
   }
 }
 
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
-  KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+__global__
+void KernelComputeCrossEntropy(const size_t batchsize, const size_t dim, const float* p,
+    const int* t, float* loss) {
+  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads = blockDim.x * gridDim.x;
+  for (; sample < batchsize; sample += num_threads) {
+    float prob_of_truth = p[sample * dim + t[sample]];
+    loss[sample] -= std::log(max(prob_of_truth, FLT_MIN));
+  }
+}
+
+__global__
+void KernelSoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim, const float* p,
+    const int* t, float* grad) {
+  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads = blockDim.x * gridDim.x;
+  for (; sample < batchsize; sample += num_threads) {
+    size_t pos = sample * dim + t[sample];
+    grad[pos] = p[pos] - 1.0f;  // TODO(wangwei) Consider p and grad are diff
+  }
 }
 void Div(const size_t num, float alpha, const float *in, float *out,
          cudaStream_t s) {
@@ -510,6 +528,21 @@ void LE(const size_t num, const float *in, const float x, float *out,
   KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 }
 
+void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float* p,
+    const int *t, float *loss, cudaStream_t stream) {
+  KernelComputeCrossEntropy<<<ceil(batchsize/CU1DBLOCKF), CU1DBLOCKF>>>(batchsize,
+      dim, p, t, loss);
+}
+
+void Set(const size_t num, const float x, float *out, cudaStream_t s) {
+  KernelSet<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, x, out);
+}
+
+void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float* p,
+    const int *t, float *grad, cudaStream_t stream) {
+  KernelSoftmaxCrossEntropyBwd<<<ceil(batchsize/CU1DBLOCKF), CU1DBLOCKF>>>(batchsize,
+      dim, p, t, grad);
+}
 }  // namespace cuda
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..09953e4 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -83,13 +83,20 @@ void set_value(int n, float v, float *out);
 void threshold(int n, float alpha, const float *in, float *out);
 
 // follow the consistency guide for math API
+void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
+                         const float *p, const int *t, float *loss,
+                         cudaStream_t stream);
 void Div(const size_t num, const float x, const float *in, float *out,
          cudaStream_t s);
-void Set(const size_t num, const float x, float *out, cudaStream_t s);
 void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
+void Set(const size_t num, const float x, float *out, cudaStream_t s);
+void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
+                            const float *p, const int *t, float *grad,
+                            cudaStream_t stream);
+
 }  // cuda
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..1ac25c6 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -77,10 +77,9 @@ void Tensor::ResetLike(const Tensor &t) {
   }
 }
 
-void Tensor::Reshape(const Shape &shape) {
-  if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+void Tensor::Reshape(const Shape& shape) {
+  if (Product(shape) != Product(shape_)) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = shape;
@@ -403,22 +402,21 @@ Tensor Average(const Tensor &t, int axis) {
   }
 }
 
-Tensor SoftMax(const Tensor &in, int axis) {
+Tensor SoftMax(const Tensor &in) {
   Tensor out(in.shape(), in.device(), in.data_type());
-  SoftMax(in, axis, &out);
+  SoftMax(in, &out);
   return out;
 }
 
-void SoftMax(const Tensor &in, int axis, Tensor *out) {
+void SoftMax(const Tensor &in, Tensor *out) {
+  CHECK_LE(in.nDim(), 2u);
+  Exp(in, out);
   size_t nrow = 1, ncol = in.Size(), size = ncol;
-  CHECK_GE(axis, 0);
-  if (axis > 0) {
-    nrow = Product(in.shape(), 0, axis);
-    CHECK_EQ(size % nrow, 0u) << "Size = " << size << " nrow = " << nrow;
+  if (in.nDim() == 2u) {
+    nrow = in.shape(0);
     ncol = size / nrow;
+    out->Reshape(Shape{nrow, ncol});
   }
-  Exp(in, out);
-  out->Reshape(Shape{nrow, ncol});
   Tensor sum(Shape{nrow}, in.device(), in.data_type());
   SumColumns(*out, &sum);
   DivColumn(sum, out);
@@ -594,6 +592,19 @@ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
     Mult(alpha, one, vmat, beta, M);
   }
 }
+void ComputeCrossEntropy(const Tensor& t, Tensor* p) {
+  CHECK_LE(p->nDim(), 2u);
+  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
+  size_t batchsize = 1;
+  if (p->nDim() == 2u) batchsize = p->shape(0);
+  size_t dim = p->Size() / batchsize;
+  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
+    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
+                                       p->blob(), ctx);
+    }, {p->blob(), t.blob()}, {p->blob()});
+  });
+}
 
 template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
   Tensor out(in.shape(), in.device(), in.data_type());
@@ -665,7 +676,20 @@ void MultRow(const Tensor &v, Tensor *M) {
         {M->blob(), v.blob()}, {M->blob()});
   });
 }
-
+void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
+  CHECK_LE(p->nDim(), 2u);
+  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
+  size_t batchsize = 1;
+  if (p->nDim() == 2u)
+    batchsize = p->shape(0);
+  size_t dim = p->Size() / batchsize;
+  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
+    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
+                                          p->blob(), ctx);
+    }, {p->blob(), t.blob()}, {p->blob()});
+  });
+}
 void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }
 
 void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index ff865e0..bcf4908 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -110,12 +110,6 @@ void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
-/// Do softmax for each row invidually
-template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
 template <typename DType, typename Lang>
@@ -312,11 +306,14 @@ void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
 
 // ========follow the consistency guide of math API
 
+/// Divide alpha by each element of 'in'.
+// following the consistency guide.
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
+                         const Blob *p, const Blob *t, Blob *loss,
+                         Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
-/// Divide alpha by each element of 'in'.
 template <typename DType, typename Lang>
 void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
          Context *ctx) {
@@ -364,6 +361,17 @@ void GE(const size_t num, const Blob *in, const DType x, Blob *out,
         Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
+template <typename DType, typename Lang>
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lang>
+void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
+                            const Blob *p, const Blob *t, Blob *grad,
+                            Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
 
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 693f09c..907c656 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -17,7 +17,9 @@
  */
 #ifndef SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
 #define SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+
 #include "./tensor_math.h"
+#include <cfloat>
 #include "singa/core/common.h"
 #include <math.h>
 
@@ -210,6 +212,22 @@ void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob *ret,
 
 // follow the consistency guide of math API
 template <>
+void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
+                                           const size_t dim, const Blob *p,
+                                           const Blob *t, Blob *loss,
+                                           Context *ctx) {
+  const float *pPtr = static_cast<const float *>(p->data());
+  const float *tPtr = static_cast<const float *>(t->data());
+  float *lossPtr = static_cast<float *>(loss->mutable_data());
+  for (size_t i = 0; i < batchsize; i++) {
+    int truth_idx = static_cast<int>(tPtr[i]);
+    CHECK_GE(truth_idx, 0);
+    float prob_of_truth = pPtr[i * dim + truth_idx];
+    lossPtr[i] = -std::log(std::max(prob_of_truth, FLT_MIN));
+  }
+}
+
+template <>
 void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
                            Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -249,13 +267,6 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
     }
   }
 }
-
-template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
-}
 template <>
 void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
                           Blob *out, Context *ctx) {
@@ -312,9 +323,32 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
               lda, BPtr, ldb, beta, CPtr, ldc);
 }
-
 #endif  // USE_CBLAS
 
+template <>
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
+                                              const size_t dim, const Blob *p,
+                                              const Blob *t,
+                                              Blob *grad, Context *ctx) {
+  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
+  // const float* pPtr = static_cast<const float*>(p->data());
+  const float *tPtr = static_cast<const float *>(t->data());
+  float *gradPtr = static_cast<float *>(grad->mutable_data());
+
+  for (size_t i = 0; i < batchsize; i++) {
+    int truth_idx = static_cast<int>(tPtr[i]);
+    CHECK_GE(truth_idx, 0);
+    gradPtr[i * dim + truth_idx] -= 1.0;
+  }
+}
+
+
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..c69620c 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -75,6 +75,17 @@ void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
 
 // follow the consistency guide of math API
 template <>
+void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize,
+                                            const size_t dim, const Blob *p,
+                                            const Blob *t, Blob *loss,
+                                            Context *ctx) {
+  const float *pPtr = static_cast<const float *>(p->data());
+  const int *tPtr = static_cast<const int *>(t->data());
+  float *lossPtr = static_cast<float *>(loss->mutable_data());
+  cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream);
+}
+
+template <>
 void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
                             Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -82,19 +93,13 @@ void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
   cuda::Div(num, alpha, inPtr, outPtr, ctx->stream);
 }
 
-template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::Set(num, x, outPtr, ctx->stream);
-}
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              const size_t ncol, const Blob *M, const Blob *v,
                              Blob *out, Context *ctx) {
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   const float *MPtr = static_cast<const float *>(M->data());
   const float *vPtr = static_cast<const float *>(v->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -121,7 +126,7 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
   const float *APtr = static_cast<const float *>(A->data());
   const float *BPtr = static_cast<const float *>(B->data());
   float *CPtr = static_cast<float *>(C->mutable_data());
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
 }
@@ -155,9 +160,25 @@ void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
   cuda::LT(num, inPtr, x, outPtr, ctx->stream);
 }
 
+template<>
+void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cuda::Set(num, x, outPtr, ctx->stream);
+}
 
-
-
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
+                                               const size_t dim, const Blob *p,
+                                               const Blob *t, Blob *grad,
+                                               Context *ctx) {
+  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
+  const float *pPtr = static_cast<const float *>(p->data());
+  const int *tPtr = static_cast<const int *>(t->data());
+  float *gradPtr = static_cast<float *>(grad->mutable_data());
+  cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
+                               ctx->stream);
+}
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/softmax.cc b/src/model/layer/softmax.cc
index 813ebf0..8af1d76 100644
--- a/src/model/layer/softmax.cc
+++ b/src/model/layer/softmax.cc
@@ -26,10 +26,11 @@ void Softmax::Setup(const LayerConf& conf) {
 
 const Tensor Softmax::Forward(int flag, const Tensor& input) {
   if (input.nDim() == 1) {
-    Tensor tmp = Reshape(input, Shape{1, input.Size()});
-    buf_.push(SoftMax(tmp, 0));
+    buf_.push(SoftMax(input));
   } else {
-    buf_.push(SoftMax(input, axis_));
+    size_t nrow = Product(input.shape(), 0, axis_);
+    const Tensor& tmp = Reshape(input, Shape{nrow, input.Size() / nrow});
+    buf_.push(SoftMax(tmp));
   }
   return buf_.top();
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/cross_entropy.h
----------------------------------------------------------------------
diff --git a/src/model/loss/cross_entropy.h b/src/model/loss/cross_entropy.h
deleted file mode 100644
index 815b795..0000000
--- a/src/model/loss/cross_entropy.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SRC_MODEL_LOSS_CROSS_ENTROPY_H_
-#define SRC_MODEL_LOSS_CROSS_ENTROPY_H_
-#include <stack>
-#include "singa/model/loss.h"
-
-namespace singa {
-
-/// Cross entropy is for cross entropy loss.
-class CrossEntropy : public Loss<Tensor> {
- public:
-  /// Compute the loss values for each sample/instance given the prediction
-  /// and the target, which is sum {-log(prob_of_truth)}
-  /// Users can call Average(const Tensor&) to get the average
-  /// loss value over all samples in the batch.
-  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
-
-  /// Compute the gradients of the loss values w.r.t. the prediction,
-  /// which is: if the entry x corresponds to ground truth,
-  /// then softmax(x) - 1; else, softmax(x)
-  Tensor Backward() override;
-
- private:
-  // to buffer intermediate data, i.e., softmax(prediction), target
-  std::stack<Tensor> buf_;
-};
-
-Tensor CrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
-  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
-                      << " The calling pattern is [Forward|Evaluate] Backward";
-
-  size_t batchsize = 1;
-  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
-  size_t dim = prediction.Size() / batchsize;
-  // a temporal Softmax layer for forward computation
-//  LayerConf conf; // TODO(kaiping): this is currently commented
-//  Softmax softmax_tmp;
-//  softmax_tmp.Setup(conf);
-//  Tensor softmax = softmax_tmp.Forward(0, prediction);
-
-  Tensor softmax(Shape{batchsize, dim});  // TODO(kaiping): Delete
-//  softmax.SetValue<float>(0.5f); // TODO(kaiping): Delete
-
-  softmax.Reshape(Shape{batchsize, dim});
-  // buffer intermediate data
-  buf_.push(softmax);
-  buf_.push(target);
-
-  // Compute loss for each sample
-  Tensor loss(Shape{batchsize, 1});
-  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
-  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
-  float * loss_ptr = reinterpret_cast<float*>(loss.blob()->mutable_data());
-  for (size_t i = 0; i < batchsize; i++) {
-    int ilabel = static_cast<int>(truth_ptr[i]);
-    CHECK_GE(ilabel, 0);
-    float prob_of_truth = pre_ptr[ilabel];
-    loss_ptr[i] = -log(prob_of_truth);
-    pre_ptr += dim;  // change to the next sample
-  }
-  return loss;
-}
-
-Tensor CrossEntropy::Backward() {
-  const Tensor& target = buf_.top();
-  buf_.pop();
-  Tensor softmax = buf_.top();
-  buf_.pop();
-
-  size_t batchsize = 1;
-  if (softmax.nDim() > 1)
-    batchsize = softmax.shape().at(0);
-  size_t dim = softmax.Size() / batchsize;
-  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
-  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
-  for (size_t i = 0; i < batchsize; i++) {
-    int ilabel = static_cast<int>(truth_ptr[i]);
-    // CHECK_GE(ilabel, 0);
-    pre_ptr[ilabel] -= 1.0;
-    pre_ptr += dim;  // change to the next sample
-  }
-  return softmax;
-}
-}  // namespace singa
-
-#endif  // SRC_MODEL_LOSS_CROSS_ENTROPY_H_
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/mse.cc
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.cc b/src/model/loss/mse.cc
new file mode 100644
index 0000000..a4bbb72
--- /dev/null
+++ b/src/model/loss/mse.cc
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/loss.h"
+
+namespace singa {
+
+Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  Tensor t = prediction - target;
+  size_t batchsize = 1;
+  if (t.nDim() > 1) batchsize = t.shape().at(0);
+  size_t dim = t.Size() / batchsize;
+  t.Reshape(Shape{batchsize, dim});
+  buf_.push(t);
+  // TODO(wangwei) use CastType for operator/
+  return Sum(Square(t), 1) * 0.5f;
+}
+
+Tensor MSE::Backward() {
+  Tensor ret = buf_.top();
+  buf_.pop();
+  return ret * (1.0f / ret.shape().at(0));
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/mse.h
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.h b/src/model/loss/mse.h
deleted file mode 100644
index 1a022f9..0000000
--- a/src/model/loss/mse.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SINGA_MODEL_LOSS_MSE_H_
-#define SINGA_MODEL_LOSS_MSE_H_
-#include <stack>
-#include "singa/model/loss.h"
-
-namespace singa {
-
-/// MSE is for mean squared error or squared euclidean distance.
-class MSE : public Loss<Tensor> {
- public:
-  /// Compute the loss values for each sample/instance given the prediction
-  /// and the target, which is 0.5/||prediction-target||^2
-  /// Users can call Average(const Tensor&) to get the average
-  /// loss value over all samples in the batch.
-  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
-
-  /// Compute the gradients of the loss values w.r.t. the prediction,
-  /// which is (prediction-target)/batchsize
-  Tensor Backward() override;
-
- private:
-  // to buffer intermediate data, i.e., prediction-target
-  std::stack<Tensor> buf_;
-};
-
-Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
-  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
-                      << " The calling pattern is [Forward|Evaluate] Backward";
-  Tensor t = prediction - target;
-  size_t batchsize = 1;
-  if (t.nDim() > 1) batchsize = t.shape().at(0);
-  size_t dim = t.Size() / batchsize;
-  t.Reshape(Shape{batchsize, dim});
-  buf_.push(t);
-  // TODO(wangwei) use CastType for operator/
-  return Sum(Square(t), 1) * 0.5f;
-}
-
-Tensor MSE::Backward() {
-  Tensor ret = buf_.top();
-  buf_.pop();
-  return ret * (1.0f / ret.shape().at(0));
-}
-}  // namespace singa
-
-#endif  // SINGA_MODEL_LOSS_H_
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/softmax_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/src/model/loss/softmax_cross_entropy.cc b/src/model/loss/softmax_cross_entropy.cc
new file mode 100644
index 0000000..4ca323a
--- /dev/null
+++ b/src/model/loss/softmax_cross_entropy.cc
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stack>
+#include "singa/model/loss.h"
+
+namespace singa {
+
+
+Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  size_t batchsize = 1;
+  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
+  size_t dim = prediction.Size() / batchsize;
+  const Tensor& input = Reshape(prediction, Shape{batchsize, dim});
+  Tensor prob = SoftMax(input);
+
+  // buffer intermediate data
+  buf_.push(prob);
+  buf_.push(target);
+  Tensor loss = prob.Clone();
+
+  ComputeCrossEntropy(target, &loss);
+  return loss;
+}
+
+Tensor SoftmaxCrossEntropy::Backward() {
+  const Tensor target = buf_.top();
+  buf_.pop();
+  Tensor prob = buf_.top();
+  buf_.pop();
+  SoftmaxCrossEntropyBwd(target, &prob);
+  return prob;
+}
+}  // namespace singa
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
index 9bb2321..6b8cb69 100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@ -22,16 +22,15 @@
 #include "gtest/gtest.h"
 #include "singa/core/tensor.h"
 #include "singa/core/device.h"
-#include "../src/model/loss/cross_entropy.h"
+#include "singa/model/loss.h"
+#include "singa_config.h"
 
 using singa::Tensor;
-class TestCrossEntropy : public ::testing::Test {
+class TestSoftmaxCrossEntropy : public ::testing::Test {
  protected:
   virtual void SetUp() {
     p.Reshape(singa::Shape{2, 4});
     t.Reshape(singa::Shape{2, 1});
-    p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
-    t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
   }
   const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
   const float tdat[2] = {0.0, 2.0};
@@ -39,8 +38,11 @@ class TestCrossEntropy : public ::testing::Test {
   singa::Tensor p, t;
 };
 
-TEST_F(TestCrossEntropy, CppForward) {
-  singa::CrossEntropy cross_entropy;
+TEST_F(TestSoftmaxCrossEntropy, CppForward) {
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  singa::SoftmaxCrossEntropy cross_entropy;
   const Tensor& loss = cross_entropy.Forward(p, t);
   auto ldat = loss.data<const float*>();
 
@@ -49,8 +51,11 @@ TEST_F(TestCrossEntropy, CppForward) {
   EXPECT_FLOAT_EQ(ldat[1], result_test);
 }
 
-TEST_F(TestCrossEntropy, CppBackward) {
-  singa::CrossEntropy cross_entropy;
+TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  singa::SoftmaxCrossEntropy cross_entropy;
   cross_entropy.Forward(p, t);
   const Tensor& grad = cross_entropy.Backward();
 
@@ -64,3 +69,46 @@ TEST_F(TestCrossEntropy, CppBackward) {
   EXPECT_FLOAT_EQ(gdat[6], -0.75);
   EXPECT_FLOAT_EQ(gdat[7], 0.25);
 }
+
+#ifdef USE_CUDA
+
+TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
+  singa::SoftmaxCrossEntropy cross_entropy;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  Tensor loss = cross_entropy.Forward(p, t);
+  loss.ToHost();
+  auto ldat = loss.data<const float*>();
+
+  const float result_test = -log(0.25);
+  EXPECT_FLOAT_EQ(ldat[0], result_test);
+  EXPECT_FLOAT_EQ(ldat[1], result_test);
+}
+
+TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
+  singa::SoftmaxCrossEntropy cross_entropy;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  cross_entropy.Forward(p, t);
+  Tensor grad = cross_entropy.Backward();
+
+  grad.ToHost();
+  auto gdat = grad.data<const float*>();
+  EXPECT_FLOAT_EQ(gdat[0], -0.75);
+  EXPECT_FLOAT_EQ(gdat[1], 0.25);
+  EXPECT_FLOAT_EQ(gdat[2], 0.25);
+  EXPECT_FLOAT_EQ(gdat[3], 0.25);
+  EXPECT_FLOAT_EQ(gdat[4], 0.25);
+  EXPECT_FLOAT_EQ(gdat[5], 0.25);
+  EXPECT_FLOAT_EQ(gdat[6], -0.75);
+  EXPECT_FLOAT_EQ(gdat[7], 0.25);
+}
+#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 67f583c..a6bd1c3 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -22,8 +22,9 @@
 #include "gtest/gtest.h"
 #include "singa/core/tensor.h"
 #include "singa/core/device.h"
-#include "../src/model/loss/mse.h"
+#include "singa/model/loss.h"
 #include "singa_config.h"
+
 using singa::Tensor;
 class TestMSE : public ::testing::Test {
  protected:
@@ -66,6 +67,8 @@ TEST_F(TestMSE, CppBackward) {
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }
 #endif
+
+#ifdef USE_CUDA
 TEST_F(TestMSE, CudaForward) {
   singa::MSE mse;
   singa::CudaGPU dev;
@@ -98,3 +101,4 @@ TEST_F(TestMSE, CudaBackward) {
   for (size_t i = 0; i < grad.Size(); i++)
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_softmax.cc b/test/singa/test_softmax.cc
index da2a6ef..09dfcd9 100644
--- a/test/singa/test_softmax.cc
+++ b/test/singa/test_softmax.cc
@@ -55,7 +55,6 @@ TEST(Softmax, Forward) {
   const float* yptr = out.data<const float*>();
   EXPECT_EQ(n, out.Size());
 
-  float* y = new float[n];
   float* sigma = new float[row];
   for (size_t i = 0; i < row; i++)
     sigma[i] = 0.f;
@@ -63,11 +62,9 @@ TEST(Softmax, Forward) {
     sigma[i / col] += exp(x[i]);
   //EXPECT_EQ(0, sigma[1]);
   for (size_t i = 0; i < row; i++)
-    for (size_t j = 0; j < col; j++)
-      y[i * col + j] = exp(x[i * col + j]) / sigma[i];
-  EXPECT_FLOAT_EQ(y[0], yptr[0]);
-  EXPECT_FLOAT_EQ(y[4], yptr[4]);
-  EXPECT_FLOAT_EQ(y[5], yptr[5]);
+    for (size_t j = 0; j < col; j++) {
+      EXPECT_FLOAT_EQ(yptr[i * col + j], exp(x[i * col + j]) / sigma[i]);
+    }
 }
 
 TEST(Softmax, Backward) {

[08/50] [abbrv] incubator-singa git commit: SINGA-172 - Add CMake supporting for Cuda and Cudnn libs

Posted by zh...@apache.org.

SINGA-172 - Add CMake supporting for Cuda and Cudnn libs

Add definition of CUDNN_MAJOR_VERSION


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e3da6a58
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e3da6a58
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e3da6a58

Branch: refs/heads/master
Commit: e3da6a58fd76e630b84c4241fbe7cc0c433a4ab8
Parents: 0b4b2e2
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu May 19 14:37:20 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 19 15:10:57 2016 +0800

----------------------------------------------------------------------
 cmake/Cuda.cmake                 | 1 +
 src/core/device/device.cc        | 2 +-
 src/model/layer/cudnn_dropout.cc | 2 +-
 src/model/layer/cudnn_dropout.h  | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3da6a58/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 8780fc6..a9ddcb0 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -17,6 +17,7 @@ add_definitions(-DUSE_CUDA)
     include_directories(SYSTEM ${CUDNN_INCLUDE_DIR})
     list(APPEND SINGA_LINKER_LIBS ${CUDNN_LIBRARIES})
     add_definitions(-DUSE_CUDNN)
+    add_definitions(-DCUDNN_VERSION_MAJOR=${CUDNN_VERSION_MAJOR})
 #endif()
 
 include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3da6a58/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 73bb5c1..205601b 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -56,7 +56,7 @@ void Device::CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
       [this, dst, src, nBytes, direct, dst_offset, src_offset](Context* ctx) {
         this->CopyToFrom(
             reinterpret_cast<char*>(dst->mutable_data()) + dst_offset,
-            reinterpret_cast<char*>(src->data()) + src_offset, nBytes,
+            reinterpret_cast<const char*>(src->data()) + src_offset, nBytes,
             direct, ctx);
       },
       {src}, {dst});

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3da6a58/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index e049ade..65cd8e5 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -17,7 +17,7 @@
  */
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
-#if CUDNN_MAJOR_VERSION >= 5
+#if CUDNN_VERSION_MAJOR >= 5
 
 #include "./cudnn_dropout.h"
 #include <cudnn.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3da6a58/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index 647eed2..a7d00e0 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -20,7 +20,7 @@
 #define SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
-#if CUDNN_MAJOR_VERSION >= 5
+ #if CUDNN_VERSION_MAJOR >= 5
 #include <cudnn.h>
 #include <utility>
 #include <string>

[32/50] [abbrv] incubator-singa git commit: SINGA-191 Add "autotune" for CudnnConvolution Layer

Posted by zh...@apache.org.

SINGA-191 Add "autotune" for CudnnConvolution Layer

If users choose "autotune", the layer will choose algorithm preference
and algorithm automatically. The following CUDNN functions are used:
  cudnnFindConvolutionForwardAlgorithm,
  cudnnFindConvolutionBackwardFilterAlgorithm,
  cudnnFindConvolutionBackwardDataAlgorithm


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/01aaf490
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/01aaf490
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/01aaf490

Branch: refs/heads/master
Commit: 01aaf49009d2b0c6c62a484b21a1d06bce575e81
Parents: 04e23d1
Author: XiangruiCAI <ca...@gmail.com>
Authored: Sat Jun 4 16:18:47 2016 +0800
Committer: XiangruiCAI <ca...@gmail.com>
Committed: Wed Jun 8 10:35:51 2016 +0800

----------------------------------------------------------------------
 src/model/layer/cudnn_convolution.cc | 180 +++++++++++++++++------------
 test/singa/test_cudnn_convolution.cc | 181 ++++++++++++++++++++++++++++++
 2 files changed, 287 insertions(+), 74 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01aaf490/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index 97aa256..b80c3bd 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -41,9 +41,9 @@ void CudnnConvolution::Setup(const LayerConf &conf) {
   workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
   prefer_ = ToLowerCase(conv_conf.prefer());
   CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-        prefer_ == "no_workspace")
-      << "CudnnConvolution only supports three algorithm preferences: fastest, "
-         "limited_workspace and no_workspace";
+        prefer_ == "no_workspace" || prefer_ == "autotune")
+      << "CudnnConvolution only supports four algorithm preferences: fastest, "
+         "limited_workspace, no_workspace and autotune";
 }
 
 void CudnnConvolution::ToDevice(Device *device) {
@@ -52,7 +52,7 @@ void CudnnConvolution::ToDevice(Device *device) {
   workspace_.ToDevice(device);
 }
 
-void CudnnConvolution::InitCudnn(const Tensor& input) {
+void CudnnConvolution::InitCudnn(const Tensor &input) {
   CHECK(!has_init_cudnn_);
   DataType dtype = input.data_type();
   Device *dev = input.device();
@@ -89,34 +89,54 @@ void CudnnConvolution::InitCudnn(const Tensor& input) {
   LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
 #endif
 
-  cudnnConvolutionFwdPreference_t fwd_pref;
-  cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-  cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-  if (prefer_ == "fastest") {
-    fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-  } else if (prefer_ == "limited_workspace") {
-    fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-  } else if (prefer_ == "no_workspace") {
-    fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+  if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+      prefer_ == "no_workspace") {
+    cudnnConvolutionFwdPreference_t fwd_pref;
+    cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+    cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+    if (prefer_ == "fastest") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+    } else if (prefer_ == "limited_workspace") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+    } else {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+    }
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+        ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+        workspace_byte_limit_, &fp_alg_));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+        ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+        bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+        ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+        bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+  } else if (prefer_ == "autotune") {
+    const int topk = 1;
+    int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+    cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+    cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+    cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+        ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+        &num_fp_alg, fp_alg_perf));
+    fp_alg_ = fp_alg_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+        ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+        &num_bp_filt_alg, bp_filt_perf));
+    bp_filter_alg_ = bp_filt_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+        ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+        &num_bp_data_alg, bp_data_perf));
+    bp_data_alg_ = bp_data_perf[0].algo;
   } else {
     LOG(FATAL) << "Preferred algorithm is not available!";
   }
-  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-      ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-      workspace_byte_limit_, &fp_alg_));
-
-  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-      ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-      bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
-  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-      ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-      bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
 
   size_t fp_byte, bp_data_byte, bp_filter_byte;
   CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
@@ -147,25 +167,30 @@ const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
 
   Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
   Tensor output(shape, dev, dtype);
-  output.device()->Exec([input, output, this](Context *ctx) {
-    Blob *inblob = input.blob(), *outblob = output.blob(),
-         *wblob = this->weight_.blob();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
-                            inblob->data(), this->filter_desc_, wblob->data(),
-                            this->conv_desc_, this->fp_alg_,
-                            this->workspace_.blob()->mutable_data(),
-                            this->workspace_count_ * sizeof(float), &beta,
-                            this->y_desc_, outblob->mutable_data());
-  }, {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
+  output.device()->Exec(
+      [input, output, this](Context *ctx) {
+        Blob *inblob = input.blob(), *outblob = output.blob(),
+             *wblob = this->weight_.blob();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
+                                inblob->data(), this->filter_desc_,
+                                wblob->data(), this->conv_desc_, this->fp_alg_,
+                                this->workspace_.blob()->mutable_data(),
+                                this->workspace_count_ * sizeof(float), &beta,
+                                this->y_desc_, outblob->mutable_data());
+      },
+      {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
 
   if (bias_term_) {
-    output.device()->Exec([output, this](Context *ctx) {
-      float beta = 1.f, alpha = 1.0f;
-      Blob *outblob = output.blob(), *bblob = this->bias_.blob();
-      cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_, bblob->data(),
-                     &beta, this->y_desc_, outblob->mutable_data());
-    }, {output.blob(), bias_.blob()}, {output.blob()});
+    output.device()->Exec(
+        [output, this](Context *ctx) {
+          float beta = 1.f, alpha = 1.0f;
+          Blob *outblob = output.blob(), *bblob = this->bias_.blob();
+          cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
+                         bblob->data(), &beta, this->y_desc_,
+                         outblob->mutable_data());
+        },
+        {output.blob(), bias_.blob()}, {output.blob()});
   }
   return output;
 }
@@ -187,38 +212,45 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
 
   // LOG(ERROR) << "backward bias";
   if (bias_term_) {
-    dx.device()->Exec([grad, db, this](Context *ctx) {
-      Blob *dyblob = grad.blob(), *dbblob = db.blob();
-      float alpha = 1.f, beta = 0.f;
-      cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
-                                   dyblob->data(), &beta, this->bias_desc_,
-                                   dbblob->mutable_data());
-    }, {grad.blob()}, {db.blob()});
+    dx.device()->Exec(
+        [grad, db, this](Context *ctx) {
+          Blob *dyblob = grad.blob(), *dbblob = db.blob();
+          float alpha = 1.f, beta = 0.f;
+          cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
+                                       dyblob->data(), &beta, this->bias_desc_,
+                                       dbblob->mutable_data());
+        },
+        {grad.blob()}, {db.blob()});
   }
   // LOG(ERROR) << "backward w";
-  dx.device()->Exec([grad, dw, src_data, this](Context *ctx) {
-    Blob *inblob = src_data.blob(), *dyblob = grad.blob(), *dwblob = dw.blob();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-        ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(), this->y_desc_,
-        dyblob->data(), this->conv_desc_, this->bp_filter_alg_,
-        this->workspace_.blob()->mutable_data(),
-        this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
-        dwblob->mutable_data());
-  }, {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
+  dx.device()->Exec(
+      [grad, dw, src_data, this](Context *ctx) {
+        Blob *inblob = src_data.blob(), *dyblob = grad.blob(),
+             *dwblob = dw.blob();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_,
+            this->bp_filter_alg_, this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
+            dwblob->mutable_data());
+      },
+      {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
 
   // LOG(ERROR) << "backward src";
-  dx.device()->Exec([dx, grad, this](Context *ctx) {
-    Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
-         *dxblob = dx.blob();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, this->filter_desc_,
-                                 wblob->data(), this->y_desc_, dyblob->data(),
-                                 this->conv_desc_, this->bp_data_alg_,
-                                 this->workspace_.blob()->mutable_data(),
-                                 this->workspace_count_ * sizeof(float), &beta,
-                                 this->x_desc_, dxblob->mutable_data());
-  }, {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
+  dx.device()->Exec(
+      [dx, grad, this](Context *ctx) {
+        Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
+             *dxblob = dx.blob();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(
+            ctx->cudnn_handle, &alpha, this->filter_desc_, wblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_, this->bp_data_alg_,
+            this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->x_desc_,
+            dxblob->mutable_data());
+      },
+      {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
   param_grad.push_back(dw);
   param_grad.push_back(db);
   return std::make_pair(dx, param_grad);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01aaf490/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
index 73359b4..2a17da2 100644
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@ -204,4 +204,185 @@ TEST(CudnnConvolution, Backward) {
   EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
   EXPECT_EQ(dy[0] * x[4], dwptr[8]);
 }
+// Tests for prefer=autotune
+TEST(CudnnConvolution_AT, Setup) {
+  CudnnConvolution conv;
+  EXPECT_EQ("CudnnConvolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(256u << 20, conv.workspace_byte_limit());
+  EXPECT_STREQ("autotune", conv.prefer().c_str());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
+}
+
+TEST(CudnnConvolution_AT, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                          6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * h * w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * h * w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(4u, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+}
+
+TEST(CudnnConvolution_AT, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                                  6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * src_h * src_w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
+                       &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
+                                                               0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
+                     &cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToDevice(&host);
+  const float *dx = in_grad.data<const float *>();
+  const float *wptr = we;
+  EXPECT_EQ(9u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  dw.ToDevice(&host);
+  db.ToDevice(&host);
+  const float *dbptr = db.data<const float *>();
+  EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
+
+  const float *dwptr = dw.data<const float *>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_EQ(dy[3] * x[4], dwptr[0]);
+  EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
+  EXPECT_EQ(dy[2] * x[4], dwptr[2]);
+  EXPECT_EQ(dy[1] * x[1] + dy[3] * x[7], dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[1] * x[2] + dy[2] * x[6] + dy[3] * x[8],
+                  dwptr[4]);
+  EXPECT_EQ(dy[0] * x[1] + dy[2] * x[7], dwptr[5]);
+  EXPECT_EQ(dy[1] * x[4], dwptr[6]);
+  EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
+  EXPECT_EQ(dy[0] * x[4], dwptr[8]);
+}
 #endif  // USE_CUDNN

[27/50] [abbrv] incubator-singa git commit: SINGA-174 Add Batch Normalization layer and Local Response Normalization layer.

Posted by zh...@apache.org.

SINGA-174 Add Batch Normalization layer and Local Response Normalization layer.

Implemented Batch Normalization Layer and Local Response Normalization Layer using CuDNN.
Passed test.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/eadd3f96
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/eadd3f96
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/eadd3f96

Branch: refs/heads/master
Commit: eadd3f969984cd4a94a1953a0b5e87af84ea5dc4
Parents: 64ea206
Author: WANG Ji <ij...@gmail.com>
Authored: Sun May 22 12:39:16 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu Jun 2 13:43:48 2016 +0800

----------------------------------------------------------------------
 include/singa/core/common.h        |   2 +-
 src/model/layer/batchnorm.cc       |  70 +++++++++
 src/model/layer/batchnorm.h        |  84 +++++++++++
 src/model/layer/cudnn_batchnorm.cc | 214 ++++++++++++++++++++++++++
 src/model/layer/cudnn_batchnorm.h  |  60 ++++++++
 src/model/layer/cudnn_lrn.cc       | 114 ++++++++++++++
 src/model/layer/cudnn_lrn.h        |  56 +++++++
 src/model/layer/lrn.cc             |  59 ++++++++
 src/model/layer/lrn.h              |  70 +++++++++
 src/proto/model.proto              |  13 +-
 test/singa/test_cudnn_batchnorm.cc | 257 ++++++++++++++++++++++++++++++++
 test/singa/test_cudnn_lrn.cc       | 205 +++++++++++++++++++++++++
 12 files changed, 1201 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 9d005c4..e6f4c90 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -42,7 +42,7 @@ typedef struct _Cuda { } Cuda;
 typedef struct _Opencl { } Opencl;
 }  // namespace lang
 
-/// Blob reprent a chunk of memory (on device or host) managed by VirtualMemory.
+/// Blob represent a chunk of memory (on device or host) managed by VirtualMemory.
 class Blob {
  public:
   Blob(void* ptr, size_t size) : data_(ptr), size_(size), ref_count_(1) {}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.cc b/src/model/layer/batchnorm.cc
new file mode 100644
index 0000000..bcd0870
--- /dev/null
+++ b/src/model/layer/batchnorm.cc
@@ -0,0 +1,70 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "batchnorm.h"
+
+namespace singa {
+void BatchNorm::Setup(const LayerConf& conf) {
+  Layer::Setup(conf);
+  factor_ = conf.batchnorm_conf().factor();
+  channels_ = conf.batchnorm_conf().channels();
+  height_ = conf.batchnorm_conf().height();
+  width_ = conf.batchnorm_conf().width();
+
+  bnScale_.Reshape(Shape{channels_ * height_ * width_});
+  bnBias_.ResetLike(bnScale_);
+  runningMean_.ResetLike(bnScale_);
+  runningVariance_.ResetLike(bnScale_);
+
+  dbnScale_.ResetLike(bnScale_);
+  dbnBias_.ResetLike(bnBias_);
+  // Push back params into param_values_
+  // Assume the order of param is: bnScale, bnBias, runningMean, runningVariance
+  for (const auto &spec : conf.param()) param_specs_.push_back(spec);
+  param_values_.push_back(&bnScale_);
+  param_values_.push_back(&bnBias_);
+  param_values_.push_back(&runningMean_);
+  param_values_.push_back(&runningVariance_);
+}
+
+void BatchNorm::ToDevice(Device* device) {
+  bnScale_.ToDevice(device);
+  bnBias_.ToDevice(device);
+  dbnScale_.ToDevice(device);
+  dbnBias_.ToDevice(device);
+  runningMean_.ToDevice(device);
+  runningVariance_.ToDevice(device);
+}
+
+const Tensor BatchNorm::Forward(int flag, const Tensor& input) {
+  LOG(FATAL) << "Not implemented";
+  Tensor output;
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> BatchNorm::Backward(
+    int flag, const Tensor& grad) {
+  LOG(FATAL) << "Not implemented";
+  Tensor dx;
+  vector<Tensor> param_grad;
+  return std::make_pair(dx, param_grad);
+}
+
+}  // namespace

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.h b/src/model/layer/batchnorm.h
new file mode 100644
index 0000000..0255179
--- /dev/null
+++ b/src/model/layer/batchnorm.h
@@ -0,0 +1,84 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_BATCHNORM_H
+#define SINGA_MODEL_LAYER_BATCHNORM_H
+#include "singa/model/layer.h"
+#include "singa/core/common.h"
+#include "singa/proto/core.pb.h"
+#include <stack>
+
+namespace singa {
+class BatchNorm : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override {
+    return "Batch Normalization";
+  }
+
+  /// \copydoc Layer::Setup(const LayerConf&)
+  virtual void Setup(const LayerConf& conf) override;
+
+  const Tensor Forward(int flag, const Tensor& input)
+    override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(
+      int flag, const Tensor& grad) override;
+
+  const float factor() const { return factor_; }
+  const Tensor& bnScale() const { return bnScale_; }
+  const Tensor& bnBias() const { return bnBias_; }
+  const Tensor& runningMean() const { return runningMean_; }
+  const Tensor& runningVariance() const { return runningVariance_; }
+  const size_t channels() const { return channels_; }
+  const size_t height() const { return height_; }
+  const size_t width() const { return width_; }
+  void set_bnScale(Tensor x) {
+    bnScale_.ResetLike(x);
+    bnScale_.CopyData(x);
+  }
+  void set_bnBias(Tensor x) {
+    bnBias_.ResetLike(x);
+    bnBias_.CopyData(x);
+  }
+  void set_runningMean(Tensor x) {
+    runningMean_.ResetLike(x);
+    runningMean_.CopyData(x);
+  }
+  void set_runningVariance(Tensor x) {
+    runningVariance_.ResetLike(x);
+    runningVariance_.CopyData(x);
+  }
+  virtual void ToDevice(Device* device) override;
+
+ protected:
+  float factor_;
+  size_t channels_, height_, width_;
+  Tensor bnScale_, bnBias_;
+  Tensor dbnScale_, dbnBias_;
+  Tensor runningMean_, runningVariance_;
+  // Store intermediate data, i.e., input tensor
+  std::stack<Tensor> buf_;
+  
+}; // class batchnorm
+} // namespace 
+
+#endif  // SINGA_MODEL_LAYER_BATCHNORM_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_batchnorm.cc b/src/model/layer/cudnn_batchnorm.cc
new file mode 100644
index 0000000..8288a41
--- /dev/null
+++ b/src/model/layer/cudnn_batchnorm.cc
@@ -0,0 +1,214 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "cudnn_batchnorm.h"
+#ifdef USE_CUDNN
+
+namespace singa {
+
+CudnnBatchNorm::~CudnnBatchNorm() {
+  if (has_init_cudnn_) {
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(shape_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(param_desc_));
+  }
+}
+
+void CudnnBatchNorm::ToDevice(Device* device) {
+  BatchNorm::ToDevice(device);
+  resultSaveMean_.ToDevice(device);
+  resultSaveVariance_.ToDevice(device);
+}
+
+void CudnnBatchNorm::Setup(const LayerConf& conf) {
+  BatchNorm::Setup(conf);
+  bnScale_.Reshape(Shape{1,channels_,1,1});
+  bnBias_.ResetLike(bnScale_);
+  dbnScale_.ResetLike(bnScale_);
+  dbnBias_.ResetLike(bnScale_);
+  runningMean_.ResetLike(bnScale_);
+  runningVariance_.ResetLike(bnScale_);
+  resultSaveMean_.ResetLike(bnScale_);
+  resultSaveVariance_.ResetLike(bnScale_);
+}
+
+void CudnnBatchNorm::InitCudnn(const Shape& shape, DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  mode_ = CUDNN_BATCHNORM_SPATIAL;
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&shape_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&param_desc_));
+  CHECK_EQ(shape.size(), 4u);
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(shape_desc_,
+        CUDNN_TENSOR_NCHW,
+        GetCudnnDataType(dtype),
+        shape[0],
+        shape[1],
+        shape[2],
+        shape[3]));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(param_desc_,
+        CUDNN_TENSOR_NCHW,
+        GetCudnnDataType(dtype),
+        1,
+        shape[1],
+        1,
+        1));
+  has_init_cudnn_ = true;
+}
+const Tensor CudnnBatchNorm::Forward(int flag, const Tensor& input) {
+  auto shape = input.shape();
+  auto dtype = input.data_type();
+  Tensor output;
+  if (!has_init_cudnn_)
+    InitCudnn(shape, dtype);
+  // TODO(wangji): check device id of input and params
+  output.ResetLike(input);
+  if ((flag & kTrain) == kTrain) {
+    output.device()->Exec(
+        [=](Context* ctx) {
+          Blob *inBlob = input.blob(), *outBlob = output.blob(),
+            *saveMeanBlob = resultSaveMean_.blob(),
+            *saveVarBlob = resultSaveVariance_.blob(),
+            *runningMeanBlob = runningMean_.blob(),
+            *runningVarBlob = runningVariance_.blob(),
+            *bnScaleBlob = bnScale_.blob(),
+            *bnBiasBlob = bnBias_.blob();
+          const float alpha = 1.0f, beta = 0.0f;
+          double epsilon = CUDNN_BN_MIN_EPSILON;
+          CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
+              ctx->cudnn_handle,
+              this->mode_,
+              &alpha,
+              &beta,
+              shape_desc_,
+              inBlob->data(),
+              shape_desc_,
+              outBlob->mutable_data(),
+              param_desc_,
+              bnScaleBlob->data(),
+              bnBiasBlob->data(),
+              factor_,
+              runningMeanBlob->mutable_data(),
+              runningVarBlob->mutable_data(),
+              epsilon,
+              saveMeanBlob->mutable_data(),
+              saveVarBlob->mutable_data()));
+        },
+        {input.blob(),
+         bnScale_.blob(),
+         bnBias_.blob()},
+        {output.blob(),
+         runningMean_.blob(),
+         runningVariance_.blob(),
+         resultSaveMean_.blob(),
+         resultSaveVariance_.blob()});
+    buf_.push(input);
+  } else {
+    output.device()->Exec(
+        [=](Context* ctx) {
+          Blob *inBlob = input.blob(), *outBlob = output.blob(),
+            *runningMeanBlob = runningMean_.blob(),
+            *runningVarBlob = runningVariance_.blob(),
+            *bnScaleBlob = bnScale_.blob(),
+            *bnBiasBlob = bnBias_.blob();
+          const float alpha = 1.0f, beta = 0.0f;
+          double epsilon = CUDNN_BN_MIN_EPSILON;
+          CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
+              ctx->cudnn_handle,
+              this->mode_,
+              &alpha,
+              &beta,
+              shape_desc_,
+              inBlob->data(),
+              shape_desc_,
+              outBlob->mutable_data(),
+              param_desc_,
+              bnScaleBlob->data(),
+              bnBiasBlob->data(),
+              runningMeanBlob->data(),
+              runningVarBlob->data(),
+              epsilon));
+        },
+        {input.blob(),
+         bnScale_.blob(),
+         bnBias_.blob(),
+         runningMean_.blob(),
+         runningVariance_.blob()},
+        {output.blob()});
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnBatchNorm::Backward(
+    int flag, const Tensor& grad) {
+  vector <Tensor> param_grad;
+  Tensor dx;
+  if ((flag & kTrain) == kTrain) {
+    Tensor input = buf_.top();
+    buf_.pop();
+    dx.ResetLike(grad);
+    dx.device()->Exec(
+        [=](Context* ctx) {
+          Blob *dyblob = grad.blob(), *dxblob = dx.blob(),
+            *xblob = input.blob(),
+            *bnScaleBlob = bnScale_.blob(),
+            *dbnScaleBlob = dbnScale_.blob(),
+            *dbnBiasBlob = dbnBias_.blob(),
+            *saveMeanBlob = resultSaveMean_.blob(),
+            *saveVarBlob = resultSaveVariance_.blob();
+          const float alpha = 1.0f, beta = .0f;
+          double epsilon = CUDNN_BN_MIN_EPSILON;
+          CUDNN_CHECK(cudnnBatchNormalizationBackward(ctx->cudnn_handle,
+              this->mode_,
+              &alpha,
+              &beta,
+              &alpha,
+              &beta,
+              shape_desc_,
+              xblob->data(),
+              shape_desc_,
+              dyblob->data(),
+              shape_desc_,
+              dxblob->mutable_data(),
+              param_desc_,
+              bnScaleBlob->data(),
+              dbnScaleBlob->mutable_data(),
+              dbnBiasBlob->mutable_data(),
+              epsilon,
+              saveMeanBlob->data(),
+              saveVarBlob->data()));
+
+        },
+        {dx.blob(),
+         grad.blob(),
+         bnScale_.blob(),
+         resultSaveMean_.blob(),
+         resultSaveVariance_.blob()},
+        {dx.blob(),
+         dbnScale_.blob(),
+         dbnBias_.blob()});
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  param_grad.push_back(dbnScale_);
+  param_grad.push_back(dbnBias_);
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace
+
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/cudnn_batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_batchnorm.h b/src/model/layer/cudnn_batchnorm.h
new file mode 100644
index 0000000..83258d2
--- /dev/null
+++ b/src/model/layer/cudnn_batchnorm.h
@@ -0,0 +1,60 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
+#define SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
+#include "singa_config.h"
+#ifdef USE_CUDNN
+
+#include "batchnorm.h"
+#include "cudnn_utils.h"
+
+namespace singa {
+class CudnnBatchNorm : public BatchNorm {
+ public:
+   ~CudnnBatchNorm();
+   /// \copy doc Layer::layer_type()
+   const std::string layer_type() const override {
+     return "CudnnBatchNorm";
+   }
+
+   void Setup(const LayerConf& conf) override;
+
+   const Tensor Forward(int flag, const Tensor& input)
+     override;
+   const std::pair<Tensor, vector<Tensor>> Backward(
+       int flag, const Tensor& grad) override;
+
+   /// Init cudnn related data structures.
+   void InitCudnn(const Shape& shape, DataType dtype);
+   void ToDevice(Device* device) override;
+
+ private:
+   bool has_init_cudnn_ = false;
+   cudnnBatchNormMode_t mode_;
+   cudnnLRNDescriptor_t lrn_desc_;
+   cudnnTensorDescriptor_t shape_desc_, param_desc_;
+   Tensor resultSaveMean_, resultSaveVariance_;
+   
+}; // class CudnnBatchNorm
+}  // namespace
+
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_BATCHNORM 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/cudnn_lrn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.cc b/src/model/layer/cudnn_lrn.cc
new file mode 100644
index 0000000..ee661b6
--- /dev/null
+++ b/src/model/layer/cudnn_lrn.cc
@@ -0,0 +1,114 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "cudnn_lrn.h"
+#ifdef USE_CUDNN
+#include "cudnn_utils.h"
+
+namespace singa {
+CudnnLRN::~CudnnLRN() {
+  if (has_init_cudnn_) {
+    CUDNN_CHECK(cudnnDestroyLRNDescriptor(lrn_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(shape_desc_));
+  }
+}
+void CudnnLRN::InitCudnn(const Shape& shape , DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&shape_desc_));
+  CHECK_EQ(shape.size(), 4);
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(shape_desc_,
+      CUDNN_TENSOR_NCHW,
+      GetCudnnDataType(dtype),
+      shape[0],
+      shape[1],
+      shape[2],
+      shape[3]));
+  CUDNN_CHECK(cudnnCreateLRNDescriptor(&lrn_desc_));
+  CUDNN_CHECK(cudnnSetLRNDescriptor(lrn_desc_,
+        local_size_,
+        alpha_,
+        beta_,
+        k_));
+  has_init_cudnn_ = true;
+}
+const Tensor CudnnLRN::Forward(int flag, const Tensor& input) {
+  auto shape = input.shape();
+  auto dtype = input.data_type();
+  if (!has_init_cudnn_)
+    InitCudnn(shape, dtype);
+  Tensor output;
+  output.ResetLike(input);
+  output.device()->Exec(
+      [=](Context* ctx) {
+        Blob *inblob = input.blob(), *outblob = output.blob();
+        const float alpha = 1.0f, beta = 0.0f;
+        CUDNN_CHECK(cudnnLRNCrossChannelForward(ctx->cudnn_handle,
+            this->lrn_desc_,
+            this->mode_,
+            &alpha,
+            this->shape_desc_,
+            inblob->data(),
+            &beta,
+            this->shape_desc_,
+            outblob->mutable_data()));
+      }, {input.blob()}, {output.blob()});
+  buf_.push(input);
+  buf_.push(output);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnLRN::Backward(
+    int flag, const Tensor& grad) {
+  vector <Tensor> param_grad;
+  Tensor dx;
+  Tensor output = buf_.top();
+  buf_.pop();
+  Tensor input = buf_.top();
+  buf_.pop();
+  if ((flag & kTrain) == kTrain) {
+    dx.ResetLike(grad);
+    dx.device()->Exec(
+        [=](Context *ctx) {
+          Blob *dyblob = grad.blob(), *dxblob = dx.blob();
+          Blob *yblob = output.blob(), *xblob = input.blob();
+          float alpha = 1.0f, beta = 0.0f;
+          CUDNN_CHECK(cudnnLRNCrossChannelBackward(ctx->cudnn_handle,
+              this->lrn_desc_,
+              this->mode_,
+              &alpha,
+              this->shape_desc_,
+              yblob->data(),
+              this->shape_desc_,
+              dyblob->data(),
+              this->shape_desc_,
+              xblob->data(),
+              &beta,
+              this->shape_desc_,
+              dxblob->mutable_data()));
+        }, {output.blob(), grad.blob(), input.blob()}, {dx.blob()});
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace
+
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/cudnn_lrn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.h b/src/model/layer/cudnn_lrn.h
new file mode 100644
index 0000000..0f650fe
--- /dev/null
+++ b/src/model/layer/cudnn_lrn.h
@@ -0,0 +1,56 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_CUDNN_LRN_H_
+#define SINGA_MODEL_LAYER_CUDNN_LRN_H_
+#include "singa_config.h"
+#ifdef USE_CUDNN
+
+#include "lrn.h"
+#include "cudnn_utils.h"
+
+namespace singa {
+class CudnnLRN : public LRN {
+ public:
+   ~CudnnLRN();
+   /// \copy doc Layer::layer_type()
+   const std::string layer_type() const override {
+     return "CudnnLRN";
+   }
+
+   const Tensor Forward(int flag, const Tensor& input)
+     override;
+   const std::pair<Tensor, vector<Tensor>> Backward(
+       int flag, const Tensor& grad) override;
+
+   /// Init cudnn related data structures.
+   void InitCudnn(const Shape& shape, DataType dtype);
+
+ private:
+   bool has_init_cudnn_ = false;
+   cudnnLRNMode_t mode_;
+   cudnnLRNDescriptor_t lrn_desc_;
+   cudnnTensorDescriptor_t shape_desc_;
+   
+}; // class CudnnLRN
+}  // namespcae
+
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_LRN_H_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/lrn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/lrn.cc b/src/model/layer/lrn.cc
new file mode 100644
index 0000000..55135f1
--- /dev/null
+++ b/src/model/layer/lrn.cc
@@ -0,0 +1,59 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "lrn.h"
+
+namespace singa{
+void LRN::Setup(const LayerConf& conf) {
+  Layer::Setup(conf);
+  local_size_ = conf.lrn_conf().local_size();
+  CHECK_EQ(local_size_ % 2, 1) << "LRN only supports odd values for Localvol";
+  k_ = conf.lrn_conf().k();
+  alpha_ = conf.lrn_conf().alpha();
+  beta_ = conf.lrn_conf().beta();
+}
+
+const Tensor LRN::Forward(int flag, const Tensor& input) {
+  //Tensor output;
+  //const float salpha = alpha_ / local_size_;
+  LOG(FATAL) << "Not implemented";
+  /* Tensor API may be need
+   * 1. set
+   * template <typename Dtype>
+   * void Set(Dtype val);
+   *
+   * 2. axpy
+   * 3. padding
+   *
+   *
+   */
+  Tensor output;
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> LRN::Backward(
+    int flag, const Tensor& grad) {
+  LOG(FATAL) << "Not implemented";
+  Tensor dx;
+  vector<Tensor> param_grad;
+  return std::make_pair(dx, param_grad);
+}
+
+}  // namespace

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/model/layer/lrn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/lrn.h b/src/model/layer/lrn.h
new file mode 100644
index 0000000..118d062
--- /dev/null
+++ b/src/model/layer/lrn.h
@@ -0,0 +1,70 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_LRN_H_
+#define SINGA_MODEL_LAYER_LRN_H_
+#include "singa/model/layer.h"
+#include <stack>
+
+namespace singa {
+class LRN : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override {
+    return "LRN";
+  }
+
+  /// \copydoc Layer::Setup(const LayerConf&)
+  void Setup(const LayerConf& conf) override;
+
+  /**
+   * Local Response Normalization edge
+   *
+   * @f$ b_i=a_i/x_i^beta @f$
+   * @f$x_i=k+alpha*\sum_{j=max(0,i-n/2)}^{min(N,i+n/2)}(a_j)^2 @f$
+   * n is size of local response area.
+   * @f$a_i@f$, the activation (after ReLU) of a neuron convolved with the i-th kernel.
+   * @f$b_i@f$, the neuron after normalization, N is the total num of kernels
+   */
+  const Tensor Forward(int flag, const Tensor& input)
+    override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(
+      int flag, const Tensor& grad) override;
+
+  int local_size() const { return local_size_; }
+  float alpha() const { return alpha_; }
+  float beta() const { return beta_; }
+  float k() const { return k_; }
+
+ protected:
+  //!< hyper-parameter: size local response (neighbor) area
+  int local_size_;
+  //!< other hyper-parameters
+  float alpha_, beta_, k_;
+  // store intermediate data, i.e., input tensor
+  std::stack<Tensor> buf_;
+  
+}; // class LRN
+} // namespace 
+
+#endif  // SINGA_MODEL_LAYER_LRN_H_
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 16ba62f..d368296 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -231,6 +231,7 @@ message LayerConf {
   // Used in SINGA
   optional DenseConf dense_conf = 201;
   optional MetricConf metric_conf = 200;
+  optional BatchNormConf batchnorm_conf = 202;
 }
 
 // Message that stores hyper-parameters used to apply transformation
@@ -902,14 +903,12 @@ message SPPConf {
   }
   optional uint32 pyramid_height = 1;
   optional PoolMethod pool = 2 [default = MAX]; // The pooling method
-  /*
   enum Engine {
     DEFAULT = 0;
     CAFFE = 1;
     CUDNN = 2;
   }
   optional Engine engine = 6 [default = DEFAULT];
-  */
 }
 
 message PReLUConf {
@@ -921,3 +920,13 @@ message PReLUConf {
   // Whether or not slope paramters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }
+
+message BatchNormConf {
+  // Used in the moving average computation runningMean =
+  // newMean*factor + runningMean*(1-factor).
+  optional double factor = 1 [default = 0.9];
+  // input shape
+  optional int32 channels = 2;
+  optional int32 height = 3;
+  optional int32 width = 4;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/test/singa/test_cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_batchnorm.cc b/test/singa/test_cudnn_batchnorm.cc
new file mode 100644
index 0000000..d38fdaa
--- /dev/null
+++ b/test/singa/test_cudnn_batchnorm.cc
@@ -0,0 +1,257 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+
+#include "../src/model/layer/cudnn_batchnorm.h"
+
+#ifdef USE_CUDNN
+#include "gtest/gtest.h"
+
+using singa::CudnnBatchNorm;
+
+TEST(CudnnBatchNorm, Setup) {
+  CudnnBatchNorm batchnorm;
+  EXPECT_EQ("CudnnBatchNorm", batchnorm.layer_type());
+
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(0.01);
+  batchnorm_conf->set_channels(2);
+  batchnorm_conf->set_height(4);
+  batchnorm_conf->set_width(4);
+  batchnorm.Setup(conf);
+
+  EXPECT_FLOAT_EQ(0.01, batchnorm.factor());
+  EXPECT_EQ(2u, batchnorm.channels());
+  EXPECT_EQ(4u, batchnorm.height());
+  EXPECT_EQ(4u, batchnorm.width());
+}
+
+TEST(CudnnBatchNorm, Forward) {
+  CudnnBatchNorm batchnorm;
+  const float x[] = {
+    0.0736655, 0.0459045, 0.0779517, 0.0771059,
+    0.0586862, 0.0561263, 0.0708457, 0.0977273,
+    0.0405025, -0.170897, 0.0208982, 0.136865,
+    -0.0367905, -0.0618205, -0.0103908, -0.0522777,
+    -0.122161, -0.025427, -0.0718576, -0.185941,
+    0.0166533, 0.178679, -0.0576606, -0.137817,
+    0.150676, 0.153442, -0.0929899, -0.148675,
+    -0.112459, -0.106284, -0.103074, -0.0668811
+  };
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
+  in.CopyDataFromHostPtr(x, 1*2*4*4);
+  const float alpha_[] = {1, 1};
+  singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
+  alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
+
+  const float beta_[] = {0, 0};
+  singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
+  beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
+
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(0.9);
+  batchnorm_conf->set_channels(2);
+  batchnorm_conf->set_height(4);
+  batchnorm_conf->set_width(4);
+  batchnorm.Setup(conf);
+
+  batchnorm.ToDevice(&cuda);
+  batchnorm.set_bnScale(alpha);
+  batchnorm.set_bnBias(beta);
+  batchnorm.set_runningMean(beta);
+  batchnorm.set_runningVariance(beta);
+  singa::Tensor out = batchnorm.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out.ToHost();
+  const float *outptr = out.data<const float *>();
+  const auto & shape = out.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+  EXPECT_NEAR(0.637092, outptr[0], 1e-4f);
+  EXPECT_NEAR(0.262057, outptr[1], 1e-4f);
+  EXPECT_NEAR(0.694995, outptr[2], 1e-4f);
+  EXPECT_NEAR(0.683569, outptr[3], 1e-4f);
+  EXPECT_NEAR(0.43473, outptr[4], 1e-4f);
+  EXPECT_NEAR(0.400147, outptr[5], 1e-4f);
+  EXPECT_NEAR(0.598998, outptr[6], 1e-4f);
+  EXPECT_NEAR(0.962152, outptr[7], 1e-4f);
+  EXPECT_NEAR(0.189079, outptr[8], 1e-4f);
+  EXPECT_NEAR(-2.6668, outptr[9], 1e-4f);
+  EXPECT_NEAR(-0.0757632, outptr[10], 1e-4f);
+  EXPECT_NEAR(1.49088, outptr[11], 1e-4f);
+  EXPECT_NEAR(-0.855104, outptr[12], 1e-4f);
+  EXPECT_NEAR(-1.19324, outptr[13], 1e-4f);
+  EXPECT_NEAR(-0.498459, outptr[14], 1e-4f);
+  EXPECT_NEAR(-1.06433, outptr[15], 1e-4f);
+  EXPECT_NEAR(-0.696646, outptr[16], 1e-4f);
+  EXPECT_NEAR(0.185125, outptr[17], 1e-4f);
+  EXPECT_NEAR(-0.238109, outptr[18], 1e-4f);
+  EXPECT_NEAR(-1.27803, outptr[19], 1e-4f);
+  EXPECT_NEAR(0.568704, outptr[20], 1e-4f);
+  EXPECT_NEAR(2.04564, outptr[21], 1e-4f);
+  EXPECT_NEAR(-0.108697, outptr[22], 1e-4f);
+  EXPECT_NEAR(-0.839356, outptr[23], 1e-4f);
+  EXPECT_NEAR(1.79038, outptr[24], 1e-4f);
+  EXPECT_NEAR(1.81559, outptr[25], 1e-4f);
+  EXPECT_NEAR(-0.430738, outptr[26], 1e-4f);
+  EXPECT_NEAR(-0.938335, outptr[27], 1e-4f);
+  EXPECT_NEAR(-0.608203, outptr[28], 1e-4f);
+  EXPECT_NEAR(-0.551921, outptr[29], 1e-4f);
+  EXPECT_NEAR(-0.522658, outptr[30], 1e-4f);
+  EXPECT_NEAR(-0.192746, outptr[31], 1e-4f);
+}
+
+TEST(CudnnBatchNorm, Backward) {
+  CudnnBatchNorm batchnorm;
+  const float x[] = {
+    0.0736655, 0.0459045, 0.0779517, 0.0771059,
+    0.0586862, 0.0561263, 0.0708457, 0.0977273,
+    0.0405025, -0.170897, 0.0208982, 0.136865,
+    -0.0367905, -0.0618205, -0.0103908, -0.0522777,
+    -0.122161, -0.025427, -0.0718576, -0.185941,
+    0.0166533, 0.178679, -0.0576606, -0.137817,
+    0.150676, 0.153442, -0.0929899, -0.148675,
+    -0.112459, -0.106284, -0.103074, -0.0668811
+  };
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
+  x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
+
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(1);
+  batchnorm_conf->set_channels(2);
+  batchnorm_conf->set_height(4);
+  batchnorm_conf->set_width(4);
+  batchnorm.Setup(conf);
+
+  const float dy[] = {
+    -0.0064714, 0, 0, 0,
+    0, -0.00297655, -0.0195729, 0,
+    0, 0, 0, 0,
+    0, 0, 0, -0.0032594,
+    0, 0, 0, 0,
+    0, 0, 0.0125562, 0,
+    0.00041933, 0.000386108, -0.0074611, 0.0015929,
+    0.00468428, 0.00735506, -0.00682525, 0.00342023
+  };
+
+  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
+  dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
+  const float alpha_[] = {1, 1};
+  singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
+  alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
+
+  const float beta_[] = {0, 0};
+  singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
+  beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
+
+  const float mean_[] = {0.0123405, -0.0622333};
+  singa::Tensor mean(singa::Shape{1,2,1,1}, &cuda);
+  mean.CopyDataFromHostPtr(mean_, 1*2*1*1);
+
+  const float var_[] = {15.9948, 8.68198};
+  singa::Tensor var(singa::Shape{1,2,1,1}, &cuda);
+  var.CopyDataFromHostPtr(var_, 1*2*1*1);
+
+  batchnorm.ToDevice(&cuda);
+  batchnorm.set_bnScale(alpha);
+  batchnorm.set_bnBias(beta);
+  batchnorm.set_runningMean(beta);
+  batchnorm.set_runningVariance(beta);
+  batchnorm.Forward(singa::kTrain, x_tensor);
+  const auto ret = batchnorm.Backward(singa::kTrain, dy_tensor);
+  singa::CppCPU host(0, 1);
+  singa::Tensor dx = ret.first;
+  dx.ToDevice(&host);
+  const float *dxptr = dx.data<const float *>();
+  const auto & shape = dx.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+  EXPECT_NEAR(-0.0528703, dxptr[0], 1e-4f);
+  EXPECT_NEAR(0.0302578, dxptr[1], 1e-4f);
+  EXPECT_NEAR(0.0352178, dxptr[2], 1e-4f);
+  EXPECT_NEAR(0.0350869, dxptr[3], 1e-4f);
+  EXPECT_NEAR(0.032236, dxptr[4], 1e-4f);
+  EXPECT_NEAR(-0.00837157, dxptr[5], 1e-4f);
+  EXPECT_NEAR(-0.2303, dxptr[6], 1e-4f);
+  EXPECT_NEAR(0.0382786, dxptr[7], 1e-4f);
+  EXPECT_NEAR(0.0294217, dxptr[8], 1e-4f);
+  EXPECT_NEAR(-0.00329757, dxptr[9], 1e-4f);
+  EXPECT_NEAR(0.0263874, dxptr[10], 1e-4f);
+  EXPECT_NEAR(0.0443361, dxptr[11], 1e-4f);
+  EXPECT_NEAR(0.0174587, dxptr[12], 1e-4f);
+  EXPECT_NEAR(0.0135847, dxptr[13], 1e-4f);
+  EXPECT_NEAR(0.0215447, dxptr[14], 1e-4f);
+  EXPECT_NEAR(-0.0289709, dxptr[15], 1e-4f);
+  EXPECT_NEAR(-0.0100591, dxptr[16], 1e-4f);
+  EXPECT_NEAR(-0.00895677, dxptr[17], 1e-4f);
+  EXPECT_NEAR(-0.00948587, dxptr[18], 1e-4f);
+  EXPECT_NEAR(-0.0107859, dxptr[19], 1e-4f);
+  EXPECT_NEAR(-0.00847725, dxptr[20], 1e-4f);
+  EXPECT_NEAR(-0.0066309, dxptr[21], 1e-4f);
+  EXPECT_NEAR(0.105131, dxptr[22], 1e-4f);
+  EXPECT_NEAR(-0.0102375, dxptr[23], 1e-4f);
+  EXPECT_NEAR(-0.00312763, dxptr[24], 1e-4f);
+  EXPECT_NEAR(-0.00339895, dxptr[25], 1e-4f);
+  EXPECT_NEAR(-0.0777377, dxptr[26], 1e-4f);
+  EXPECT_NEAR(0.00415871, dxptr[27], 1e-4f);
+  EXPECT_NEAR(0.0327506, dxptr[28], 1e-4f);
+  EXPECT_NEAR(0.0571663, dxptr[29], 1e-4f);
+  EXPECT_NEAR(-0.0720566, dxptr[30], 1e-4f);
+  EXPECT_NEAR(0.0217477, dxptr[31], 1e-4f);
+
+  singa::Tensor dbnScale = ret.second.at(0);
+  dbnScale.ToDevice(&host);
+  const float *dbnScaleptr = dbnScale.data<const float *>();
+  const auto & dbnScaleShape = dbnScale.shape();
+  EXPECT_EQ(4u, dbnScaleShape.size());
+  EXPECT_EQ(1u, dbnScaleShape[0]);
+  EXPECT_EQ(2u, dbnScaleShape[1]);
+  EXPECT_EQ(1u, dbnScaleShape[2]);
+  EXPECT_EQ(1u, dbnScaleShape[3]);
+
+  EXPECT_NEAR(-0.013569f, dbnScaleptr[0], 1e-4f);
+  EXPECT_NEAR(-0.00219431f, dbnScaleptr[1], 1e-4f);
+
+  singa::Tensor dbnBias = ret.second.at(1);
+  dbnBias.ToDevice(&host);
+  const float *dbnBiasptr = dbnBias.data<const float *>();
+  const auto & dbnBiasShape = dbnBias.shape();
+  EXPECT_EQ(4u, dbnBiasShape.size());
+  EXPECT_EQ(1u, dbnBiasShape[0]);
+  EXPECT_EQ(2u, dbnBiasShape[1]);
+  EXPECT_EQ(1u, dbnBiasShape[2]);
+  EXPECT_EQ(1u, dbnBiasShape[3]);
+
+  EXPECT_NEAR(-0.0322803f, dbnBiasptr[0], 1e-4f);
+  EXPECT_NEAR(0.0161278f, dbnBiasptr[1], 1e-4f);
+}
+
+#endif  //  USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eadd3f96/test/singa/test_cudnn_lrn.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_lrn.cc b/test/singa/test_cudnn_lrn.cc
new file mode 100644
index 0000000..390c588
--- /dev/null
+++ b/test/singa/test_cudnn_lrn.cc
@@ -0,0 +1,205 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+
+#include "../src/model/layer/cudnn_lrn.h"
+
+#ifdef USE_CUDNN
+// cudnn lrn is added in cudnn 4
+#if CUDNN_VERSION_MAJOR >=4
+#include "gtest/gtest.h"
+
+using singa::CudnnLRN;
+
+TEST(CudnnLRN, Setup) {
+  CudnnLRN lrn;
+  EXPECT_EQ("CudnnLRN", lrn.layer_type());
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(conf);
+
+  EXPECT_FLOAT_EQ(1.0, lrn.k());
+  EXPECT_EQ(3, lrn.local_size());
+  EXPECT_FLOAT_EQ(0.1, lrn.alpha());
+  EXPECT_FLOAT_EQ(0.75, lrn.beta());
+}
+
+TEST(CudnnLRN, Forward) {
+  CudnnLRN lrn;
+  const float x[] = {
+    0.00658502, -0.0496967, -0.0333733, -0.0263094,
+    -0.044298, 0.0211638, 0.0829358, -0.0172312,
+    -0.0665471, -0.10017, -0.0750333, -0.104551,
+    -0.00981208, -0.0583349, -0.0751652, 0.011747,
+    0.0151165, 0.0304321, 0.0736639, -0.00652653,
+    0.00962833, 0.169646, -0.044588, -0.00244141,
+    0.0597329, -0.0530868, 0.0124246, 0.108429,
+    0.0451175, 0.0247055, 0.0304345, 0.0179575
+  };
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
+  in.CopyDataFromHostPtr(x, 1*2*4*4);
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(conf);
+
+  singa::Tensor out = lrn.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out.ToDevice(&host);
+  const float *outptr = out.data<const float *>();
+  const auto & shape = out.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+
+  EXPECT_NEAR(0.00658498f, outptr[0], 1e-6f);
+  EXPECT_NEAR(-0.0496925f, outptr[1], 1e-6f);
+  EXPECT_NEAR(-0.0333678f, outptr[2], 1e-6f);
+  EXPECT_NEAR(-0.0263089f, outptr[3], 1e-6f);
+  EXPECT_NEAR(-0.0442958f, outptr[4], 1e-6f);
+  EXPECT_NEAR(0.0211483f, outptr[5], 1e-6f);
+  EXPECT_NEAR(0.0829174f, outptr[6], 1e-6f);
+  EXPECT_NEAR(-0.0172311f, outptr[7], 1e-6f);
+  EXPECT_NEAR(-0.0665338f, outptr[8], 1e-6f);
+  EXPECT_NEAR(-0.100138f, outptr[9], 1e-6f);
+  EXPECT_NEAR(-0.0750224f, outptr[10], 1e-6f);
+  EXPECT_NEAR(-0.104492f, outptr[11], 1e-6f);
+  EXPECT_NEAR(-0.00981155f, outptr[12], 1e-6f);
+  EXPECT_NEAR(-0.058329f, outptr[13], 1e-6f);
+  EXPECT_NEAR(-0.0751528f, outptr[14], 1e-6f);
+  EXPECT_NEAR(0.0117468f, outptr[15], 1e-6f);
+  EXPECT_NEAR(0.0151164f, outptr[16], 1e-6f);
+  EXPECT_NEAR(0.0304296f, outptr[17], 1e-6f);
+  EXPECT_NEAR(0.0736518f, outptr[18], 1e-6f);
+  EXPECT_NEAR(-0.00652641f, outptr[19], 1e-6f);
+  EXPECT_NEAR(0.00962783f, outptr[20], 1e-6f);
+  EXPECT_NEAR(0.169522f, outptr[21], 1e-6f);
+  EXPECT_NEAR(-0.0445781f, outptr[22], 1e-6f);
+  EXPECT_NEAR(-0.00244139f, outptr[23], 1e-6f);
+  EXPECT_NEAR(0.0597209f, outptr[24], 1e-6f);
+  EXPECT_NEAR(-0.0530697f, outptr[25], 1e-6f);
+  EXPECT_NEAR(0.0124228f, outptr[26], 1e-6f);
+  EXPECT_NEAR(0.108367f, outptr[27], 1e-6f);
+  EXPECT_NEAR(0.045115f, outptr[28], 1e-6f);
+  EXPECT_NEAR(0.024703f, outptr[29], 1e-6f);
+  EXPECT_NEAR(0.0304295f, outptr[30], 1e-6f);
+  EXPECT_NEAR(0.0179573f, outptr[31], 1e-6f);
+}
+
+TEST(CudnnLRN, Backward) {
+  CudnnLRN lrn;
+
+  const float x[] = {
+    0.00658502, -0.0496967, -0.0333733, -0.0263094,
+    -0.044298, 0.0211638, 0.0829358, -0.0172312,
+    -0.0665471, -0.10017, -0.0750333, -0.104551,
+    -0.00981208, -0.0583349, -0.0751652, 0.011747,
+    0.0151165, 0.0304321, 0.0736639, -0.00652653,
+    0.00962833, 0.169646, -0.044588, -0.00244141,
+    0.0597329, -0.0530868, 0.0124246, 0.108429,
+    0.0451175, 0.0247055, 0.0304345, 0.0179575
+  };
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
+  x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
+
+  const float dy[] = {
+    -0.103178, -0.0326904, 0.293932, 0.355288,
+    -0.0288079, -0.0543308, -0.0668226, 0.0462216,
+    -0.0448064, -0.068982, -0.0509133, -0.0721143,
+    0.0959078, -0.0389037, -0.0510071, -0.178793,
+    0.00428248, -0.001132, -0.19928, 0.011935,
+    0.00622313, 0.143793, 0.0253894, 0.0104906,
+    -0.170673, 0.0283919, 0.00523488, -0.0455003,
+    0.177807, 0.000892812, -0.00113197, 0.00327798
+  };
+
+  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
+  dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(conf);
+
+  lrn.Forward(singa::kTrain, x_tensor);
+  const auto ret = lrn.Backward(singa::kTrain, dy_tensor);
+  singa::CppCPU host(0, 1);
+  singa::Tensor dx = ret.first;
+  dx.ToDevice(&host);
+  const float *dxptr = dx.data<const float *>();
+  const auto & shape = dx.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+
+  EXPECT_NEAR(-0.103177, dxptr[0], 1e-6f);
+  EXPECT_NEAR(-0.0326837, dxptr[1], 1e-6f);
+  EXPECT_NEAR(0.293844, dxptr[2], 1e-6f);
+  EXPECT_NEAR(0.355269, dxptr[3], 1e-6f);
+  EXPECT_NEAR(-0.0288034, dxptr[4], 1e-6f);
+  EXPECT_NEAR(-0.0543157, dxptr[5], 1e-6f);
+  EXPECT_NEAR(-0.0667802, dxptr[6], 1e-6f);
+  EXPECT_NEAR(0.0462206, dxptr[7], 1e-6f);
+  EXPECT_NEAR(-0.0448215, dxptr[8], 1e-6f);
+  EXPECT_NEAR(-0.0689328, dxptr[9], 1e-6f);
+  EXPECT_NEAR(-0.0508914, dxptr[10], 1e-6f);
+  EXPECT_NEAR(-0.0720598, dxptr[11], 1e-6f);
+  EXPECT_NEAR(0.0959062, dxptr[12], 1e-6f);
+  EXPECT_NEAR(-0.0388931, dxptr[13], 1e-6f);
+  EXPECT_NEAR(-0.0509844, dxptr[14], 1e-6f);
+  EXPECT_NEAR(-0.17879, dxptr[15], 1e-6f);
+  EXPECT_NEAR(0.00428292, dxptr[16], 1e-6f);
+  EXPECT_NEAR(-0.00113432, dxptr[17], 1e-6f);
+  EXPECT_NEAR(-0.199158, dxptr[18], 1e-6f);
+  EXPECT_NEAR(0.0119317, dxptr[19], 1e-6f);
+  EXPECT_NEAR(0.00622216, dxptr[20], 1e-6f);
+  EXPECT_NEAR(0.143491, dxptr[21], 1e-6f);
+  EXPECT_NEAR(0.0253689, dxptr[22], 1e-6f);
+  EXPECT_NEAR(0.0104904, dxptr[23], 1e-6f);
+  EXPECT_NEAR(-0.170617, dxptr[24], 1e-6f);
+  EXPECT_NEAR(0.0283971, dxptr[25], 1e-6f);
+  EXPECT_NEAR(0.00523171, dxptr[26], 1e-6f);
+  EXPECT_NEAR(-0.0454887, dxptr[27], 1e-6f);
+  EXPECT_NEAR(0.177781, dxptr[28], 1e-6f);
+  EXPECT_NEAR(0.000889893, dxptr[29], 1e-6f);
+  EXPECT_NEAR(-0.00113756, dxptr[30], 1e-6f);
+  EXPECT_NEAR(0.00327978, dxptr[31], 1e-6f);
+}
+
+#endif  //  CUDNN_VERSION_MAJOR >= 4
+#endif  //  USE_CUDNN

[36/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

SINGA-182 Clean math function APIs and implementations

Clean tensor.h/.cc and tensor_math.h, tensor_math_cpp.h:
re-order the functions by (type, name), where type is a) element-wise
function b) matrix function c) random function d) blas function

Implement GEMV using cblas and cublas.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/564c88ad
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/564c88ad
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/564c88ad

Branch: refs/heads/master
Commit: 564c88ad95e976e6067198c832f4fcd9a8878cd7
Parents: 07c49da
Author: wangwei <wa...@gmail.com>
Authored: Fri Jun 10 23:12:09 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        | 396 +++++++++---------
 src/core/tensor/tensor.cc          | 688 ++++++++++++++++----------------
 src/core/tensor/tensor_math.h      | 336 ++++++++--------
 src/core/tensor/tensor_math_cpp.h  | 640 +++++++++++++++--------------
 src/core/tensor/tensor_math_cuda.h | 158 ++++----
 test/singa/test_tensor_math.cc     |  15 +-
 6 files changed, 1131 insertions(+), 1102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..82bbe81 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -32,17 +32,6 @@ using std::tuple;
 namespace singa {
 
 typedef vector<size_t> Shape;
-typedef Shape::iterator ShapeIter;
-inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
-  if (len == 0)
-    len = shape.size();
-  CHECK_LE(len, shape.size());
-  size_t v = 1;
-  for (unsigned int i = start; i < len; i++)
-    v *= shape[i];
-  return v;
-}
-
 /// hardcode the width of types defined in DataType
 const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int),
                              sizeof(char), sizeof(double)};
@@ -65,10 +54,10 @@ class Tensor {
  public:
   ~Tensor();
   Tensor();
-  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
-  explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
-  Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
-  Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
+  explicit Tensor(Shape &&shape, const DataType dtype = kFloat32);
+  explicit Tensor(const Shape &shape, const DataType dtype = kFloat32);
+  Tensor(Shape &&shape, Device *dev, const DataType dtype = kFloat32);
+  Tensor(const Shape &shape, Device *dev, const DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor &from);
@@ -82,10 +71,10 @@ class Tensor {
 
   Device *device() const { return device_; }
 
-  /// Return immutable Tensor values with given type.
-  template <typename DType>
-  DType data() const {
-    return static_cast<DType>(blob()->data());
+  /// return immutable Tensor values with given type.
+  template <typename SType>
+  SType data() const {
+    return static_cast<SType>(blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -93,7 +82,7 @@ class Tensor {
 
   const Shape &shape() const { return shape_; }
 
-  const size_t shape(size_t idx) const {
+  const size_t shape(const size_t idx) const {
     CHECK_LT(idx, shape_.size());
     return shape_.at(idx);
   }
@@ -102,13 +91,13 @@ class Tensor {
 
   bool transpose() const { return transpose_; }
 
-  /// Return number of total elements
+  /// return number of total elements
   size_t Size() const {
     CHECK_EQ(blob_->size() % SizeOf(data_type_), 0u);
     return blob_->size() / SizeOf(data_type_);
   }
 
-  /// Return memory size (i.e., Bytes)
+  /// return memory size (i.e., Bytes)
   size_t MemSize() const { return blob_->size(); }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
@@ -121,7 +110,7 @@ class Tensor {
   void ResetLike(const Tensor &t);
 
   /// Reset the data type, it would reallocate blob if type changes.
-  void AsType(DataType type);
+  void AsType(const DataType type);
 
   /// Reset the device.
   /// If the target device is a diff device, then do deep data copy.
@@ -135,14 +124,14 @@ class Tensor {
   void SetValue(const SType x);
 
   /// For init the tensor values, copy 'num' elements.
-  template <typename DType>
-  void CopyDataFromHostPtr(const DType *src, size_t num);
+  template <typename SType>
+  void CopyDataFromHostPtr(const SType *src, const size_t num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
   void CopyData(const Tensor &other);
 
-  /// Return an exactly the same Tensor with data been deep copied.
+  /// return an exactly the same Tensor with data been deep copied.
   Tensor Clone() const;
 
   // Tensor operations
@@ -152,42 +141,37 @@ class Tensor {
   Tensor T() const;
 
   /// Copy the meta info with data blob shared.
-  Tensor &operator=(const Tensor &t);
+  Tensor &operator=(const Tensor &in);
 
   /// Copy the meta info with data blob shared.
-  Tensor &operator=(Tensor &&t);
+  Tensor &operator=(Tensor &&in);
 
-  Tensor &operator+=(const Tensor &t);
-  // void operator+=(Tensor&& t);
-  Tensor &operator-=(const Tensor &t);
-  // void operator-=(Tensor&& t);
-  Tensor &operator*=(const Tensor &t);
-  // void operator*=(Tensor&& t);
-  Tensor &operator/=(const Tensor &t);
-  // void operator/=(Tensor&& t);
+  Tensor &operator+=(const Tensor &in);
+  // void operator+=(Tensor&& in);
+  Tensor &operator-=(const Tensor &in);
+  // void operator-=(Tensor&& in);
+  Tensor &operator*=(const Tensor &in);
+  // void operator*=(Tensor&& in);
+  Tensor &operator/=(const Tensor &in);
+  // void operator/=(Tensor&& in);
 
   // Scalar operations.
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator+=(DType x);
-
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator-=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator+=(const SType x);
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator*=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator-=(const SType x);
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator/=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator*=(const SType x);
 
-  /// save Tensor into a proto msg
-  // void ToProto(TensorProto* t);
-  /// load Tensor from proto msg
-  // void FromProto(const TensorProto& t);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator/=(const SType x);
 
  protected:
   bool transpose_ = false;
@@ -196,14 +180,29 @@ class Tensor {
   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Blob, use blob() instead of blob_.
   Blob *blob_ = nullptr;
-  Shape shape_;
+  Shape shape_ = {};
 };
 
+typedef Shape::iterator ShapeIter;
+inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+  if (len == 0) len = shape.size();
+  CHECK_LE(len, shape.size());
+  size_t v = 1;
+  for (unsigned int i = start; i < len; i++) v *= shape[i];
+  return v;
+}
+
 inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
   CHECK_EQ(in1.data_type(), in2.data_type());
   CHECK_EQ(in1.device()->lang(), in2.device()->lang());
 }
 
+template <typename FromType, typename ToType>
+ToType TypeCast(const FromType &x) {
+  // TODO(wangwei) cast fp16; prevent some casts, e.g., float to char
+  return static_cast<ToType>(x);
+}
+
 Tensor Reshape(const Tensor &in, const Shape &s);
 Tensor Reshape(const Tensor &in, Shape &&s);
 
@@ -212,192 +211,171 @@ Tensor Reshape(const Tensor &in, Shape &&s);
 
 /// Copy 'num' elements of src to dst.
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                    size_t src_offset = 0, size_t dst_offset = 0);
-
-// ==================Simple Linear Algebra Operations=========================
-Tensor Abs(const Tensor &t);
-Tensor Exp(const Tensor &t);
-Tensor Log(const Tensor &t);
-Tensor ReLU(const Tensor &t);
-Tensor Sigmoid(const Tensor &t);
-Tensor Sign(const Tensor &t);
-Tensor Sqrt(const Tensor &t);
-Tensor Square(const Tensor &t);
-Tensor Tanh(const Tensor &t);
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t src_offset = 0, const size_t dst_offset = 0);
+
+// =============Element-wise operations====================================
+Tensor Abs(const Tensor &in);
+Tensor Exp(const Tensor &in);
+Tensor Log(const Tensor &in);
+Tensor ReLU(const Tensor &in);
+Tensor Sigmoid(const Tensor &in);
+Tensor Sign(const Tensor &in);
+Tensor Sqrt(const Tensor &in);
+Tensor Square(const Tensor &in);
+Tensor Tanh(const Tensor &in);
+
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+Tensor Pow(const Tensor &in, const SType x);
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+void Pow(const Tensor &in, const SType x, Tensor *out);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+Tensor Pow(const Tensor &base, const Tensor &exp);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
 
+/// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
 template <typename SType>
-SType Sum(const Tensor &t);
-/// Sum elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, sum all rows into a single row
-/// if 'axis' is 1, sum all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.sum
-Tensor Sum(const Tensor &t, int axis);
+Tensor operator<(const Tensor &in, const SType x);
+template <typename SType>
+void LT(const Tensor &in, const SType x, Tensor *out);
 
-/// Average elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, average all rows into a single row
-/// if 'axis' is 1, average all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.average
-Tensor Average(const Tensor &t, int axis);
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
-/// and shape_[axis]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-Tensor SoftMax(const Tensor &t, int axis = 0);
-void SoftMax(const Tensor &t, int axis, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator<=(const Tensor &in, const SType x);
+template <typename SType>
+void LE(const Tensor &in, const SType x, Tensor *out);
 
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
-/// and shape_[axis+1]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-// Tensor Softmax(const Tensor& t, int axis = -1);
-// void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
-
-/// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<(const Tensor &t, const DType x);
-template <typename DType>
-void LT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<=(const Tensor &t, const DType x);
-template <typename DType>
-void LE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>(const Tensor &t, const DType x);
-template <typename DType>
-void GT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>=(const Tensor &t, const DType x);
-template <typename DType>
-void GE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-Tensor Pow(const Tensor &t, DType x);
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-void Pow(const Tensor &t, DType x, Tensor *ret);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-Tensor Pow(const Tensor &base, Tensor exp);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>(const Tensor &in, const SType x);
+template <typename SType>
+void GT(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>=(const Tensor &in, const SType x);
+template <typename SType>
+void GE(const Tensor &in, const SType x, Tensor *out);
 
 Tensor operator+(const Tensor &lhs, const Tensor &rhs);
-void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator-(const Tensor &lhs, const Tensor &rhs);
-void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator*(const Tensor &lhs, const Tensor &rhs);
-void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator/(const Tensor &lhs, const Tensor &rhs);
-void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 
-template <typename DType>
-Tensor operator+(const Tensor &t, DType x);
-template <typename DType>
-void Add(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator-(const Tensor &t, DType x);
-template <typename DType>
-void Sub(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator*(const Tensor &t, DType x);
-template <typename DType>
-void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator/(const Tensor &t, DType x);
-template <typename DType>
-void Div(const Tensor &t, DType x, Tensor *ret);
+template <typename SType>
+Tensor operator+(const Tensor &in, const SType x);
+template <typename SType>
+void Add(const Tensor &in, const SType x, Tensor *out);
 
-// ================Blas operations============================================
-// We fix the scalar argument type to be float.
+template <typename SType>
+Tensor operator-(const Tensor &in, const SType x);
+template <typename SType>
+void Sub(const Tensor &in, const SType x, Tensor *out);
 
-// ===== Level 1
-// TODO(wangwei) make amax/amin/asum a member function of tensor
-// void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
-// void Asum(Tensor Context* ctx);
+template <typename SType>
+Tensor operator*(const Tensor &in, const SType x);
+template <typename SType>
+void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
 
-// template <typename DType>
-// void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+/// For each element e of Tensor 'in', compute e / x
+template <typename SType>
+Tensor operator/(const Tensor &in, const SType x);
+/// For each element e of Tensor 'in', compute e / x into out
+template <typename SType>
+void Div(const Tensor &in, const SType x, Tensor *out);
 
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  result = A * B
-Tensor Mult(const Tensor &A, const Tensor &B);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  C = A * B
-void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+/// For each element e of Tensor 'in', compute x/e
+template <typename SType>
+Tensor Div(const SType x, const Tensor &in);
+/// For each element e of Tensor 'in', compute x/e into 'out'
+template <typename SType>
+void Div(const SType x, const Tensor &in, Tensor *out);
 
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
-void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
-          const float beta, Tensor *C);
+template <typename SType>
+SType Sum(const Tensor &in);
 
-// ================Random operations==========================================
-/// For each element x set x = 1 if random() < p; otherwise x = 1.
-void Bernoulli(float p, Tensor *t);
-/// Fill in Tensor 't' following uniform distribution.
-void Uniform(float low, float high, Tensor *t);
-/// Fill in Tensor 't' following Gaussian distribution.
-void Gaussian(float mean, float std, Tensor *t);
+// ============Matrix (row/column) operations==================================
+/// Average elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, average all rows into a single row
+/// if 'axis' is 1, average all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.average
+Tensor Average(const Tensor &in, const int axis);
+/// Sum elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, sum all rows into a single row
+/// if 'axis' is 1, sum all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.sum
+Tensor Sum(const Tensor &in, const int axis);
+/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
+/// and shape_[axis]*...*shape_[nDim()] columns.
+/// and do softmax along each row.
+Tensor SoftMax(const Tensor &in, const int axis = 0);
+void SoftMax(const Tensor &in, const int axis, Tensor *out);
 
-// follow the consistency guide
-// https://issues.apache.org/jira/browse/SINGA-182
-// ============Matrix vector operations=======================================
 /// Add column 'v' with each column of matrix M
 void AddColumn(const Tensor &v, Tensor *M);
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+/// For each column 'c' of matrix out, do c=alpha*v + beta*c
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *out);
-/// Sub column 'v' by each column of matrix M
-void SubColumn(const Tensor &v, Tensor *M);
-/// Multiply column 'v' and each column of matrix M; write results into 'out'
-void MultColumn(const Tensor &v, Tensor *M);
-/// Divide column 'v' by each column of matrix M; write results into 'out'
-void DivColumn(const Tensor &v, Tensor *M);
-
 /// Add row 'v' with each row of matrix M; write results into 'out'
 void AddRow(const Tensor &v, Tensor *out);
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
-/// Sub row 'v' by each row of matrix M; write results into 'out'
-void SubRow(const Tensor &v, Tensor *M);
-/// Multiply row 'v' with each row of matrix M; write results into 'out'
-void MultRow(const Tensor &v, Tensor *M);
+/// For each row 'r' of matrix out, do r=alpha*v + beta*r
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M);
 /// Divide row 'v' by each row of matrix M; write results into 'out'
 void DivRow(const Tensor &v, Tensor *M);
-
-/// Sum all rows of matrix M into a single row as 'out'
-void SumRows(const Tensor &M, Tensor *out);
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M);
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M);
+/// Sub column 'v' by each column of matrix M
+void SubColumn(const Tensor &v, Tensor *M);
+/// Sub row 'v' by each row of matrix M; write results into 'out'
+void SubRow(const Tensor &v, Tensor *M);
 /// Sum all columns of matrix M into a single column as 'out'
 void SumColumns(const Tensor &M, Tensor *out);
+/// Sum all rows of matrix M into a single row as 'out'
+void SumRows(const Tensor &M, Tensor *out);
 
-/// For each element x of Tensor 'in', compute alpha/x
+// ================Random operations==========================================
+/// For each element x set x = 1 if random() < p; otherwise x = 1.
 template <typename SType>
-Tensor Div(const SType alpha, const Tensor &in);
+void Bernoulli(const SType p, Tensor *out);
+/// Fill in Tensor 't' following Gaussian distribution.
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out);
+/// Fill in Tensor 't' following uniform distribution.
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out);
 
-/// For each element x of Tensor 'in', compute alpha/x into 'out'
+// ================Blas operations============================================
+// TODO(wangwei) make amax/amin/asum a member function of tensor
+
+/// out = alpha*in + out
 template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out);
-
-/*
-/// Multiply each column of the lhs matrix with the rhs column
-Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
-void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Multiply each row of the lhs matrix with the rhs row
-Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
-void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Div each row of the lhs matrix with the rhs column
-Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
-void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Divide each row of the lhs matrix by the rhs row
-Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
-void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-*/
+void Axpy(SType alpha, const Tensor &in, Tensor *out);
+
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  result = A * B
+Tensor Mult(const Tensor &A, const Tensor &B);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  C = A * B
+void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. out = alpha lhs * rhs + beta * out
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C);
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..f4e9da2 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -26,61 +26,61 @@ namespace singa {
 
 Tensor::~Tensor() {
   // LOG(ERROR) << "~";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
 Tensor::Tensor() { device_ = &defaultDevice; }
 
-Tensor::Tensor(const Shape &shape, DataType dtype)
+Tensor::Tensor(const Shape &shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, DataType dtype)
+Tensor::Tensor(Shape &&shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
+Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
+Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Tensor &t)
-    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-      blob_(t.blob()), shape_(t.shape_) {
+Tensor::Tensor(const Tensor &in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      blob_(in.blob()),
+      shape_(in.shape_) {
   blob_->IncRefCount();
-  // LOG(ERROR) << "const&";
 }
 
-Tensor::Tensor(Tensor &&t)
-    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-      shape_(std::move(t.shape_)) {
-  blob_ = t.blob_;
-  t.blob_ = nullptr;
-  // LOG(ERROR) << "&&";
+Tensor::Tensor(Tensor &&in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      shape_(std::move(in.shape_)) {
+  blob_ = in.blob_;
+  in.blob_ = nullptr;
 }
 
-void Tensor::ResetLike(const Tensor &t) {
-  if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
-    shape_ = t.shape_;
-    device_ = t.device_;
-    data_type_ = t.data_type_;
-    blob_ = device_->NewBlob(t.MemSize());
+void Tensor::ResetLike(const Tensor &in) {
+  if (blob_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+    shape_ = in.shape_;
+    device_ = in.device_;
+    data_type_ = in.data_type_;
+    blob_ = device_->NewBlob(in.MemSize());
   }
 }
 
 void Tensor::Reshape(const Shape &shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = shape;
@@ -88,17 +88,15 @@ void Tensor::Reshape(const Shape &shape) {
 
 void Tensor::Reshape(Shape &&shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = std::move(shape);
 }
 
-void Tensor::AsType(DataType type) {
+void Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
     data_type_ = type;
   }
@@ -109,8 +107,7 @@ void Tensor::ToDevice(Device *dst) {
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     tmp.CopyData(*this);
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = tmp.blob_;
     tmp.blob_ = nullptr;
     device_ = dst;
@@ -120,7 +117,7 @@ void Tensor::ToDevice(Device *dst) {
 void Tensor::ToHost() { ToDevice(device_->host()); }
 
 template <typename DType>
-void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
+void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
@@ -130,8 +127,8 @@ void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
     LOG(WARNING) << "Copy data from null host ptr";
   }
 }
-template void Tensor::CopyDataFromHostPtr(const float *src, size_t num);
-template void Tensor::CopyDataFromHostPtr(const int *src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num);
+template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num);
 
 void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
@@ -162,29 +159,27 @@ Tensor Tensor::T() const {
   return t;
 }
 
-Tensor &Tensor::operator=(const Tensor &t) {
+Tensor &Tensor::operator=(const Tensor &in) {
   // LOG(ERROR) << "= const &";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
-  transpose_ = t.transpose_;
-  data_type_ = t.data_type_;
-  shape_ = t.shape_;
-  device_ = t.device_;
-  blob_ = t.blob();
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = in.shape_;
+  device_ = in.device_;
+  blob_ = in.blob();
   blob_->IncRefCount();
   return *this;
 }
 
-Tensor &Tensor::operator=(Tensor &&t) {
+Tensor &Tensor::operator=(Tensor &&in) {
   // LOG(ERROR) << "= &&";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
-  transpose_ = t.transpose_;
-  data_type_ = t.data_type_;
-  shape_ = std::move(t.shape_);
-  device_ = t.device_;
-  blob_ = t.blob_;
-  t.blob_ = nullptr;
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = std::move(in.shape_);
+  device_ = in.device_;
+  blob_ = in.blob_;
+  in.blob_ = nullptr;
   return *this;
 }
 
@@ -200,10 +195,10 @@ Tensor Reshape(const Tensor &in, Shape &&s) {
   return out;
 }
 
-#define GenUnaryTensorArgMemberFn(op, fn)                                \
-  Tensor &Tensor::op(const Tensor &t) {                                        \
-    fn(*this, t, this);                                                        \
-    return *this;                                                              \
+#define GenUnaryTensorArgMemberFn(op, fn) \
+  Tensor &Tensor::op(const Tensor &in) {  \
+    fn(*this, in, this);                  \
+    return *this;                         \
   }
 
 GenUnaryTensorArgMemberFn(operator+=, Add);
@@ -211,12 +206,13 @@ GenUnaryTensorArgMemberFn(operator-=, Sub);
 GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
 GenUnaryTensorArgMemberFn(operator/=, Div);
 
-#define GenUnaryScalarArgMemberFn(op, fn)                                \
-  template <typename DType> Tensor &Tensor::op(DType x) {                      \
-    fn(*this, x, this);                                                        \
-    return *this;                                                              \
-  }                                                                            \
-  template Tensor &Tensor::op<float>(float x)
+#define GenUnaryScalarArgMemberFn(op, fn) \
+  template <typename DType>               \
+  Tensor &Tensor::op(const DType x) {     \
+    fn(*this, x, this);                   \
+    return *this;                         \
+  }                                       \
+  template Tensor &Tensor::op<float>(const float x)
 
 GenUnaryScalarArgMemberFn(operator-=, Sub);
 GenUnaryScalarArgMemberFn(operator+=, Add);
@@ -224,103 +220,105 @@ GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);
 
 // ====================Tensor Operations=======================================
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                    size_t dst_offset, size_t src_offset) {
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t dst_offset, const size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   size_t nBytes = num * width;
-  dst_offset *= width;
-  src_offset *= width;
-  CHECK_GE(src.MemSize(), src_offset + nBytes);
-  CHECK_GE(dst->MemSize(), dst_offset + nBytes);
+  auto d_offset = dst_offset * width;
+  auto s_offset = src_offset * width;
+  CHECK_GE(src.MemSize(), s_offset + nBytes);
+  CHECK_GE(dst->MemSize(), d_offset + nBytes);
 
   Device *src_dev = src.device(), *dst_dev = dst->device();
   Blob *from = src.blob(), *to = dst->blob();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
     if (dst_dev->lang() == kCpp) {
-      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, dst_offset,
-                              src_offset);
+      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, d_offset,
+                              s_offset);
     } else if (src_dev->lang() == kCpp) {
-      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, dst_offset,
-                              src_offset);
+      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, d_offset,
+                              s_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
     auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
-    src_dev->CopyDataToFrom(to, from, nBytes, direct, dst_offset, src_offset);
+    src_dev->CopyDataToFrom(to, from, nBytes, direct, d_offset, s_offset);
   }
 }
 //============================================================================
 /// typedef DType accroding to type value.
 /// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...)                                          \
-  do {                                                                         \
-    switch (type) {                                                            \
-    case kFloat32: {                                                           \
-      typedef float DType;                                                     \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case kInt: {                                                               \
-      typedef int DType;                                                       \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case kChar: {                                                              \
-      typedef char DType;                                                      \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      LOG(FATAL) << "Unknow data type = " << DataType_Name(type);              \
-    }                                                                          \
+#define TYPE_SWITCH(type, DType, ...)                               \
+  do {                                                              \
+    switch (type) {                                                 \
+      case kFloat32: {                                              \
+        typedef float DType;                                        \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kInt: {                                                  \
+        typedef int DType;                                          \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kChar: {                                                 \
+        typedef char DType;                                         \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      default:                                                      \
+        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
+    }                                                               \
   } while (0)
 
 /// typedef DType and Lang according to data type and device programming
 /// language respectively.
 /// type is from DataType, and lang is from LangType.
 /// DType and Lang would be used in __VA_ARGS__.
-#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)                       \
-  do {                                                                         \
-    const int _SwitchShift = 3;                                                \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);                     \
-    switch (_SwitchHash) {                                                     \
-    case ((kFloat32 << _SwitchShift) + kCuda): {                               \
-      typedef float DType;                                                     \
-      typedef lang::Cuda Lang;                                                 \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case ((kFloat32 << _SwitchShift) + kCpp): {                                \
-      typedef float DType;                                                     \
-      typedef lang::Cpp Lang;                                                  \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case ((kFloat32 << _SwitchShift) + kOpencl): {                             \
-      typedef float DType;                                                     \
-      typedef lang::Opencl Lang;                                               \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      LOG(FATAL) << "Unknown combination of data type "                        \
-                 << DataType_Name(dtype) << " and language "                   \
-                 << LangType_Name(ltype);                                      \
-    }                                                                          \
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
+  do {                                                         \
+    const int _SwitchShift = 3;                                \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
+    switch (_SwitchHash) {                                     \
+      case ((kFloat32 << _SwitchShift) + kCuda): {             \
+        typedef float DType;                                   \
+        typedef lang::Cuda Lang;                               \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kCpp): {              \
+        typedef float DType;                                   \
+        typedef lang::Cpp Lang;                                \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kOpencl): {           \
+        typedef float DType;                                   \
+        typedef lang::Opencl Lang;                             \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      default:                                                 \
+        LOG(FATAL) << "Unknown combination of data type "      \
+                   << DataType_Name(dtype) << " and language " \
+                   << LangType_Name(ltype);                    \
+    }                                                          \
   } while (0)
 
-template <typename SType> void Tensor::SetValue(const SType x) {
+// =============Element-wise operations====================================
+template <typename SType>
+void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
   auto size = Size();
   auto ptr = blob_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     // cast x to DType
-    device_->Exec(
-        [size, x, ptr](Context *ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
-        {}, {ptr});
+    device_->Exec([size, x, ptr](Context *ctx) {
+      Set<DType, Lang>(size, x, ptr, ctx);
+    }, {}, {ptr});
   });
 }
 template void Tensor::SetValue<float>(const float x);
@@ -328,21 +326,19 @@ template void Tensor::SetValue<float>(const float x);
 #define EltwiseUnaryTensorFn(fn, t, ret)                               \
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
-      ret->device()->Exec(                                             \
-          [t, ret](Context* ctx) {                                     \
-            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);     \
-          },                                                           \
-          {t.blob()}, {ret->blob()});                                  \
+      ret->device()->Exec([t, ret](Context * ctx) {                    \
+        fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);         \
+      }, {t.blob()}, {ret->blob()});                                   \
     });                                                                \
   } while (0)
 
-#define GenUnaryTensorFn(fn)                          \
-  Tensor fn(const Tensor &t) {                        \
-    Tensor ret(t.shape(), t.device(), t.data_type()); \
-    auto *retptr = &ret;                              \
-    EltwiseUnaryTensorFn(fn, t, retptr);              \
-    return ret;                                       \
-  }                                                   \
+#define GenUnaryTensorFn(fn)                             \
+  Tensor fn(const Tensor &in) {                          \
+    Tensor ret(in.shape(), in.device(), in.data_type()); \
+    auto *retptr = &ret;                                 \
+    EltwiseUnaryTensorFn(fn, in, retptr);                \
+    return ret;                                          \
+  }                                                      \
   void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }
 
 GenUnaryTensorFn(Abs);
@@ -355,33 +351,89 @@ GenUnaryTensorFn(Sqrt);
 GenUnaryTensorFn(Square);
 GenUnaryTensorFn(Tanh);
 
-// TODO(wangwei) conside async exec
-template <> float Sum<float>(const Tensor &t) {
-  float s = 0.0f;
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    t.device()->Exec(
-        [t, &s](Context *ctx) {
-          Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
-        },
-        {t.blob()}, {});
-  });
-  return s;
-}
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
+  do {                                                                         \
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
+      ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                     \
+        fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), ctx); \
+      }, {lhs.blob(), rhs.blob()}, {ret->blob()});                             \
+    });                                                                        \
+  } while (0)
 
-Tensor Sum(const Tensor &M, int axis) {
-  if (axis == 0) {
-    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
-    SumRows(M, &out);
-    return out;
-  } else {
-    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
-    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
-    SumColumns(M, &out);
-    return out;
+#define GenBinaryTensorFn(op, fn)                              \
+  Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
+    fn(lhs, rhs, &ret);                                        \
+    return ret;                                                \
+  }                                                            \
+  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
   }
+
+GenBinaryTensorFn(operator+, Add);
+GenBinaryTensorFn(operator-, Sub);
+GenBinaryTensorFn(operator*, EltwiseMult);
+GenBinaryTensorFn(operator/, Div);
+GenBinaryTensorFn(Pow, Pow);
+
+#define EltwiseTensorScalarFn(fn, t, x, ret)                            \
+  do {                                                                  \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
+      static_assert(std::is_same<SType, DType>::value,                  \
+                    "The Scalar type must match the Tensor data type"); \
+      ret->device()->Exec([t, x, ret](Context * ctx) {                  \
+        fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);       \
+      }, {t.blob()}, {ret->blob()});                                    \
+    });                                                                 \
+  } while (0)
+
+#define GenTensorScalarFn(op, fn)                             \
+  template <typename SType>                                   \
+  Tensor op(const Tensor &in, const SType x) {                \
+    Tensor ret(in.shape(), in.device(), in.data_type());      \
+    fn(in, x, &ret);                                          \
+    return ret;                                               \
+  }                                                           \
+  template <typename SType>                                   \
+  void fn(const Tensor &in, const SType x, Tensor *ret) {     \
+    EltwiseTensorScalarFn(fn, in, x, ret);                    \
+  }                                                           \
+  template Tensor op<float>(const Tensor &in, const float x); \
+  template void fn<float>(const Tensor &in, const float x, Tensor *ret)
+
+GenTensorScalarFn(operator+, Add);
+GenTensorScalarFn(operator-, Sub);
+GenTensorScalarFn(operator*, EltwiseMult);
+GenTensorScalarFn(operator/, Div);
+GenTensorScalarFn(Pow, Pow);
+GenTensorScalarFn(operator<, LT);
+GenTensorScalarFn(operator<=, LE);
+GenTensorScalarFn(operator>, GT);
+GenTensorScalarFn(operator>=, GE);
+template <typename SType>
+Tensor Div(const SType alpha, const Tensor &in) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  Div(alpha, in, &out);
+  return out;
 }
+template Tensor Div<float>(const float, const Tensor &);
 
-Tensor Average(const Tensor &t, int axis) {
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out) {
+  CheckDataTypeAndLang(in, *out);
+  CHECK(in.shape() == out->shape());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    // TODO(wangwei) type cast SType to DType;
+    in.device()->Exec([alpha, in, out](Context *ctx) {
+      Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
+    }, {in.blob()}, {out->blob()});
+  });
+}
+template void Div<float>(const float, const Tensor &, Tensor *);
+
+// =============Matrix operations============================================
+Tensor Average(const Tensor &M, int axis) {
   // operator/ only has implementation for float scalar type, hence it is
   // necessary to cast the denominator to a float.
   // TODO(wangwei) implement function for cast scalar type involved in Tensor
@@ -396,10 +448,34 @@ Tensor Average(const Tensor &t, int axis) {
   //    ....
   // }
   if (axis == 0) {
-    return Sum(t, 0) / (1.0f * t.shape().at(0));
+    return Sum(M, 0) / (1.0f * M.shape(0));
   } else {
     CHECK_EQ(axis, 1);
-    return Sum(t, 1) / (1.0f * t.shape().at(1));
+    return Sum(M, 1) / (1.0f * M.shape(1));
+  }
+}
+// TODO(wangwei) conside async exec
+template <>
+float Sum<float>(const Tensor &in) {
+  float s = 0.0f;
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    in.device()->Exec([in, &s](Context *ctx) {
+      Sum<DType, Lang>(in.Size(), in.blob(), &s, ctx);
+    }, {in.blob()}, {});
+  });
+  return s;
+}
+
+Tensor Sum(const Tensor &M, int axis) {
+  if (axis == 0) {
+    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
+    SumRows(M, &out);
+    return out;
+  } else {
+    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
+    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
+    SumColumns(M, &out);
+    return out;
   }
 }
 
@@ -424,141 +500,10 @@ void SoftMax(const Tensor &in, int axis, Tensor *out) {
   DivColumn(sum, out);
 }
 
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
-      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
-      ret->device()->Exec(                                                     \
-          [lhs, rhs, ret](Context *ctx) {                                      \
-            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),   \
-                            ctx);                                              \
-          },                                                                   \
-          {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
-    });                                                                        \
-  } while (0)
-
-#define GenBinaryTensorFn(op, fn)                                        \
-  Tensor op(const Tensor &lhs, const Tensor &rhs) {                            \
-    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());                    \
-    fn(lhs, rhs, &ret);                                                        \
-    return ret;                                                                \
-  }                                                                            \
-  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {                 \
-    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                                  \
-  }
-
-GenBinaryTensorFn(operator+, Add);
-GenBinaryTensorFn(operator-, Sub);
-GenBinaryTensorFn(operator*, EltwiseMult);
-GenBinaryTensorFn(operator/, Div);
-GenBinaryTensorFn(Pow, Pow);
-
-#define EltwiseTensorScalarFn(fn, t, x, ret)                                   \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         \
-      static_assert(std::is_same<SType, DType>::value,                         \
-                    "The Scalar type must match the Tensor data type");        \
-      ret->device()->Exec(                                                     \
-          [t, x, ret](Context *ctx) {                                          \
-            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);          \
-          },                                                                   \
-          {t.blob()}, {ret->blob()});                                          \
-    });                                                                        \
-  } while (0)
-
-#define GenTensorScalarFn(op, fn)                                        \
-  template <typename SType> Tensor op(const Tensor &t, SType x) {              \
-    Tensor ret(t.shape(), t.device(), t.data_type());                          \
-    fn(t, x, &ret);                                                            \
-    return ret;                                                                \
-  }                                                                            \
-  template <typename SType> void fn(const Tensor &t, SType x, Tensor *ret) {   \
-    EltwiseTensorScalarFn(fn, t, x, ret);                                      \
-  }                                                                            \
-  template Tensor op<float>(const Tensor &t, float x);                         \
-  template void fn<float>(const Tensor &t, const float x, Tensor *ret)
-
-GenTensorScalarFn(operator+, Add);
-GenTensorScalarFn(operator-, Sub);
-GenTensorScalarFn(operator*, EltwiseMult);
-GenTensorScalarFn(operator/, Div);
-GenTensorScalarFn(Pow, Pow);
-GenTensorScalarFn(operator<, LT);
-GenTensorScalarFn(operator<=, LE);
-GenTensorScalarFn(operator>, GT);
-GenTensorScalarFn(operator>=, GE);
-
-// ================Blas operations============================================
-Tensor Mult(const Tensor &lhs, const Tensor &rhs) {
-  Tensor ret(Shape{lhs.shape(0), rhs.shape(1)}, lhs.device(), lhs.data_type());
-  Mult(lhs, rhs, &ret);
-  return ret;
-}
-
-void Mult(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {
-  Mult(1.0f, lhs, rhs, 0.0f, ret);
-}
-
-void Mult(const float alpha, const Tensor &A, const Tensor &B, const float beta,
-          Tensor *C) {
-  CHECK_EQ(A.shape().size(), 2u);
-  if (B.nDim() == 1u) {
-    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
-      C->device()->Exec(
-          [alpha, A, beta, B, C](Context *ctx) {
-            GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), alpha,
-                              A.blob(), B.blob(), beta, C->blob(), ctx);
-          },
-          {A.blob(), B.blob()}, {C->blob()});
-    });
-  } else {
-    CHECK(!C->transpose());
-    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
-      C->device()->Exec(
-          [alpha, A, beta, B, C](Context *ctx) {
-            GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0),
-                              B.shape(1), A.shape(1), alpha, A.blob(), B.blob(),
-                              beta, C->blob(), ctx);
-          },
-          {A.blob(), B.blob()}, {C->blob()});
-    });
-  }
-}
-
-void Bernoulli(float p, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [p, t](Context *ctx) {
-          Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-void Uniform(float low, float high, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [low, high, t](Context *ctx) {
-          Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-void Gaussian(float mean, float std, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [mean, std, t](Context *ctx) {
-          Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-// ======follow the consistency guide
 void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
 /// Add column 'v' onto each column of matrix M;
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *M) {
   if (M->transpose()) {
     Tensor X = M->T();
@@ -570,15 +515,19 @@ void AddColumn(const float alpha, const float beta, const Tensor &v,
     CHECK_EQ(nb_row, v.Size());
 
     Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor vmat = Reshape(v, Shape{nb_row, 1});
     Mult(alpha, vmat, one, beta, M);
   }
 }
+template <>
+void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
+
 void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
 
 /// Sub column 'v' by each column of matrix M; write results into 'out'
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
   if (M->transpose()) {
     Tensor X = M->T();
     AddColumn(v, &X);
@@ -594,29 +543,8 @@ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
     Mult(alpha, one, vmat, beta, M);
   }
 }
-
-template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
-  Tensor out(in.shape(), in.device(), in.data_type());
-  Div(alpha, in, &out);
-  return out;
-}
-
-template Tensor Div<float>(const float, const Tensor &);
-
-template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out) {
-  CheckDataTypeAndLang(in, *out);
-  CHECK(in.shape() == out->shape());
-  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-    // TODO(wangwei) type cast SType to DType;
-    in.device()->Exec(
-        [alpha, in, out](Context *ctx) {
-          Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
-        },
-        {in.blob()}, {out->blob()});
-  });
-}
-template void Div<float>(const float, const Tensor &, Tensor *);
+template <>
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
 
 /// Divide column 'v' by each column of matrix M; write results into 'out'
 void DivColumn(const Tensor &v, Tensor *M) {
@@ -640,12 +568,10 @@ void MultColumn(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec(
-        [M, v](Context *ctx) {
-          DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(),
-                            v.blob(), M->blob(), ctx);
-        },
-        {M->blob(), v.blob()}, {M->blob()});
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(), v.blob(),
+                        M->blob(), ctx);
+    }, {M->blob(), v.blob()}, {M->blob()});
   });
 }
 
@@ -657,12 +583,10 @@ void MultRow(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec(
-        [M, v](Context *ctx) {
-          DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
-                            M->blob(), ctx);
-        },
-        {M->blob(), v.blob()}, {M->blob()});
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
+                        M->blob(), ctx);
+    }, {M->blob(), v.blob()}, {M->blob()});
   });
 }
 
@@ -680,8 +604,8 @@ void SumColumns(const Tensor &M, Tensor *v) {
     size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
     CHECK_EQ(nb_row, v->Size());
 
-    Tensor one(Shape{nb_col, 1}, M.device(), M.data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor one(Shape{nb_col}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Mult(M, one, v);
   }
 }
@@ -695,10 +619,98 @@ void SumRows(const Tensor &M, Tensor *v) {
     size_t nb_row = M.shape(0), nb_col = M.shape(1);
     CHECK_EQ(nb_col, v->Size());
 
-    Tensor one(Shape{nb_row, 1}, M.device(), M.data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor one(Shape{nb_row}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor X = M.T();
     Mult(X, one, v);
   }
 }
+// ====================Random operations=====================================
+template <typename SType>
+void Bernoulli(const SType p, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto prob = TypeCast<SType, DType>(p);
+    out->device()->Exec([prob, out](Context *ctx) {
+      Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Bernoulli<float>(const float p, Tensor *out);
+
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto l = TypeCast<SType, DType>(low);
+    auto h = TypeCast<SType, DType>(high);
+    out->device()->Exec([l, h, out](Context *ctx) {
+      Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Uniform<float>(const float low, const float high, Tensor *out);
+
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto m = TypeCast<SType, DType>(mean);
+    auto s = TypeCast<SType, DType>(std);
+    out->device()->Exec([m, s, out](Context *ctx) {
+      Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Gaussian<float>(const float mean, const float std, Tensor *out);
+
+// ================Blas operations============================================
+template <typename SType>
+void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    auto a = TypeCast<SType, DType>(alpha);
+    out->device()->Exec([a, in, out](Context *ctx) {
+      Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+    }, {in.blob(), out->blob()}, {out->blob()});
+  });
+}
+template <>
+void Axpy(const float alpha, const Tensor &in, Tensor *out);
+
+Tensor Mult(const Tensor &A, const Tensor &B) {
+  Shape s;
+  s.push_back(A.shape(0));
+  if (B.nDim() == 2) s.push_back(B.shape(1));
+  Tensor out(s, A.device(), A.data_type());
+  Mult(A, B, &out);
+  return out;
+}
+
+void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+  Mult(1.0f, A, B, 0.0f, out);
+}
+
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C) {
+  CHECK_EQ(A.shape().size(), 2u);
+  if (B.nDim() == 1u) {
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+                          B.blob(), b, C->blob(), ctx);
+      }, {A.blob(), B.blob()}, {C->blob()});
+    });
+  } else {
+    CHECK(!C->transpose());
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+                          A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+      }, {A.blob(), B.blob()}, {C->blob()});
+    });
+  }
+}
+
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 1bf6fc7..b5d0ba9 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -29,12 +29,14 @@ namespace singa {
 /// device programming language, e.g., Langice::kCpp, Langice::kCuda
 ///
 /// TODO(wangwei) Clean the functions to make the function APIs consistent:
-/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the first
+/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the
+/// first
 ///    letter.
 /// 2. Order functions based on function name in alphabetical order.
-/// 3. Function arguments order is [const basic type] [const Blob] [mutable Blob].
+/// 3. Function arguments order is [const basic type] [const Blob] [mutable
+/// Blob].
 /// 4. Function argument names, use 'num' for total number of elements in
-///    elementwise operations; use 'in1' 'in2' for input blobs; use 'out' for
+///    elementwise operations; use 'in1' 'in2' for in blobs; use 'out' for
 ///    output blob or value. With exceptions for some functions, e.g.,
 ///      Scale(const float alpha, const Blob* in, Blob* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
@@ -46,262 +48,283 @@ namespace singa {
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Blob level math functions.
 
-
-// ================Linear algebra functions====================================
-/// ret[i] = |input[i]|
+// =============Element-wise operations====================================
+/// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
+/// out = in + x
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Set Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add Not Implemented";
 }
 
-/// sum all elements of input into ret
+/// out = in1 + in2
 template <typename DType, typename Lang>
-void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
-  LOG(FATAL) << "Sum Not Implemented";
+void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add-Pair Not Implemented";
 }
-
-/// ret[i] = sign(input[i])
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sign Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
+           Blob *out, Context *ctx) {
+  LOG(FATAL) << "Clamp Not Implemented";
 }
 
-/// Base is e, Neper number. ret[i]=exp(input[i])
+/// out = x / in
 template <typename DType, typename Lang>
-void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Exp Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div Not Implemented";
 }
 
-/// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
-template <typename DType, typename Lang>
-void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Log Not Implemented";
-}
-/// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sqrt Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  CHECK_NE(x, 0.f);
+  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
-/// Element-wise operation, ret[i]=square([input[i])
+/// out = in1 / in2
 template <typename DType, typename Lang>
-void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Square Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
-/// Element-wise operation, ret[i]=tanh([input[i])
+/// out = in * x
 template <typename DType, typename Lang>
-void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Tanh Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult Not Implemented";
 }
-/// Element-wise operation, ret[i]=max(0, input[i])
+
+/// out = in2 * in2
 template <typename DType, typename Lang>
-void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "ReLU Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
-/// Element-wise operation, ret[i]=sigmoid([input[i])
+
+/// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
-void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sigmoid Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Exp Not Implemented";
 }
 
-// Do softmax for each row invidually
+/// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "Softmax Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "LE Not Implemented";
 }
-
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
+/// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "SumRows Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Log Not Implemented";
 }
-
-/// Sum the columns of the input matrix into a vector
+/// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, 
-	        Blob *out, Context *ctx) {
-  LOG(FATAL) << "SumColumns Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "LT Not Implemented";
 }
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out 
+/// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddRow Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "GE Not Implemented";
 }
-
-/// Add the vector v to every column of A as the column of out
+/// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddCol Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "GT Not Implemented";
 }
-
-/// Element-wise operation, do v^x for every v from the input tensor
+/// Element-wise operation, do v^x for every v from the in tensor
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Element-wise operation, out[i]=max(0, in[i])
 template <typename DType, typename Lang>
-void Clamp(const size_t num, const DType low, const DType high, const Blob *in, 	   Blob *out, Context *ctx) {
-  LOG(FATAL) << "Clamp Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "ReLU Not Implemented";
 }
 
-/// ret = input + x
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in, const DType x, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Add Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Set Not Implemented";
 }
-
-/// ret = lhs + rhs
+/// Element-wise operation, out[i]=sigmoid([in[i])
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Add-Pair Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
-/// ret =  input - x
+/// out[i] = sign(in[i])
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  Add<DType, Lang>(num, in, -x, out, ctx);
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sign Not Implemented";
 }
-
-/// ret = lhs - rhs
+/// Element-wise operation, out[i]=sqrt([in[i])
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sub-Pair Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sqrt Not Implemented";
 }
 
-/// ret = input * x
+/// Element-wise operation, out[i]=square([in[i])
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
-                 Context *ctx) {
-  LOG(FATAL) << "EltwiseMult Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Square Not Implemented";
 }
 
-/// ret = lhs * rhs
+/// out =  in - x
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, 
-		 Blob *out, Context *ctx) {
-  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  Add<DType, Lang>(num, in, -x, out, ctx);
 }
 
-/// ret = input / x
+/// out = in1 - in2
 template <typename DType, typename Lang>
-void Div(const size_t num, const DType x, const Blob *in, 
-	 Blob *out, Context *ctx) { 
-  LOG(FATAL) << "Div Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Sub-Pair Not Implemented";
 }
-
+/// sum all elements of in into out
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  CHECK_NE(x,0.f);
-  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Sum Not Implemented";
 }
 
-/// ret = lhs / rhs
+/// Element-wise operation, out[i]=tanh([in[i])
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Div-Pair Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Tanh Not Implemented";
 }
 
+// =========== Matrix operations ===========================================
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
+}
 /// outer-product.
-/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2, 
-	   Blob *out, Context *ctx) {
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
   LOG(FATAL) << "Outer Not Implemented";
 }
-
-/// ret[i]=(input[i]<x)?1.f:0.f
+// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "LT Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "Softmax Not Implemented";
 }
-/// ret[i]=(input[i]<=x)?1.f:0.f
+/// Sum the columns of the in matrix into a vector
 template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "LE Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
 template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "GT Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
+}
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <typename DType, typename Lang>
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Bernoulli Not Implemented";
 }
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "GE Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std, Blob *out,
+              Context *ctx) {
+  LOG(FATAL) << "Gaussian Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <typename DType, typename Lang>
+void Uniform(const size_t num, const float low, const float high, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "Uniform Not Implemented";
 }
 
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
-// ===== Level 1
-/// return the index of the element with the max value.
+/// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amax Not Implemented";
 }
 
-/// return the index of the element with the min value.
+/// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
 void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amin Not Implemented";
 }
-/// ret = sum |x| for all x in input
+/// out = sum |x| for all x in in
 template <typename DType, typename Lang>
 void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Asum Not Implemented";
 }
 
-/// ret = alpha * input + ret
+/// out = alpha * in + out
 template <typename DType, typename Lang>
-void Axpy(const size_t num, const DType alpha, const Blob *in, 
-	  Blob *out, Context *ctx) {
+void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
+          Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
-/// ret *= x
+/// out *= x
 template <typename DType, typename Lang>
 void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, 
-	 DType *out, Context *ctx) {
+void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
+         Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
 
-// ===== Level 2
-/// ret = alpha * op(A) * v + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+/// out = alpha * A * v + beta * out.
+/// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
-void GEMV(bool trans, const size_t m, const size_t n, const DType alpha, 
-	  const Blob *A, const Blob *v,
-          const DType beta, Blob *out, Context *ctx) {
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+          const Blob *A, const Blob *v, const DType beta, Blob *out,
+          Context *ctx) {
   LOG(FATAL) << "GEMV Not Implemented";
 }
 
@@ -323,34 +346,5 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
   LOG(FATAL) << "GEMM Not Implemented";
 }
 
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Bernoulli Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(const size_t num, const float low, const float high, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "Uniform Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(const size_t num, const float mean, const float std, 
-	      Blob *out, Context *ctx) {
-  LOG(FATAL) << "Gaussian Not Implemented";
-}
-
-
-
-
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

[07/50] [abbrv] incubator-singa git commit: SINGA-171 - Create CppDevice and CudaDevice

Posted by zh...@apache.org.

SINGA-171 - Create CppDevice and CudaDevice

Implement CudaDevice.

(zhongle) Fix erorrs for cudnn and cuda by adding cuda & cudnn libs to singa_linker_libs.

NOTE: set cudnn include path before cuda include path, as some platforms may include cudnn.h in cuda/include, but the cudnn.h is not the one users configured in CMAKE_XXX_PATH.

Pass test for cudnn dropout; NOTE: make sure all data in cudnn layers are allocated on device (not on cpu). you can check mem erros by cuda-memcheck ./program

pass cpplint.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/0b4b2e20
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/0b4b2e20
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/0b4b2e20

Branch: refs/heads/master
Commit: 0b4b2e20f803d1b890f24e6047912282092c156f
Parents: 282712c
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Wed May 18 20:10:45 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 19 14:19:36 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                   |   3 +-
 cmake/Cuda.cmake                 |   7 +-
 include/singa/core/tensor.h      |   5 +-
 include/singa/model/layer.h      |   7 +-
 src/core/device/cpp_device.cc    |   2 +-
 src/core/device/cuda_device.cc   |  15 ++--
 src/core/device/device.cc        |   8 ++-
 src/core/tensor/tensor.cc        |   8 +--
 src/model/layer/cudnn_dropout.cc |  30 +++++---
 src/model/layer/cudnn_dropout.h  |   8 ++-
 test/CMakeLists.txt              |   3 +-
 test/singa/test_cpp_device.cc    |   2 +-
 test/singa/test_cudnn_dropout.cc | 127 ++++++++++++++++++++++++++++++++++
 test/singa/test_dropout.cc       |  16 ++---
 test/singa/test_tensor_math.cc   |  12 ++--
 15 files changed, 201 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8457bf2..2d1a1e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
 PROJECT(singa)
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11 -DUSE_CUDA -DUSE_CUDNN")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11")
+#message(STATUS "${CMAKE_CXX_FLAGS}")
 
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
 #message(STATUS "module path: ${CMAKE_MODULE_PATH}")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index e3338af..8780fc6 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -7,8 +7,9 @@ endif()
 
 set(HAVE_CUDA TRUE)
 message(STATUS "Found cuda_v${CUDA_VERSION}")
-include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-list(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+add_definitions(-DUSE_CUDA)
+#message(STATUS "linking: ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}")
+
 
 #if(USE_CUDNN)
 #include(cmake/Modules/Cudnn.cmake)
@@ -18,3 +19,5 @@ list(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CU
     add_definitions(-DUSE_CUDNN)
 #endif()
 
+include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+list(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 03bf443..359f1ee 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -88,8 +88,8 @@ class Tensor {
 
   /// Return immutable Tensor values with given type.
   template <typename DType>
-  const DType* data() const {
-    return static_cast<const DType*> (blob()->data());
+  DType data() const {
+    return static_cast<DType> (blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -111,6 +111,7 @@ class Tensor {
 
   /// Return number of total elements
   size_t Size() const {
+    CHECK_EQ(blob_->size() % SizeOf(data_type_), 0u);
     return blob_->size() / SizeOf(data_type_);
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index a4c4630..050236a 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -16,12 +16,13 @@
  * limitations under the License.
  */
 
-#ifndef SINGA_LAYER_H_
-#define SINGA_LAYER_H_
+#ifndef SINGA_MODEL_LAYER_H_
+#define SINGA_MODEL_LAYER_H_
 
 #include <vector>
 #include <string>
 #include <stack>
+#include <utility>
 #include "singa/core/tensor.h"
 #include "singa/proto/layer.pb.h"
 
@@ -191,4 +192,4 @@ class Layer {
 };
 
 }  // namespace singa
-#endif  // SINGA_LAYER_H_
+#endif  // SINGA_MODEL_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/src/core/device/cpp_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_device.cc b/src/core/device/cpp_device.cc
index d0e051e..763156c 100644
--- a/src/core/device/cpp_device.cc
+++ b/src/core/device/cpp_device.cc
@@ -44,4 +44,4 @@ void CppDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
                            CopyDirection direction, Context* ctx) {
   memcpy(dst, src, nBytes);
 }
-}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/src/core/device/cuda_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_device.cc b/src/core/device/cuda_device.cc
index 1f6de60..9be1a6e 100644
--- a/src/core/device/cuda_device.cc
+++ b/src/core/device/cuda_device.cc
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 #ifdef USE_CUDA
-#include <chrono>
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand.h>
+#include <chrono>
 
 #include "singa/core/device.h"
 #include "singa/utils/cuda.h"
@@ -47,10 +47,10 @@ CudaDevice::CudaDevice(int id, int num_executors,
                        string scheduler, string vm)
     : Device(id, num_executors, scheduler, vm) {
   device_type_ = kCuda;
-  host_ = nullptr; // TODO(wangwei) add host device
-  ctx_.stream = NULL; // use the default sync stream
+  host_ = nullptr;  // TODO(wangwei) add host device
+  ctx_.stream = NULL;  // use the default sync stream
   // TODO(wangwei) create one handle for each steam?
-  CUBLAS_CHECK(cublasCreate(&ctx_.cublas_handle));
+  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
   // use curandCreateGeneratorHost for CudaHost device
   CURAND_CHECK(
       curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
@@ -58,6 +58,7 @@ CudaDevice::CudaDevice(int id, int num_executors,
   SetRandSeed(seed);
   // TODO(wangwei) if one generator per stream, then need diff offset per gen?
   CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
 
 #ifdef USE_CUDNN
   // TODO(wangwei) create one handle for each stream?
@@ -86,14 +87,14 @@ void CudaDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
 /// Allocate cpu memory.
 void* CudaDevice::Malloc(int size) {
   void* ptr = nullptr;
-  cudaMalloc(&ptr, size);
+  CUDA_CHECK(cudaMalloc(&ptr, size));
   return ptr;
 }
 
   /// Free cpu memory.
 void CudaDevice::Free(void* ptr) {
   CHECK_NE(ptr, nullptr);
-  cudaFree(ptr);
+  CUDA_CHECK(cudaFree(ptr));
 }
 
 
@@ -152,5 +153,5 @@ int CudaDevice::FindDevice(const int start_id) {
 }
 
 
-}
+}  // namespace singa
 #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 153637c..73bb5c1 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -54,8 +54,10 @@ void Device::CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
                             int src_offset) {
   this->Exec(
       [this, dst, src, nBytes, direct, dst_offset, src_offset](Context* ctx) {
-        this->CopyToFrom((Byte*)dst->mutable_data() + dst_offset,
-                         (Byte*)src->data() + src_offset, nBytes, direct, ctx);
+        this->CopyToFrom(
+            reinterpret_cast<char*>(dst->mutable_data()) + dst_offset,
+            reinterpret_cast<char*>(src->data()) + src_offset, nBytes,
+            direct, ctx);
       },
       {src}, {dst});
 }
@@ -63,7 +65,7 @@ void Device::CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
 void Device::CopyDataFromHostPtr(Blob* dst, const void* src, size_t nBytes,
                                  size_t dst_offset) {
   auto direct = device_type_ == kCpp ? kHostToHost : kHostToDevice;
-  void* dstptr = (Byte*)dst->mutable_data() + dst_offset;
+  void* dstptr = reinterpret_cast<char*>(dst->mutable_data()) + dst_offset;
   Exec([this, dstptr, src, nBytes,
         direct](Context* ctx) { CopyToFrom(dstptr, src, nBytes, direct, ctx); },
        {}, {dst});

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 339262e..fac846c 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -71,12 +71,12 @@ Tensor::Tensor(Tensor&& t)
 }
 
 void Tensor::ResetLike(const Tensor& t) {
-  if (blob_ == nullptr || blob_->size() != t.MemSize()) {
+  if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
     if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     shape_ = t.shape_;
     device_ = t.device_;
     data_type_ = t.data_type_;
-    blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+    blob_ = device_->NewBlob(t.MemSize());
   }
 }
 
@@ -121,8 +121,7 @@ void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
   if (src != nullptr) {
-    auto direction = device_->type() == kCpp ? kHostToHost : kHostToDevice;
-    device_->CopyDataFromHostPtr(blob(), src, sizeof(DType) * num, direction);
+    device_->CopyDataFromHostPtr(blob(), src, sizeof(DType) * num, 0);
   } else {
     LOG(WARNING) << "Copy data from null host ptr";
   }
@@ -169,6 +168,7 @@ Tensor& Tensor::operator=(Tensor&& t) {
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
+  data_type_ = t.data_type_;
   shape_ = std::move(t.shape_);
   device_ = t.device_;
   blob_ = t.blob_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index 4d5f5d5..e049ade 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -18,9 +18,14 @@
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
 #if CUDNN_MAJOR_VERSION >= 5
+
 #include "./cudnn_dropout.h"
+#include <cudnn.h>
+#include <chrono>
+
 #include "./cudnn_utils.h"
 #include "singa/utils/logging.h"
+
 namespace singa {
 CudnnDropout::~CudnnDropout() {
   if (drop_desc_ != nullptr)
@@ -29,7 +34,8 @@ CudnnDropout::~CudnnDropout() {
   if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
 }
 
-void CudnnDropout::InitCudnn(int size, DataType dtype, Context* ctx) {
+void CudnnDropout::InitCudnn(int size, DataType dtype, Device* dev,
+                             Context* ctx) {
   CHECK(!has_init_cudnn_);
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
@@ -41,10 +47,17 @@ void CudnnDropout::InitCudnn(int size, DataType dtype, Context* ctx) {
       y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
 
   cudnnDropoutGetStatesSize(ctx->cudnn_handle, &state_size_);
+  state_ = Tensor(Shape{state_size_}, dev, kChar);
   cudnnDropoutGetReserveSpaceSize(x_desc_, &reserve_size_);
+  mask_ = Tensor(Shape{reserve_size_}, dev, kChar);
+  // TODO(wangwei) update for async running,
+  // where reserve_size_ may not available
+  CHECK_EQ(reserve_size_, mask_.MemSize());
+
+  // TODO(wangwei) get seed from ctx or user config?
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
   cudnnSetDropoutDescriptor(drop_desc_, ctx->cudnn_handle, 1 - dropout_ratio_,
-                            state_.blob()->mutable_data(), state_size_,
-                            ctx->seed);
+                            state_.blob()->mutable_data(), state_size_, seed);
   has_init_cudnn_ = true;
 }
 
@@ -52,16 +65,13 @@ const Tensor CudnnDropout::Forward(int flag, const Tensor& input) {
   if (flag & kTrain) {
     auto size = input.Size();
     DataType dtype = input.data_type();
+    Device* dev = input.device();
     if (!has_init_cudnn_) {
       input.device()->Exec(
-          [size, dtype, this](Context* ctx) {
-            this->InitCudnn(size, dtype, ctx);
+          [size, dtype, this, dev](Context* ctx) {
+            this->InitCudnn(size, dtype, dev, ctx);
           },
           {}, {this->state_.blob()});
-      mask_.ResetLike(input);
-      // TODO(wangwei) update for async running,
-      // where reserve_size_ may not available
-      CHECK_EQ(reserve_size_, mask_.MemSize());
     }
     Tensor output;
     output.ResetLike(input);
@@ -71,7 +81,7 @@ const Tensor CudnnDropout::Forward(int flag, const Tensor& input) {
                *mblob = mask_.blob();
           cudnnDropoutForward(ctx->cudnn_handle, this->drop_desc_,
                               this->x_desc_, inblob->data(), this->y_desc_,
-                              outblob->mutable_data(), mblob,
+                              outblob->mutable_data(), mblob->mutable_data(),
                               this->reserve_size_);
         },
         {input.blob()}, {output.blob(), mask_.blob()});

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index db0aa15..647eed2 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -21,9 +21,11 @@
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
 #if CUDNN_MAJOR_VERSION >= 5
+#include <cudnn.h>
 #include <utility>
 #include <string>
 #include <vector>
+
 #include "./dropout.h"
 #include "singa/core/common.h"
 #include "singa/model/layer.h"
@@ -41,12 +43,12 @@ class CudnnDropout : public Dropout {
                                                    const Tensor& grad) override;
 
   /// Init cudnn related data structures.
-  void InitCudnn(int size, DataType dtype, Context* ctx);
+  void InitCudnn(int size, DataType dtype, Device* dev, Context* ctx);
 
  private:
   bool has_init_cudnn_ = false;
-  cudnnDropoutDescriptor_t drop_desc_;
-  cudnnTensorDescriptor_t x_desc_, y_desc_;
+  cudnnDropoutDescriptor_t drop_desc_ = nullptr;
+  cudnnTensorDescriptor_t x_desc_ = nullptr, y_desc_ = nullptr;
   size_t state_size_, reserve_size_;
   Tensor state_;
 };

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/test/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f362968..de64abd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -6,5 +6,6 @@ AUX_SOURCE_DIRECTORY(singa singa_test_source)
 ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source})
 ADD_DEPENDENCIES(test_singa singa_core singa_utils)
 MESSAGE(STATUS "link libs" ${singa_linker_libs})
-TARGET_LINK_LIBRARIES(test_singa gtest singa_core singa_utils proto protobuf)
+TARGET_LINK_LIBRARIES(test_singa gtest singa_core singa_utils proto protobuf
+    ${SINGA_LINKER_LIBS})
 SET_TARGET_PROPERTIES(test_singa PROPERTIES LINK_FLAGS "${LINK_FLAGS} -pthread")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/test/singa/test_cpp_device.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_device.cc b/test/singa/test_cpp_device.cc
index d2c0149..c302206 100644
--- a/test/singa/test_cpp_device.cc
+++ b/test/singa/test_cpp_device.cc
@@ -34,7 +34,7 @@ TEST(CppDevice, MemoryMallocFree) {
   CppDevice dev(0, 1);
   Blob* b = dev.NewBlob(4);
   EXPECT_NE(nullptr, b);
-  EXPECT_EQ(4, b->size());
+  EXPECT_EQ(4u, b->size());
   dev.FreeBlob(b);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_dropout.cc b/test/singa/test_cudnn_dropout.cc
new file mode 100644
index 0000000..9913074
--- /dev/null
+++ b/test/singa/test_cudnn_dropout.cc
@@ -0,0 +1,127 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#ifdef USE_CUDNN
+// cudnn dropout is added in cudnn 5
+//#if CUDNN_MAJOR_VERSION >= 5
+
+#include "../src/model/layer/cudnn_dropout.h"
+#include "gtest/gtest.h"
+
+bool inline GetBitValue(const char* x, int pos) {
+  const unsigned char BitMask[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  int idx = pos / 8;
+  int offset = pos % 8;
+  return x[idx] & BitMask[offset];
+}
+
+using singa::CudnnDropout;
+TEST(CudnnDropout, Setup) {
+  CudnnDropout drop;
+  EXPECT_EQ("CudnnDropout", drop.layer_type());
+
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(0.8);
+
+  drop.Setup(conf);
+  EXPECT_EQ(0.8f, drop.dropout_ratio());
+}
+
+TEST(CudnnDropout, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::CudaDevice cuda(0, 1);
+  singa::Tensor in(singa::Shape{n}, &cuda);
+  in.CopyDataFromHostPtr(x, n);
+
+  float pdrop = 0.5;
+  CudnnDropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(conf);
+
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
+
+  singa::Tensor mask(drop.mask().shape(), drop.mask().data_type());
+  mask.CopyData(drop.mask());
+  const char* mptr = mask.data<const char*>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(0, GetBitValue(mptr, i) * (GetBitValue(mptr, i) - 1));
+
+  singa::CppDevice host(0, 1);
+  out1.ToDevice(&host);
+  const float* outptr1 = out1.data<const float*>();
+  EXPECT_EQ(n, out1.Size());
+  float scale = 1.0f / (1.0f - pdrop);
+  // the output value should be 0 or the same as the input
+  EXPECT_EQ(0.f, outptr1[0] * (outptr1[0] - scale * x[0]));
+  EXPECT_EQ(0.f, outptr1[1] * (outptr1[1] - scale * x[1]));
+  EXPECT_EQ(0.f, outptr1[7] * (outptr1[7] - scale * x[7]));
+
+  singa::Tensor out2 = drop.Forward(singa::kEval, in);
+  out2.ToDevice(&host);
+  EXPECT_EQ(n, out2.Size());
+  const float* outptr2 = out2.data<const float*>();
+  // the output value should be the same as the input
+  EXPECT_EQ(x[0], outptr2[0]);
+  EXPECT_EQ(x[1], outptr2[1]);
+  EXPECT_EQ(x[7], outptr2[7]);
+}
+
+TEST(CudnnDropout, Backward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::CudaDevice cuda(0, 1);
+  singa::Tensor in(singa::Shape{n}, &cuda);
+  in.CopyDataFromHostPtr(x, n);
+
+  float pdrop = 0.5;
+  float scale = 1.0f / (1.0f - pdrop);
+
+  CudnnDropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(conf);
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
+
+  const float dy[] = {4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{n}, &cuda);
+  grad.CopyDataFromHostPtr(dy, n);
+
+  const auto ret = drop.Backward(singa::kTrain, grad);
+  singa::CppDevice host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToDevice(&host);
+  const float* dx = in_grad.data<const float*>();
+
+  singa::Tensor mask(drop.mask().shape(), drop.mask().data_type());
+  mask.CopyData(drop.mask());
+  const char* mptr = mask.data<const char*>();
+
+
+  EXPECT_FLOAT_EQ(dx[0], dy[0] * GetBitValue(mptr, 0) * scale);
+  EXPECT_FLOAT_EQ(dx[1], dy[1] * GetBitValue(mptr, 1) * scale);
+  EXPECT_FLOAT_EQ(dx[7], dy[7] * GetBitValue(mptr, 7) * scale);
+}
+//#endif  // CUDNN_VERSION_MAJOR>=5
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/test/singa/test_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dropout.cc b/test/singa/test_dropout.cc
index 3190ecd..d648ff8 100644
--- a/test/singa/test_dropout.cc
+++ b/test/singa/test_dropout.cc
@@ -23,7 +23,7 @@
 #include "gtest/gtest.h"
 
 using singa::Dropout;
-TEST(DropoutLayer, Setup) {
+TEST(Dropout, Setup) {
   Dropout drop;
   EXPECT_EQ("Dropout", drop.layer_type());
 
@@ -35,7 +35,7 @@ TEST(DropoutLayer, Setup) {
   EXPECT_EQ(0.8f, drop.dropout_ratio());
 }
 
-TEST(DropoutLayer, Forward) {
+TEST(Dropout, Forward) {
   const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   size_t n = sizeof(x) / sizeof(float);
   singa::Tensor in(singa::Shape{n});
@@ -51,11 +51,11 @@ TEST(DropoutLayer, Forward) {
 
   singa::Tensor out1 = drop.Forward(singa::kTrain, in);
 
-  const float* mptr = static_cast<const float*>(drop.mask().blob()->data());
+  const float* mptr = drop.mask().data<const float*>();
   for (size_t i = 0; i < n; i++)
     EXPECT_FLOAT_EQ(0, mptr[i] * (mptr[i] - scale));
 
-  const float* outptr1 = static_cast<const float*>(out1.blob()->data());
+  const float* outptr1 = out1.data<const float*>();
   EXPECT_EQ(n, out1.Size());
   // the output value should be 0 or the same as the input
   EXPECT_EQ(0.f, outptr1[0] * (outptr1[0] - scale * x[0]));
@@ -64,14 +64,14 @@ TEST(DropoutLayer, Forward) {
 
   singa::Tensor out2 = drop.Forward(singa::kEval, in);
   EXPECT_EQ(n, out2.Size());
-  const float* outptr2 = static_cast<const float*>(out2.blob()->data());
+  const float* outptr2 = out2.data<const float*>();
   // the output value should be the same as the input
   EXPECT_EQ(x[0], outptr2[0]);
   EXPECT_EQ(x[1], outptr2[1]);
   EXPECT_EQ(x[7], outptr2[7]);
 }
 
-TEST(DropoutLayer, Backward) {
+TEST(Dropout, Backward) {
   const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   size_t n = sizeof(x) / sizeof(float);
   singa::Tensor in(singa::Shape{n});
@@ -91,9 +91,9 @@ TEST(DropoutLayer, Backward) {
   singa::Tensor grad(singa::Shape{n});
   grad.CopyDataFromHostPtr(dy, n);
 
-  const float* mptr = static_cast<const float*>(drop.mask().blob()->data());
+  const float* mptr = drop.mask().data<const float*>();
   const auto ret = drop.Backward(singa::kTrain, grad);
-  const float* dx = static_cast<const float*>(ret.first.blob()->data());
+  const float* dx = ret.first.data<const float*>();
   EXPECT_FLOAT_EQ(dx[0], dy[0] * (mptr[0] > 0 ? 1.0f : 0.0f) * scale);
   EXPECT_FLOAT_EQ(dx[1], dy[1] * (mptr[1] > 0) * scale);
   EXPECT_FLOAT_EQ(dx[7], dy[7] * (mptr[7] > 0) * scale);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0b4b2e20/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index ccd91a0..eee18ec 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -23,7 +23,7 @@ class TestTensorMath : public ::testing::Test {
 TEST_F(TestTensorMath, MemberAddTensor) {
   Tensor aa = a.Clone();
   aa += a;
-  const float* dptr = aa.data<float>();
+  const float* dptr = aa.data<const float*>();
   EXPECT_FLOAT_EQ(2.0f, dptr[0]);
   EXPECT_FLOAT_EQ(4.0f, dptr[1]);
   EXPECT_FLOAT_EQ(6.0f, dptr[2]);
@@ -31,13 +31,13 @@ TEST_F(TestTensorMath, MemberAddTensor) {
   // check p is initialized to 0
   Tensor p(Shape{6});
   p += aa;
-  const float* dptr1 = p.data<float>();
+  const float* dptr1 = p.data<const float*>();
   EXPECT_FLOAT_EQ(2.0f, dptr1[0]);
   EXPECT_FLOAT_EQ(4.0f, dptr1[1]);
   EXPECT_FLOAT_EQ(6.0f, dptr1[2]);
 
   a += b;
-  const float* dptr2 = a.data<float>();
+  const float* dptr2 = a.data<const float*>();
   EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
@@ -48,21 +48,21 @@ TEST_F(TestTensorMath, MemberAddTensor) {
 TEST_F(TestTensorMath, AddTensors) {
   Tensor ret(a.shape(), a.device(), a.data_type());
   Add(a, b, &ret);
-  const float* dptr = ret.data<float>();
+  const float* dptr = ret.data<const float*>();
   EXPECT_FLOAT_EQ(2.1f, dptr[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr[2]);
   EXPECT_FLOAT_EQ(12.1f, dptr[5]);
 
   const Tensor d = a + b;
-  const float* dptr2 = d.data<float>();
+  const float* dptr2 = d.data<const float*>();
   EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
   EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
 
   Add(a, b, &a);
-  const float* dptr1 = a.data<float>();
+  const float* dptr1 = a.data<const float*>();
   EXPECT_FLOAT_EQ(2.1f, dptr1[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr1[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr1[2]);

[39/50] [abbrv] incubator-singa git commit: SINGA-168 Implement Cpp Math functions APIs

Posted by zh...@apache.org.

SINGA-168 Implement Cpp Math functions APIs

Update error log for tensor_math.h to include the function name, e.g.
"Foo is not implemented".

Add Tensor Math Cpp Implementation and Test Cases


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/07c49da5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/07c49da5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/07c49da5

Branch: refs/heads/master
Commit: 07c49da5b1ee6582780f5faef6c6bf3418a7a0b6
Parents: 01aaf49
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Fri Jun 3 20:46:16 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 src/core/tensor/tensor_math.h     | 293 +++++++++----------
 src/core/tensor/tensor_math_cpp.h | 508 ++++++++++++++++++++++++---------
 test/singa/test_tensor_math.cc    | 264 ++++++++++++++++-
 3 files changed, 774 insertions(+), 291 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index ff865e0..1bf6fc7 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -50,277 +50,259 @@ namespace singa {
 // ================Linear algebra functions====================================
 /// ret[i] = |input[i]|
 template <typename DType, typename Lang>
-void Abs(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Abs Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Set(int count, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Set Not Implemented";
 }
+
 /// sum all elements of input into ret
 template <typename DType, typename Lang>
-void Sum(int count, const Blob *input, DType *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Sum Not Implemented";
 }
 
 /// ret[i] = sign(input[i])
 template <typename DType, typename Lang>
-void Sign(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sign Not Implemented";
 }
 
 /// Base is e, Neper number. ret[i]=exp(input[i])
 template <typename DType, typename Lang>
-void Exp(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Exp Not Implemented";
 }
 
 /// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
 template <typename DType, typename Lang>
-void Log(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Log Not Implemented";
 }
-
 /// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sqrt Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=square([input[i])
 template <typename DType, typename Lang>
-void Square(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Square Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=tanh([input[i])
 template <typename DType, typename Lang>
-void Tanh(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Tanh Not Implemented";
 }
 /// Element-wise operation, ret[i]=max(0, input[i])
 template <typename DType, typename Lang>
-void ReLU(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "ReLU Not Implemented";
 }
 /// Element-wise operation, ret[i]=sigmoid([input[i])
 template <typename DType, typename Lang>
-void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
-/// Do softmax for each row invidually
+// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "Softmax Not Implemented";
 }
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
 }
 
 /// Sum the columns of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(int nrow, int ncol, const Blob *input, Blob *ret,
-                Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, 
+	        Blob *out, Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
 }
 
 // TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
+/// Add the vector v to every row of A as the row of out 
 template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
-            Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
 }
 
-/// Add the vector v to every column of A as the column of ret
+/// Add the vector v to every column of A as the column of out
 template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
-            Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lang>
-void Pow(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
 template <typename DType, typename Lang>
-void Pow(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Clamp(int count, DType low, DType high, const Blob *input, Blob *ret,
-           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in, 	   Blob *out, Context *ctx) {
+  LOG(FATAL) << "Clamp Not Implemented";
 }
 
 /// ret = input + x
 template <typename DType, typename Lang>
-void Add(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Add Not Implemented";
 }
+
+/// ret = lhs + rhs
+template <typename DType, typename Lang>
+void Add(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Add-Pair Not Implemented";
+}
+
 /// ret =  input - x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  Add<DType, Lang>(count, input, -x, ret, ctx);
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  Add<DType, Lang>(num, in, -x, out, ctx);
 }
-/// ret = input * x
+
+/// ret = lhs - rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *input, DType x, Blob *ret,
-                 Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sub-Pair Not Implemented";
 }
-/// ret = input / x
+
+/// ret = input * x
 template <typename DType, typename Lang>
-void Div(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx);
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
-/// ret = lhs + rhs
+/// ret = lhs * rhs
 template <typename DType, typename Lang>
-void Add(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, 
+		 Blob *out, Context *ctx) {
+  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
 
-/// ret = lhs - rhs
+/// ret = input / x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in, 
+	 Blob *out, Context *ctx) { 
+  LOG(FATAL) << "Div Not Implemented";
 }
 
-/// ret = lhs * rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *lhs, const Blob *rhs, Blob *ret,
-                 Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  CHECK_NE(x,0.f);
+  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
 /// ret = lhs / rhs
 template <typename DType, typename Lang>
-void Div(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
 /// outer-product.
 /// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret,
-           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2, 
+	   Blob *out, Context *ctx) {
+  LOG(FATAL) << "Outer Not Implemented";
 }
 
 /// ret[i]=(input[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void LT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "LT Not Implemented";
 }
 /// ret[i]=(input[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void LE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "LE Not Implemented";
 }
 /// ret[i]=(input[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void GT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GT Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+/// ret[i]=(input[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void GE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GE Not Implemented";
 }
 
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // ===== Level 1
 /// return the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(int count, const Blob *input, int *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amax Not Implemented";
 }
 
 /// return the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(int count, const Blob *input, int *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amin Not Implemented";
 }
 /// ret = sum |x| for all x in input
 template <typename DType, typename Lang>
-void Asum(int count, const Blob *input, DType *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Asum Not Implemented";
 }
 
 /// ret = alpha * input + ret
 template <typename DType, typename Lang>
-void Axpy(int count, DType alpha, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Axpy(const size_t num, const DType alpha, const Blob *in, 
+	  Blob *out, Context *ctx) {
+  LOG(FATAL) << "Axpy Not Implemented";
 }
 
 /// ret *= x
 template <typename DType, typename Lang>
-void Scale(int count, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Scale Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
-         Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Dot(const size_t num, const Blob *in1, const Blob *in2, 
+	 DType *out, Context *ctx) {
+  LOG(FATAL) << "Dot Not Implemented";
 }
 
 // ===== Level 2
 /// ret = alpha * op(A) * v + beta * ret.
 /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
 template <typename DType, typename Lang>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
-          DType beta, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ========follow the consistency guide of math API
-
-template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// Divide alpha by each element of 'in'.
-template <typename DType, typename Lang>
-void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
-         Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha, 
+	  const Blob *A, const Blob *v,
+          const DType beta, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GEMV Not Implemented";
 }
 
 /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
@@ -328,7 +310,7 @@ void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
 template <typename DType, typename Lang>
 void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG(FATAL) << "DGMM Not Implemented";
 }
 
 /// C = alpha * A * B + beta * C.
@@ -338,32 +320,37 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
           const size_t ncolB, const size_t ncolA, const DType alpha,
           const Blob *A, const Blob *B, const DType beta, Blob *C,
           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG(FATAL) << "GEMM Not Implemented";
 }
-/// ret[i]=(input[i]<x)?1.f:0.f
-template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// ret[i]=(input[i]<=x)?1.f:0.f
+
+
+// ===== Level 3
+
+// ================Random functions===========================================
+/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Bernoulli Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Uniform(const size_t num, const float low, const float high, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "Uniform Not Implemented";
 }
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std, 
+	      Blob *out, Context *ctx) {
+  LOG(FATAL) << "Gaussian Not Implemented";
 }
 
+
+
+
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 693f09c..ec7a892 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -27,195 +27,317 @@
 
 /// TODO(wangwei) Clean the implementations following the comments in
 /// tensor_math.h.
-/// For Blob argument xxx, name its pointer as xxxPtr.
 namespace singa {
+
+template<>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = fabs(inPtr[i]);
+  }
+}
+
 template <>
-void Square<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                              Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *in = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = in[i] * in[i];
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
+template <>
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+  float s = 0.f;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    s += inPtr[i];
   }
+  *out = s;
 }
 
 template <>
-void Add<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + rptr[i];
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float*>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; 
   }
 }
 
 template <>
-void Add<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + x;
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = exp(inPtr[i]);
   }
 }
 
 template <>
-void Sub<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] - rptr[i];
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = log(inPtr[i]);
   }
 }
 
-// sum all elements of input into ret
-// TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(int count, const Blob *input, float *ret,
-                           Context *ctx) {
-  float s = 0.f;
-  const float *in = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    s += in[i];
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = sqrt(inPtr[i]);
   }
-  *ret = s;
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *input, float x,
-                                   Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * x;
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                                   Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * rptr[i];
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = tanh(inPtr[i]);
   }
 }
 
 template <>
-void Exp<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = exp(lptr[i]);
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
   }
 }
 
 template <>
-void Log<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    CHECK_GT(lptr[i], 0.f);
-    dptr[i] = log(lptr[i]);
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
   }
 }
 
 template <>
-void Tanh<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                            Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = tanh(lptr[i]);
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+             Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+	float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+		float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+			bPtr[c] = exp(inPtr[offset + c]);
+			denom += bPtr[c];
+    }
+		for (size_t c = 0; c < ncol; c++) {
+			size_t idx = offset + c;
+			outPtr[idx] = bPtr[c] / denom;
+		}
   }
+	delete bPtr;
 }
 
 template <>
-void ReLU<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                            Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = (lptr[i] >= 0.f) ? lptr[i] : 0.f;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+             Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		outPtr[r] = 0.f;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[r] += inPtr[offset + c];
+		}
+	}
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t c = 0; c < ncol; c++) {
+		outPtr[c] = 0.f;
+	}
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+				outPtr[c] += inPtr[offset + c];
+		}
+	}
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());                              
+  const float *vPtr = static_cast<const float *>(v->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+		}
+	}
+}
+
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());                              
+  const float *vPtr = static_cast<const float *>(v->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+		}
+	}
+}
+
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(inPtr[i], x);
   }
 }
 
 template <>
-void Sigmoid<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                               Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = 1.f / (1.f + exp(-lptr[i]));
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr= static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
   }
 }
 
 template <>
-void Pow<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = pow(lptr[i], x);
+void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
+														 Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t i = 0; i < num; i++) {
+		if (inPtr[i] > high) {
+			outPtr[i] = high;
+		}
+		else if (inPtr[i] < low) {
+			outPtr[i] = low;
+		}
+		else {
+			outPtr[i] = inPtr[i];			
+		}
+	}
+}
+
+template <>
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, 
+													 Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] + x;
   }
 }
 
 template <>
-void Pow<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = pow(lptr[i], rptr[i]);
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] + in2Ptr[i];
   }
 }
 
 template <>
-void Bernoulli<float, lang::Cpp>(int count, float p, Blob *ret, Context *ctx) {
-  std::bernoulli_distribution distribution(p);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] - in2Ptr[i];
   }
 }
 
 template <>
-void Uniform<float, lang::Cpp>(int count, float low, float high, Blob *ret,
-                               Context *ctx) {
-  std::uniform_real_distribution<float> distribution(low, high);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * x;
   }
 }
 
 template <>
-void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob *ret,
-                                Context *ctx) {
-  std::normal_distribution<float> distribution(mean, std);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] * in2Ptr[i];
   }
 }
 
-// follow the consistency guide of math API
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+		CHECK_NE(in2Ptr[i],0.f);
+    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, 
+         								  Blob *out, Context *ctx) {
+	float *outPtr= static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = alpha / inPtr[i];
+  for (size_t i = 0; i < num; i++) {
+		CHECK_NE(inPtr[i],0.f);
+    outPtr[i] = x / inPtr[i];
+  }
 }
+
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
+	float *outPtr= static_cast<float *>(out->mutable_data());
+	const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+	for (size_t r = 0; r < m ; r++) {
+		size_t offset = r * n;
+		for (size_t c = 0; c < n; c++) {
+			outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+		}
+	}
+}
+
 template <>
 void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
                           Blob *out, Context *ctx) {
@@ -227,6 +349,125 @@ void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
 }
 
 template <>
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+	size_t maxPos = 0;
+	float maxVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		if (i == 0) {
+			maxVal = inPtr[i]; 
+		}
+		else if (inPtr[i] > maxVal) {
+			maxVal = inPtr[i];
+			maxPos = i;
+		}
+	}
+	*out = maxPos;
+}
+
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+	size_t minPos = 0;
+	float minVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		if (i == 0) {
+			minVal = inPtr[i]; 
+		}
+		else if (inPtr[i] > minVal) {
+			minVal = inPtr[i];
+			minPos = i;
+		}
+	}
+	*out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+	float sum = 0;
+	const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		sum += fabs(inPtr[i]);
+	}
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+          							 	  Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {	
+		outPtr[i] += alpha * inPtr[i];
+	}
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+	for (size_t i = 0; i < num; i++) {
+		outPtr[i] *= x;
+	}
+}
+
+//template <>
+//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+//         									 float *out, Context *ctx) {
+//	float sum = 0;
+//	const float *in1Ptr = static_cast<const float *>(in1->data());
+//	const float *in2Ptr = static_cast<const float *>(in2->data());
+//	for (size_t i = 0; i < num; i++) {
+//		sum += in1Ptr[i] * in2Ptr[i];
+//	}
+//}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
+          const Blob *A, const Blob *v, const float beta, 
+					Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+	const float* APtr = static_cast<const float *>(A->data());
+	const float* vPtr = static_cast<const float *>(v->data());
+	for (size_t r = 0; r < m; r++) {
+		float sum = 0; 
+		for (size_t c = 0; c < n; c++) {
+			size_t idx = trans ? c * m + r : r * n + c;	
+			sum += APtr[idx] * vPtr[c];
+		}
+		outPtr[r] = alpha * sum + beta * outPtr[r];
+	}
+}
+
+template <>
 void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
                             const size_t ncol, const Blob *M, const Blob *v,
                             Blob *out, Context *ctx) {
@@ -251,41 +492,35 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
 }
 
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
-}
-template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
+  std::bernoulli_distribution distribution(p);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
+                               Context *ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float *outPtr= static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
+                                Context *ctx) {
+  std::normal_distribution<float> distribution(mean, std);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
+
 #ifdef USE_CBLAS
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -314,7 +549,6 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
 }
 
 #endif  // USE_CBLAS
-
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 170b96c..823445f 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -11,15 +11,277 @@ protected:
     b.Reshape(singa::Shape{6});
     c.Reshape(singa::Shape{6, 1});
     d.Reshape(singa::Shape{3, 2});
+		e.Reshape(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);
+		e.CopyDataFromHostPtr<float>(dat1, 6);
   }
-  Tensor a, b, c, d;
+  Tensor a, b, c, d, e;
   const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
 };
 
+TEST_F(TestTensorMath, MemberAbs) {
+	Tensor aa = a.Clone();
+	Tensor bb = b.Clone();
+	Tensor cc = aa - bb;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+
+	Tensor p = Abs(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberExp) {
+	Tensor p = Exp(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLog) {
+	Tensor p = Log(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberReLU) {
+	Tensor aa = a.Clone();
+	Tensor cc = aa - 2.0f;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+	Tensor p = ReLU(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSigmoid) {
+	Tensor p = Sigmoid(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(1.0f/(1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+  EXPECT_NEAR(1.0f/(1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f/(1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSign) {
+	Tensor aa = a.Clone();
+	Tensor cc = aa - 2.0f;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+	Tensor p = Sign(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_EQ(0.0f, dptr1[0]);
+  EXPECT_EQ(0.0f, dptr1[1]);
+  EXPECT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberSqrt) {
+	Tensor p = Sqrt(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSquare) {
+	Tensor p = Square(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+  EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+  EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberTanh) {
+	Tensor p = Tanh(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, Sum) {
+	Tensor p1(Shape{1,2});
+	p1 = Sum(e, 0);
+  const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
+	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
+	
+	Tensor p2(Shape{3,1});
+	p2 = Sum(e, 1);
+  const float *dptr2 = p2.data<const float *>();
+	EXPECT_FLOAT_EQ(3.0f,dptr2[0]);
+	EXPECT_FLOAT_EQ(7.0f,dptr2[1]);
+	EXPECT_FLOAT_EQ(11.0f,dptr2[2]);
+}
+
+TEST_F(TestTensorMath, SoftMax) {
+	Tensor p1(Shape{3,2});
+	p1 = SoftMax(e,0);
+  const float *dptr1 = p1.data<const float *>();
+	float sum = 0;
+	for(int i = 0; i < 6; i++) sum += exp(i+1);
+	EXPECT_NEAR(exp(1)/sum, dptr1[0],1e-5);
+	EXPECT_NEAR(exp(3)/sum, dptr1[2],1e-5);
+	EXPECT_NEAR(exp(5)/sum, dptr1[4],1e-5);
+	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
+	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
+	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
+	
+	Tensor p2(Shape{3,2});
+	p2 = SoftMax(e,1); 
+  const float *dptr2 = p2.data<const float *>();
+	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
+	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLT) {
+	Tensor p1 = a < 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberLE) {
+	Tensor p1 = a <= 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGT) {
+	Tensor p1 = a > 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGE) {
+	Tensor p1 = a >= 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberPow) {
+	Tensor p1 = Pow(b,3.0f);
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(pow(1.1f,3.0f), dptr1[0]);
+	EXPECT_FLOAT_EQ(pow(2.1f,3.0f), dptr1[1]);
+	EXPECT_FLOAT_EQ(pow(3.1f,3.0f), dptr1[2]);
+
+	//TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the function is complete
+	//Tensor p2 = Pow(a,b);
+	//const float *dptr2 = p2.data<const float *>();
+	//EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+	//EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+	//EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+}
+
+
+TEST_F(TestTensorMath, MemberSub) {
+	Tensor p1 = a - b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberEltwiseMult) {
+	Tensor p1 = a * b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(1.0*1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0*2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0*3.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberDiv) {
+	Tensor p1 = a / b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(1.0/1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0/2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0/3.1, dptr1[2], 1e-5);
+
+	Tensor p2 = Div(10.0f,b);
+	const float* dptr2 = p2.data<const float*>();
+	EXPECT_NEAR(10.0/1.1, dptr2[0], 1e-5);
+  EXPECT_NEAR(10.0/2.1, dptr2[1], 1e-5);
+  EXPECT_NEAR(10.0/3.1, dptr2[2], 1e-5);
+
+	Tensor p3 = a / 8.0f;
+	const float* dptr3 = p3.data<const float*>();
+	EXPECT_NEAR(1.0/8.0, dptr3[0], 1e-5);
+  EXPECT_NEAR(2.0/8.0, dptr3[1], 1e-5);
+  EXPECT_NEAR(3.0/8.0, dptr3[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberBernoulli) {
+	Tensor p1(Shape{10000});
+	Bernoulli(0.3,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 10000; i++) sum += dptr1[i];
+	float mean = sum/10000;
+	EXPECT_NEAR(mean, 0.3, 1e-2);
+
+	sum = 0;
+	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/9999;
+	EXPECT_NEAR(variance, 0.3*0.7, 1e-2);
+}
+
+TEST_F(TestTensorMath, MemberUniform) {
+	Tensor p1(Shape{10000});
+	Uniform(0.1f,0.2f,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 10000; i++) sum += dptr1[i];
+	float mean = sum/10000;
+	EXPECT_NEAR(mean, 0.15f, 1e-3);
+
+	sum = 0;
+	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/9999;
+	EXPECT_NEAR(variance, 0.01f/12, 1e-3);
+}
+
+TEST_F(TestTensorMath, MemberGaussian) {
+	Tensor p1(Shape{50000});
+	Gaussian(0.0,1.0,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 50000; i++) sum += dptr1[i];
+	float mean = sum/50000;
+	EXPECT_NEAR(mean, 0.0, 1e-2);
+
+	sum = 0;
+	for(int i = 0; i < 50000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/49999;
+	EXPECT_NEAR(variance, 1.0, 1e-2);
+}
+
+
+
 TEST_F(TestTensorMath, MemberAddTensor) {
   Tensor aa = a.Clone();
   aa += a;

[12/50] [abbrv] incubator-singa git commit: SINGA-176 - Add loss and metric base classes

Posted by zh...@apache.org.

SINGA-176 - Add loss and metric base classes

Add loss and metric base classes, and implement the MSE as a sub-class
of Loss and the Accuracy as a subclass of Metric.

Add math functions to support the metric/loss classes.

Draft test files for MSE and Accuracy.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d6800791
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d6800791
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d6800791

Branch: refs/heads/master
Commit: d680079165496da2787064d04daf283f5b3e7bba
Parents: 72923b1
Author: wangwei <wa...@gmail.com>
Authored: Sun May 22 23:12:30 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 26 14:09:53 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |  21 +-
 include/singa/model/loss.h         |  61 +++++
 include/singa/model/metric.h       |  57 +++++
 src/core/tensor/math_kernel.cu     | 421 ++++++++++++++++++++++++++++++++
 src/core/tensor/math_kernel.h      |  82 +++++++
 src/core/tensor/tensor.cc          |  96 +++++++-
 src/core/tensor/tensor_math.h      |  66 +++--
 src/core/tensor/tensor_math_cpp.h  |  54 ++++
 src/core/tensor/tensor_math_cuda.h |  34 ++-
 src/model/loss/mse.h               |  66 +++++
 src/model/metric/accuracy.h        |  82 +++++++
 src/proto/layer.proto              |  13 +-
 test/singa/test_accuracy.cc        |  35 +++
 test/singa/test_mse.cc             |  88 +++++++
 test/singa/test_tensor.cc          |   8 +-
 test/singa/test_tensor_math.cc     |   8 +-
 16 files changed, 1146 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 359f1ee..e560071 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -121,7 +121,7 @@ class Tensor {
   }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
-  void ReShape(const Shape& shape);
+  void Reshape(const Shape& shape);
 
   /// Reset the shape, device, and data type as given tensor.
   /// If blob size changes, then reallocate a new blob. The previous blob would
@@ -138,6 +138,10 @@ class Tensor {
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
 
+  /// Set each element of the tensor to be x
+  template<typename SType>
+  void SetValue(SType x);
+
   /// For init the tensor values, copy 'num' elements.
   template<typename DType>
   void CopyDataFromHostPtr(const DType* src, size_t num);
@@ -223,8 +227,23 @@ Tensor ReLU(const Tensor& t);
 Tensor Sigmoid(const Tensor& t);
 Tensor Sign(const Tensor& t);
 Tensor Sqrt(const Tensor& t);
+Tensor Square(const Tensor& t);
 Tensor Tanh(const Tensor& t);
 
+
+template<typename SType>
+SType Sum(const Tensor& t);
+/// Sum elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, sum all rows into a single row
+/// if 'axis' is 1, sum all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.sum
+Tensor Sum(const Tensor& t, int axis);
+
+/// Average elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, average all rows into a single row
+/// if 'axis' is 1, average all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.average
+Tensor Average(const Tensor&t, int axis);
 /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
 /// and shape_[axis+1]*...*shape_[nDim()] columns.
 /// and do softmax along each row.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
new file mode 100644
index 0000000..6c79e7b
--- /dev/null
+++ b/include/singa/model/loss.h
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LOSS_H_
+#define SINGA_MODEL_LOSS_H_
+#include "singa/proto/layer.pb.h"
+#include "singa/core/tensor.h"
+namespace singa {
+
+/// The base loss class, which declares the APIs for computing the objective
+/// score (loss) for a pair of prediction (from the model) and the target (i.e.
+/// the ground truth). It also computes the gradients of the objective w.r.t.
+/// the prediction. It has similar APIs as Layer.
+template <typename T = Tensor>
+class Loss {
+ public:
+  Loss() = default;
+  void Setup(const string& conf) {
+    LossConf loss;
+    loss.ParseFromString(conf);
+    Setup(loss);
+  }
+
+  /// Set meta fields from user configurations.
+  virtual void Setup(const LossConf& conf) {}
+
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target.
+  virtual Tensor Forward(const Tensor& prediction, const T& target) = 0;
+
+  /// Average loss values for all samples in the mini-batch
+  /// It calls Forward() internally. The calling pattern should be
+  /// [Evaluate|Forward] Backward.
+  float Evaluate(const Tensor& prediction, const T& target) {
+    const Tensor& loss = Forward(prediction, target);
+    return Sum<float>(loss) / (1.0f * loss.Size());
+  }
+
+  /// Compute the gradients of the loss values w.r.t. the prediction.
+  virtual Tensor Backward() = 0;
+};
+}  // namespace singa
+
+#endif  // SINGA_MODEL_LOSS_H_
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/include/singa/model/metric.h
----------------------------------------------------------------------
diff --git a/include/singa/model/metric.h b/include/singa/model/metric.h
new file mode 100644
index 0000000..6519028
--- /dev/null
+++ b/include/singa/model/metric.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_METRIC_H_
+#define SINGA_MODEL_METRIC_H_
+#include "singa/core/tensor.h"
+#include "singa/proto/layer.pb.h"
+namespace singa {
+
+/// The base metric class, which declares the APIs for computing the performance
+/// evaluation metrics given the prediction of the model and the ground truth,
+/// i.e., the target.
+/// The target type is a template argument.  For data samples with a single
+/// label, T could be 1-d tenor (or vector<int>); If each data sample has
+/// multiple labels, T could be vector<vector<int>>, one vector per sample.
+template <typename T = Tensor>
+class Metric {
+ public:
+  // TODO(wangwei) call Setup using a default MetricConf.
+  Metric() = default;
+  void Setup(const string& conf) {
+    MetricConf metric;
+    metric.ParseFromString(conf);
+    Setup(metric);
+  }
+
+  /// Set meta fields from user configurations.
+  virtual void Setup(const MetricConf& conf) {}
+
+  /// Compute the metric for each data sample
+  virtual Tensor Forward(const Tensor& prediction, const T& target) = 0;
+
+  /// Comptue the metric value averaged over all samples (in a batch)
+  float Evaluate(const Tensor& prediction, const T& target) {
+    const Tensor& metric = Forward(prediction, target);
+    return Sum<float>(metric) / (1.0f * metric.Size());
+  }
+};
+
+}  // namespace singa
+
+#endif  // SINGA_MODEL_METRIC_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
new file mode 100644
index 0000000..585d65d
--- /dev/null
+++ b/src/core/tensor/math_kernel.cu
@@ -0,0 +1,421 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifdef USE_CUDA
+#include <cmath>
+#include <algorithm>
+#include "./math_kernel.h"
+
+#define CU2DBLOCK_X 32
+#define CU2DBLOCK_Y 32
+
+#define CU1DBLOCK 1024
+#define CU1DBLOCKF 1024.0
+
+// Cuda Kernel Functions
+namespace cuda {
+__global__ void kernel_softmax_loss(const float *prob, const int *label,
+                                    float *loss, int n, int dim) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    float prob_of_truth = prob[index * dim + label[index]];
+    loss[index] -= log(max(prob_of_truth, FLT_MIN));
+  }
+}
+
+__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
+                                        int dim, float scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    int pos = index * dim + label[index];
+    grad[pos] = (grad[pos] - 1.0f) * scale;
+  }
+}
+
+__global__ void kernel_sum_vec(float *data, float *sum, int n) {
+  int THREADS = blockDim.x;
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (n - 1) / THREADS + 1;
+  aux[threadIdx.x] = data[threadIdx.x];
+
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < n) {
+      aux[threadIdx.x] += data[threadIdx.x + i * THREADS];
+    }
+  }
+
+  int total_threads = THREADS;
+  __syncthreads();
+
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
+  }
+
+  __syncthreads();
+  *sum = aux[0];
+}
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < rows; index += num_threads) {
+    dst_vec_data[index] = 0.0f;
+    for (int k = 0; k < cols; k++) {
+      dst_vec_data[index] += src_mat_data[index * stride + k];
+    }
+  }
+}
+
+__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int j = blockIdx.x;
+  int THREADS = blockDim.x;
+  if (j >= cols) {
+    return;
+  }
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (rows - 1) / THREADS + 1;
+  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < rows) {
+      aux[threadIdx.x] +=
+          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
+    }
+  }
+
+  int total_threads = THREADS;
+  __syncthreads();
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
+  }
+
+  __syncthreads();
+  dst_vec_data[j] = aux[0];
+}
+
+__global__ void kernel_add_vec_row(const float *src_vec_data,
+                                   const float *src_mat_data,
+                                   float *des_mat_data, int rows, int cols,
+                                   int stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int num_threads_x = blockDim.x * gridDim.x;
+  int num_threads_y = blockDim.y * gridDim.y;
+  int index = 0;
+  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+    index = j * stride + i;
+    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+  }
+}
+
+__global__ void kernel_exp(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = exp(src_data[index]);
+  }
+}
+
+__global__ void kernel_log(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = log(src_data[index]);
+  }
+}
+
+__global__ void kernel_sigmoid(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
+}
+
+__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
+                                    int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * (1.0f - src_data[index]);
+  }
+}
+
+__global__ void kernel_relu(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = max(src_data[index], 0.0f);
+  }
+}
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+  }
+}
+
+__global__ void kernel_tanh(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = tanhf(src_data[index]);
+  }
+}
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+  }
+}
+
+__global__ void kernel_softplus(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = logf(1 + expf(src_data[index]));
+  }
+}
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+                                     int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
+}
+
+__global__ void kernel_square(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * src_data[index];
+  }
+}
+
+__global__ void kernel_square_grad(const float *src_data, float *des_data,
+                                   int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 2 * sqrt(src_data[index]);
+  }
+}
+
+__global__ void kernel_sqrt(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = sqrt(src_data[index]);
+  }
+}
+
+__global__ void kernel_pow(const float *src_data_a, const float *src_data_b,
+                           float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = pow(src_data_a[index], src_data_b[index]);
+  }
+}
+
+__global__ void kernel_mult(const float *src_data_a, const float *src_data_b,
+                            float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data_a[index] * src_data_b[index];
+  }
+}
+
+__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
+                           float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data_a[index] / src_data_b[index];
+  }
+}
+
+__global__ static void kernel_set_value(float *data, float value, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    data[index] = value;
+  }
+}
+
+__global__ void kernel_threshold(const float *src_data, float *des_data,
+                                 float alpha, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+  }
+}
+
+/*
+void softmaxloss_forward(int n, int dim, const float *prob,
+    const int *label, float *loss) {
+  kernel_softmax_loss<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(prob, label, loss, n,
+      dim);
+}
+
+void softmaxloss_backward(int n, int dim, float scale,
+    const int *label, float *grad) {
+  kernel_softmax_gradient<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(grad, label, n,
+      dim, scale);
+}
+*/
+void sum(int n, const float *in, float *out) {
+  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+  //  here, we only need one block
+  int num_blocks = 1;
+
+  kernel_sum_vec<<<num_blocks, threads_per_block>>>(in, out, n);
+}
+
+void sum_row(int rows, int cols, int stride, const float *in, float *out) {
+  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
+  int num_blocks = cols;
+
+  kernel_sum_row<<<num_blocks, threads_per_block>>>(in, out, rows, cols,
+                                                    stride);
+}
+
+void sum_col(int rows, int cols, int stride, const float *in, float *out) {
+  int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
+  int num_blocks = rows;
+
+  kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data,
+                                                    rows, cols, stride);
+}
+void add_row(int rows, int cols, int stride, const float *in_row,
+             const float *in_mat, float *out) {
+  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
+  dim3 num_blocks(
+      cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
+      rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
+  kernel_add_vec_row<<<num_blocks, threads_per_block>>>(in_row, in_mat, out,
+                                                        rows, cols, stride);
+}
+
+void exp(int n, const float *in, float *out) {
+  kernel_exp<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void log(int n, const float *in, float *out) {
+  kernel_log<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void sigmoid(int n, const float *in, float *out) {
+  kernel_sigmoid<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void sigmoid_grad(int n, const float *in, float *out) {
+  kernel_sigmoid_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void relu(int n, const float *in, float *out) {
+  kernel_relu<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void relu_grad(int n, const float *in, float *out) {
+  kernel_relu_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void tanh(int n, const float *in, float *out) {
+  kernel_tanh<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void tanh_grad(int n, const float *in, float *out) {
+  kernel_tanh_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void softplus(int n, const float *in, float *out) {
+  kernel_softplus<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void softplus_grad(int n, const float *in, float *out) {
+  kernel_softplus_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void square(int n, const float *in, float *out) {
+  kernel_square<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void square_grad(int n, const float *in, float *out) {
+  kernel_square_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void sqrt(int n, const float *in, float *out) {
+  kernel_sqrt<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void pow(int n, const float *a, const float *b, float *out) {
+  kernel_pow<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
+
+void mult(int n, const float *a, const float *b, float *out) {
+  kernel_mult<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
+
+void div(int n, const float *a, const float *b, float *out) {
+  kernel_div<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
+
+void set_value(int n, float v, float *out) {
+  kernel_set_value<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(out, v, n);
+}
+
+void threshold(int n, float alpha, const float *in, float *out) {
+  kernel_threshold<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, alpha, n);
+}
+}  // namespace cuda
+}  // namespace singa
+
+#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
new file mode 100644
index 0000000..7629ac8
--- /dev/null
+++ b/src/core/tensor/math_kernel.h
@@ -0,0 +1,82 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#ifndef SRC_CORE_TENSOR__MATH_KERNEL_H_
+#define SRC_CORE_TENSOR__MATH_KERNEL_H_
+
+namespace singa {
+
+/*
+  void softmaxloss_forward(int n, int dim, const float *prob,
+      const int *label, float *loss);
+
+  void softmaxloss_backward(int n, int dim, float scale,
+      const int *label, float *grad);
+*/
+// TODO(wangwei) make all function templates.
+namespace cuda {
+void sum(int n, const float *in, float *out);
+
+void sum_row(int rows, int cols, int stride, const float *in, float *out);
+
+void sum_col(int rows, int cols, int stride, const float *in, float *out);
+
+void add_row(int rows, int cols, int stride, const float *in_row,
+  const float *in_mat, float *out);
+
+void exp(int n, const float *in, float *out);
+
+void log(int n, const float *in, float *out);
+
+void sigmoid(int n, const float *in, float *out);
+
+void sigmoid_grad(int n, const float *in, float *out);
+
+void relu(int n, const float *in, float *out);
+
+void relu_grad(int n, const float *in, float *out);
+
+void tanh(int n, const float *in, float *out);
+
+void tanh_grad(int n, const float *in, float *out);
+
+void softplus(int n, const float *in, float *out);
+
+void softplus_grad(int n, const float *in, float *out);
+
+void square(int n, const float *in, float *out);
+
+void square_grad(int n, const float *in, float *out);
+
+void sqrt(int n, const float *in, float *out);
+
+void pow(int n, const float *a, const float *b, float *out);
+
+void mult(int n, const float *a, const float *b, float *out);
+
+void div(int n, const float *a, const float *b, float *out);
+
+void set_value(int n, float v, float *out);
+
+void threshold(int n, float alpha, const float *in, float *out);
+}  // cuda
+}  // namespace singa
+
+#endif  // SRC_CORE_TENSOR__MATH_KERNEL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 185b1f9..052f3ff 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -77,7 +77,7 @@ void Tensor::ResetLike(const Tensor& t) {
   }
 }
 
-void Tensor::ReShape(const Shape& shape) {
+void Tensor::Reshape(const Shape& shape) {
   if (shape_ != shape) {
     if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
@@ -119,6 +119,7 @@ void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
   }
 }
 template void Tensor::CopyDataFromHostPtr(const float* src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const int* src, size_t num);
 
 void Tensor::CopyData(const Tensor& src) {
   CHECK_EQ(Size(), src.Size());
@@ -279,6 +280,20 @@ void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
     }                                                          \
   } while (0)
 
+
+template <typename SType>
+void Tensor::SetValue(SType x) {
+  CHECK_EQ(sizeof(SType), SizeOf(data_type_));
+  auto size = Size();
+  auto ptr = blob_;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec(
+        [size, x, ptr](Context* ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
+        {}, {ptr});
+  });
+}
+
+
 #define EltwiseUnaryTensorFn(fn, t, ret)                               \
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
@@ -305,8 +320,87 @@ GenUnaryTensorFunction(ReLU);
 GenUnaryTensorFunction(Sigmoid);
 GenUnaryTensorFunction(Sign);
 GenUnaryTensorFunction(Sqrt);
+GenUnaryTensorFunction(Square);
 GenUnaryTensorFunction(Tanh);
 
+// TODO(wangwei) consider matrix transpose.
+Tensor SumRows(const Tensor& t) {
+  int ndim = t.shape().size();
+  CHECK_EQ(ndim, 2) << "Cannot do SumRows for Tensor with ndim = " << ndim;
+  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
+  Tensor ret(Shape{ncol}, t.device(), t.data_type());
+  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
+    ret.device()->Exec(
+        [nrow, ncol, t, ret](Context* ctx) {
+          SumRows<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
+        },
+        {t.blob()}, {ret.blob()});
+  });
+  return ret;
+}
+
+// TODO(wangwei) consider matrix transpose.
+Tensor SumColumns(const Tensor& t) {
+  int ndim = t.shape().size();
+  CHECK_EQ(ndim, 2) << "Cannot do SumColumns for Tensor with ndim = " << ndim;
+  CHECK(!t.transpose());  // TODO(wangwei) enable transpose
+  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
+  Tensor ret(Shape{nrow}, t.device(), t.data_type());
+  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
+    ret.device()->Exec(
+        [nrow, ncol, t, ret](Context* ctx) {
+          SumColumns<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
+        },
+        {t.blob()}, {ret.blob()});
+  });
+  return ret;
+}
+
+// TODO(wangwei) conside async exec
+template<>
+float Sum<float>(const Tensor& t)  {
+  float s = 0.0f;
+  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
+      t.device()->Exec(
+        [t, &s](Context* ctx) {
+        Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
+        },
+        {t.blob()}, {});
+      });
+  return s;
+}
+
+Tensor Sum(const Tensor& t, int axis) {
+  if (axis == 0) {
+    return SumRows(t);
+  } else {
+    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
+    return SumColumns(t);
+  }
+}
+
+Tensor Average(const Tensor& t, int axis) {
+  // operator/ only has implementation for float scalar type, hence it is
+  // necessary to cast the denominator to a float.
+  // TODO(wangwei) implement function for cast scalar type involved in Tensor
+  // functions. E.g.,
+  // template<S, D>
+  // D CastTo(S x) {
+  //   return D(x);
+  // }
+  // for speical types, e.g., fp16:
+  // tempalte<>
+  // fp16 CastType(float x) {
+  //    ....
+  // }
+  if (axis == 0) {
+    return Sum(t, 0) / (1.0f * t.shape().at(0));
+  } else {
+    CHECK_EQ(axis, 1);
+    return Sum(t, 1) / (1.0f * t.shape().at(1));
+  }
+}
+
 Tensor Softmax(const Tensor& t, int axis) {
   Tensor ret(t.shape(), t.device(), t.data_type());
   Softmax(t, &ret, axis);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 53e979b..d55e15a 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -50,6 +50,10 @@ void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+template <typename DType, typename Lang>
+void Set(int count, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
 /// sum all elements of input into ret
 template <typename DType, typename Lang>
 void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
@@ -80,6 +84,12 @@ void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+/// Element-wise operation, ret[i]=square([input[i])
+template <typename DType, typename Lang>
+void Square(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
 /// Element-wise operation, ret[i]=tanh([input[i])
 template <typename DType, typename Lang>
 void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
@@ -102,6 +112,35 @@ void Softmax(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the input matrix into a vector
+template <typename DType, typename Lang>
+void SumRows(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Sum the columns of the input matrix into a vector
+template <typename DType, typename Lang>
+void SumColumns(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of ret
+template <typename DType, typename Lang>
+void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Add the vector v to every column of A as the column of ret
+template <typename DType, typename Lang>
+void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lang>
 void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
@@ -177,33 +216,6 @@ void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
   LOG(FATAL) << "Not Implemented";
 }
 
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lang>
-void SumRow(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lang>
-void SumCol(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
-template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Add the vector v to every column of A as the column of ret
-template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // ===== Level 1
 /// return the index of the element with the max value.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index b58e3bd..c584b69 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -26,6 +26,16 @@
 
 namespace singa {
 template <>
+void Square<float, lang::Cpp>(int count, const Blob* input,
+                           Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = in[i] * in[i];
+  }
+}
+
+template <>
 void Add<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
                            Blob* ret, Context* ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
@@ -36,6 +46,50 @@ void Add<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
     dptr[i] = lptr[i] + rptr[i];
   }
 }
+
+
+// sum all elements of input into ret
+// TODO(wangwei) optimize using omp
+template <>
+void Sum<float, lang::Cpp>(int count, const Blob* input, float* ret,
+    Context* ctx) {
+  float s = 0.f;
+  const float* in = static_cast<const float*>(input->data());
+  for (int i = 0; i < count; i++) {
+    s += in[i];
+  }
+  *ret = s;
+}
+
+// TODO(wangwei) optimize using omp
+template <>
+void SumRows<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* ret,
+    Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  memset(dptr, 0, ncol * sizeof(float));
+  for (int r = 0; r < nrow; r++) {
+    for (int c = 0; c < ncol; c++) {
+      dptr[c] += in[r * ncol + c];
+    }
+  }
+}
+
+// Sum the rows of the input matrix into a vector
+// TODO(wangwei) optimize using omp
+template <>
+void SumColumns<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* ret,
+    Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  memset(dptr, 0, ncol * sizeof(float));
+  for (int r = 0; r < nrow; r++) {
+    for (int c = 0; c < ncol; c++) {
+      dptr[r] += in[r * ncol + c];
+    }
+  }
+}
+
 template <>
 void EltwiseMult<float, lang::Cpp>(int count, const Blob* input, float x,
                                    Blob* ret, Context* ctx) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 40f9210..2e497d2 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -18,14 +18,14 @@
 
 #ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
 #define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-#include "./tensor_math.h"
 #include "singa_config.h"
+#ifdef USE_CUDA
+#include "./tensor_math.h"
+#include "./math_kernel.h"
 #include "singa/core/common.h"
 
-
 namespace singa {
 
-#ifdef USE_CUDA
 template<>
 void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
                         Blob* ret, Context* ctx) {
@@ -38,9 +38,35 @@ void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
   cublasSaxpy(ctx->cublas_handle, 1.0f, rptr, 1, ptr, 1);
   */
 }
+// sum all elements of input into ret
+// TODO(wangwei) optimize using stream
+template <>
+void Sum<float, lang::Cuda>(int count, const Blob* input, float* ret,
+                            Context* ctx) {
+  const float* in = static_cast<const float*>(input->data());
+  cuda::sum(count, in, ret);
+}
+
+// TODO(wangwei) optimize using stream
+template <>
+void SumRows<float, lang::Cuda>(int nrow, int ncol, const Blob* input,
+                                Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  cuda::sum_row(nrow, ncol, ncol, in, dptr);
+}
 
-#endif
+// Sum the rows of the input matrix into a vector
+// TODO(wangwei) optimize using stream
+template <>
+void SumColumns<float, lang::Cuda>(int nrow, int ncol, const Blob* input,
+                                   Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  cuda::sum_col(nrow, ncol, ncol, in, dptr);
+}
 }
 
 
+#endif  // USE_CUDA
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/model/loss/mse.h
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.h b/src/model/loss/mse.h
new file mode 100644
index 0000000..5799f13
--- /dev/null
+++ b/src/model/loss/mse.h
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LOSS_MSE_H_
+#define SINGA_MODEL_LOSS_MSE_H_
+#include <stack>
+#include "singa/model/loss.h"
+
+namespace singa {
+
+/// MSE is for mean squared error or squared euclidean distance.
+class MSE : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is 0.5/||prediction-target||^2
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is (prediction-target)/batchsize
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., prediction-target
+  std::stack<Tensor> buf_;
+};
+
+Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  Tensor t = prediction - target;
+  size_t batchsize = 1;
+  if (t.nDim() > 1) batchsize = t.shape().at(0);
+  size_t dim = t.Size() / batchsize;
+  t.Reshape(Shape{batchsize, dim});
+  buf_.push(t);
+  // TODO(wangwei) use CastType for operator/
+  return Sum(Square(t), 1);
+}
+
+Tensor MSE::Backward() {
+  const Tensor& ret = buf_.top();
+  buf_.pop();
+  return ret / (1.0f * ret.shape().at(0));
+}
+}  // namespace singa
+
+#endif  // SINGA_MODEL_LOSS_H_
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/model/metric/accuracy.h
----------------------------------------------------------------------
diff --git a/src/model/metric/accuracy.h b/src/model/metric/accuracy.h
new file mode 100644
index 0000000..05c1643
--- /dev/null
+++ b/src/model/metric/accuracy.h
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_METRIC_ACCURACY_H_
+#define SINGA_MODEL_METRIC_ACCURACY_H_
+#include "singa/model/metric.h"
+namespace singa {
+
+/// Compute the accuray of the prediction, which is matched against the
+/// ground truth labels.
+/// TODO(wangwei) consider multi-label cases.
+class Accuracy : public Metric<Tensor> {
+ public:
+  /// Set meta fields from user configurations.
+  void Setup(const MetricConf& conf) override { top_k_ = conf.top_k(); }
+
+  /// Check the prediction against the target (ground truth) for each data
+  /// sample. The returned Tensor has a float value for each sample, 0 for wrong
+  /// and 1 for correct. Users can call Sum(const Tensor&) / Tensor::Size() to
+  /// get the accuracy.
+  Tensor Forward(const Tensor& prediction, const Tensor& target);
+
+ private:
+  /// \copydoc Match(const Tensor&, const Tensor&);
+  Tensor Match(const Tensor& prediction, const vector<int>& target);
+  /// If the ground truth label is in the top k predicted labels, then the
+  /// prediction is correct.
+  size_t top_k_ = 1;
+};
+
+Tensor Accuracy::Match(const Tensor& prediction, const vector<int>& target) {
+  size_t batchsize = target.size();
+  size_t nb_classes = prediction.Size() / batchsize;
+  // each row of prediction is the prob distribution for one sample
+  CHECK_EQ(prediction.shape().at(0), batchsize);
+  const float* prob = prediction.data<const float*>();
+  float* score = new float[batchsize];
+  for (size_t b = 0; b < batchsize; b++) {
+    vector<std::pair<float, int>> prob_class;
+    for (size_t c = 0; c < nb_classes; c++) {
+      prob_class.push_back(std::make_pair(prob[b * nb_classes + c], c));
+    }
+    std::partial_sort(prob_class.begin(), prob_class.begin() + top_k_,
+                      prob_class.end(), std::greater<std::pair<float, int>>());
+
+    for (size_t k = 0; k < top_k_; k++)
+      if (prob_class.at(k).second == target.at(b)) score[b] = 1;
+  }
+  Tensor ret(Shape{batchsize});
+  ret.CopyDataFromHostPtr(score, batchsize);
+  return ret;
+}
+
+// TODO(wangwei) consider multi-label cases, where target is of shape
+// nb_samples * nb_classes
+Tensor Accuracy::Forward(const Tensor& prediction, const Tensor& target) {
+  vector<int> target_vec;
+  // TODO(wangwei) copy target to host.
+  const int* target_value = target.data<const int*>();
+  for (size_t i = 0; i < target.Size(); i++)
+    target_vec.push_back(target_value[i]);
+  return Match(prediction, target_vec);
+}
+
+}  // namespace singa
+
+#endif  // SINGA_MODEL_METRIC_ACCURACY_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/proto/layer.proto
----------------------------------------------------------------------
diff --git a/src/proto/layer.proto b/src/proto/layer.proto
index 3d130ea..51225ee 100644
--- a/src/proto/layer.proto
+++ b/src/proto/layer.proto
@@ -157,7 +157,7 @@ message LayerConf {
   // for their implementation. These layers include an Engine type and
   // engine parameter for selecting the implementation.
   // The default for the engine is set by the ENGINE switch at compile-time.
-  optional AccuracyConf accuracy_conf = 102;
+  //optional AccuracyConf accuracy_conf = 102;
   optional ArgMaxConf argmax_conf = 103;
   optional ConcatConf concat_conf = 104;
   optional ContrastiveLossConf contrastive_loss_conf = 105;
@@ -177,6 +177,8 @@ message LayerConf {
   optional InnerProductConf inner_product_conf = 117;
   optional LogConf log_conf = 134;
   optional LRNConf lrn_conf = 118;
+  // Used in SINGA
+  optional MetricConf metric_conf = 200;
   // optional MemoryDataConf memory_data_conf = 119;
   optional MVNConf mvn_conf = 120;
   optional PoolingConf pooling_conf = 121;
@@ -230,10 +232,7 @@ message LossConf {
   optional bool normalize = 2 [default = true];
 }
 
-// Messages that store hyper-parameters used by individual layer types follow, in
-// alphabetical order.
-
-message AccuracyConf {
+message MetricConf {
   // When computing accuracy, count as correct by comparing the true label to
   // the top k scoring classes.  By default, only compare to the top scoring
   // class (i.e. argmax).
@@ -249,6 +248,10 @@ message AccuracyConf {
   // If specified, ignore instances with the given label.
   optional int32 ignore_label = 3;
 }
+// Messages that store hyper-parameters used by individual layer types follow, in
+// alphabetical order.
+
+
 
 message ArgMaxConf {
   // If true produce pairs (argmax, maxval)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_accuracy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_accuracy.cc b/test/singa/test_accuracy.cc
new file mode 100644
index 0000000..dc7719b
--- /dev/null
+++ b/test/singa/test_accuracy.cc
@@ -0,0 +1,35 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "../src/model/metric/accuracy.h"
+
+TEST(Accuracy, Compute) {
+  singa::Accuracy acc;
+  singa::Tensor p(singa::Shape{2, 3});
+  singa::Tensor t(singa::Shape{2}, singa::kInt);
+  const float pdat[6] = {0.1, 0.3, 0.6, 0.3, 0.2, 0.5};
+  const int tdat[2] = {1, 2};  // one wrong, one correct
+  p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+  t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
+  float a = acc.Evaluate(p, t);
+  EXPECT_FLOAT_EQ(a, 0.5f);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
new file mode 100644
index 0000000..9056176
--- /dev/null
+++ b/test/singa/test_mse.cc
@@ -0,0 +1,88 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+#include "singa/core/device.h"
+#include "../src/model/loss/mse.h"
+
+using singa::Tensor;
+class TestMSE : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    p.Reshape(singa::Shape{2, 3});
+    t.Reshape(singa::Shape{2, 3});
+    p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+    t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
+  }
+  const float pdat[6] = {0.1, 1.1, 2.1, 0.3, 2.2, 1.8};
+  const float tdat[6] = {0.1, 1.1, 2.0, 0.3, 2.2, 1.8};
+
+  singa::Tensor p, t;
+};
+
+TEST_F(TestMSE, CppForward) {
+  singa::MSE mse;
+  const Tensor& loss = mse.Forward(p, t);
+  auto ldat = loss.data<const float*>();
+
+  EXPECT_FLOAT_EQ(ldat[0], 0.005);
+  EXPECT_FLOAT_EQ(ldat[1], 0);
+}
+
+TEST_F(TestMSE, CudaForward) {
+  singa::MSE mse;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  Tensor loss = mse.Forward(p, t);
+
+  loss.ToHost();
+  auto ldat = loss.data<const float*>();
+
+  for (size_t i = 0; i < loss.Size(); i++)
+    EXPECT_FLOAT_EQ(ldat[i], 0.5 * (pdat[i] - tdat[i]) * (pdat[i] - tdat[i]));
+}
+
+TEST_F(TestMSE, CppBackward) {
+  singa::MSE mse;
+  mse.Forward(p, t);
+  const Tensor& grad = mse.Backward();
+
+  auto gdat = grad.data<const float*>();
+
+  for (size_t i = 0; i < grad.Size(); i++)
+    EXPECT_FLOAT_EQ(gdat[i], pdat[i] - tdat[i]);
+}
+
+TEST_F(TestMSE, CudaBackward) {
+  singa::MSE mse;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  mse.Forward(p, t);
+  Tensor grad = mse.Backward();
+  grad.ToHost();
+  auto gdat = grad.data<const float*>();
+
+  for (size_t i = 0; i < grad.Size(); i++)
+    EXPECT_FLOAT_EQ(gdat[i], pdat[i] - tdat[i]);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index b3f0c6b..f9acdb0 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -35,18 +35,18 @@ TEST(TensorTest, TestConstructor) {
 
 TEST(TensorClass, Reshape) {
   Tensor t;
-  t.ReShape(Shape{2,3});
+  t.Reshape(Shape{2,3});
   EXPECT_TRUE((Shape{2,3} == t.shape()));
 
-  t.ReShape(Shape{3,3, 4});
+  t.Reshape(Shape{3,3, 4});
   EXPECT_TRUE((Shape{3,3, 4} == t.shape()));
 
-  t.ReShape(Shape{12});
+  t.Reshape(Shape{12});
   EXPECT_TRUE((Shape{12} == t.shape()));
 
   Tensor o;
   EXPECT_TRUE(o.shape() != t.shape());
-  o.ReShape(Shape{3, 3});
+  o.Reshape(Shape{3, 3});
   EXPECT_TRUE(o.shape() != t.shape());
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index eee18ec..fb7e3e8 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -9,10 +9,10 @@ class TestTensorMath : public ::testing::Test {
   virtual void SetUp() {
     const float dat1[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
     const float dat2[] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
-    a.ReShape(singa::Shape{6});
-    b.ReShape(singa::Shape{6});
-    c.ReShape(singa::Shape{6, 1});
-    d.ReShape(singa::Shape{3, 2});
+    a.Reshape(singa::Shape{6});
+    b.Reshape(singa::Shape{6});
+    c.Reshape(singa::Shape{6, 1});
+    d.Reshape(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);

[30/50] [abbrv] incubator-singa git commit: Merge SINGA-189 with the latest code.

Posted by zh...@apache.org.

Merge SINGA-189 with the latest code.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/7a191650
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/7a191650
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/7a191650

Branch: refs/heads/master
Commit: 7a19165022b254a5d7f5b2a0cd4cdc36d95059e2
Parents: da23e57 fa2ea30
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Jun 3 11:10:15 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Jun 3 11:27:24 2016 +0800

----------------------------------------------------------------------
 include/singa/core/common.h          |   2 +-
 src/model/layer/activation.cc        |  10 +-
 src/model/layer/batchnorm.cc         |  70 ++++++++
 src/model/layer/batchnorm.h          |  84 ++++++++++
 src/model/layer/cudnn_activation.cc  |  13 +-
 src/model/layer/cudnn_batchnorm.cc   | 214 +++++++++++++++++++++++++
 src/model/layer/cudnn_batchnorm.h    |  60 +++++++
 src/model/layer/cudnn_convolution.cc |   3 +-
 src/model/layer/cudnn_lrn.cc         | 118 ++++++++++++++
 src/model/layer/cudnn_lrn.h          |  56 +++++++
 src/model/layer/cudnn_pooling.cc     |   7 +-
 src/model/layer/cudnn_softmax.cc     |   4 +-
 src/model/layer/dense.cc             |  86 ++++++++++
 src/model/layer/dense.h              |  70 ++++++++
 src/model/layer/lrn.cc               |  59 +++++++
 src/model/layer/lrn.h                |  70 ++++++++
 src/model/layer/softmax.cc           |  10 +-
 src/proto/model.proto                |  34 +++-
 test/singa/test_activation.cc        |   8 +-
 test/singa/test_cudnn_activation.cc  |   6 +-
 test/singa/test_cudnn_batchnorm.cc   | 257 ++++++++++++++++++++++++++++++
 test/singa/test_cudnn_lrn.cc         | 205 ++++++++++++++++++++++++
 test/singa/test_cudnn_softmax.cc     |   6 +-
 test/singa/test_dense.cc             | 249 +++++++++++++++++++++++++++++
 test/singa/test_softmax.cc           |   6 +-
 25 files changed, 1674 insertions(+), 33 deletions(-)
----------------------------------------------------------------------

[29/50] [abbrv] incubator-singa git commit: SINGA-189 Generate python outputs of proto files

Posted by zh...@apache.org.

SINGA-189 Generate python outputs of proto files

Remove ProtoBuf.cmake. Add python outputs generated by proto files locate at build/src.
The proto headers are now at build/include/singa/proto.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/da23e577
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/da23e577
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/da23e577

Branch: refs/heads/master
Commit: da23e577e48e9f52644dabaa844a010a97286b70
Parents: 3e2507b
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Fri Jun 3 01:15:53 2016 +0800
Committer: xiezl <xi...@comp.nus.edu.sg>
Committed: Fri Jun 3 01:15:53 2016 +0800

----------------------------------------------------------------------
 cmake/Dependencies.cmake |   7 ++-
 cmake/ProtoBuf.cmake     | 116 ------------------------------------------
 src/CMakeLists.txt       |  25 ++++++---
 test/CMakeLists.txt      |   1 +
 4 files changed, 26 insertions(+), 123 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/da23e577/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 5505ad6..d3f0b00 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,6 +1,11 @@
 SET(SINGA_LINKER_LIBS "")
 
-INCLUDE("cmake/ProtoBuf.cmake")
+#INCLUDE("cmake/ProtoBuf.cmake")
+
+FIND_PACKAGE( Protobuf REQUIRED )
+INCLUDE_DIRECTORIES(SYSTEM ${PROTOBUF_INCLUDE_DIR})
+MESSAGE(STATUS "proto libs " ${PROTOBUF_LIBRARIES})
+LIST(APPEND singa_linker_libs ${PROTOBUF_LIBRARIES})
 
 IF(USE_LMDB)
     FIND_PACKAGE(LMDB REQUIRED)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/da23e577/cmake/ProtoBuf.cmake
----------------------------------------------------------------------
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
deleted file mode 100644
index 437d136..0000000
--- a/cmake/ProtoBuf.cmake
+++ /dev/null
@@ -1,116 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-# This file is adpated from Caffe cmake/ProtoBuf.cmake.
-# We changed 'caffe' to 'singa'
-
-# Finds Google Protocol Buffers library and compilers and extends
-# the standard cmake script with version and python generation support
-
-find_package( Protobuf REQUIRED )
-include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR})
-MESSAGE(STATUS "proto libs " ${PROTOBUF_LIBRARIES})
-list(APPEND singa_linker_libs ${PROTOBUF_LIBRARIES})
-
-# As of Ubuntu 14.04 protoc is no longer a part of libprotobuf-dev package
-# and should be installed separately as in: sudo apt-get install
-# protobuf-compiler
-if(EXISTS ${PROTOBUF_PROTOC_EXECUTABLE})
-  message(STATUS "Found PROTOBUF Compiler: ${PROTOBUF_PROTOC_EXECUTABLE}")
-else()
-  message(FATAL_ERROR "Could not find PROTOBUF Compiler")
-endif()
-
-#if(PROTOBUF_FOUND)
-#  # fetches protobuf version
-#  caffe_parse_header(${PROTOBUF_INCLUDE_DIR}/google/protobuf/stubs/common.h VERION_LINE GOOGLE_PROTOBUF_VERSION)
-#  string(REGEX MATCH "([0-9])00([0-9])00([0-9])" PROTOBUF_VERSION ${GOOGLE_PROTOBUF_VERSION})
-#  set(PROTOBUF_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
-#  unset(GOOGLE_PROTOBUF_VERSION)
-#endif()
-
-# place where to generate protobuf sources
-set(proto_gen_folder "${PROJECT_BINARY_DIR}/include/singa/proto")
-include_directories("${PROJECT_BINARY_DIR}/include")
-
-set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE)
-
-###############################################################################
-# Modification of standard 'protobuf_generate_cpp()' with output dir parameter
-# and python support
-# Usage:
-#   singa_protobuf_generate_cpp_py(<output_dir> <srcs_var> <hdrs_var>
-#                                  <python_var> <proto_files>)
-function(singa_protobuf_generate_cpp_py output_dir srcs_var hdrs_var python_var)
-  if(NOT ARGN)
-    message(SEND_ERROR
-      "Error: singa_protobuf_generate_cpp_py() called without any proto files")
-    return()
-  endif()
-
-  if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
-    # Create an include path for each file specified
-    foreach(fil ${ARGN})
-      get_filename_component(abs_fil ${fil} ABSOLUTE)
-      get_filename_component(abs_path ${abs_fil} PATH)
-      list(FIND _protoc_include ${abs_path} _contains_already)
-      if(${_contains_already} EQUAL -1)
-        list(APPEND _protoc_include -I ${abs_path})
-      endif()
-    endforeach()
-  else()
-    set(_protoc_include -I ${CMAKE_CURRENT_SOURCE_DIR})
-  endif()
-
-  if(DEFINED PROTOBUF_IMPORT_DIRS)
-    foreach(dir ${PROTOBUF_IMPORT_DIRS})
-      get_filename_component(abs_path ${dir} ABSOLUTE)
-      list(FIND _protoc_include ${abs_path} _contains_already)
-      if(${_contains_already} EQUAL -1)
-        list(APPEND _protoc_include -I ${abs_path})
-      endif()
-    endforeach()
-  endif()
-
-  set(${srcs_var})
-  set(${hdrs_var})
-  set(${python_var})
-  foreach(fil ${ARGN})
-    get_filename_component(abs_fil ${fil} ABSOLUTE)
-    get_filename_component(fil_we ${fil} NAME_WE)
-
-    list(APPEND ${srcs_var} "${output_dir}/${fil_we}.pb.cc")
-    list(APPEND ${hdrs_var} "${output_dir}/${fil_we}.pb.h")
-    list(APPEND ${python_var} "${output_dir}/${fil_we}_pb2.py")
-
-    add_custom_command(
-      OUTPUT "${output_dir}/${fil_we}.pb.cc"
-             "${output_dir}/${fil_we}.pb.h"
-             "${output_dir}/${fil_we}_pb2.py"
-      COMMAND ${CMAKE_COMMAND} -E make_directory "${output_dir}"
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --cpp_out    ${output_dir} ${_protoc_include} ${abs_fil}
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${output_dir} ${_protoc_include} ${abs_fil}
-      DEPENDS ${abs_fil}
-      COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
-  endforeach()
-
-  set_source_files_properties(${${srcs_var}} ${${hdrs_var}} ${${python_var}} PROPERTIES GENERATED TRUE)
-  set(${srcs_var} ${${srcs_var}} PARENT_SCOPE)
-  set(${hdrs_var} ${${hdrs_var}} PARENT_SCOPE)
-  set(${python_var} ${${python_var}} PARENT_SCOPE)
-endfunction()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/da23e577/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 28066de..952f7ee 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,8 +1,21 @@
 # generate protobuf sources
 FILE(GLOB proto_files proto/*.proto)
-singa_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
-# include python files either to force generation
-ADD_LIBRARY(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_files})
+protobuf_generate_python(proto_pys ${proto_files})
+INCLUDE_DIRECTORIES("${CMAKE_BINARY_DIR}/include")
+#message(STATUS "include: ${CMAKE_BINARY_DIR} ")
+#message(STATUS "srcs: ${proto_srcs}")
+#message(STATUS "hdrs: ${proto_hdrs}")
+#message(STATUS "pys: ${proto_pys}")
+ADD_LIBRARY(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_pys})
+FOREACH(fil ${proto_hdrs})
+    ADD_CUSTOM_COMMAND(
+        TARGET proto PRE_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_BINARY_DIR}/include/singa/proto"
+        COMMAND ${CMAKE_COMMAND} -E copy ${fil} "${CMAKE_BINARY_DIR}/include/singa/proto"
+        #COMMAND ${CMAKE_COMMAND} -E echo "copy done"
+        )
+ENDFOREACH()
 LIST(APPEND SINGA_LINKER_LIBS proto)
 
 #FILE(GLOB_RECURSE utils_source ${CMAKE_CURRENT_SOURCE_DIR}/utils/ "*.cc")
@@ -19,14 +32,14 @@ AUX_SOURCE_DIRECTORY(core/memory core_source)
 AUX_SOURCE_DIRECTORY(core/scheduler core_source)
 AUX_SOURCE_DIRECTORY(core/tensor core_source)
 FILE(GLOB_RECURSE cuda_source core "*.cu")
-set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
-set(CMAKE_CXX_FLAGS "")
+SET(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
+SET(CMAKE_CXX_FLAGS "")
 CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
 #message(STATUS "FLAGS ${CMAKE_CXX_FLAGS}")
 #message(STATUS "CORE ${cuda_source}")
 #message(STATUS "OBJ ${cuda_objs}")
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/core/tensor")
-set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
+SET(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 ADD_LIBRARY(singa_core SHARED ${core_source} ${cuda_objs})
 TARGET_LINK_LIBRARIES(singa_core ${SINGA_LINKER_LIBS})
 LIST(APPEND SINGA_LINKER_LIBS singa_core)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/da23e577/test/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a8b0e29..92401ad 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,5 @@
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
 ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" "gtest/gtest-all.cc")
 
 AUX_SOURCE_DIRECTORY(singa singa_test_source)

[19/50] [abbrv] incubator-singa git commit: SINGA-185 Add CBLAS and GLOG detection for singav1

Posted by zh...@apache.org.

SINGA-185 Add CBLAS and GLOG detection for singav1

Add cblas and glog detection. USE_CBLAS is ON as a default setting.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c6ae7861
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c6ae7861
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c6ae7861

Branch: refs/heads/master
Commit: c6ae7861afd6c65846ed0497cf94fb277e5f4897
Parents: 870d1a9
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Mon May 30 14:17:41 2016 +0800
Committer: xiezl <xi...@comp.nus.edu.sg>
Committed: Mon May 30 14:17:41 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                   |  2 +-
 cmake/Cuda.cmake                 |  2 +-
 cmake/Dependencies.cmake         | 16 ++++++++++++++--
 cmake/Thirdparty/FindCBLAS.cmake | 11 +++++++++++
 cmake/Thirdparty/FindGlog.cmake  | 11 +++++++++++
 cmake/Thirdparty/FindLMDB.cmake  |  2 +-
 6 files changed, 39 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c6ae7861/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d585497..fbe3adc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,7 +17,7 @@ SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
 INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
 
 #OPTION(CPU_ONLY "use GPU libs" OFF)
-OPTION(USE_CBLAS "Use CBlas libs" OFF)
+OPTION(USE_CBLAS "Use CBlas libs" ON)
 OPTION(USE_CUDA "Use Cuda libs" ON)
 OPTION(USE_CUDNN "Use Cudnn libs" ON)
 OPTION(USE_OPENCV "Use opencv" OFF)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c6ae7861/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 4985bb8..a74c82b 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -22,4 +22,4 @@ ENDIF()
 
 INCLUDE_DIRECTORIES(SYSTEM ${CUDA_INCLUDE_DIRS})
 LIST(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
-MESSAGE(STATUS "libs " ${SINGA_LINKER_LIBS})
+#MESSAGE(STATUS "libs " ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c6ae7861/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e995553..5505ad6 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,11 +1,12 @@
-SET(Singa_LINKER_LIBS "")
+SET(SINGA_LINKER_LIBS "")
 
 INCLUDE("cmake/ProtoBuf.cmake")
 
 IF(USE_LMDB)
     FIND_PACKAGE(LMDB REQUIRED)
     INCLUDE_DIRECTORIES(SYSTEM ${LMDB_INCLUDE_DIR})
-    LIST(APPEND Singa_LINKER_LIBS ${LMDB_LIBRARIES})
+    LIST(APPEND SINGA_LINKER_LIBS ${LMDB_LIBRARIES})
+    MESSAGE(STATUS "FOUND lmdb at ${LMDB_INCLUDE_DIR}")
 ENDIF()
 
 IF(NOT CPU_ONLY)
@@ -15,6 +16,17 @@ ELSE()
     SET(USE_CUDNN FALSE)
 ENDIF()
 
+IF(USE_CBLAS)
+    FIND_PACKAGE(CBLAS REQUIRED)
+    INCLUDE_DIRECTORIES(SYSTEM ${CBLAS_INCLUDE_DIR})
+    LIST(APPEND SINGA_LINKER_LIBS ${CBLAS_LIBRARIES})
+    MESSAGE(STATUS "FOUND cblas at ${CBLAS_LIBRARIES}")
+ENDIF()
+
+FIND_PACKAGE(Glog REQUIRED)
+INCLUDE_DIRECTORIES(SYSTEM ${GLOG_INCLUDE_DIRS})
+LIST(APPEND SINGA_LINKER_LIBS ${GLOG_LIBRARIES})
+#MESSAGE(STATUS "Found glog at ${GLOG_INCLUDE_DIRS}")
 
 #LIST(APPEND SINGA_LINKER_LIBS "/home/wangwei/local/lib/libopenblas.so")
 #MESSAGE(STATUS "link lib : " ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c6ae7861/cmake/Thirdparty/FindCBLAS.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindCBLAS.cmake b/cmake/Thirdparty/FindCBLAS.cmake
new file mode 100644
index 0000000..413d1c1
--- /dev/null
+++ b/cmake/Thirdparty/FindCBLAS.cmake
@@ -0,0 +1,11 @@
+
+FIND_PATH(CBLAS_INCLUDE_DIR NAMES cblas.h PATHS "$ENV{CBLAS_DIR}/include")
+FIND_LIBRARY(CBLAS_LIBRARIES NAMES openblas PATHS "$ENV{CBLAS_DIR}/lib")
+
+INCLUDE(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIR CBLAS_LIBRARIES)
+
+IF(CBLAS_FOUND)
+    #    MESSAGE(STATUS "Found cblas at ${CBLAS_INCLUDE_DIR}")
+    MARK_AS_ADVANCED(CBLAS_INCLUDE_DIR CBLAS_LIBRARIES)
+ENDIF()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c6ae7861/cmake/Thirdparty/FindGlog.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindGlog.cmake b/cmake/Thirdparty/FindGlog.cmake
new file mode 100644
index 0000000..c0fdf83
--- /dev/null
+++ b/cmake/Thirdparty/FindGlog.cmake
@@ -0,0 +1,11 @@
+
+FIND_PATH(GLOG_INCLUDE_DIR NAMES glog/logging.h PATHS "$ENV{GLOG_DIR}/include")
+FIND_LIBRARY(GLOG_LIBRARIES NAMES glog)
+
+INCLUDE(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GLOG DEFAULT_MSG GLOG_INCLUDE_DIR GLOG_LIBRARIES)
+
+IF(GLOG_FOUND)
+    #    MESSAGE(STATUS "Found glog at ${GLOG_INCLUDE_DIR}")
+    MARK_AS_ADVANCED(GLOG_INCLUDE_DIR GLOG_LIBRARIES)
+ENDIF()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c6ae7861/cmake/Thirdparty/FindLMDB.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindLMDB.cmake b/cmake/Thirdparty/FindLMDB.cmake
index c402d99..cf45e00 100644
--- a/cmake/Thirdparty/FindLMDB.cmake
+++ b/cmake/Thirdparty/FindLMDB.cmake
@@ -6,7 +6,7 @@ INCLUDE(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(LMDB DEFAULT_MSG LMDB_INCLUDE_DIR LMDB_LIBRARIES)
 
 IF(LMDB_FOUND)
-    MESSAGE(STATUS "Found lmdb at $ENV{LMDB_DIR}")
+    MESSAGE(STATUS "Found lmdb at ${LMDB_INCLUDE_DIR}")
     MARK_AS_ADVANCED(LMDB_INCLUDE_DIR LMDB_LIBRARIES)
     
 ENDIF()

[34/50] [abbrv] incubator-singa git commit: SINGA-190 - Add prelu layer and flatten layer

Posted by zh...@apache.org.

SINGA-190 - Add prelu layer and flatten layer

Implement prelu layer and flatten layer for cpu version.

Write gtest for prelu and flatten layer.

Pass all tests.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5afd81b7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5afd81b7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5afd81b7

Branch: refs/heads/master
Commit: 5afd81b7f4841b15ce292b5b2e3e26c25a79b912
Parents: 04e23d1
Author: jixin <ji...@comp.nus.edu.sg>
Authored: Wed Jun 8 15:58:09 2016 +0800
Committer: jixin <ji...@comp.nus.edu.sg>
Committed: Sat Jun 11 16:46:59 2016 +0800

----------------------------------------------------------------------
 src/model/layer/flatten.cc |  62 ++++++++
 src/model/layer/flatten.h  |  54 +++++++
 src/model/layer/prelu.cc   | 169 +++++++++++++++++++++
 src/model/layer/prelu.h    |  60 ++++++++
 src/proto/model.proto      | 321 ++++++++++++++++++----------------------
 test/singa/test_flatten.cc | 156 +++++++++++++++++++
 test/singa/test_prelu.cc   | 149 +++++++++++++++++++
 7 files changed, 793 insertions(+), 178 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5afd81b7/src/model/layer/flatten.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/flatten.cc b/src/model/layer/flatten.cc
new file mode 100644
index 0000000..3ed37fe
--- /dev/null
+++ b/src/model/layer/flatten.cc
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./flatten.h"
+namespace singa {
+
+void Flatten::Setup(const LayerConf &conf) {
+  Layer::Setup(conf);
+  axis_ = conf.flatten_conf().axis();
+}
+
+const Tensor Flatten::Forward(int flag, const Tensor &input) {
+  Tensor output = input;
+  input_shape_ = input.shape();
+  if (!Axis()) {
+    // reshape to 1D
+    size_t dim = output.Size();
+    output.Reshape(Shape {
+      dim
+    });
+    output_shape_ = Shape { dim }
+    ;
+  } else {
+    // reshape to 2D
+    size_t dim1 = 1, dim2;
+    for (int i = 0; i < Axis(); i++)
+      dim1 *= output.shape(i);
+    dim2 = output.Size() / dim1;
+    output.Reshape(Shape {
+      dim1, dim2
+    });
+    output_shape_ = Shape { dim1, dim2 }
+    ;
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor> > Flatten::Backward(int flag,
+                                                           const Tensor &grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad = grad;
+  input_grad.Reshape(Input_shape());
+  return std::make_pair(input_grad, param_grad);
+}
+
+} // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5afd81b7/src/model/layer/flatten.h
----------------------------------------------------------------------
diff --git a/src/model/layer/flatten.h b/src/model/layer/flatten.h
new file mode 100644
index 0000000..cb36542
--- /dev/null
+++ b/src/model/layer/flatten.h
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_FLATTEN_H_
+#define SRC_MODEL_LAYER_FLATTEN_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Flatten : public Layer {
+public:
+  /// \copydoc Layer::layer_type();
+  const std::string layer_type() const override { return "Flatten"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf &conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&);
+  const Tensor Forward(int flag, const Tensor &input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor> > Backward(int flag,
+                                                    const Tensor &grad)
+      override;
+
+  const int Axis() const { return axis_; }
+  const Shape Input_shape() const { return input_shape_; }
+  const Shape Output_shape() const { return output_shape_; }
+
+protected:
+  /// flatten layer reshape the input to 2D, one from 0 to axis_-1, one from
+  /// axis_ to end.
+  /// if axis_ is 0, reshape the input to 1D.
+  int axis_;
+  Shape input_shape_, output_shape_;
+};
+}      // namespace singa
+#endif // SRC_MODEL_LAYER_FLATTEN_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5afd81b7/src/model/layer/prelu.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/prelu.cc b/src/model/layer/prelu.cc
new file mode 100644
index 0000000..1d6a2e7
--- /dev/null
+++ b/src/model/layer/prelu.cc
@@ -0,0 +1,169 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./prelu.h"
+namespace singa {
+
+void PReLU::Setup(const LayerConf &conf) {
+  Layer::Setup(conf);
+  channel_shared_ = conf.prelu_conf().channel_shared();
+  format_ = conf.prelu_conf().format();
+  // Push back params into param_values_
+  for (const auto &spec : conf.param())
+    param_specs_.push_back(spec);
+  param_values_.push_back(&a_);
+}
+
+const Tensor PReLU::Forward(int flag, const Tensor &input) {
+  Tensor output;
+  if (!channel_shared_) {
+    size_t n, c, h, w;
+    Tensor temp = (input <= 0.f);
+    if (temp.nDim() == 4) {
+      if (format_ == "NCHW") {
+        n = temp.shape(0);
+        c = temp.shape(1);
+        h = temp.shape(2);
+        w = temp.shape(3);
+        temp.Reshape(Shape {
+          n *c, h *w
+        });
+        Tensor temp_a(Shape {
+          n, c
+        });
+        Uniform(1.f, 1.f, &temp_a);
+        MultRow(a_, &temp_a);
+        temp_a.Reshape(Shape {
+          n *c
+        });
+        MultColumn(temp_a, &temp);
+      } else if (format_ == "NHWC") {
+        n = temp.shape(0);
+        h = temp.shape(1);
+        w = temp.shape(2);
+        c = temp.shape(3);
+        temp.Reshape(Shape {
+          n *h *w, c
+        });
+        MultRow(a_, &temp);
+      } else {
+        LOG(FATAL) << "Incorrect input format for prelu layer.";
+      }
+    } else {
+      LOG(FATAL) << "Incorrect input format for prelu layer.";
+    }
+    output = input * ((input > 0.f) + temp);
+  } else {
+    // share the first param of Tensor A along all channels
+    const float a = a_.data<const float *>()[0];
+    output = input * ((input > 0.f) + (input <= 0.f) * a);
+  }
+  if (flag & kTrain)
+    buf_.push(input);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
+                                                         const Tensor &grad) {
+  vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
+  Tensor input_grad, input = buf_.top();
+  buf_.pop();
+  Tensor da;
+  da.ResetLike(a_);
+  if (!channel_shared_) {
+    size_t n, c, h, w;
+    Tensor temp1 = (input <= 0.f);
+    if (temp1.nDim() == 4) {
+      if (format_ == "NCHW") {
+        n = temp1.shape(0);
+        c = temp1.shape(1);
+        h = temp1.shape(2);
+        w = temp1.shape(3);
+        temp1.Reshape(Shape {
+          n *c, h *w
+        });
+        Tensor temp_a(Shape {
+          n, c
+        });
+        Uniform(1.f, 1.f, &temp_a);
+        MultRow(a_, &temp_a);
+        temp_a.Reshape(Shape {
+          n *c
+        });
+        MultColumn(temp_a, &temp1);
+        temp1.Reshape(Shape {
+          n, c, h, w
+        });
+      } else if (format_ == "NHWC") {
+        n = temp1.shape(0);
+        h = temp1.shape(1);
+        w = temp1.shape(2);
+        c = temp1.shape(3);
+        temp1.Reshape(Shape {
+          n *h *w, c
+        });
+        MultRow(a_, &temp1);
+        temp1.Reshape(Shape {
+          n, h, w, c
+        });
+      } else {
+        LOG(FATAL) << "Incorrect input format for prelu layer.";
+      }
+    } else {
+      LOG(FATAL) << "Incorrect input format for prelu layer.";
+    }
+    input_grad = grad * input * ((input > 0.f) + temp1);
+    Tensor temp2 = grad * input * (input <= 0.f), temp3(Shape {
+      n *c
+    });
+    if (format_ == "NCHW") {
+      temp2.Reshape(Shape {
+        n *c, h *w
+      });
+      SumColumns(temp2, &temp3);
+      temp3.Reshape(Shape {
+        n, c
+      });
+      SumRows(temp3, &da);
+    } else if (format_ == "NHWC") {
+      temp2.Reshape(Shape {
+        n *h *w, c
+      });
+      SumRows(temp2, &da);
+    }
+  } else {
+    // share the first param of Tensor A along all channels
+    const float a = a_.data<const float *>()[0];
+    input_grad = grad * input * ((input > 0.f) + (input <= 0.f) * a);
+    Tensor temp = grad * input * (input <= 0.f);
+    float sum = Sum<float>(temp);
+    Uniform(1.f, 1.f, &da);
+    da *= sum;
+  }
+  param_grad.push_back(da);
+  return std::make_pair(input_grad, param_grad);
+}
+
+void PReLU::ToDevice(Device *device) {
+  Layer::ToDevice(device);
+  a_.ToDevice(device);
+}
+
+} // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5afd81b7/src/model/layer/prelu.h
----------------------------------------------------------------------
diff --git a/src/model/layer/prelu.h b/src/model/layer/prelu.h
new file mode 100644
index 0000000..1a01d98
--- /dev/null
+++ b/src/model/layer/prelu.h
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_PRELU_H_
+#define SINGA_MODEL_LAYER_PRELU_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class PReLU : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+   const std::string layer_type() const override { return "PReLU"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf &conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor &input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor> > Backward(int flag,
+                                                    const Tensor &grad)
+      override;
+
+  void ToDevice(Device *device);
+
+  const bool Channel_shared() const { return channel_shared_; }
+  const Tensor A() const { return a_; }
+  const std::string Format() const { return format_; }
+
+  void Set_a(Tensor a) {
+    a_.ResetLike(a);
+    a_.CopyData(a);
+  }
+
+ protected:
+  bool channel_shared_;
+  std::string format_; // format_ has two valid value, i.e. NCHW, NHWC
+  Tensor a_; // shape of a_ is 2D, i.e. (channels, 1)
+  std::stack<Tensor> buf_;
+};
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_PRELU_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5afd81b7/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index d368296..1d1f3cf 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -33,64 +33,59 @@ package singa;
 /// using Python (or C++/Java).
 
 // Specifies the shape (dimensions) of a Blob.
-message BlobShape {
-  repeated int64 dim = 1 [packed = true];
-}
+message BlobShape { repeated int64 dim = 1[packed = true]; }
 
 message BlobProto {
   optional BlobShape shape = 7;
-  repeated float data = 5 [packed = true];
-  repeated float diff = 6 [packed = true];
-  repeated double double_data = 8 [packed = true];
-  repeated double double_diff = 9 [packed = true];
+  repeated float data = 5[packed = true];
+  repeated float diff = 6[packed = true];
+  repeated double double_data = 8[packed = true];
+  repeated double double_diff = 9[packed = true];
 
   // 4D dimensions -- deprecated.  Use "shape" instead.
-  optional int32 num = 1 [default = 0];
-  optional int32 channels = 2 [default = 0];
-  optional int32 height = 3 [default = 0];
-  optional int32 width = 4 [default = 0];
+  optional int32 num = 1[default = 0];
+  optional int32 channels = 2[default = 0];
+  optional int32 height = 3[default = 0];
+  optional int32 width = 4[default = 0];
 }
 
 message FillerConf {
   // The filler type, case insensitive
-  optional string type = 1 [default = 'constant'];
-  optional float value = 2 [default = 0]; // the value in constant filler
-  optional float min = 3 [default = 0]; // the min value in uniform filler
-  optional float max = 4 [default = 1]; // the max value in uniform filler
-  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
-  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  optional string type = 1[default = 'constant'];
+  optional float value = 2[default = 0]; // the value in constant filler
+  optional float min = 3[default = 0];   // the min value in uniform filler
+  optional float max = 4[default = 1];   // the max value in uniform filler
+  optional float mean = 5[default = 0];  // the mean value in Gaussian filler
+  optional float std = 6[default = 1];   // the std value in Gaussian filler
   // The expected number of non-zero output weights for a given input in
   // Gaussian filler -- the default -1 means don't perform sparsification.
   /* optional int32 sparse = 7 [default = -1]; */
   // Normalize the filler variance by fan_in, fan_out, or their average.
   // Applies to 'xavier' and 'msra' fillers.
   enum VarianceNorm {
-    FAN_IN = 0;
-    FAN_OUT = 1;
-    AVERAGE = 2;
-  }
-  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+    FAN_IN = 0; FAN_OUT = 1; AVERAGE = 2;
+  } optional VarianceNorm variance_norm = 8[default = FAN_IN];
 }
 
 /// SINGA message
 message OptimizerConf {
   // case insensitive
-  optional string type = 1 [default = "sgd"];
+  optional string type = 1[default = "sgd"];
 
   // used by RMSprop and Adadelta
-  optional float rho = 2 [default = 0.001];
+  optional float rho = 2[default = 0.001];
 
   // used by Adam and AdamMax
-  optional float beta_1 = 3 [default = 0.9];
-  optional float beta_2 = 4 [default = 0.999];
+  optional float beta_1 = 3[default = 0.9];
+  optional float beta_2 = 4[default = 0.999];
 
   // used by vanilla sgd and nesterov
-  optional float momentum = 5 [default = 0.9];
+  optional float momentum = 5[default = 0.9];
 }
 
 message ConstraintConf {
   // case insensitive to limit the parameter value/gradient scale
-  optional string type = 1 [default = "l2"];
+  optional string type = 1[default = "l2"];
   // e.g., the threshold for limiting the parameter scale.
   optional float threshold = 2;
 }
@@ -98,7 +93,7 @@ message ConstraintConf {
 /// SINGA message
 message RegularizerConf {
   // case insensitive to regularize the parameters, e.g., L2.
-  optional string type = 1 [default = "l2"];
+  optional string type = 1[default = "l2"];
   // e.g., the weight decay for L2 regularizer
   optional float coefficient = 2;
 }
@@ -124,10 +119,10 @@ message ParamSpec {
   */
 
   // The multiplier on the global learning rate for this parameter.
-  optional float lr_mult = 3 [default = 1.0];
+  optional float lr_mult = 3[default = 1.0];
 
   // The multiplier on the global weight decay for this parameter.
-  optional float decay_mult = 4 [default = 1.0];
+  optional float decay_mult = 4[default = 1.0];
 
   // SINGA uses this filed internally. Users just configure the fillers in
   // Layer specific conf message as caffe (style).
@@ -137,14 +132,13 @@ message ParamSpec {
 }
 
 enum Phase {
-  kTrain = 4;
-  kEval = 8;
-}
-// NOTE
-// Update the next available ID when you add a new LayerConf field.
-//
-// LayerConf next available layer-specific ID: 139 (last added: tile_param)
-message LayerConf {
+  kTrain = 4; kEval = 8;
+}
+    // NOTE
+    // Update the next available ID when you add a new LayerConf field.
+    //
+    // LayerConf next available layer-specific ID: 139 (last added: tile_param)
+    message LayerConf {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
   /* repeated string bottom = 3; // the name of each bottom blob */
@@ -248,7 +242,8 @@ message TransformationConf {
   optional uint32 crop_size = 3 [default = 0];
   // mean_file and mean_value cannot be specified at the same time
   optional string mean_file = 4;
-  // if specified can be repeated once (would substract it from all the channels)
+  // if specified can be repeated once (would substract it from all the
+channels)
   // or can be repeated the same number of times as channels
   // (would subtract them from the corresponding channel)
   repeated float mean_value = 5;
@@ -265,34 +260,33 @@ message LossConf {
   optional int32 ignore_label = 1;
   // If true, normalize each batch across all instances (including spatial
   // dimesions, but not ignored instances); else, divide by batch size only.
-  optional bool normalize = 2 [default = true];
+  optional bool normalize = 2[default = true];
 }
 
 message MetricConf {
   // When computing accuracy, count as correct by comparing the true label to
   // the top k scoring classes.  By default, only compare to the top scoring
   // class (i.e. argmax).
-  optional uint32 top_k = 1 [default = 1];
+  optional uint32 top_k = 1[default = 1];
 
   // The "label" axis of the prediction blob, whose argmax corresponds to the
   // predicted label -- may be negative to index from the end (e.g., -1 for the
   // last axis).  For example, if axis == 1 and the predictions are
   // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
   // labels with integer values in {0, 1, ..., C-1}.
-  optional int32 axis = 2 [default = 1];
+  optional int32 axis = 2[default = 1];
 
   // If specified, ignore instances with the given label.
   optional int32 ignore_label = 3;
 }
-// Messages that store hyper-parameters used by individual layer types follow, in
+// Messages that store hyper-parameters used by individual layer types follow,
+// in
 // alphabetical order.
 
-
-
 message ArgMaxConf {
   // If true produce pairs (argmax, maxval)
-  optional bool out_max_val = 1 [default = false];
-  optional uint32 top_k = 2 [default = 1];
+  optional bool out_max_val = 1[default = false];
+  optional uint32 top_k = 2[default = 1];
   // The axis along which to maximise -- may be negative to index from the
   // end (e.g., -1 for the last axis).
   // By default ArgMaxLayer maximizes over the flattened trailing dimensions
@@ -305,54 +299,51 @@ message ConcatConf {
   // end (e.g., -1 for the last axis).  Other axes must have the
   // same dimension for all the bottom blobs.
   // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 2 [default = 1];
+  optional int32 axis = 2[default = 1];
 
   // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 concat_dim = 1 [default = 1];
+  optional uint32 concat_dim = 1[default = 1];
 }
 
 message ContrastiveLossConf {
   // margin for dissimilar pair
-  optional float margin = 1 [default = 1.0];
+  optional float margin = 1[default = 1.0];
   // The first implementation of this cost did not exactly match the cost of
   // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
   // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
   // Hadsell paper. New models should probably use this version.
   // legacy_version = true uses (margin - d^2). This is kept to support /
   // reproduce existing models and results
-  optional bool legacy_version = 2 [default = false];
+  optional bool legacy_version = 2[default = false];
 }
 
 message ConvolutionConf {
   optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional bool bias_term = 2[default = true]; // whether to have bias terms
 
   // Pad, kernel size, and stride are all given as a single value for equal
   // dimensions in all spatial dimensions, or once per spatial dimension.
-  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 pad = 3;         // The padding size; defaults to 0
   repeated uint32 kernel_size = 4; // The kernel size
-  repeated uint32 stride = 6; // The stride; defaults to 1
+  repeated uint32 stride = 6;      // The stride; defaults to 1
 
   // For 2D convolution only, the *_h and *_w versions may also be used to
   // specify both spatial dimensions.
-  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
-  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
-  optional uint32 kernel_h = 11; // The kernel height (2D only)
-  optional uint32 kernel_w = 12; // The kernel width (2D only)
-  optional uint32 stride_h = 13; // The stride height (2D only)
-  optional uint32 stride_w = 14; // The stride width (2D only)
+  optional uint32 pad_h = 9[default = 0];  // The padding height (2D only)
+  optional uint32 pad_w = 10[default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11;           // The kernel height (2D only)
+  optional uint32 kernel_w = 12;           // The kernel width (2D only)
+  optional uint32 stride_h = 13;           // The stride height (2D only)
+  optional uint32 stride_w = 14;           // The stride width (2D only)
 
   // SINGA: not supported.
   // optional uint32 group = 5 [default = 1]; // The group size for group conv
 
   optional FillerConf weight_filler = 7; // The filler for the weight
-  optional FillerConf bias_filler = 8; // The filler for the bias
+  optional FillerConf bias_filler = 8;   // The filler for the bias
   enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 15 [default = DEFAULT];
+    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
+  } optional Engine engine = 15[default = DEFAULT];
 
   // The axis to interpret as "channels" when performing convolution.
   // Preceding dimensions are treated as independent inputs;
@@ -374,13 +365,12 @@ message ConvolutionConf {
   // SINGA: not supported;
   // optional bool force_nd_im2col = 17 [default = false];
 
-
   // SINGA: add by xiangrui
   // cudnn workspace size in MB
-  optional int32 workspace_byte_limit = 50 [default = 512];
+  optional int32 workspace_byte_limit = 50[default = 512];
   // cudnn algorithm preference
   // options: "fastest", "limited_workspace", "no_workspace"
-  optional string prefer = 51 [default = "fastest"];
+  optional string prefer = 51[default = "fastest"];
   // input shape
   optional int32 channels = 52;
   optional int32 height = 53;
@@ -424,7 +414,7 @@ message DataConf {
 */
 
 message DropoutConf {
-  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+  optional float dropout_ratio = 1[default = 0.5]; // dropout ratio
 }
 
 // DummyDataLayer fills any number of arbitrarily shaped blobs with random
@@ -448,16 +438,13 @@ message DummyDataConf {
 
 message EltwiseConf {
   enum EltwiseOp {
-    PROD = 0;
-    SUM = 1;
-    MAX = 2;
-  }
-  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+    PROD = 0; SUM = 1; MAX = 2;
+  } optional EltwiseOp operation = 1[default = SUM]; // element-wise operation
   repeated float coeff = 2; // blob-wise coefficient for SUM operation
 
   // Whether to use an asymptotically slower (for >2 inputs) but stabler method
   // of computing the gradient for the PROD operation. (No effect for SUM op.)
-  optional bool stable_prod_grad = 3 [default = true];
+  optional bool stable_prod_grad = 3[default = true];
 }
 
 // Message that stores hyper-parameters used by EmbedLayer
@@ -468,9 +455,9 @@ message EmbedConf {
   // 1 greater than the maximum possible input value.
   optional uint32 input_dim = 2;
 
-  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
-  optional FillerConf weight_filler = 4; // The filler for the weight
-  optional FillerConf bias_filler = 5; // The filler for the bias
+  optional bool bias_term = 3[default = true]; // Whether to use a bias term
+  optional FillerConf weight_filler = 4;       // The filler for the weight
+  optional FillerConf bias_filler = 5;         // The filler for the bias
 
 }
 
@@ -479,21 +466,21 @@ message ExpConf {
   // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
   // Or if base is set to the default (-1), base is set to e,
   // so y = exp(shift + scale * x).
-  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
+  optional float base = 1[default = -1.0];
+  optional float scale = 2[default = 1.0];
+  optional float shift = 3[default = 0.0];
 }
 
 /// Message that stores hyper-parameters used by FlattenLayer
 message FlattenConf {
   // The first axis to flatten: all preceding axes are retained in the output.
   // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 1 [default = 1];
+  optional int32 axis = 1[default = 1];
 
   // The last axis to flatten: all following axes are retained in the output.
   // May be negative to index from the end (e.g., the default -1 for the last
   // axis).
-  optional int32 end_axis = 2 [default = -1];
+  optional int32 end_axis = 2[default = -1];
 }
 
 /*
@@ -519,11 +506,10 @@ message HDF5OutputConf {
 
 message HingeLossConf {
   enum Norm {
-    L1 = 1;
-    L2 = 2;
+    L1 = 1; L2 = 2;
   }
-  // Specify the Norm to use L1 or L2
-  optional Norm norm = 1 [default = L1];
+      // Specify the Norm to use L1 or L2
+      optional Norm norm = 1[default = L1];
 }
 
 /*
@@ -566,29 +552,29 @@ message InfogainLossConf {
 
 message InnerProductConf {
   optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
-  optional FillerConf weight_filler = 3; // The filler for the weight
-  optional FillerConf bias_filler = 4; // The filler for the bias
+  optional bool bias_term = 2[default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3;       // The filler for the weight
+  optional FillerConf bias_filler = 4;         // The filler for the bias
 
   // The first axis to be lumped into a single inner product computation;
   // all preceding axes are retained in the output.
   // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 5 [default = 1];
+  optional int32 axis = 5[default = 1];
 }
 
 message DenseConf {
   optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
-  optional FillerConf weight_filler = 3; // The filler for the weight
-  optional FillerConf bias_filler = 4; // The filler for the bias
+  optional bool bias_term = 2[default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3;       // The filler for the weight
+  optional FillerConf bias_filler = 4;         // The filler for the bias
 
   // The first axis to be lumped into a single inner product computation;
   // all preceding axes are retained in the output.
   // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 5 [default = 1];
+  optional int32 axis = 5[default = 1];
 
   optional uint32 num_input = 20; // The number of inputs for the layer
-  optional bool transpose = 21 [default = false]; // whether transpose or not
+  optional bool transpose = 21[default = false]; // whether transpose or not
 }
 
 // Message that stores hyper-parameters used by LogLayer
@@ -596,22 +582,20 @@ message LogConf {
   // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
   // Or if base is set to the default (-1), base is set to e,
   // so y = ln(shift + scale * x) = log_e(shift + scale * x)
-  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
+  optional float base = 1[default = -1.0];
+  optional float scale = 2[default = 1.0];
+  optional float shift = 3[default = 0.0];
 }
 
 // Message that stores hyper-parameters used by LRNLayer
 message LRNConf {
-  optional uint32 local_size = 1 [default = 5];
-  optional float alpha = 2 [default = 1.];
-  optional float beta = 3 [default = 0.75];
+  optional uint32 local_size = 1[default = 5];
+  optional float alpha = 2[default = 1.];
+  optional float beta = 3[default = 0.75];
   enum NormRegion {
-    ACROSS_CHANNELS = 0;
-    WITHIN_CHANNEL = 1;
-  }
-  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
-  optional float k = 5 [default = 1.];
+    ACROSS_CHANNELS = 0; WITHIN_CHANNEL = 1;
+  } optional NormRegion norm_region = 4[default = ACROSS_CHANNELS];
+  optional float k = 5[default = 1.];
 }
 
 message MemoryDataConf {
@@ -623,33 +607,30 @@ message MemoryDataConf {
 
 message MVNConf {
   // This parameter can be set to false to normalize mean only
-  optional bool normalize_variance = 1 [default = true];
+  optional bool normalize_variance = 1[default = true];
 
   // This parameter can be set to true to perform DNN-like MVN
-  optional bool across_channels = 2 [default = false];
+  optional bool across_channels = 2[default = false];
 
   // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 3 [default = 1e-9];
+  optional float eps = 3[default = 1e-9];
 }
 
 message PoolingConf {
   enum PoolMethod {
-    MAX = 0;
-    AVE = 1;
-    STOCHASTIC = 2;
-  }
-  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+    MAX = 0; AVE = 1; STOCHASTIC = 2;
+  } optional PoolMethod pool = 1[default = MAX]; // The pooling method
   // Pad, kernel size, and stride are all given as a single value for equal
   // dimensions in height and width or as Y, X pairs.
-  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
-  optional uint32 pad_h = 9 [default = 0]; // The padding height
-  optional uint32 pad_w = 10 [default = 0]; // The padding width
-  optional uint32 kernel_size = 2; // The kernel size (square)
-  optional uint32 kernel_h = 5; // The kernel height
-  optional uint32 kernel_w = 6; // The kernel width
-  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
-  optional uint32 stride_h = 7; // The stride height
-  optional uint32 stride_w = 8; // The stride width
+  optional uint32 pad = 4[default = 0];    // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9[default = 0];  // The padding height
+  optional uint32 pad_w = 10[default = 0]; // The padding width
+  optional uint32 kernel_size = 2;         // The kernel size (square)
+  optional uint32 kernel_h = 5;            // The kernel height
+  optional uint32 kernel_w = 6;            // The kernel width
+  optional uint32 stride = 3[default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7;            // The stride height
+  optional uint32 stride_w = 8;            // The stride width
   /*
   enum Engine {
     DEFAULT = 0;
@@ -660,20 +641,20 @@ message PoolingConf {
   */
   // If global_pooling then it will pool over the size of the bottom by doing
   // kernel_h = bottom->height and kernel_w = bottom->width
-  optional bool global_pooling = 12 [default = false];
+  optional bool global_pooling = 12[default = false];
   // Shape of source
   optional int32 channels = 50;
   optional int32 height = 51;
   optional int32 width = 52;
   // whether to propagate nan
-  optional bool nan_prop = 53 [default = false];
+  optional bool nan_prop = 53[default = false];
 }
 
 message PowerConf {
   // PowerLayer computes outputs y = (shift + scale * x) ^ power.
-  optional float power = 1 [default = 1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
+  optional float power = 1[default = 1.0];
+  optional float scale = 2[default = 1.0];
+  optional float shift = 3[default = 0.0];
 }
 /*
 message PythonConf {
@@ -684,7 +665,8 @@ message PythonConf {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // Whether this PythonLayer is shared among worker solvers during data
+parallelism.
   // If true, each worker solver sequentially run forward from this layer.
   // This value should be set true if you are using it as a data layer.
   optional bool share_in_parallel = 4 [default = false];
@@ -694,13 +676,8 @@ message PythonConf {
 // Message that stores hyper-parameters used by ReductionLayer
 message ReductionConf {
   enum ReductionOp {
-    SUM = 1;
-    ASUM = 2;
-    SUMSQ = 3;
-    MEAN = 4;
-  }
-
-  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+    SUM = 1; ASUM = 2; SUMSQ = 3; MEAN = 4;
+  } optional ReductionOp operation = 1[default = SUM]; // reduction operation
 
   // The first axis to reduce to a scalar -- may be negative to index from the
   // end (e.g., -1 for the last axis).
@@ -715,9 +692,9 @@ message ReductionConf {
   // If axis == 0 (the default), the output Blob always has the empty shape
   // (count 1), performing reduction across the entire input --
   // often useful for creating new loss functions.
-  optional int32 axis = 2 [default = 0];
+  optional int32 axis = 2[default = 0];
 
-  optional float coeff = 3 [default = 1.0]; // coefficient for output
+  optional float coeff = 3[default = 1.0]; // coefficient for output
 }
 
 // Message that stores hyper-parameters used by ReLULayer
@@ -727,7 +704,7 @@ message ReLUConf {
   // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
   // improve neural network acoustic models. In ICML Workshop on Deep Learning
   // for Audio, Speech, and Language Processing.
-  optional float negative_slope = 1 [default = 0];
+  optional float negative_slope = 1[default = 0];
   /*
   enum Engine {
     DEFAULT = 0;
@@ -798,58 +775,50 @@ message ReshapeConf {
   //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
   //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
   //
-  optional int32 axis = 2 [default = 0];
-  optional int32 num_axes = 3 [default = -1];
+  optional int32 axis = 2[default = 0];
+  optional int32 num_axes = 3[default = -1];
 }
 
 message SigmoidConf {
   enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 1 [default = DEFAULT];
+    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
+  } optional Engine engine = 1[default = DEFAULT];
 }
 
 message SliceConf {
   // The axis along which to slice -- may be negative to index from the end
   // (e.g., -1 for the last axis).
   // By default, SliceLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 3 [default = 1];
+  optional int32 axis = 3[default = 1];
   repeated uint32 slice_point = 2;
 
   // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 slice_dim = 1 [default = 1];
+  optional uint32 slice_dim = 1[default = 1];
 }
 
-// Message that stores hyper-parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+// Message that stores hyper-parameters used by SoftmaxLayer,
+// SoftmaxWithLossLayer
 message SoftmaxConf {
   enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 1 [default = DEFAULT];
+    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
+  } optional Engine engine = 1[default = DEFAULT];
 
   // The axis along which to perform the softmax -- may be negative to index
   // from the end (e.g., -1 for the last axis).
   // Any other axes will be evaluated as independent softmaxes.
-  optional int32 axis = 2 [default = 1];
+  optional int32 axis = 2[default = 1];
 }
 
 message TanHConf {
   enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 1 [default = DEFAULT];
+    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
+  } optional Engine engine = 1[default = DEFAULT];
 }
 
 // Message that stores hyper-parameters used by TileLayer
 message TileConf {
   // The index of the axis to tile.
-  optional int32 axis = 1 [default = 1];
+  optional int32 axis = 1[default = 1];
 
   // The number of copies (tiles) of the blob to output.
   optional int32 tiles = 2;
@@ -857,7 +826,7 @@ message TileConf {
 
 // Message that stores hyper-parameters used by ThresholdLayer
 message ThresholdConf {
-  optional float threshold = 1 [default = 0]; // Strictly positive values
+  optional float threshold = 1[default = 0]; // Strictly positive values
 }
 
 /*
@@ -897,18 +866,12 @@ message WindowDataConf {
 
 message SPPConf {
   enum PoolMethod {
-    MAX = 0;
-    AVE = 1;
-    STOCHASTIC = 2;
-  }
-  optional uint32 pyramid_height = 1;
-  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+    MAX = 0; AVE = 1; STOCHASTIC = 2;
+  } optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2[default = MAX]; // The pooling method
   enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 6 [default = DEFAULT];
+    DEFAULT = 0; CAFFE = 1; CUDNN = 2;
+  } optional Engine engine = 6[default = DEFAULT];
 }
 
 message PReLUConf {
@@ -918,13 +881,15 @@ message PReLUConf {
   // Initial value of a_i. Default is a_i=0.25 for all i.
   optional FillerConf filler = 1;
   // Whether or not slope paramters are shared across channels.
-  optional bool channel_shared = 2 [default = false];
+  optional bool channel_shared = 2[default = false];
+  // format of the input. Default is NCHW.
+  optional string format = 50[default = "NCHW"];
 }
 
 message BatchNormConf {
   // Used in the moving average computation runningMean =
   // newMean*factor + runningMean*(1-factor).
-  optional double factor = 1 [default = 0.9];
+  optional double factor = 1[default = 0.9];
   // input shape
   optional int32 channels = 2;
   optional int32 height = 3;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5afd81b7/test/singa/test_flatten.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_flatten.cc b/test/singa/test_flatten.cc
new file mode 100644
index 0000000..906e4b8
--- /dev/null
+++ b/test/singa/test_flatten.cc
@@ -0,0 +1,156 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/flatten.h"
+#include "gtest/gtest.h"
+
+using singa::Flatten;
+TEST(Flatten, Setup) {
+  Flatten flt;
+  EXPECT_EQ("Flatten", flt.layer_type());
+
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(1);
+
+  flt.Setup(conf);
+  EXPECT_EQ(1, flt.Axis());
+}
+
+TEST(Flatten, ForwardCPU) {
+  const float x[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
+                      -2.f, -1.f };
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Shape s = { 2, 1, 3, 2 };
+  singa::Tensor in(s);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 3;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(conf);
+
+  singa::Tensor out = flt.Forward(singa::kTrain, in);
+  EXPECT_EQ(n, out.Size());
+  EXPECT_EQ(6, out.shape(0));
+  EXPECT_EQ(2, out.shape(1));
+  const float *yptr = out.data<const float *>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(x[i], yptr[i]);
+}
+
+TEST(Flatten, BackwardCPU) {
+  // directly use input as the output_grad for backward
+  // note that only the shape of input really matters
+  const float dy[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
+                       -2.f, -1.f };
+  size_t n = sizeof(dy) / sizeof(float);
+  singa::Tensor in(singa::Shape {
+    2, 1, 3, 2
+  });
+  in.CopyDataFromHostPtr<float>(dy, n);
+
+  int axis = 2;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(conf);
+
+  singa::Tensor temp = flt.Forward(singa::kTrain, in);
+  const auto out = flt.Backward(singa::kTrain, temp);
+  const float *xptr = out.first.data<const float *>();
+  EXPECT_EQ(n, out.first.Size());
+  EXPECT_EQ(2, out.first.shape(0));
+  EXPECT_EQ(1, out.first.shape(1));
+  EXPECT_EQ(3, out.first.shape(2));
+  EXPECT_EQ(2, out.first.shape(3));
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+}
+
+#ifdef USE_CUDA
+TEST(Flatten, ForwardGPU) {
+  const float x[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
+                      -2.f, -1.f };
+  size_t n = sizeof(x) / sizeof(float);
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape {
+    2, 1, 3, 2
+  },
+                   &cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 3;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(conf);
+
+  singa::Tensor out = flt.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out.ToDevice(&host);
+  EXPECT_EQ(n, out.Size());
+  EXPECT_EQ(6, out.shape(0));
+  EXPECT_EQ(2, out.shape(1));
+  const float *yptr = out.data<const float *>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(x[i], yptr[i]);
+}
+
+TEST(Flatten, BackwardGPU) {
+  // directly use input as the output_grad for backward
+  // note that only the shape of input really matters
+  const float dy[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -4.f, 1.5f, -1.5f, 0.f, -0.5f,
+                       -2.f, -1.f };
+  size_t n = sizeof(dy) / sizeof(float);
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape {
+    2, 1, 3, 2
+  },
+                   &cuda);
+  in.CopyDataFromHostPtr<float>(dy, n);
+
+  int axis = 2;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(conf);
+
+  singa::Tensor out = flt.Forward(singa::kTrain, in);
+  const auto ret = flt.Backward(singa::kTrain, out);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_diff = ret.first;
+  in_diff.ToDevice(&host);
+  const float *xptr = in_diff.data<const float *>();
+  EXPECT_EQ(n, in_diff.Size());
+  EXPECT_EQ(2, in_diff.shape(0));
+  EXPECT_EQ(1, in_diff.shape(1));
+  EXPECT_EQ(3, in_diff.shape(2));
+  EXPECT_EQ(2, in_diff.shape(3));
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+}
+#endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5afd81b7/test/singa/test_prelu.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_prelu.cc b/test/singa/test_prelu.cc
new file mode 100644
index 0000000..2dde9e9
--- /dev/null
+++ b/test/singa/test_prelu.cc
@@ -0,0 +1,149 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/prelu.h"
+#include "gtest/gtest.h"
+#include "singa_config.h"
+
+using singa::PReLU;
+TEST(PReLU, Setup) {
+  PReLU prelu;
+  EXPECT_EQ("PReLU", prelu.layer_type());
+
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(true);
+  preluconf->set_format("NHWC");
+
+  prelu.Setup(conf);
+  EXPECT_EQ(true, prelu.Channel_shared());
+  EXPECT_EQ("NHWC", prelu.Format());
+}
+
+TEST(PReLU, ForwardCPU) {
+  const float x[] = { 1.f, 2.f, 3.f, -2.f, -3.f, -1.f, -1.f, 2.f, -1.f, -2.f,
+                      -2.f, -1.f };
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batchsize = 2, c = 3, h = 2, w = 1;
+  singa::Tensor in(singa::Shape {
+    batchsize, h, w, c
+  });
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  PReLU prelu;
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(false);
+  preluconf->set_format("NHWC");
+  prelu.Setup(conf);
+
+  const float neg_slope[] = { 0.25f, 0.5f, 0.75f };
+  singa::Tensor a(singa::Shape {
+    c
+  });
+  a.CopyDataFromHostPtr<float>(neg_slope, c);
+  prelu.Set_a(a);
+
+  singa::Tensor out = prelu.Forward(singa::kTrain, in);
+  const float *yptr = out.data<const float *>();
+  EXPECT_EQ(n, out.Size());
+
+  float *y = new float[n];
+  size_t div_factor = prelu.Channel_shared() ? c : 1;
+  if (prelu.Format() == "NCHW") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+    }
+  } else if (prelu.Format() == "NHWC") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+    }
+  }
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(y[i], yptr[i]);
+}
+
+TEST(PReLU, BackwardCPU) {
+  const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f, -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batchsize = 2, c = 3, h = 2, w = 1;
+  singa::Tensor in(singa::Shape {
+    batchsize, c, h, w
+  });
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  PReLU prelu;
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(false);
+  preluconf->set_format("NCHW");
+  prelu.Setup(conf);
+
+  const float neg_slope[] = { 0.25f, 0.5f, 0.75f };
+  singa::Tensor a(singa::Shape {
+    c
+  });
+  a.CopyDataFromHostPtr<float>(neg_slope, c);
+  prelu.Set_a(a);
+
+  singa::Tensor out = prelu.Forward(singa::kTrain, in);
+
+  const float grad[] = { 1.f, 2.f, -2.f, -1.f, -1.f, -3.f, 2.f, -2.f, 1.f, 1.f,
+                         -2.f, 0.f };
+  singa::Tensor out_diff(singa::Shape {
+    batchsize, c, h, w
+  });
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto ret = prelu.Backward(singa::kTrain, out_diff);
+  const float *xptr = ret.first.data<const float *>();
+  const float *aptr = ret.second.at(0).data<const float *>();
+  float *dx = new float[n];
+  size_t div_factor = prelu.Channel_shared() ? c : 1;
+  size_t params = prelu.Channel_shared() ? 1 : c;
+  float da[] = { 0.f, 0.f, 0.f };
+  if (prelu.Format() == "NCHW") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      dx[i] = grad[i] *
+              (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+    }
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      da[pos] += grad[i] * std::min(x[i], 0.f);
+    }
+  } else if (prelu.Format() == "NHWC") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      dx[i] = grad[i] *
+              (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+    }
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      da[pos] += grad[i] * std::min(x[i], 0.f);
+    }
+  }
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+  for (size_t i = 0; i < params; i++)
+    EXPECT_FLOAT_EQ(da[i], aptr[i]);
+}

[18/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

SINGA-182 Clean math function APIs and implementations

Implement GEMM/DGMM to support sum rows/columns, and add/sub/mult/div:row/column

Pass all test;

Format code and update the consistency guide for cleaning code.

Add the compile guard for USE_CBLAS.
TODO, find cblas by cmake and set USE_CBLAS


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/870d1a97
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/870d1a97
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/870d1a97

Branch: refs/heads/master
Commit: 870d1a97e19061f3f42b9cf907874609f7158231
Parents: fbd5219
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri May 27 20:31:41 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon May 30 13:24:51 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                     |   1 +
 cmake/Cuda.cmake                   |   1 +
 cmake/Dependencies.cmake           |   4 +
 cmake/Templates/singa_config.h.in  |   4 +-
 include/singa/core/tensor.h        | 257 +++++++-------
 include/singa/utils/cuda_utils.h   |  60 ++--
 src/core/device/cpp_cpu.cc         |  13 +-
 src/core/device/cuda_gpu.cc        |  10 +-
 src/core/device/device.cc          |   1 -
 src/core/tensor/math_kernel.cu     |  26 ++
 src/core/tensor/math_kernel.h      |  15 +-
 src/core/tensor/tensor.cc          | 610 +++++++++++++++++++-------------
 src/core/tensor/tensor_math.h      | 160 +++++----
 src/core/tensor/tensor_math_cpp.h  | 157 +++++---
 src/core/tensor/tensor_math_cuda.h | 117 ++++--
 test/singa/test_cpp_math.cc        |  25 --
 test/singa/test_mse.cc             |  26 +-
 test/singa/test_tensor.cc          |   2 -
 test/singa/test_tensor_math.cc     | 447 ++++++++++++++++++++++-
 19 files changed, 1325 insertions(+), 611 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e08fb98..d585497 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
 INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
 
 #OPTION(CPU_ONLY "use GPU libs" OFF)
+OPTION(USE_CBLAS "Use CBlas libs" OFF)
 OPTION(USE_CUDA "Use Cuda libs" ON)
 OPTION(USE_CUDNN "Use Cudnn libs" ON)
 OPTION(USE_OPENCV "Use opencv" OFF)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 19d4e27..4985bb8 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -22,3 +22,4 @@ ENDIF()
 
 INCLUDE_DIRECTORIES(SYSTEM ${CUDA_INCLUDE_DIRS})
 LIST(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+MESSAGE(STATUS "libs " ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ae28073..e995553 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -14,3 +14,7 @@ ELSE()
     SET(USE_CUDA FALSE)
     SET(USE_CUDNN FALSE)
 ENDIF()
+
+
+#LIST(APPEND SINGA_LINKER_LIBS "/home/wangwei/local/lib/libopenblas.so")
+#MESSAGE(STATUS "link lib : " ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Templates/singa_config.h.in
----------------------------------------------------------------------
diff --git a/cmake/Templates/singa_config.h.in b/cmake/Templates/singa_config.h.in
index e0f7328..5e8b32d 100644
--- a/cmake/Templates/singa_config.h.in
+++ b/cmake/Templates/singa_config.h.in
@@ -4,7 +4,9 @@
 // Binaries director
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
-#cmakedefine CPU_ONLY 
+#cmakedefine CPU_ONLY
+
+#cmakedefine USE_CBLAS
 // cuda
 #cmakedefine USE_CUDA
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index e560071..f51c899 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -33,22 +33,22 @@ namespace singa {
 
 typedef vector<size_t> Shape;
 typedef Shape::iterator ShapeIter;
-inline size_t Product(const Shape& shape, int start = 0, size_t len = 0) {
+inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
   if (len == 0)
     len = shape.size();
   CHECK_LE(len, shape.size());
   size_t v = 1;
-  for (unsigned int i = start; i < len; i ++)
+  for (unsigned int i = start; i < len; i++)
     v *= shape[i];
   return v;
 }
 
 /// hardcode the width of types defined in DataType
 const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int),
-                          sizeof(char), sizeof(double)};
+                             sizeof(char), sizeof(double)};
 inline size_t SizeOf(DataType t) {
   static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(size_t),
-      "Num of data types not match num of data width");
+                "Num of data types not match num of data width");
   CHECK_GT(kNumDataType, t);
   return kDataWidth[t];
 }
@@ -62,52 +62,44 @@ inline size_t SizeOf(DataType t) {
 /// then it must be set up correctly (shape, device). Otherwise, runtime error
 /// like SegmentFault would happen. Simply type/device check would be conducted.
 class Tensor {
- public:
+public:
   ~Tensor();
   Tensor();
-  explicit Tensor(Shape&& shape, DataType dtype = kFloat32);
-  explicit Tensor(const Shape& shape, DataType dtype = kFloat32);
-  Tensor(Shape&& shape, Device* dev, DataType dtype = kFloat32);
-  Tensor(const Shape& shape, Device* dev, DataType dtype = kFloat32);
+  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
+  explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
+  Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
+  Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
-  Tensor(const Tensor& from);
+  Tensor(const Tensor &from);
   /// Copy Tensor to share the internal data.  No deep copy.
-  Tensor(Tensor&& from);
+  Tensor(Tensor &&from);
 
   /// For functions in xx_math.cc to access the blob.
   /// Users should not operate against Blob directly.
   /// blob_ is allocated in constructors.
-  Blob* blob() const {
-    return blob_;
-  }
+  Blob *blob() const { return blob_; }
 
-  Device* device() const {
-    return device_;
-  }
+  Device *device() const { return device_; }
 
   /// Return immutable Tensor values with given type.
-  template <typename DType>
-  DType data() const {
-    return static_cast<DType> (blob()->data());
+  template <typename DType> DType data() const {
+    return static_cast<DType>(blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
-  const DataType data_type() const {
-    return data_type_;
-  }
+  const DataType data_type() const { return data_type_; }
 
-  const Shape& shape() const {
-    return shape_;
-  }
+  const Shape &shape() const { return shape_; }
 
-  int nDim() const {
-    return shape_.size();
+  const size_t shape(size_t idx) const {
+    CHECK_LT(idx, shape_.size());
+    return shape_.at(idx);
   }
 
-  bool transpose() const {
-    return transpose_;
-  }
+  int nDim() const { return shape_.size(); }
+
+  bool transpose() const { return transpose_; }
 
   /// Return number of total elements
   size_t Size() const {
@@ -116,39 +108,37 @@ class Tensor {
   }
 
   /// Return memory size (i.e., Bytes)
-  size_t MemSize() const {
-    return blob_->size();
-  }
+  size_t MemSize() const { return blob_->size(); }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
-  void Reshape(const Shape& shape);
+  void Reshape(const Shape &shape);
+  void Reshape(Shape &&shape);
 
   /// Reset the shape, device, and data type as given tensor.
   /// If blob size changes, then reallocate a new blob. The previous blob would
   /// be deleted.
-  void ResetLike(const Tensor& t);
+  void ResetLike(const Tensor &t);
 
   /// Reset the data type, it would reallocate blob if type changes.
   void AsType(DataType type);
 
   /// Reset the device.
   /// If the target device is a diff device, then do deep data copy.
-  void ToDevice(Device* dev);
+  void ToDevice(Device *dev);
 
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
 
   /// Set each element of the tensor to be x
-  template<typename SType>
-  void SetValue(SType x);
+  template <typename SType> void SetValue(const SType x);
 
   /// For init the tensor values, copy 'num' elements.
-  template<typename DType>
-  void CopyDataFromHostPtr(const DType* src, size_t num);
+  template <typename DType>
+  void CopyDataFromHostPtr(const DType *src, size_t num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
-  void CopyData(const Tensor& other);
+  void CopyData(const Tensor &other);
 
   /// Return an exactly the same Tensor with data been deep copied.
   Tensor Clone();
@@ -160,135 +150,124 @@ class Tensor {
   Tensor T() const;
 
   /// Copy the meta info with data blob shared.
-  Tensor& operator=(const Tensor& t);
+  Tensor &operator=(const Tensor &t);
 
   /// Copy the meta info with data blob shared.
-  Tensor& operator=(Tensor&& t);
+  Tensor &operator=(Tensor &&t);
 
-
-  Tensor& operator+=(const Tensor& t);
+  Tensor &operator+=(const Tensor &t);
   // void operator+=(Tensor&& t);
-  Tensor& operator-=(const Tensor& t);
+  Tensor &operator-=(const Tensor &t);
   // void operator-=(Tensor&& t);
-  Tensor& operator*=(const Tensor& t);
+  Tensor &operator*=(const Tensor &t);
   // void operator*=(Tensor&& t);
-  Tensor& operator/=(const Tensor& t);
+  Tensor &operator/=(const Tensor &t);
   // void operator/=(Tensor&& t);
 
   // Scalar operations.
 
   /// T is a scalar type
-  template<typename DType>
-  Tensor& operator+=(DType x);
+  template <typename DType> Tensor &operator+=(DType x);
 
   /// T is a scalar type
-  template <typename DType>
-  Tensor& operator-=(const DType x);
+  template <typename DType> Tensor &operator-=(const DType x);
 
   /// T is a scalar type
-  template <typename DType>
-  Tensor& operator*=(const DType x);
+  template <typename DType> Tensor &operator*=(const DType x);
 
   /// T is a scalar type
-  template <typename DType>
-  Tensor& operator/=(const DType x);
+  template <typename DType> Tensor &operator/=(const DType x);
 
   /// save Tensor into a proto msg
   // void ToProto(TensorProto* t);
   /// load Tensor from proto msg
   // void FromProto(const TensorProto& t);
 
- protected:
+protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
-  Device* device_ = nullptr;
+  Device *device_ = nullptr;
   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Blob, use blob() instead of blob_.
-  Blob* blob_ = nullptr;
+  Blob *blob_ = nullptr;
   Shape shape_;
 };
 
+inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
+  CHECK_EQ(in1.data_type(), in2.data_type());
+  CHECK_EQ(in1.device()->lang(), in2.device()->lang());
+}
+
+Tensor Reshape(const Tensor &in, const Shape &s);
+Tensor Reshape(const Tensor &in, Shape &&s);
+
 // For tensors with sparse content, e.g., missing columns or rows.
 // class SparseTensor : public Tensor {};
 
 /// Copy 'num' elements of src to dst.
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
-void CopyDataToFrom(Tensor* dst,
-              const Tensor& src,
-              size_t num,
-              size_t src_offset = 0,
-              size_t dst_offset = 0);
+void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
+                    size_t src_offset = 0, size_t dst_offset = 0);
 
 // ==================Simple Linear Algebra Operations=========================
-Tensor Abs(const Tensor& t);
-Tensor Exp(const Tensor& t);
-Tensor Log(const Tensor& t);
-Tensor ReLU(const Tensor& t);
-Tensor Sigmoid(const Tensor& t);
-Tensor Sign(const Tensor& t);
-Tensor Sqrt(const Tensor& t);
-Tensor Square(const Tensor& t);
-Tensor Tanh(const Tensor& t);
-
-
-template<typename SType>
-SType Sum(const Tensor& t);
+Tensor Abs(const Tensor &t);
+Tensor Exp(const Tensor &t);
+Tensor Log(const Tensor &t);
+Tensor ReLU(const Tensor &t);
+Tensor Sigmoid(const Tensor &t);
+Tensor Sign(const Tensor &t);
+Tensor Sqrt(const Tensor &t);
+Tensor Square(const Tensor &t);
+Tensor Tanh(const Tensor &t);
+
+template <typename SType> SType Sum(const Tensor &t);
 /// Sum elements in the Tensor, currently only support vector and matrix.
 /// if 'axis' is 0, sum all rows into a single row
 /// if 'axis' is 1, sum all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.sum
-Tensor Sum(const Tensor& t, int axis);
+Tensor Sum(const Tensor &t, int axis);
 
 /// Average elements in the Tensor, currently only support vector and matrix.
 /// if 'axis' is 0, average all rows into a single row
 /// if 'axis' is 1, average all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.average
-Tensor Average(const Tensor&t, int axis);
+Tensor Average(const Tensor &t, int axis);
 /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
 /// and shape_[axis+1]*...*shape_[nDim()] columns.
 /// and do softmax along each row.
-Tensor Softmax(const Tensor& t, int axis = -1);
-void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
+Tensor Softmax(const Tensor &t, int axis = -1);
+void Softmax(const Tensor &t, Tensor *ret, int axis = -1);
 
 /// Element-wise opeartion, ret[i]=t[i]^x
-template<typename DType>
-Tensor Pow(const Tensor& t, DType x);
+template <typename DType> Tensor Pow(const Tensor &t, DType x);
 /// Element-wise opeartion, ret[i]=t[i]^x
-template<typename DType>
-void Pow(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> void Pow(const Tensor &t, DType x, Tensor *ret);
 /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-Tensor Pow(const Tensor& base, Tensor exp);
+Tensor Pow(const Tensor &base, Tensor exp);
 /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-void Pow(const Tensor& base, const Tensor& exp, Tensor* ret);
+void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
 
-Tensor operator+(const Tensor& lhs, const Tensor& rhs);
-void Add(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-Tensor operator-(const Tensor& lhs, const Tensor& rhs);
-void Sub(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-Tensor operator*(const Tensor& lhs, const Tensor& rhs);
-void EltwiseMult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-Tensor operator/(const Tensor& lhs, const Tensor& rhs);
-void Div(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+Tensor operator+(const Tensor &lhs, const Tensor &rhs);
+void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+Tensor operator-(const Tensor &lhs, const Tensor &rhs);
+void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+Tensor operator*(const Tensor &lhs, const Tensor &rhs);
+void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+Tensor operator/(const Tensor &lhs, const Tensor &rhs);
+void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
 
-template <typename DType>
-Tensor operator+(const Tensor& t, DType x);
-template <typename DType>
-void Add(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> Tensor operator+(const Tensor &t, DType x);
+template <typename DType> void Add(const Tensor &t, DType x, Tensor *ret);
 
-template <typename DType>
-Tensor operator-(const Tensor& t, DType x);
-template <typename DType>
-void Sub(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> Tensor operator-(const Tensor &t, DType x);
+template <typename DType> void Sub(const Tensor &t, DType x, Tensor *ret);
 
+template <typename DType> Tensor operator*(const Tensor &t, DType x);
 template <typename DType>
-Tensor operator*(const Tensor& t, DType x);
-template <typename DType>
-void EltwiseMult(const Tensor& t, DType x, Tensor* ret);
+void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
 
-template <typename DType>
-Tensor operator/(const Tensor& t, DType x);
-template <typename DType>
-void Div(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> Tensor operator/(const Tensor &t, DType x);
+template <typename DType> void Div(const Tensor &t, DType x, Tensor *ret);
 
 // ================Blas operations============================================
 // We fix the scalar argument type to be float.
@@ -302,27 +281,59 @@ void Div(const Tensor& t, DType x, Tensor* ret);
 // void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
 
 /// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  ret = lhs * rhs
-Tensor Mult(const Tensor& lhs, const Tensor& rhs);
+/// on the Tensor shape.  result = A * B
+Tensor Mult(const Tensor &A, const Tensor &B);
 /// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  ret = lhs * rhs
-void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+/// on the Tensor shape.  C = A * B
+void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 
 /// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  ret = alpha lhs * rhs + beta * ret
-Tensor Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
 /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
-void Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs,
-    Tensor* C);
+void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
+          const float beta, Tensor *C);
 
 // ================Random operations==========================================
 /// For each element x set x = 1 if random() < p; otherwise x = 1.
-void Bernoulli(float p, Tensor* t);
+void Bernoulli(float p, Tensor *t);
 /// Fill in Tensor 't' following uniform distribution.
-void Uniform(float low, float high, Tensor* t);
+void Uniform(float low, float high, Tensor *t);
 /// Fill in Tensor 't' following Gaussian distribution.
-void Gaussian(float mean, float std, Tensor* t);
+void Gaussian(float mean, float std, Tensor *t);
+
+// follow the consistency guide
+// ============Matrix vector operations=======================================
+/// Add column 'v' with each column of matrix M
+void AddColumn(const Tensor &v, Tensor *M);
+void AddColumn(const float alpha, const float beta, const Tensor &v,
+               Tensor *out);
+/// Sub column 'v' by each column of matrix M
+void SubColumn(const Tensor &v, Tensor *M);
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M);
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M);
+
+/// Add row 'v' with each row of matrix M; write results into 'out'
+void AddRow(const Tensor &v, Tensor *out);
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
+/// Sub row 'v' by each row of matrix M; write results into 'out'
+void SubRow(const Tensor &v, Tensor *M);
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M);
+/// Divide row 'v' by each row of matrix M; write results into 'out'
+void DivRow(const Tensor &v, Tensor *M);
+
+/// Sum all rows of matrix M into a single row as 'out'
+void SumRows(const Tensor &M, Tensor *out);
+/// Sum all columns of matrix M into a single column as 'out'
+void SumColumns(const Tensor &M, Tensor *out);
+
+/// For each element x of Tensor 'in', compute alpha/x
+template <typename SType> Tensor Div(const SType alpha, const Tensor &in);
+
+/// For each element x of Tensor 'in', compute alpha/x into 'out'
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out);
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
index 076d0d1..17eb683 100644
--- a/include/singa/utils/cuda_utils.h
+++ b/include/singa/utils/cuda_utils.h
@@ -8,33 +8,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-//
-// CUDA macros
-//
-
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << curandGetErrorString(status); \
-  } while (0)
-
-const char* cublasGetErrorString(cublasStatus_t error) {
+inline const char* cublasGetErrorString(cublasStatus_t error) {
   switch (error) {
   case CUBLAS_STATUS_SUCCESS:
     return "CUBLAS_STATUS_SUCCESS";
@@ -64,7 +38,7 @@ const char* cublasGetErrorString(cublasStatus_t error) {
   return "Unknown cublas status";
 }
 
-const char* curandGetErrorString(curandStatus_t error) {
+inline const char* curandGetErrorString(curandStatus_t error) {
   switch (error) {
   case CURAND_STATUS_SUCCESS:
     return "CURAND_STATUS_SUCCESS";
@@ -95,5 +69,33 @@ const char* curandGetErrorString(curandStatus_t error) {
   }
   return "Unknown curand status";
 }
-#endif
+
+//
+// CUDA macros
+//
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+
+#define CUBLAS_CHECK(condition) \
+  do { \
+    cublasStatus_t status = condition; \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
+      << cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_CHECK(condition) \
+  do { \
+    curandStatus_t status = condition; \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
+      << curandGetErrorString(status); \
+  } while (0)
+
+
+#endif  // USE_CUDA
 #endif  // SINGA_UTILS_CUDA_UTILS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 28b0da4..44f614a 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -33,13 +33,18 @@ void CppCPU::DoExec(function<void(Context*)>&& fn, int executor) {
 }
 
 void* CppCPU::Malloc(int size) {
-  void *ptr = malloc(size);
-  memset(ptr, 0, size);
-  return ptr;
+  if (size > 0) {
+    void *ptr = malloc(size);
+    memset(ptr, 0, size);
+    return ptr;
+  } else {
+    return nullptr;
+  }
 }
 
 void CppCPU::Free(void* ptr) {
-  free(ptr);
+  if (ptr != nullptr)
+    free(ptr);
 }
 
 void CppCPU::CopyToFrom(void* dst, const void* src, size_t nBytes,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 0ba05fb..5d4e1ed 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -89,15 +89,17 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
 /// Allocate cpu memory.
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
-  CUDA_CHECK(cudaMalloc(&ptr, size));
-  CUDA_CHECK(cudaMemset(ptr, 0, size));
+  if (size > 0) {
+    CUDA_CHECK(cudaMalloc(&ptr, size));
+    CUDA_CHECK(cudaMemset(ptr, 0, size));
+  }
   return ptr;
 }
 
   /// Free cpu memory.
 void CudaGPU::Free(void* ptr) {
-  CHECK_NE(ptr, nullptr);
-  CUDA_CHECK(cudaFree(ptr));
+  if (ptr != nullptr)
+    CUDA_CHECK(cudaFree(ptr));
 }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index ede3fda..1d3c446 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -35,7 +35,6 @@ void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
 Blob* Device::NewBlob(int size) {
   if (size > 0) {
     void* ptr = Malloc(size);
-    // memset(ptr, 0, size);
     return new Blob(ptr, size);
   } else {
     return nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index e67ea7b..88041b1 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -450,6 +450,32 @@ void set_value(int n, float v, float *out) {
 void threshold(int n, float alpha, const float *in, float *out) {
   kernel_threshold<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, alpha, n);
 }
+
+
+// follow the consistency guide for math API
+__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
+                          float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = alpha / in[idx];
+  }
+}
+
+__global__ void KernelSet(const size_t num, const float x, float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = x;
+  }
+}
+
+void Div(const size_t num, float alpha, const float *in, float *out,
+         cudaStream_t s) {
+  KernelDiv<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, alpha, in, out);
+}
+
+void Set(const size_t num, const float x, float *out, cudaStream_t s) {
+  KernelSet<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, x, out);
+}
 }  // namespace cuda
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index b016007..925346e 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -28,13 +28,7 @@
 /// TODO(wangwei) Clean the function APIs as commented in tensor_math.h
 ///  Add 'Context *ctx' as an argument of all cuda functions.
 namespace singa {
-/*
-  void softmaxloss_forward(int n, int dim, const float *prob,
-      const int *label, float *loss);
 
-  void softmaxloss_backward(int n, int dim, float scale,
-      const int *label, float *grad);
-*/
 // TODO(wangwei) make all function templates.
 namespace cuda {
 void sum(int n, const float *in, float *out);
@@ -44,7 +38,7 @@ void sum_row(int rows, int cols, int stride, const float *in, float *out);
 void sum_col(int rows, int cols, int stride, const float *in, float *out);
 
 void add_row(int rows, int cols, int stride, const float *in_row,
-  const float *in_mat, float *out);
+             const float *in_mat, float *out);
 
 void add(int n, const float *a, const float *b, float *out);
 
@@ -87,7 +81,12 @@ void div(int n, const float *a, const float *b, float *out);
 void set_value(int n, float v, float *out);
 
 void threshold(int n, float alpha, const float *in, float *out);
-}  // cuda
+
+// follow the consistency guide for math API
+void Div(const size_t num, const float x, const float *in, float *out,
+         cudaStream_t s);
+void Set(const size_t num, const float x, float *out, cudaStream_t s);
+} // cuda
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 052f3ff..0e47a4f 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -25,51 +25,51 @@
 namespace singa {
 
 Tensor::~Tensor() {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  // LOG(ERROR) << "~";
+  if (blob_ != nullptr && blob_->DecRefCount() == 0)
+    device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
 Tensor::Tensor() { device_ = &defaultDevice; }
 
-Tensor::Tensor(const Shape& shape, DataType dtype)
+Tensor::Tensor(const Shape &shape, DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape&& shape, DataType dtype)
+Tensor::Tensor(Shape &&shape, DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape& shape, Device* device, DataType dtype)
+Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape&& shape, Device* device, DataType dtype)
+Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Tensor& t)
-    : transpose_(t.transpose_),
-      data_type_(t.data_type_),
-      device_(t.device_),
-      blob_(t.blob()),
-      shape_(t.shape_) {
+Tensor::Tensor(const Tensor &t)
+    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
+      blob_(t.blob()), shape_(t.shape_) {
   blob_->IncRefCount();
+  // LOG(ERROR) << "const&";
 }
 
-Tensor::Tensor(Tensor&& t)
-    : transpose_(t.transpose_),
-      data_type_(t.data_type_),
-      device_(t.device_),
+Tensor::Tensor(Tensor &&t)
+    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
       shape_(std::move(t.shape_)) {
   blob_ = t.blob_;
   t.blob_ = nullptr;
+  // LOG(ERROR) << "&&";
 }
 
-void Tensor::ResetLike(const Tensor& t) {
+void Tensor::ResetLike(const Tensor &t) {
   if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     shape_ = t.shape_;
     device_ = t.device_;
     data_type_ = t.data_type_;
@@ -77,28 +77,40 @@ void Tensor::ResetLike(const Tensor& t) {
   }
 }
 
-void Tensor::Reshape(const Shape& shape) {
-  if (shape_ != shape) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+void Tensor::Reshape(const Shape &shape) {
+  if (Product(shape_) != Product(shape)) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
-    shape_ = shape;
   }
+  shape_ = shape;
+}
+
+void Tensor::Reshape(Shape &&shape) {
+  if (Product(shape_) != Product(shape)) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
+    blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
+  }
+  shape_ = std::move(shape);
 }
 
 void Tensor::AsType(DataType type) {
   if (data_type_ != type) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
     data_type_ = type;
   }
 }
 
-void Tensor::ToDevice(Device* dst) {
+void Tensor::ToDevice(Device *dst) {
   // TODO(wangwei) the comparison is very strict. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     tmp.CopyData(*this);
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     blob_ = tmp.blob_;
     tmp.blob_ = nullptr;
     device_ = dst;
@@ -108,7 +120,7 @@ void Tensor::ToDevice(Device* dst) {
 void Tensor::ToHost() { ToDevice(device_->host()); }
 
 template <typename DType>
-void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
+void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
@@ -118,10 +130,10 @@ void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
     LOG(WARNING) << "Copy data from null host ptr";
   }
 }
-template void Tensor::CopyDataFromHostPtr(const float* src, size_t num);
-template void Tensor::CopyDataFromHostPtr(const int* src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const float *src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const int *src, size_t num);
 
-void Tensor::CopyData(const Tensor& src) {
+void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
   CHECK(blob_ != nullptr);
   // Do copy only if the src's blob is already initialized.
@@ -139,14 +151,21 @@ Tensor Tensor::Clone() {
 
 Tensor Tensor::T() const {
   CHECK_EQ(shape_.size(), 2u);
-  Tensor t(*this);
+  Tensor t;
+  t.device_ = device_;
+  t.data_type_ = data_type_;
   t.transpose_ = ~transpose_;
-  std::swap(t.shape_[0], t.shape_[1]);
+  t.shape_.push_back(shape_[1]);
+  t.shape_.push_back(shape_[0]);
+  t.blob_ = blob_;
+  blob_->IncRefCount();
   return t;
 }
 
-Tensor& Tensor::operator=(const Tensor& t) {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+Tensor &Tensor::operator=(const Tensor &t) {
+  // LOG(ERROR) << "= const &";
+  if (blob_ != nullptr && blob_->DecRefCount() == 0)
+    device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
   data_type_ = t.data_type_;
   shape_ = t.shape_;
@@ -156,8 +175,10 @@ Tensor& Tensor::operator=(const Tensor& t) {
   return *this;
 }
 
-Tensor& Tensor::operator=(Tensor&& t) {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+Tensor &Tensor::operator=(Tensor &&t) {
+  // LOG(ERROR) << "= &&";
+  if (blob_ != nullptr && blob_->DecRefCount() == 0)
+    device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
   data_type_ = t.data_type_;
   shape_ = std::move(t.shape_);
@@ -167,10 +188,22 @@ Tensor& Tensor::operator=(Tensor&& t) {
   return *this;
 }
 
-#define GenUnaryTensorArgMemberFunction(op, fn) \
-  Tensor& Tensor::op(const Tensor& t) {         \
-    fn(*this, t, this);                         \
-    return *this;                               \
+Tensor Reshape(const Tensor &in, const Shape &s) {
+  Tensor out(in);
+  out.Reshape(s);
+  return out;
+}
+
+Tensor Reshape(const Tensor &in, Shape &&s) {
+  Tensor out(in);
+  out.Reshape(std::move(s));
+  return out;
+}
+
+#define GenUnaryTensorArgMemberFunction(op, fn)                                \
+  Tensor &Tensor::op(const Tensor &t) {                                        \
+    fn(*this, t, this);                                                        \
+    return *this;                                                              \
   }
 
 GenUnaryTensorArgMemberFunction(operator+=, Add);
@@ -178,13 +211,12 @@ GenUnaryTensorArgMemberFunction(operator-=, Sub);
 GenUnaryTensorArgMemberFunction(operator*=, EltwiseMult);
 GenUnaryTensorArgMemberFunction(operator/=, Div);
 
-#define GenUnaryScalarArgMemberFunction(op, fn) \
-  template <typename DType>                     \
-  Tensor& Tensor::op(DType x) {                 \
-    fn(*this, x, this);                         \
-    return *this;                               \
-  }                                             \
-  template Tensor& Tensor::op<float>(float x)
+#define GenUnaryScalarArgMemberFunction(op, fn)                                \
+  template <typename DType> Tensor &Tensor::op(DType x) {                      \
+    fn(*this, x, this);                                                        \
+    return *this;                                                              \
+  }                                                                            \
+  template Tensor &Tensor::op<float>(float x)
 
 GenUnaryScalarArgMemberFunction(operator-=, Sub);
 GenUnaryScalarArgMemberFunction(operator+=, Add);
@@ -192,7 +224,7 @@ GenUnaryScalarArgMemberFunction(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFunction(operator/=, Div);
 
 // ====================Tensor Operations=======================================
-void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
+void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
                     size_t dst_offset, size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
@@ -223,94 +255,93 @@ void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
 //============================================================================
 /// typedef DType accroding to type value.
 /// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...)                               \
-  do {                                                              \
-    switch (type) {                                                 \
-      case kFloat32: {                                              \
-        typedef float DType;                                        \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kInt: {                                                  \
-        typedef int DType;                                          \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kChar: {                                                 \
-        typedef char DType;                                         \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      default:                                                      \
-        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
-    }                                                               \
+#define TYPE_SWITCH(type, DType, ...)                                          \
+  do {                                                                         \
+    switch (type) {                                                            \
+    case kFloat32: {                                                           \
+      typedef float DType;                                                     \
+      { __VA_ARGS__ }                                                          \
+      break;                                                                   \
+    }                                                                          \
+    case kInt: {                                                               \
+      typedef int DType;                                                       \
+      { __VA_ARGS__ }                                                          \
+      break;                                                                   \
+    }                                                                          \
+    case kChar: {                                                              \
+      typedef char DType;                                                      \
+      { __VA_ARGS__ }                                                          \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      LOG(FATAL) << "Unknow data type = " << DataType_Name(type);              \
+    }                                                                          \
   } while (0)
 
 /// typedef DType and Lang according to data type and device programming
 /// language respectively.
 /// type is from DataType, and lang is from LangType.
 /// DType and Lang would be used in __VA_ARGS__.
-#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
-  do {                                                         \
-    const int _SwitchShift = 3;                                \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
-    switch (_SwitchHash) {                                     \
-      case ((kFloat32 << _SwitchShift) + kCuda): {             \
-        typedef float DType;                                   \
-        typedef lang::Cuda Lang;                               \
-        { __VA_ARGS__ }                                        \
-        break;                                                 \
-      }                                                        \
-      case ((kFloat32 << _SwitchShift) + kCpp): {              \
-        typedef float DType;                                   \
-        typedef lang::Cpp Lang;                                \
-        { __VA_ARGS__ }                                        \
-        break;                                                 \
-      }                                                        \
-      case ((kFloat32 << _SwitchShift) + kOpencl): {           \
-        typedef float DType;                                   \
-        typedef lang::Opencl Lang;                             \
-        { __VA_ARGS__ }                                        \
-        break;                                                 \
-      }                                                        \
-      default:                                                 \
-        LOG(FATAL) << "Unknown combination of data type "      \
-                   << DataType_Name(dtype) << " and language " \
-                   << LangType_Name(ltype);                    \
-    }                                                          \
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)                       \
+  do {                                                                         \
+    const int _SwitchShift = 3;                                                \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);                     \
+    switch (_SwitchHash) {                                                     \
+    case ((kFloat32 << _SwitchShift) + kCuda): {                               \
+      typedef float DType;                                                     \
+      typedef lang::Cuda Lang;                                                 \
+      { __VA_ARGS__ }                                                          \
+      break;                                                                   \
+    }                                                                          \
+    case ((kFloat32 << _SwitchShift) + kCpp): {                                \
+      typedef float DType;                                                     \
+      typedef lang::Cpp Lang;                                                  \
+      { __VA_ARGS__ }                                                          \
+      break;                                                                   \
+    }                                                                          \
+    case ((kFloat32 << _SwitchShift) + kOpencl): {                             \
+      typedef float DType;                                                     \
+      typedef lang::Opencl Lang;                                               \
+      { __VA_ARGS__ }                                                          \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      LOG(FATAL) << "Unknown combination of data type "                        \
+                 << DataType_Name(dtype) << " and language "                   \
+                 << LangType_Name(ltype);                                      \
+    }                                                                          \
   } while (0)
 
-
-template <typename SType>
-void Tensor::SetValue(SType x) {
+template <typename SType> void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
   auto size = Size();
   auto ptr = blob_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    // cast x to DType
     device_->Exec(
-        [size, x, ptr](Context* ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
+        [size, x, ptr](Context *ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
         {}, {ptr});
   });
 }
-
-
-#define EltwiseUnaryTensorFn(fn, t, ret)                               \
-  do {                                                                 \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
-      ret->device()->Exec(                                             \
-          [t, ret](Context* ctx) {                                     \
-            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);     \
-          },                                                           \
-          {t.blob()}, {ret->blob()});                                  \
-    });                                                                \
+template void Tensor::SetValue<float>(const float x);
+
+#define EltwiseUnaryTensorFn(fn, t, ret)                                       \
+  do {                                                                         \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         \
+      ret->device()->Exec(                                                     \
+          [t, ret](Context *ctx) {                                             \
+            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);             \
+          },                                                                   \
+          {t.blob()}, {ret->blob()});                                          \
+    });                                                                        \
   } while (0)
 
-#define GenUnaryTensorFunction(fn)                    \
-  Tensor fn(const Tensor& t) {                        \
-    Tensor ret(t.shape(), t.device(), t.data_type()); \
-    auto* retptr = &ret;                              \
-    EltwiseUnaryTensorFn(fn, t, retptr);              \
-    return ret;                                       \
+#define GenUnaryTensorFunction(fn)                                             \
+  Tensor fn(const Tensor &t) {                                                 \
+    Tensor ret(t.shape(), t.device(), t.data_type());                          \
+    auto *retptr = &ret;                                                       \
+    EltwiseUnaryTensorFn(fn, t, retptr);                                       \
+    return ret;                                                                \
   }
 
 GenUnaryTensorFunction(Abs);
@@ -323,63 +354,33 @@ GenUnaryTensorFunction(Sqrt);
 GenUnaryTensorFunction(Square);
 GenUnaryTensorFunction(Tanh);
 
-// TODO(wangwei) consider matrix transpose.
-Tensor SumRows(const Tensor& t) {
-  int ndim = t.shape().size();
-  CHECK_EQ(ndim, 2) << "Cannot do SumRows for Tensor with ndim = " << ndim;
-  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
-  Tensor ret(Shape{ncol}, t.device(), t.data_type());
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    ret.device()->Exec(
-        [nrow, ncol, t, ret](Context* ctx) {
-          SumRows<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
-        },
-        {t.blob()}, {ret.blob()});
-  });
-  return ret;
-}
-
-// TODO(wangwei) consider matrix transpose.
-Tensor SumColumns(const Tensor& t) {
-  int ndim = t.shape().size();
-  CHECK_EQ(ndim, 2) << "Cannot do SumColumns for Tensor with ndim = " << ndim;
-  CHECK(!t.transpose());  // TODO(wangwei) enable transpose
-  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
-  Tensor ret(Shape{nrow}, t.device(), t.data_type());
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    ret.device()->Exec(
-        [nrow, ncol, t, ret](Context* ctx) {
-          SumColumns<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
-        },
-        {t.blob()}, {ret.blob()});
-  });
-  return ret;
-}
-
 // TODO(wangwei) conside async exec
-template<>
-float Sum<float>(const Tensor& t)  {
+template <> float Sum<float>(const Tensor &t) {
   float s = 0.0f;
   TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-      t.device()->Exec(
-        [t, &s](Context* ctx) {
-        Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
+    t.device()->Exec(
+        [t, &s](Context *ctx) {
+          Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
         },
         {t.blob()}, {});
-      });
+  });
   return s;
 }
 
-Tensor Sum(const Tensor& t, int axis) {
+Tensor Sum(const Tensor &M, int axis) {
   if (axis == 0) {
-    return SumRows(t);
+    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
+    SumRows(M, &out);
+    return out;
   } else {
     CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
-    return SumColumns(t);
+    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
+    SumColumns(M, &out);
+    return out;
   }
 }
 
-Tensor Average(const Tensor& t, int axis) {
+Tensor Average(const Tensor &t, int axis) {
   // operator/ only has implementation for float scalar type, hence it is
   // necessary to cast the denominator to a float.
   // TODO(wangwei) implement function for cast scalar type involved in Tensor
@@ -401,13 +402,13 @@ Tensor Average(const Tensor& t, int axis) {
   }
 }
 
-Tensor Softmax(const Tensor& t, int axis) {
+Tensor Softmax(const Tensor &t, int axis) {
   Tensor ret(t.shape(), t.device(), t.data_type());
   Softmax(t, &ret, axis);
   return ret;
 }
 
-void Softmax(const Tensor& t, Tensor* ret, int axis) {
+void Softmax(const Tensor &t, Tensor *ret, int axis) {
   int nrow = 1, ncol = t.Size(), size = ncol;
   CHECK_GE(axis, -1);
   CHECK_GT(t.shape().size(), 0u);
@@ -418,34 +419,34 @@ void Softmax(const Tensor& t, Tensor* ret, int axis) {
   }
   TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
     ret->device()->Exec(
-        [nrow, ncol, t, ret](Context* ctx) {
+        [nrow, ncol, t, ret](Context *ctx) {
           Softmax<DType, Lang>(nrow, ncol, t.blob(), ret->blob(), ctx);
         },
         {t.blob()}, {ret->blob()});
   });
 }
 
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                             \
-  do {                                                                       \
-    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {   \
-      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                      \
-      ret->device()->Exec(                                                   \
-          [lhs, rhs, ret](Context* ctx) {                                    \
-            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), \
-                            ctx);                                            \
-          },                                                                 \
-          {lhs.blob(), rhs.blob()}, {ret->blob()});                          \
-    });                                                                      \
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
+  do {                                                                         \
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
+      ret->device()->Exec(                                                     \
+          [lhs, rhs, ret](Context *ctx) {                                      \
+            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),   \
+                            ctx);                                              \
+          },                                                                   \
+          {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
+    });                                                                        \
   } while (0)
 
-#define GenBinaryTensorFunction(op, fn)                        \
-  Tensor op(const Tensor& lhs, const Tensor& rhs) {            \
-    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
-    fn(lhs, rhs, &ret);                                        \
-    return ret;                                                \
-  }                                                            \
-  void fn(const Tensor& lhs, const Tensor& rhs, Tensor* ret) { \
-    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
+#define GenBinaryTensorFunction(op, fn)                                        \
+  Tensor op(const Tensor &lhs, const Tensor &rhs) {                            \
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());                    \
+    fn(lhs, rhs, &ret);                                                        \
+    return ret;                                                                \
+  }                                                                            \
+  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {                 \
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                                  \
   }
 
 GenBinaryTensorFunction(operator+, Add);
@@ -454,32 +455,30 @@ GenBinaryTensorFunction(operator*, EltwiseMult);
 GenBinaryTensorFunction(operator/, Div);
 GenBinaryTensorFunction(Pow, Pow);
 
-#define EltwiseTensorScalarFn(fn, t, x, ret)                            \
-  do {                                                                  \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
-      static_assert(std::is_same<SType, DType>::value,                  \
-                    "The Scalar type must match the Tensor data type"); \
-      ret->device()->Exec(                                              \
-          [t, x, ret](Context* ctx) {                                   \
-            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);   \
-          },                                                            \
-          {t.blob()}, {ret->blob()});                                   \
-    });                                                                 \
+#define EltwiseTensorScalarFn(fn, t, x, ret)                                   \
+  do {                                                                         \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         \
+      static_assert(std::is_same<SType, DType>::value,                         \
+                    "The Scalar type must match the Tensor data type");        \
+      ret->device()->Exec(                                                     \
+          [t, x, ret](Context *ctx) {                                          \
+            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);          \
+          },                                                                   \
+          {t.blob()}, {ret->blob()});                                          \
+    });                                                                        \
   } while (0)
 
-#define GenTensorScalarFunction(op, fn)                \
-  template <typename SType>                            \
-  Tensor op(const Tensor& t, SType x) {                \
-    Tensor ret(t.shape(), t.device(), t.data_type());  \
-    fn(t, x, &ret);                                    \
-    return ret;                                        \
-  }                                                    \
-  template <typename SType>                            \
-  void fn(const Tensor& t, SType x, Tensor* ret) {     \
-    EltwiseTensorScalarFn(fn, t, x, ret);              \
-  }                                                    \
-  template Tensor op<float>(const Tensor& t, float x); \
-  template void fn<float>(const Tensor& t, const float x, Tensor* ret)
+#define GenTensorScalarFunction(op, fn)                                        \
+  template <typename SType> Tensor op(const Tensor &t, SType x) {              \
+    Tensor ret(t.shape(), t.device(), t.data_type());                          \
+    fn(t, x, &ret);                                                            \
+    return ret;                                                                \
+  }                                                                            \
+  template <typename SType> void fn(const Tensor &t, SType x, Tensor *ret) {   \
+    EltwiseTensorScalarFn(fn, t, x, ret);                                      \
+  }                                                                            \
+  template Tensor op<float>(const Tensor &t, float x);                         \
+  template void fn<float>(const Tensor &t, const float x, Tensor *ret)
 
 GenTensorScalarFunction(operator+, Add);
 GenTensorScalarFunction(operator-, Sub);
@@ -488,83 +487,216 @@ GenTensorScalarFunction(operator/, Div);
 GenTensorScalarFunction(Pow, Pow);
 
 // ================Blas operations============================================
-Tensor Mult(const Tensor& lhs, const Tensor& rhs) {
-  Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());
+Tensor Mult(const Tensor &lhs, const Tensor &rhs) {
+  Tensor ret(Shape{lhs.shape(0), rhs.shape(1)}, lhs.device(), lhs.data_type());
   Mult(lhs, rhs, &ret);
   return ret;
 }
 
-void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret) {
-  Mult(1, lhs, 1, rhs, ret);
+void Mult(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {
+  Mult(1.0f, lhs, rhs, 0.0f, ret);
 }
 
-Tensor Mult(float alpha, const Tensor& A, float beta, const Tensor& B) {
-  Tensor ret(A.shape(), A.device(), A.data_type());
-  Mult(alpha, A, beta, B, &ret);
-  return ret;
-}
-
-void Mult(float alpha, const Tensor& A, float beta, const Tensor& B,
-          Tensor* C) {
+void Mult(const float alpha, const Tensor &A, const Tensor &B, const float beta,
+          Tensor *C) {
   CHECK_EQ(A.shape().size(), 2u);
-  bool transA = A.transpose();
-  size_t m = transA ? A.shape()[1] : A.shape()[0], n = 0;
-  if (B.shape().size() == 1u) {
-    n = C->Size();
+  if (B.nDim() == 1u) {
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       C->device()->Exec(
-          [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
-            GEMV<DType, Lang>(transA, m, n, alpha, A.blob(), B.blob(), beta,
-                              C->blob(), ctx);
+          [alpha, A, beta, B, C](Context *ctx) {
+            GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), alpha,
+                              A.blob(), B.blob(), beta, C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
     });
   } else {
     CHECK(!C->transpose());
-    bool transB = B.transpose();
-    size_t k = transB ? B.shape()[1] : B.shape()[0];
-    n = C->shape()[1];
-    CHECK_EQ(C->shape()[0], m);
-    CHECK_EQ(A.Size(), m * k);
-    CHECK_EQ(B.Size(), n * k);
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       C->device()->Exec(
-          [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
-            GEMM<DType, Lang>(transA, transB, m, n, k, alpha, A.blob(),
-                              B.blob(), beta, C->blob(), ctx);
+          [alpha, A, beta, B, C](Context *ctx) {
+            GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0),
+                              B.shape(1), A.shape(1), alpha, A.blob(), B.blob(),
+                              beta, C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
     });
   }
 }
 
-void Bernoulli(float p, Tensor* t) {
+void Bernoulli(float p, Tensor *t) {
   TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
-        [p, t](Context* ctx) {
+        [p, t](Context *ctx) {
           Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
-void Uniform(float low, float high, Tensor* t) {
+void Uniform(float low, float high, Tensor *t) {
   TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
-        [low, high, t](Context* ctx) {
+        [low, high, t](Context *ctx) {
           Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
-void Gaussian(float mean, float std, Tensor* t) {
+void Gaussian(float mean, float std, Tensor *t) {
   TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
-        [mean, std, t](Context* ctx) {
+        [mean, std, t](Context *ctx) {
           Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
+
+// ======follow the consistency guide
+void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
+/// Add column 'v' onto each column of matrix M;
+void AddColumn(const float alpha, const float beta, const Tensor &v,
+               Tensor *M) {
+  if (M->transpose()) {
+    Tensor X = M->T();
+    AddRow(v, &X);
+  } else {
+    CHECK_EQ(M->nDim(), 2);
+    CHECK_EQ(v.nDim(), 1);
+    size_t nb_row = M->shape(0), nb_col = M->shape(1);
+    CHECK_EQ(nb_row, v.Size());
+
+    Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
+    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor vmat = Reshape(v, Shape{nb_row, 1});
+    Mult(alpha, vmat, one, beta, M);
+  }
+}
+void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
+
+/// Sub column 'v' by each column of matrix M; write results into 'out'
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
+  if (M->transpose()) {
+    Tensor X = M->T();
+    AddColumn(v, &X);
+  } else {
+    CHECK_EQ(M->nDim(), 2);
+    CHECK_EQ(v.nDim(), 1);
+    size_t nb_row = M->shape(0), nb_col = M->shape(1);
+    CHECK_EQ(nb_col, v.Size());
+
+    Tensor one(Shape{nb_row, 1}, M->device(), M->data_type());
+    one.SetValue(1.0f);
+    Tensor vmat = Reshape(v, Shape{1, nb_col});
+    Mult(alpha, one, vmat, beta, M);
+  }
+}
+
+template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  Div(alpha, in, &out);
+  return out;
+}
+
+template Tensor Div<float>(const float, const Tensor &);
+
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out) {
+  CheckDataTypeAndLang(in, *out);
+  CHECK(in.shape() == out->shape());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    // TODO(wangwei) type cast SType to DType;
+    in.device()->Exec(
+        [alpha, in, out](Context *ctx) {
+          Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
+        },
+        {in.blob()}, {out->blob()});
+  });
+}
+template void Div<float>(const float, const Tensor &, Tensor *);
+
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M) {
+  Tensor inv;
+  TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
+  MultColumn(inv, M);
+}
+
+/// Divide row 'v' by each row of matrix M; write results into 'out'
+void DivRow(const Tensor &v, Tensor *M) {
+  Tensor inv;
+  TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
+  MultRow(inv, M);
+}
+
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M) {
+  CHECK(!M->transpose()) << "Not supported yet";
+  CHECK_EQ(M->nDim(), 2);
+  CHECK_EQ(v.nDim(), 1);
+  CHECK_EQ(v.Size(), M->shape(0));
+  CheckDataTypeAndLang(*M, v);
+  TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
+    v.device()->Exec(
+        [M, v](Context *ctx) {
+          DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(),
+                            v.blob(), M->blob(), ctx);
+        },
+        {M->blob(), v.blob()}, {M->blob()});
+  });
+}
+
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M) {
+  CHECK(!M->transpose()) << "Not supported yet";
+  CHECK_EQ(M->nDim(), 2);
+  CHECK_EQ(v.nDim(), 1);
+  CHECK_EQ(v.Size(), M->shape(1));
+  CheckDataTypeAndLang(*M, v);
+  TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
+    v.device()->Exec(
+        [M, v](Context *ctx) {
+          DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
+                            M->blob(), ctx);
+        },
+        {M->blob(), v.blob()}, {M->blob()});
+  });
+}
+
+void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }
+
+void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }
+
+void SumColumns(const Tensor &M, Tensor *v) {
+  if (M.transpose()) {
+    Tensor X = M.T();
+    SumRows(X, v);
+  } else {
+    CHECK_EQ(M.nDim(), 2);
+    CHECK_EQ(v->nDim(), 1);
+    size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
+    CHECK_EQ(nb_row, v->Size());
+
+    Tensor one(Shape{nb_col, 1}, M.device(), M.data_type());
+    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Mult(M, one, v);
+  }
+}
+void SumRows(const Tensor &M, Tensor *v) {
+  if (M.transpose()) {
+    Tensor X = M.T();
+    SumColumns(X, v);
+  } else {
+    CHECK_EQ(M.nDim(), 2);
+    CHECK_EQ(v->nDim(), 1);
+    size_t nb_row = M.shape(0), nb_col = M.shape(1);
+    CHECK_EQ(nb_col, v->Size());
+
+    Tensor one(Shape{nb_row, 1}, M.device(), M.data_type());
+    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor X = M.T();
+    Mult(X, one, v);
+  }
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index b53d4cb..98d91bf 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -39,178 +39,184 @@ namespace singa {
 ///      Scale(const float alpha, const Blob* in, Blob* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
 ///    For blas functions, follow the blas style for argument names.
+///    Use 'M' and 'v' for matrix and vector tensors in functions involving both
+///    matrix and vectors.
+/// 5. For Blob argument xxx, name its raw pointer as xxxPtr.
+/// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
+/// 7. Use size_t for the number of elements, rows or columns.
+/// 8. Use the same name for the Tensor and Blob level math functions.
 
 
 // ================Linear algebra functions====================================
 /// ret[i] = |input[i]|
 template <typename DType, typename Lang>
-void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Abs(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Set(int count, DType x, Blob* ret, Context* ctx) {
+void Set(int count, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// sum all elements of input into ret
 template <typename DType, typename Lang>
-void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
+void Sum(int count, const Blob *input, DType *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret[i] = sign(input[i])
 template <typename DType, typename Lang>
-void Sign(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Sign(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Base is e, Neper number. ret[i]=exp(input[i])
 template <typename DType, typename Lang>
-void Exp(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Exp(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
 template <typename DType, typename Lang>
-void Log(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Log(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Sqrt(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=square([input[i])
 template <typename DType, typename Lang>
-void Square(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Square(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=tanh([input[i])
 template <typename DType, typename Lang>
-void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Tanh(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// Element-wise operation, ret[i]=max(0, input[i])
 template <typename DType, typename Lang>
-void ReLU(int count, const Blob* input, Blob* ret, Context* ctx) {
+void ReLU(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// Element-wise operation, ret[i]=sigmoid([input[i])
 template <typename DType, typename Lang>
-void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+void SumRows(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Sum the columns of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+void SumColumns(int nrow, int ncol, const Blob *input, Blob *ret,
+                Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // TODO(wangwei) unify AddRow and AddCol.
 /// Add the vector v to every row of A as the row of ret
 template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
+void AddRow(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
+            Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Add the vector v to every column of A as the column of ret
 template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
+void AddCol(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
+            Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
-
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lang>
-void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Pow(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
 template <typename DType, typename Lang>
-void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+void Pow(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret,
-           Context* ctx) {
+void Clamp(int count, DType low, DType high, const Blob *input, Blob *ret,
+           Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = input + x
 template <typename DType, typename Lang>
-void Add(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Add(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret =  input - x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Sub(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   Add<DType, Lang>(count, input, -x, ret, ctx);
 }
 /// ret = input * x
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob* input, DType x, Blob* ret, Context* ctx)
-{
+void EltwiseMult(int count, const Blob *input, DType x, Blob *ret,
+                 Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret = input / x
 template <typename DType, typename Lang>
-void Div(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Div(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx);
 }
 
 /// ret = lhs + rhs
 template <typename DType, typename Lang>
-void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+void Add(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs - rhs
 template <typename DType, typename Lang>
-void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+void Sub(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs * rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob* lhs, const Blob* rhs, Blob* ret,
-          Context* ctx) {
+void EltwiseMult(int count, const Blob *lhs, const Blob *rhs, Blob *ret,
+                 Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs / rhs
 template <typename DType, typename Lang>
-void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+void Div(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// outer-product.
 /// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
-           Context* ctx) {
+void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret,
+           Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
@@ -218,36 +224,36 @@ void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
 // ===== Level 1
 /// return the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(int count, const Blob* input, int* ret, Context* ctx) {
+void Amax(int count, const Blob *input, int *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// return the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(int count, const Blob* input, int* ret, Context* ctx) {
+void Amin(int count, const Blob *input, int *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret = sum |x| for all x in input
 template <typename DType, typename Lang>
-void Asum(int count, const Blob* input, DType* ret, Context* ctx) {
+void Asum(int count, const Blob *input, DType *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = alpha * input + ret
 template <typename DType, typename Lang>
-void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) {
+void Axpy(int count, DType alpha, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret *= x
 template <typename DType, typename Lang>
-void Scale(int count, DType x, Blob* ret, Context* ctx) {
+void Scale(int count, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
-         Context* ctx) {
+void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
+         Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
@@ -255,56 +261,64 @@ void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
 /// ret = alpha * op(A) * v + beta * ret.
 /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
 template <typename DType, typename Lang>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
-          DType beta, Blob* ret, Context* ctx) {
+void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
+          DType beta, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // ===== Level 3
-/// ret = alpha * op(A) * op(B) + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(ret) = m, cols(ret) = n.
-template <typename DType, typename Lang>
-void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha,
-          const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
 
 // ================Random functions===========================================
 /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
-void Bernoulli(int count, float p, Blob* ret, Context* ctx) {
+void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
-void Uniform(int count, float low, float high, Blob* ret, Context* ctx) {
+void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/*Some operations would have many config/hyper-parameters, e.g., Conv, and
-these config vary among diff implementations, e.g., cuda/cudnn/opencl.
-To separate the modules, we pass a OpConf pointer to the Tensor Op function.
-The specific fields are implemented by inheriting OpConf, and casting the
-pointer between the base and the sub-class.
-class OpConf {
- public:
-  template <typename T>
-  T* CastTo() {
-    static_assert(std::is_base_of<OpConf, T>::value,
-                  "The cast type must be a sub-class of OpConf");
-    return static_cast<T*>(this);
-  }
-};
-*/
-}  // namespace singa
+void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ========follow the consistency guide of math API
+
+template <typename DType, typename Lang>
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Divide alpha by each element of 'in'.
+template <typename DType, typename Lang>
+void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+/// if matrix_lef_side is true, do M*v; else do v*M
+template <typename DType, typename Lang>
+void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
+          const Blob *M, const Blob *v, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// C = alpha * A * B + beta * C.
+/// transA indicates if the internal data layout is transposed of A
+template <typename DType, typename Lang>
+void GEMM(const bool transA, const bool transB, const size_t nrowA,
+          const size_t ncolB, const size_t ncolA, const DType alpha,
+          const Blob *A, const Blob *B, const DType beta, Blob *C,
+          Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+} // namespace singa
 
 #endif  // SINGA_CORE_MATH_H_

[24/50] [abbrv] incubator-singa git commit: SINGA-184 Add Cross Entropy loss computation

Posted by zh...@apache.org.

SINGA-184 Add Cross Entropy loss computation

Implement Cross Entropy loss
Pass cpplint.py, test pass compilation
Todo: check test


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/efd7b627
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/efd7b627
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/efd7b627

Branch: refs/heads/master
Commit: efd7b627bacb4acd6a3322468350f2b5399f725b
Parents: 3e2507b
Author: kaiping <ka...@comp.nus.edu.sg>
Authored: Fri May 27 12:09:30 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Tue May 31 22:14:09 2016 +0800

----------------------------------------------------------------------
 src/model/loss/cross_entropy.h   | 105 ++++++++++++++++++++++++++++++++++
 test/singa/test_cross_entropy.cc |  66 +++++++++++++++++++++
 2 files changed, 171 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/efd7b627/src/model/loss/cross_entropy.h
----------------------------------------------------------------------
diff --git a/src/model/loss/cross_entropy.h b/src/model/loss/cross_entropy.h
new file mode 100644
index 0000000..815b795
--- /dev/null
+++ b/src/model/loss/cross_entropy.h
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LOSS_CROSS_ENTROPY_H_
+#define SRC_MODEL_LOSS_CROSS_ENTROPY_H_
+#include <stack>
+#include "singa/model/loss.h"
+
+namespace singa {
+
+/// Cross entropy is for cross entropy loss.
+class CrossEntropy : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is sum {-log(prob_of_truth)}
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is: if the entry x corresponds to ground truth,
+  /// then softmax(x) - 1; else, softmax(x)
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., softmax(prediction), target
+  std::stack<Tensor> buf_;
+};
+
+Tensor CrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+
+  size_t batchsize = 1;
+  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
+  size_t dim = prediction.Size() / batchsize;
+  // a temporal Softmax layer for forward computation
+//  LayerConf conf; // TODO(kaiping): this is currently commented
+//  Softmax softmax_tmp;
+//  softmax_tmp.Setup(conf);
+//  Tensor softmax = softmax_tmp.Forward(0, prediction);
+
+  Tensor softmax(Shape{batchsize, dim});  // TODO(kaiping): Delete
+//  softmax.SetValue<float>(0.5f); // TODO(kaiping): Delete
+
+  softmax.Reshape(Shape{batchsize, dim});
+  // buffer intermediate data
+  buf_.push(softmax);
+  buf_.push(target);
+
+  // Compute loss for each sample
+  Tensor loss(Shape{batchsize, 1});
+  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
+  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
+  float * loss_ptr = reinterpret_cast<float*>(loss.blob()->mutable_data());
+  for (size_t i = 0; i < batchsize; i++) {
+    int ilabel = static_cast<int>(truth_ptr[i]);
+    CHECK_GE(ilabel, 0);
+    float prob_of_truth = pre_ptr[ilabel];
+    loss_ptr[i] = -log(prob_of_truth);
+    pre_ptr += dim;  // change to the next sample
+  }
+  return loss;
+}
+
+Tensor CrossEntropy::Backward() {
+  const Tensor& target = buf_.top();
+  buf_.pop();
+  Tensor softmax = buf_.top();
+  buf_.pop();
+
+  size_t batchsize = 1;
+  if (softmax.nDim() > 1)
+    batchsize = softmax.shape().at(0);
+  size_t dim = softmax.Size() / batchsize;
+  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
+  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
+  for (size_t i = 0; i < batchsize; i++) {
+    int ilabel = static_cast<int>(truth_ptr[i]);
+    // CHECK_GE(ilabel, 0);
+    pre_ptr[ilabel] -= 1.0;
+    pre_ptr += dim;  // change to the next sample
+  }
+  return softmax;
+}
+}  // namespace singa
+
+#endif  // SRC_MODEL_LOSS_CROSS_ENTROPY_H_
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/efd7b627/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
new file mode 100644
index 0000000..9bb2321
--- /dev/null
+++ b/test/singa/test_cross_entropy.cc
@@ -0,0 +1,66 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+#include "singa/core/device.h"
+#include "../src/model/loss/cross_entropy.h"
+
+using singa::Tensor;
+class TestCrossEntropy : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    p.Reshape(singa::Shape{2, 4});
+    t.Reshape(singa::Shape{2, 1});
+    p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+    t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
+  }
+  const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+  const float tdat[2] = {0.0, 2.0};
+
+  singa::Tensor p, t;
+};
+
+TEST_F(TestCrossEntropy, CppForward) {
+  singa::CrossEntropy cross_entropy;
+  const Tensor& loss = cross_entropy.Forward(p, t);
+  auto ldat = loss.data<const float*>();
+
+  const float result_test = -log(0.25);
+  EXPECT_FLOAT_EQ(ldat[0], result_test);
+  EXPECT_FLOAT_EQ(ldat[1], result_test);
+}
+
+TEST_F(TestCrossEntropy, CppBackward) {
+  singa::CrossEntropy cross_entropy;
+  cross_entropy.Forward(p, t);
+  const Tensor& grad = cross_entropy.Backward();
+
+  auto gdat = grad.data<const float*>();
+  EXPECT_FLOAT_EQ(gdat[0], -0.75);
+  EXPECT_FLOAT_EQ(gdat[1], 0.25);
+  EXPECT_FLOAT_EQ(gdat[2], 0.25);
+  EXPECT_FLOAT_EQ(gdat[3], 0.25);
+  EXPECT_FLOAT_EQ(gdat[4], 0.25);
+  EXPECT_FLOAT_EQ(gdat[5], 0.25);
+  EXPECT_FLOAT_EQ(gdat[6], -0.75);
+  EXPECT_FLOAT_EQ(gdat[7], 0.25);
+}

[02/50] [abbrv] incubator-singa git commit: SINGA-170 Add Dropout layer and CudnnDropout layer

Posted by zh...@apache.org.

SINGA-170 Add Dropout layer and CudnnDropout layer

pass compilation.
there is link error for cudnn.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/99e0d24d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/99e0d24d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/99e0d24d

Branch: refs/heads/master
Commit: 99e0d24d90fa1c588d73f87f402dfb0ac36ca8a7
Parents: 02851fa
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon May 16 21:40:24 2016 +0800
Committer: wangwei <wa...@gmail.com>
Committed: Tue May 17 00:40:24 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                     |   7 +-
 include/singa/core/common.h        |  29 ++++-
 include/singa/core/device.h        |   4 +-
 include/singa/core/tensor.h        |  62 ++++++-----
 include/singa/model/layer.h        | 190 +++++++++++++++++++++++++-------
 include/singa/model/param.h        |  97 ----------------
 src/CMakeLists.txt                 |   7 +-
 src/core/device/device.cc          |   4 +-
 src/core/tensor/tensor.cc          | 107 ++++++++++--------
 src/core/tensor/tensor_math.h      |  11 +-
 src/core/tensor/tensor_math_cpp.h  |  29 +++++
 src/core/tensor/tensor_math_cuda.h |  24 ++--
 src/model/layer/conv.cc            |  27 -----
 src/model/layer/cudnn_dropout.cc   | 106 ++++++++++++++++++
 src/model/layer/cudnn_dropout.h    |  54 +++++++++
 src/model/layer/cudnn_utils.h      |  83 ++++++++++++++
 src/model/layer/dropout.cc         |  60 ++++++++++
 src/model/layer/dropout.h          |  49 ++++++++
 src/model/layer/layer.cc           |  30 -----
 src/proto/core.proto               |   3 +-
 src/proto/layer.proto              |  10 +-
 test/singa/test_dropout.cc         |  29 +++++
 test/singa/test_tensor.cc          |  10 +-
 23 files changed, 722 insertions(+), 310 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67a82e5..dd92d03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
 PROJECT(singa)
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11 -DUSE_CUDA -DUSE_CUDNN")
 
 # Flags
 IF(UNIX OR APPLE)
@@ -10,12 +10,13 @@ ENDIF()
 # Includes
 SET(singa_include_dir ${PROJECT_SOURCE_DIR}/include)
 INCLUDE_DIRECTORIES(${singa_include_dir} ${PROJECT_BINARY_DIR})
+INCLUDE_DIRECTORIES("/home/wangwei/local/cudnn5/include" "/usr/local/cuda/include")
 
 
 SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
-SET(singa_linker_lib)
-LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH})
+SET(singa_linker_lib cudnn)
+LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} "/home/wangwei/local/cudnn5/lib64/")
 
 INCLUDE(cmake/ProtoBuf.cmake)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 1d73f67..4d783fb 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -18,9 +18,18 @@
 
 #ifndef SINGA_CORE_COMMON_H_
 #define SINGA_CORE_COMMON_H_
-
+#include <random>
+#include <chrono>
 #include "singa/utils/logging.h"
 
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#endif
+#endif
+
 namespace singa {
 namespace lib {
 /// To implemente functions using cpp libraries
@@ -37,10 +46,10 @@ typedef unsigned char Byte;
 /// Blob reprent a chunk of memory (on device or host) managed by VirtualMemory.
 class Blob {
  public:
-  Blob(void* ptr, int size) : data_(ptr), size_(size), ref_count_(1) {}
+  Blob(void* ptr, size_t size) : data_(ptr), size_(size), ref_count_(1) {}
   void* mutable_data() const { return data_; }
   const void* data() const { return data_; }
-  int size() const { return size_; }
+  size_t size() const { return size_; }
   int IncRefCount() {
     ref_count_++;
     return ref_count_;
@@ -54,11 +63,21 @@ class Blob {
 
  private:
   void* data_ = nullptr;
-  int size_ = 0;
+  size_t size_ = 0;
   int ref_count_ = 0;
 };
 
-class Context {};
+typedef struct _Context {
+  std::mt19937 random_generator;
+  unsigned long long seed;
+#ifdef USE_CUDA
+  cublasHandle_t cublas_handle;
+  cudaStream_t stream;
+#ifdef USE_CUDNN
+  cudnnHandle_t cudnn_handle;
+#endif
+#endif
+} Context;
 
 }  // namespace singa
 #endif  // SINGA_CORE_COMMON_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index fa30d6d..f3bb5a2 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -79,8 +79,8 @@ class Device {
   void CopyDataFromHostPtr(Blob* dst, const void* src, size_t size);
   /// Submit the operation to the device, which may execute it right now or
   /// delay it depending on the scheduler.
-  void Submit(function<void(Context*)> fn, const vector<Blob*> read_blobs,
-              const vector<Blob*> write_blobs);
+  void Exec(function<void(Context*)> fn, const vector<Blob*> read_blobs,
+              const vector<Blob*> write_blobs, bool use_rand_generator = false);
 
   // Wait for one event.
   // void WaitFor();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 4278078..4807123 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -31,25 +31,23 @@ using std::vector;
 using std::tuple;
 namespace singa {
 
-typedef vector<int> Shape;
-inline int Product(Shape shape) {
-  if (shape.size() == 0)
-    return 0;
-  return Product(shape.begin(), shape.end());
-}
-
-inline int Product(vector<int>::iterator begin, vector<int>::iterator end) {
-  CHECK(begin != end);
-  int v = 1;
-  for (auto it = being; it < end; it++)
-    v* = *it;
+typedef vector<size_t> Shape;
+typedef Shape::iterator ShapeIter;
+inline size_t Product(const Shape& shape, int start = 0, size_t len = 0) {
+  if (len == 0)
+    len = shape.size();
+  CHECK_LE(len, shape.size());
+  size_t v = 1;
+  for (unsigned int i = start; i < len; i ++)
+    v *= shape[i];
   return v;
 }
 
 /// hardcode the width of types defined in DataType
-const int kDataWidth[] = {4, 2, 4, 1};
-inline int SizeOf(DataType t) {
-  static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(int),
+const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int),
+                          sizeof(char), sizeof(double)};
+inline size_t SizeOf(DataType t) {
+  static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(size_t),
       "Num of data types not match num of data width");
   CHECK_GT(kNumDataType, t);
   return kDataWidth[t];
@@ -112,18 +110,23 @@ class Tensor {
   }
 
   /// Return number of total elements
-  int Size() const {
+  size_t Size() const {
     return blob_->size() / SizeOf(data_type_);
   }
 
   /// Return memory size (i.e., Bytes)
-  int MemSize() const {
+  size_t MemSize() const {
     return blob_->size();
   }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
   void ReShape(const Shape& shape);
 
+  /// Reset the shape, device, and data type as given tensor.
+  /// If blob size changes, then reallocate a new blob. The previous blob would
+  /// be deleted.
+  void ResetLike(const Tensor& t);
+
   /// Reset the data type, it would reallocate blob if type changes.
   void AsType(DataType type);
 
@@ -136,7 +139,7 @@ class Tensor {
 
   /// For init the tensor values, copy 'num' elements.
   template<typename DType>
-  void CopyDataFromHostPtr(const DType* src, int num);
+  void CopyDataFromHostPtr(const DType* src, size_t num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
@@ -207,17 +210,17 @@ class Tensor {
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
 void CopyData(Tensor* dst,
               const Tensor& src,
-              int num,
-              int src_offset = 0,
-              int dst_offset = 0);
+              size_t num,
+              size_t src_offset = 0,
+              size_t dst_offset = 0);
 
 /// Copy 'nBytes' bytes of src data to dst.
 /// The first 'src_offset' ('dst_offset') bytes will be skipped.
 void CopyRawData(Tensor* dst,
               const Tensor& src,
-              int nBytes,
-              int src_offset = 0,
-              int dst_offset = 0);
+              size_t nBytes,
+              size_t src_offset = 0,
+              size_t dst_offset = 0);
 
 // ==================Simple Linear Algebra Operations=========================
 Tensor Abs(const Tensor& t);
@@ -306,15 +309,15 @@ void Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs,
 // tempalte<typename DType> T Dot(const Tensor& lhs, const Tensor& rhs);
 
 //================Random operations==========================================
-/// For each element x set x = 0 if random() < p; otherwise x = 1.
-Tensor Bernoulli(float p, Blob* t);
+/// For each element x set x = 1 if random() < p; otherwise x = 1.
+void Bernoulli(float p, Tensor* t);
 /// Fill in Tensor 't' following uniform distribution.
-Tensor Uniform(float low, DType high, Blob* t);
+void Uniform(float low, float high, Tensor* t);
 /// Fill in Tensor 't' following Gaussian distribution.
-Tensor Gaussian(float mean, DType std, Blob* t);
+void Gaussian(float mean, float std, Tensor* t);
 
 //================Neural Net operations======================================
-// following API of cudnn, e.g., conv, pool, lrn, batchnorm, softmax
+/* following API of cudnn, e.g., conv, pool, lrn, batchnorm, softmax
 void ConvFwd(const ConvConf& conf, const Tensor& x, const Tensor& w, Tensor* y);
 void ConvBwdBias(const ConvConf& conf, const Tensor& dy, Tensor* db);
 void ConvBwdFilter(const ConvConf& conf, const Tensor& dy, const Tensor& x,
@@ -325,6 +328,7 @@ void PoolFwd(const PoolConf& conf, const Tensor& x, Tensor* y,
              Tensor* mask = nullptr);
 void PoolBwd(const PoolConf& conf, const Tensor& y, const Tensor& dy,
              const Tensor& x, Tensor* dx);
+*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 7b9b6d4..48fc58f 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -21,6 +21,7 @@
 
 #include <vector>
 #include <string>
+#include <stack>
 #include "singa/core/tensor.h"
 #include "singa/proto/layer.pb.h"
 
@@ -28,14 +29,10 @@ namespace singa {
 
 /// The base layer class.
 /// Generally, a layer conducts feature transformation against a set of Tensor
-/// to generate a set of Tensor. Each layer may have some parameters represented
-/// by Param instances.
+/// to generate a set of Tensor. Each layer may have some parameters.
 class Layer {
  public:
   Layer() = default;
-  /// Each layer sub-class would optionaly have a type name.
-  /// Used for debugging and logging.
-  virtual const std::string layer_type() const { return "Unknown"; }
 
   /// Set meta data fields from a string representing a proto message.
   void Setup(const string& proto_str) {
@@ -44,68 +41,183 @@ class Layer {
     this->Setup(conf);
   }
 
+  // ============= Following Functions could be override =====================
+  /// Destruct the objecst created by this layer.
+  virtual ~Layer() {
+    for (Tensor * t : param_values_) {
+      delete t;
+    }
+  }
+
+  /// Each layer sub-class would optionaly have a type name.
+  /// Used for debugging and logging.
+  virtual const std::string layer_type() const { return "Unknown"; }
+
   /// Set meta data fields configured in 'conf' (a proto message).
   virtual void Setup(const LayerConf& conf) {
     name_ = conf.name();
+    for (const auto& spec : conf.param())
+      param_specs_.push_back(spec);
+    // TODO(wangwei) load param values from checkpoint blobs.
   }
 
-  /// Do feature transformation for given 'input' Tensor.
-  /// It is the forward pass for feed-forward nets and rnn nets.
+  /// Do feature transformation for the given 'input' tensor (denoted as x).
   /// 'flag' is either kPhaseTrain or kPhaseTest for feed-forward nets, and
-  /// would be used for phases of training other nets.
-  /// It will return a set of Tensor.
-  virtual const vector<Tensor> ComputeFeature(int flag,
-                                              const vector<Tensor>& input) {
-    return vector<Tensor>{};
-  }
-  /// Compute gradients of parameters of this layer.
-  /// It would also compute the gradients for other layers, e.g., the
-  /// preceding layers in topology order. It would return an empty vector if
-  /// this layer does not need to compute gradients for other layers.
-  /// 'flag' is either kPhaseTrain or kPhaseTest for feed-forward nets, and
-  /// would be used for phases of training other nets.
-  /// 'input' is a vector of Tensor for gradients from other layers.
-  virtual const vector<Tensor> ComputeGradient(int flag,
-                                               const vector<Tensor>& input) {
-    return vector<Tensor>{};
+  /// would be used for other phases of training other nets. For example, when
+  /// training RBM, we may create an alias of this function as ComputeFeature
+  /// where flag could be kPositivePhase and kNegativePhase.
+  /// It will return a Tensor (denoted as y).
+  /// If the 'input' or 'output' is required for computing the gradients in
+  /// Backward(), then push them into the states_ stack.
+  virtual const Tensor Forward(int flag, const Tensor& input) {
+    LOG(FATAL) << "Not implemented";
+    Tensor t;
+    return t;
+  }
+
+  /// \copydoc Forward(int flag, const Tensor& input)
+  /// Accept multiple input tensors and generate multiple output tensors.
+  virtual const vector<Tensor> Forward(int flag, const vector<Tensor>& inputs) {
+    vector<Tensor> ret;
+    if (inputs.size() == 1)
+      ret.push_back(Forward(flag, inputs.at(0)));
+
+    LOG(FATAL) << "Not implemented";
+    return ret;
+  }
+
+  /// Compute gradients of this layer.
+  /// Specifically, there are two types of gradients:
+  /// 1. gradients of preceding layers, i.e., dx.
+  /// 2. gradients of parameters of this layer.
+  /// 1 and 2 are returned as a pair of vector<Tensor>
+  /// 1 is an empty tensor if there is no preceding layer or there is no need to
+  /// compute dx (e.g., x is from a data layer); 2 is empty if this layer has no
+  /// parameters.
+  /// 'flag' is either kTrainPhase or kTestPhase for feed-forward nets, and
+  /// would be used for other phases when training other nets.
+  /// 'grad' is a Tensor for gradient (dy) from the upper layer.
+  /// Some layer would use 'input' or 'output' from Forward to compute the
+  /// gradients of parameters. Backward() pop out the state data.
+  /// It is useful for RNN layers, where the same layer is used multiple
+  /// times just like unrolling the layer.
+  virtual const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                           const Tensor& grad) {
+    LOG(FATAL) << "Not implemented!";
+    Tensor t;
+    return std::make_pair(t, vector<Tensor>{});
+  }
+
+  /// \copydoc Backward(int, const vector<Tensor>&)
+  /// For Forward(int, const vector<Tensor>&)
+  virtual const std::pair<vector<Tensor>, vector<Tensor>> Backward(
+      int flag, const vector<Tensor>& grads) {
+    vector<Tensor> input_grad, param_grad;
+    if (grads.size() == 1u) {
+      auto ret = Backward(flag, grads.at(0));
+      input_grad.push_back(ret.first);
+      param_grad = ret.second;
+    } else  {
+      LOG(FATAL) << "Not implemented";
+    }
+    return std::make_pair(input_grad, param_grad);
   }
-  // return <dx>  <dw (ParamGrad)>
 
-  /// Move the layer (including its parameters and other Tensor) onto the given
-  /// device
+  /// Move the layer (including its parameters and other internal Tensor) onto
+  /// the given device
   virtual void ToDevice(Device* device) {
-    // for (auto p : params_)
-      // p->ToDevice(device);
+    for (auto p : param_values_) p->ToDevice(device);
   }
 
-  /// Set the data type of Tensor s in this layer.
+  /// Set the data type of Tensor in this layer.
   virtual void AsType(DataType dtype) {
-  //     for (auto p : params_)
-  //     p->AsType(dtype);
+    for (auto p : param_values_) p->AsType(dtype);
   }
 
-  /// Serialize the layer info, including params)_, into a LayerConf message.
-  virtual std::string ToProto(LayerConf* conf) const {
+  /// Serialize the layer info (including params) into a LayerConf proto message
+  virtual void ToProto(LayerConf* conf) const {
     conf->set_name(name_);
+    for (const auto& spec: param_specs_) {
+      ParamSpec* p = conf->add_param();
+      p->CopyFrom(spec);
+    }
+    // TODO(wangwei) add param values into conf;
   }
 
+  // ========================================================================
+
   /// Serialize the layer info, including params_, into a string representing
   /// a LayerParameter message.
-  std::string ToProtoStr() const;
+  std::string ToProtoStr() const {
+    LayerConf conf;
+    ToProto(&conf);
+    string str;
+    conf.SerializeToString(&str);
+    return str;
+  }
+  /// Return specs/configuration of all parameter instances of this layer.
+  /// \ref ParamSpec.
+  const vector<ParamSpec> param_specs() {
+    return param_specs_;
+  }
 
-  /// Return all Param instances of this layer.
-  /// Each layer could cache the Param objects.
-  /// To save memory of , it can also create it when this function
-  /// is called
-  const vector<Param*> GetParam();
+  /// Return the i-th ParamSpec.
+  const ParamSpec& param_specs(int i) {
+    return param_specs_.at(i);
+  }
+
+  /// Return pointers to parameter Tensor s.
+  const vector<Tensor*> param_values() {
+    return param_values_;
+  }
+
+  /// Return a pointer to the 'i'-th parameter Tensor.
+  Tensor* param_value(size_t i) {
+    CHECK_LT(i, param_values_.size());
+    return param_values_[i];
+  }
+
+  /// Return names of all parmaeters.
+  const vector<string> param_names() {
+    vector<string> pname;
+    for (const auto& spec: param_specs_)
+      pname.push_back(spec.name());
+    return pname;
+  }
+
+  /// Return the 'i'-th parameter name.
+  const string& param_name(size_t i) {
+    CHECK_LT(i, param_specs_.size());
+    return param_specs_.at(i).name();
+  }
 
   /// Each layer instance would optionally have a name.
   /// Used for debugging and logging.
   const std::string name() const { return name_; }
 
+  /*
+  std::stack<Tensor> states() const {
+    return states_;
+  }
+  */
+
  protected:
   std::string name_;
+  vector<Tensor*> param_values_;
+  vector<ParamSpec> param_specs_;
+  /// Used to store input or output of Forward(), which would be used in
+  /// Backward.  Rules:
+  /// 1. push the 'input' or 'output' into states_ if the flag of Forward() is
+  ///    for training.
+  /// 2. pop data out in Backward().
+  /// TODO(wangwei) enable this feature for rnn layers.
+  // std::stack<Tensor*> states_;
 };
 
+// ===========================================================================
+// Order layer sub-classes based on alphabetical order of the first letter.
+// ===========================================================================
+
+
 }  // namespace singa
 #endif  // SINGA_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/include/singa/model/param.h
----------------------------------------------------------------------
diff --git a/include/singa/model/param.h b/include/singa/model/param.h
deleted file mode 100644
index b859b1c..0000000
--- a/include/singa/model/param.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_MODEL_PARAM_H_
-#define SINGA_MODEL_PARAM_H_
-#include "singa/core/tensor.h"
-#include <vector>
-#include <string>
-using std::vector;
-using std::string;
-namespace singa {
-/// Base Param class for storing set of parameters, e.g., a weight matrix or a
-/// bias vector.
-/// It includes multiple Tensor s for parameter values, gradients, etc.
-class Param {
- public:
-  ~Param();
-  Param(const ParamSpec& conf);
-  Param(Param&& p);
-  Param(const Param& p);
-  void operator=(Param&& p);
-  void operator=(const Param& p);
-
-  Tensor& value() {
-    return value_;
-  }
-
-  Tensor& grad() {
-    return grad_;
-  }
-
-  void set_value(const Tensor& t) {
-    value_ = t;
-  }
-
-  void set_value(Tensor&& t) {
-    value_ = std::move(t);
-  }
-
-  void set_grad(const Tensor& t) {
-    isGradValid_ = true;
-    grad_ = t;
-  }
-
-  void set_grad(Tensor&& t) {
-    grad_ = std::move(t);
-  }
-
-  // void Compress();
-  // string ToString();
-
- protected:
-  string name_;
-  Tensor value_;
-  float lr_mult_ = 1.0f, decay_mult_ = 1.0f;
-};
-
-class ParamGrad {
-// return grad tensor or data to recover the grad tensor, e.g., if W = U * V
-// then, ParamGrad could just store U and V. provide func for serailize and
-// deserialize.
-};
-
-// updater just copy the ParamGrad to a device and submit ops to that device, e.g.,
-// add grad; check update_condidtion; apply sgd; copy back.
-// consider rpc (no rmda).
-
-Param* CreateParam(string type) {
-  Param* p = nullptr;
-  if (type == "default")
-    p = new Param();
-  else
-    LOG(FATAL) << "Currently param type " << type << " is not implemented."
-               << "Pls use the 'default' type";
-  return p;
-}
-#endif  // SINGA_MODEL_PARAM_H_
-
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d8bec8d..e2e923e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -15,7 +15,12 @@ FILE(GLOB_RECURSE core_source ${CMAKE_CURRENT_SOURCE_DIR}/core/ "*.cc")
 ADD_LIBRARY(singa_core SHARED ${core_source})
 TARGET_LINK_LIBRARIES(singa_core ${singa_linker_libs})
 list(APPEND singa_linker_libs singa_core)
-MESSAGE(STATUS "link libs " ${singa_linker_libs})
+#MESSAGE(STATUS "link libs " ${singa_linker_libs})
+
+FILE(GLOB_RECURSE model_source ${CMAKE_CURRENT_SOURCE_DIR}/model/ "*.cc")
+ADD_LIBRARY(singa_model SHARED ${model_source})
+TARGET_LINK_LIBRARIES(singa_model ${singa_linker_libs})
+list(APPEND singa_linker_libs singa_model)
 
 #ADD_LIBRARY(singa_layer SHARED ${LAYER_SOURCE})
 #ADD_LIBRARY(singa_model SHARED ${MODEL_SOURCE})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 4976a32..b2a8705 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -25,8 +25,8 @@ Device::Device(int id, int num_executors, string scheduler, string vm)
   vm_ = nullptr;
 }
 
-void Device::Submit(function<void(Context*)> fn, const vector<Blob*> read_blobs,
-                    const vector<Blob*> write_blobs) {
+void Device::Exec(function<void(Context*)> fn, const vector<Blob*> read_blobs,
+                    const vector<Blob*> write_blobs, bool use_rand_generator) {
   fn(nullptr);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 51b785e..8352b48 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -20,6 +20,7 @@
 #include "./tensor_math_cpp.h"
 #include "./tensor_math_cuda.h"
 #include "./tensor_math_opencl.h"
+#include <utility>
 
 namespace singa {
 
@@ -69,6 +70,16 @@ Tensor::Tensor(Tensor&& t)
   t.blob_ = nullptr;
 }
 
+void Tensor::ResetLike(const Tensor& t) {
+  if (blob_->size() != t.MemSize()) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+    shape_ = t.shape_;
+    device_ = t.device_;
+    data_type_ = t.data_type_;
+    blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+  }
+}
+
 void Tensor::ReShape(const Shape& shape) {
   if (shape_ != shape) {
     if (blob_ != nullptr && blob_->DecRefCount() == 0)
@@ -105,7 +116,7 @@ void Tensor::ToHost() {
 }
 
 template<typename DType>
-void Tensor::CopyDataFromHostPtr(const DType* src, int num) {
+void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_)) << "data_type is "
                                               << DataType_Name(data_type_)
                                               << " user given type is of size "
@@ -115,7 +126,7 @@ void Tensor::CopyDataFromHostPtr(const DType* src, int num) {
   else
     LOG(WARNING) << "Copy data from null host ptr";
 }
-template void Tensor::CopyDataFromHostPtr(const float* src, int num);
+template void Tensor::CopyDataFromHostPtr(const float* src, size_t num);
 
 void Tensor::CopyData(const Tensor& src) {
   CHECK_EQ(Size(), src.Size());
@@ -134,10 +145,10 @@ Tensor Tensor::Clone() {
 }
 
 Tensor Tensor::T() const {
-  CHECK_EQ(shape_.size(), 2);
+  CHECK_EQ(shape_.size(), 2u);
   Tensor t(*this);
   t.transpose_ = ~transpose_;
-  std::swap(shape_[0], shape_[1]);
+  std::swap(t.shape_[0], t.shape_[1]);
   return t;
 }
 
@@ -185,21 +196,21 @@ GenUnaryScalarArgMemberFunction(operator/=, Div);
 // ====================Tensor Operations=======================================
 void CopyData(Tensor* dst,
               const Tensor& src,
-              int num,
-              int dst_offset,
-              int src_offset) {
+              size_t num,
+              size_t dst_offset,
+              size_t src_offset) {
   CHECK_GE(src.Size(), src_offset + num);
   CHECK_GE(dst->Size(), dst_offset + num);
-  int width = SizeOf(src.data_type());
+  auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   CopyRawData(dst, src, num * width, dst_offset * width, src_offset * width);
 }
 
 void CopyRawData(Tensor* dst,
               const Tensor& src,
-              int nBytes,
-              int dst_offset,
-              int src_offset) {
+              size_t nBytes,
+              size_t dst_offset,
+              size_t src_offset) {
   CHECK_GE(src.MemSize(), src_offset + nBytes);
   CHECK_GE(dst->MemSize(), dst_offset + nBytes);
   Device* src_dev = src.device(), *dst_dev = dst->device();
@@ -286,7 +297,7 @@ void CopyRawData(Tensor* dst,
 #define EltwiseUnaryTensorFn(fn, t, ret)                                   \
   do {                                                                     \
     TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, { \
-      ret->device()->Submit(                                               \
+      ret->device()->Exec(                                               \
           [t, ret](Context* ctx) {                                         \
             fn<DType, Lib>(t.Size(), t.blob(), ret->blob(), ctx);          \
           },                                                               \
@@ -320,14 +331,14 @@ Tensor Softmax(const Tensor& t, int axis) {
 void Softmax(const Tensor& t, Tensor* ret, int axis) {
   int nrow = 1, ncol = t.Size(), size = ncol;
   CHECK_GE(axis, -1);
-  CHECK_GT(t.shape().size(), 0);
+  CHECK_GT(t.shape().size(), 0u);
   if (axis > -1) {
-    nrow = Product(t.shape().begin(), t.shape().begin() + axis + 1);
+    nrow = Product(t.shape(), 0, axis + 1);
     CHECK_EQ(size % nrow, 0) << "Size = " << size << " nrow = " << nrow;
     ncol = size / nrow;
   }
   TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, {
-    ret->device()->Submit(
+    ret->device()->Exec(
         [nrow, ncol, t, ret](Context* ctx) {
           Softmax<DType, Lib>(nrow, ncol, t.blob(), ret->blob(), ctx);
         },
@@ -338,8 +349,8 @@ void Softmax(const Tensor& t, Tensor* ret, int axis) {
 #define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
   do {                                                                         \
     TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, { \
-      ret->device()->Submit(                                                   \
-          CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                    \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
+      ret->device()->Exec(                                                     \
           [lhs, rhs, ret](Context* ctx) {                                      \
             fn<DType, Lib>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),    \
                            ctx);                                               \
@@ -364,28 +375,28 @@ GenBinaryTensorFunction(operator*, EltwiseMult);
 GenBinaryTensorFunction(operator/, Div);
 GenBinaryTensorFunction(Pow, Pow);
 
-#define EltwiseTensorScalarFn(fn, t, x, ret)                                \
-  do {                                                                      \
-    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, {  \
-      ret->device()->Submit(                                                \
-          static_assert(typeid(x) == typeid(DType),                         \
-                        "The Scalar type must match the Tensor data type"); \
-          [t, x, ret](Context* ctx) {                                       \
-            fn<DType, Lib>(t.Size(), t.blob(), x, ret->blob(), ctx);        \
-          },                                                                \
-          {t.blob()}, {ret->blob()});                                       \
-    });                                                                     \
+#define EltwiseTensorScalarFn(fn, t, x, ret)                               \
+  do {                                                                     \
+    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, { \
+      static_assert(std::is_same<SType, DType>::value,                             \
+                    "The Scalar type must match the Tensor data type");    \
+      ret->device()->Exec(                                                 \
+          [t, x, ret](Context* ctx) {                                      \
+            fn<DType, Lib>(t.Size(), t.blob(), x, ret->blob(), ctx);       \
+          },                                                               \
+          {t.blob()}, {ret->blob()});                                      \
+    });                                                                    \
   } while (0)
 
 #define GenTensorScalarFunction(op, fn)                \
-  template <typename DType>                                \
-  Tensor op(const Tensor& t, DType x) {                    \
+  template <typename SType>                            \
+  Tensor op(const Tensor& t, SType x) {                \
     Tensor ret(t.shape(), t.device(), t.data_type());  \
     fn(t, x, &ret);                                    \
     return ret;                                        \
   }                                                    \
-  template <typename DType>                                \
-  void fn(const Tensor& t, DType x, Tensor* ret) {   \
+  template <typename SType>                            \
+  void fn(const Tensor& t, SType x, Tensor* ret) {     \
     EltwiseTensorScalarFn(fn, t, x, ret);              \
   }                                                    \
   template Tensor op<float>(const Tensor& t, float x); \
@@ -424,15 +435,15 @@ template Tensor Mult<float>(float alpha, const Tensor& lhs, float beta,
 template <typename SType>
 void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
 {
-  CHECK_EQ(A.shape().size(), 2);
+  CHECK_EQ(A.shape().size(), 2u);
   bool transA = A.transpose();
-  int m = transA ? A.shape()[1] : A.shape()[0], n = 0;
-  if (B.shape().size() == 1) {
+  size_t m = transA ? A.shape()[1] : A.shape()[0], n = 0;
+  if (B.shape().size() == 1u) {
     n = C->Size();
     TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
       static_assert(std::is_same<SType, DType>::value,
         "The scalar type must be the same as the tensor data type");
-      C->device()->Submit(
+      C->device()->Exec(
         [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
         GEMV<DType, Lib>(transA, m, n, alpha, A.blob(),
           B.blob(), beta, C->blob(), ctx);
@@ -442,7 +453,7 @@ void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
   } else {
     CHECK(!C->transpose());
     bool transB = B.transpose();
-    int k = transB ? B.shape()[1] : B.shape()[0];
+    size_t k = transB ? B.shape()[1] : B.shape()[0];
     n = C->shape()[1];
     CHECK_EQ(C->shape()[0], m);
     CHECK_EQ(A.Size(), m * k);
@@ -450,7 +461,7 @@ void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
     TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
         static_assert(std::is_same<SType, DType>::value,
           "The scalar type must be the same as the tensor data type");
-        C->device()->Submit(
+        C->device()->Exec(
           [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
           GEMM<DType, Lib>(transA, transB, m, n, k, alpha, A.blob(),
             B.blob(), beta, C->blob(), ctx);
@@ -468,7 +479,7 @@ template void Mult<float>(float alpha, const Tensor& lhs, float beta,
 void Conv(const OpConf* conf, const Tensor& input, const Tensor& W,
           const Tensor& b, Tensor* ret) {
   TYPE_LIB_SWITCH(input.data_type(), DType, input.device()->nn_lib(), Lib, {
-    ret->device()->Submit(
+    ret->device()->Exec(
         [conf, input, W, b, ret](Context* ctx) {
           Conv<DType, Lib>(conf, input.blob(), W.blob(), b.blob(), ret->blob(),
                            ctx);
@@ -477,33 +488,33 @@ void Conv(const OpConf* conf, const Tensor& input, const Tensor& W,
   });
 }
 */
-void Bernoulli(float threshold, Tensor* t) {
+void Bernoulli(float p, Tensor* t) {
   TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
-    t->device()->Submit(
-        [threshold, t](Context* ctx) {
-          Bernoulli<DType, Lib>(t->Size(), threshold, t->blob(), ctx);
+    t->device()->Exec(
+        [p, t](Context* ctx) {
+          Bernoulli<DType, Lib>(t->Size(), p, t->blob(), ctx);
         },
-        {}, {t->blob()});
+        {}, {t->blob()}, true);
   });
 }
 
 void Uniform(float low, float high, Tensor* t) {
   TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
-    t->device()->Submit(
+    t->device()->Exec(
         [low, high, t](Context* ctx) {
           Uniform<DType, Lib>(t->Size(), low, high, t->blob(), ctx);
         },
-        {}, {t->blob()});
+        {}, {t->blob()}, true);
   });
 }
 
 void Gaussian(float mean, float std, Tensor* t) {
   TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
-    t->device()->Submit(
+    t->device()->Exec(
         [mean, std, t](Context* ctx) {
           Gaussian<DType, Lib>(t->Size(), mean, std, t->blob(), ctx);
         },
-        {}, {t->blob()});
+        {}, {t->blob()}, true);
   });
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index a4f68e3..aa520c9 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -96,6 +96,12 @@ void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+/// Do softmax for each row invidually
+template <typename DType, typename Lib>
+void Softmax(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lib>
 void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
@@ -258,7 +264,7 @@ void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha,
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lib>
-void Bernoulli(int count, float threshold, Blob* ret, Context* ctx) {
+void Bernoulli(int count, float p, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // The random generator should be extracted from ctx.
@@ -274,7 +280,7 @@ void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
-// ================Neural net functions=======================================
+/* ================Neural net functions=======================================
 template <typename DType, typename Lib>
 void ConvFwd(ConvConf* conf, const Blob* x, const Blob* w, Blob* y,
              Context* ctx) {
@@ -296,6 +302,7 @@ void PoolBwd(const PoolConf* conf, const Blob* y, const Blob* dy, const Blob* x,
              Blob* dx, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
+*/
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index a953085..9e7ed30 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -40,6 +40,35 @@ void Add<float, lib::Cpp>(int count,
   }
 }
 
+template <>
+void Bernoulli<float, lib::Cpp>(int count, float p, Blob* ret,
+                                 Context* ctx) {
+  std::bernoulli_distribution distribution(p);
+  float* ptr = static_cast<float*>(ret->mutable_data());
+  for (int i = 0; i < count; i ++) {
+    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
+}
+
+template <>
+void Uniform<float, lib::Cpp>(int count, float low, float high, Blob* ret,
+                               Context* ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float* ptr = static_cast<float*>(ret->mutable_data());
+  for (int i = 0; i < count; i ++) {
+    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
+}
+
+template <>
+void Gaussian<float, lib::Cpp>(int count, float mean, float std, Blob* ret,
+                              Context* ctx) {
+  std::normal_distribution<float> distribution(mean, std);
+  float* ptr = static_cast<float*>(ret->mutable_data());
+  for (int i = 0; i < count; i++) {
+    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
+}
 #ifdef USE_CBLAS
 template<>
 void Dot<float, lib::Cpp>(int count,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index e1c72d8..c5ea3c4 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -28,24 +28,16 @@ namespace singa {
 template<>
 void Add<float, lib::Cuda>(int count, const Blob* lhs, const Blob* rhs,
                         Blob* ret, Context* ctx) {
-  cublasSetStream(ctx->handle, ctx->stream);
-  cublasScopy(ctx->handle, count, lhs->data(), 1, ret->mutable_data(), 1);
-  cublasSaxpy(ctx->handle, 1.0f, rhs->data(), 1, ret->mutable_data(), 1);
+  /*
+  cublasSetStream(ctx->cublas_handle, ctx->stream);
+  const float* lptr = static_cast<const float*>(lhs->data());
+  const float* rptr = static_cast<const float*>(rhs->data());
+  float* ptr = static_cast<float*>(ret->mutable_data());
+  cublasScopy(ctx->cublas_handle, count, lptr, 1, ptr, 1);
+  cublasSaxpy(ctx->cublas_handle, 1.0f, rptr, 1, ptr, 1);
+  */
 }
 
-#ifdef USE_CUDNN
-template<>
-void Conv<float, lib::Cudnn>(const OpConf *conf,
-          const Blob* input,
-          const Blob* W,
-          const Blob* b,
-          Blob* ret,
-          Context* ctx) {
-  // auto conv_conf = conf->CastTo<ConvConf>();
-  // conv op
-}
-
-#endif
 #endif
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/model/layer/conv.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/conv.cc b/src/model/layer/conv.cc
deleted file mode 100644
index d1a7d2c..0000000
--- a/src/model/layer/conv.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-namespace singa {
-
-
-
-
-
-
-}  /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
new file mode 100644
index 0000000..926ccb9
--- /dev/null
+++ b/src/model/layer/cudnn_dropout.cc
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef USE_CUDNN
+// cudnn dropout is added in cudnn 5
+//#if CUDNN_MAJOR_VERSION >= 5
+#include "./cudnn_utils.h"
+#include "./cudnn_dropout.h"
+#include "singa/utils/logging.h"
+namespace singa {
+CudnnDropout::~CudnnDropout() {
+  if (drop_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyDropoutDescriptor(drop_desc_));
+  if (x_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnDropout::InitCudnn(int size, DataType dtype, Context* ctx) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreateDropoutDescriptor(&drop_desc_));
+
+  int dim[] = {size};
+  int stride[] = {1};
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_desc_, GetCudnnDataType(dtype), 1,
+      dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_desc_, GetCudnnDataType(dtype), 1,
+      dim, stride));
+
+  cudnnDropoutGetStatesSize(ctx->cudnn_handle, &state_size_);
+  cudnnDropoutGetReserveSpaceSize(x_desc_, &reserve_size_);
+  cudnnSetDropoutDescriptor(drop_desc_, ctx->cudnn_handle, dropout_ratio_,
+    state_.blob()->mutable_data(),
+    state_size_, ctx->seed);
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnDropout::Forward(int flag, const Tensor& input) {
+  if (flag & kTrain) {
+    auto size = input.Size();
+    DataType dtype = input.data_type();
+    if (!has_init_cudnn_) {
+      input.device()->Exec(
+          [size, dtype, this](Context* ctx) {
+          this->InitCudnn(size, dtype, ctx);
+          },
+          {}, {state_.blob()});
+      mask_.ResetLike(input);
+      CHECK_EQ(reserve_size_, mask_.MemSize());
+    }
+    Tensor out;
+    out.ResetLike(input);
+    Blob *inblob = input.blob(), *outblob = out.blob(), *mblob = mask_.blob();
+    out.device()->Exec(
+        [inblob, outblob, mblob, this](Context* ctx) {
+        cudnnDropoutForward(
+            ctx->cudnn_handle, this->drop_desc_, this->x_desc_, inblob->data(),
+            this->y_desc_, outblob->mutable_data(), mblob, this->reserve_size_);
+        },
+        {inblob}, {mblob, outblob});
+    return out;
+  } else {
+    return input;
+  }
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnDropout::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor dx;
+  if (flag & kTrain) {
+    dx.ResetLike(grad);
+    Blob *dyblob = grad.blob(), *dxblob = dx.blob(), *mblob = mask_.blob();
+    dx.device()->Exec(
+        [dyblob, dxblob, mblob, this](Context* ctx) {
+        cudnnDropoutBackward(ctx->cudnn_handle, this->drop_desc_,
+            this->y_desc_, dyblob->data(), this->x_desc_,
+            dxblob->mutable_data(), mblob,
+            this->reserve_size_);
+        },
+        {dyblob, mblob}, {dxblob});
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+//#endif  // CUDNN_VERSION_MAJOR>=5
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
new file mode 100644
index 0000000..0a19214
--- /dev/null
+++ b/src/model/layer/cudnn_dropout.h
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_
+#define SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_
+#ifdef USE_CUDNN
+// cudnn dropout is added in cudnn 5
+//#if CUDNN_MAJOR_VERSION >= 5
+
+#include "singa/model/layer.h"
+#include "singa/core/common.h"
+#include "singa/proto/core.pb.h"
+#include "./dropout.h"
+
+namespace singa {
+class CudnnDropout : public Dropout {
+ public:
+  ~CudnnDropout();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "CudnnDropout"; }
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(
+      int flag, const Tensor& grad) override;
+
+  /// Init cudnn related data structures.
+  void InitCudnn(int size, DataType dtype, Context* ctx);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnDropoutDescriptor_t drop_desc_;
+  cudnnTensorDescriptor_t x_desc_, y_desc_;
+  size_t state_size_, reserve_size_;
+  Tensor state_;
+};
+}  // namespace
+//#endif  // CUDNN_VERSION_MAJOR>=5
+#endif  // USE_CUDNN
+#endif // SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/model/layer/cudnn_utils.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
new file mode 100644
index 0000000..735ec13
--- /dev/null
+++ b/src/model/layer/cudnn_utils.h
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_CUDNN_BASE_H_
+#define SINGA_MODEL_LAYER_CUDNN_BASE_H_
+#ifdef USE_CUDNN
+#include "singa/proto/core.pb.h"
+#include "singa/utils/logging.h"
+#include <cudnn.h>
+namespace singa {
+inline cudnnDataType_t GetCudnnDataType(DataType dtype) {
+  cudnnDataType_t ret;
+  switch (dtype) {
+    case kFloat32:
+      ret = CUDNN_DATA_FLOAT;
+      break;
+    case kDouble:
+      ret = CUDNN_DATA_DOUBLE;
+      break;
+    case kFloat16:
+      ret = CUDNN_DATA_HALF;
+      break;
+    default:
+      LOG(FATAL) << "The data type " << DataType_Name(dtype)
+                 << " is not support by cudnn";
+  }
+  return ret;
+}
+
+#define CUDNN_CHECK(condition) \
+  do { \
+    cudnnStatus_t status = condition; \
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "\
+      << cudnnGetErrorString(status); \
+  } while (0)
+
+/*
+inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+  }
+  return "Unknown cudnn status";
+}
+*/
+
+}  // namespace singa
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_BASE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/model/layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.cc b/src/model/layer/dropout.cc
new file mode 100644
index 0000000..f0fe25b
--- /dev/null
+++ b/src/model/layer/dropout.cc
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./dropout.h"
+namespace singa {
+
+void Dropout::Setup(const LayerConf& conf) {
+  Layer::Setup(conf);
+  dropout_ratio_ = conf.dropout_conf().dropout_ratio();
+}
+
+const Tensor Dropout::Forward(int flag, const Tensor& input) {
+  Tensor out;
+  if (flag & kTrain) {
+    mask_.ResetLike(input);
+    // set mask_[i] = 1 with prob 1-dropout_rato_
+    Bernoulli(1 - dropout_ratio_, &mask_);
+    mask_ *= 1.0f / (1.0f - dropout_ratio_);
+    out = input * mask_;
+  } else {
+    out = input;
+  }
+  return out;
+}
+
+const std::pair<Tensor, vector<Tensor>> Dropout::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad;
+  if (flag & kTrain) {
+    // note mask is already scaled by 1/(1-dropout_ratio_)
+    input_grad = grad * mask_;
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  return std::make_pair(input_grad, param_grad);
+}
+
+void Dropout::ToDevice(Device* device) {
+  Layer::ToDevice(device);
+  mask_.ToDevice(device);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/model/layer/dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
new file mode 100644
index 0000000..de349a5
--- /dev/null
+++ b/src/model/layer/dropout.h
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_DROPOUT_H_
+#define SINGA_MODEL_LAYER_DROPOUT_H_
+#include "singa/model/layer.h"
+namespace singa {
+class Dropout : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Dropout"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf& conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  /// if flag is kTrain, then do dropout with given dropout_ratio;
+  /// otherwise if it is kEval, copy input directly to the output
+  /// TODO(wangwei) There are diff implementations, Caffe vs
+  /// <a href="https://github.com/nitishsrivastava/deepnet/blob/master/deepnet/fastdropoutnet.py">
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  void ToDevice(Device* device) override;
+
+ protected:
+  /// the proability to set each element to 0.
+  float dropout_ratio_;
+  Tensor mask_;
+};
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_DROPOUT_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/model/layer/layer.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/layer.cc b/src/model/layer/layer.cc
deleted file mode 100644
index 0e83cde..0000000
--- a/src/model/layer/layer.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "singa/model/layer.h"
-
-namespace singa {
-const vector<Tensor> ComputeFeature(int flag, const vector<Tensor>& input) {
-  const vector<Blob*> input_blobs;
-
-}
-
-void ComputeFeature(int flag, const vector<Tensor>& input) {
-  const vector<Blob*> input_blobs;
-
-}
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index c137186..f366ed0 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -26,7 +26,8 @@ enum DataType {
   kFloat16 = 1;
   kInt = 2;
   kChar = 3;
-  kNumDataType = 4;
+  kDouble = 4;
+  kNumDataType = 5;
 }
 
 enum LibType {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/src/proto/layer.proto
----------------------------------------------------------------------
diff --git a/src/proto/layer.proto b/src/proto/layer.proto
index 0fbbb5d..3d130ea 100644
--- a/src/proto/layer.proto
+++ b/src/proto/layer.proto
@@ -98,11 +98,15 @@ message ParamSpec {
   // The multiplier on the global weight decay for this parameter.
   optional float decay_mult = 4 [default = 1.0];
 
-  // SINGA field for creating diff Param, e.g. SparseParam or CompressableParam
-  // Curently only have a default param implementation.
-  optional string type = 20 [default = "default"];
+  // SINGA uses this filed internally. Users just configure the fillers in
+  // Layer specific conf message as caffe (style).
+  optional FillerConf filler = 20;
 }
 
+enum Phase {
+  kTrain = 4;
+  kEval = 8;
+}
 // NOTE
 // Update the next available ID when you add a new LayerConf field.
 //

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/test/singa/test_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dropout.cc b/test/singa/test_dropout.cc
new file mode 100644
index 0000000..cfe9d73
--- /dev/null
+++ b/test/singa/test_dropout.cc
@@ -0,0 +1,29 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "../src/model/layer/dropout.h"
+
+
+TEST(TestDropoutLayer, Setup) {
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/99e0d24d/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index 86200a8..ae20823 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -6,19 +6,19 @@ using singa::Device;
 
 TEST(TensorTest, TestConstructor) {
   singa::Tensor float_t(singa::Shape{2,3});
-  EXPECT_EQ(6, float_t.Size());
+  EXPECT_EQ(6u, float_t.Size());
   EXPECT_EQ(sizeof(float) * 6, float_t.MemSize());
   EXPECT_EQ(singa::kFloat32, float_t.data_type());
   auto s = float_t.shape();
-  EXPECT_EQ(s[0], 2);
-  EXPECT_EQ(s[1], 3);
+  EXPECT_EQ(s[0], 2u);
+  EXPECT_EQ(s[1], 3u);
 
   EXPECT_NE(float_t.device(), nullptr);
 
   singa::Tensor float16_t(Shape{2,3}, singa::kFloat16);
   EXPECT_EQ(singa::kFloat16, float16_t.data_type());
-  EXPECT_EQ(6, float16_t.Size());
-  EXPECT_EQ(12, float16_t.blob()->size());
+  EXPECT_EQ(6u, float16_t.Size());
+  EXPECT_EQ(12u, float16_t.blob()->size());
 
   singa::Tensor x(float16_t);
   EXPECT_EQ(float16_t.Size(), x.Size());

[10/50] [abbrv] incubator-singa git commit: SINGA-177 Add fully cmake supporting for the compilation of singa_v1

Posted by zh...@apache.org.

SINGA-177 Add fully cmake supporting for the compilation of singa_v1

Add the whole compilation system implementing by cmake.
Solve the duplicated compilation bugs.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/611554f8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/611554f8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/611554f8

Branch: refs/heads/master
Commit: 611554f88a9d321f706ab987d8fa81b8bd6d3733
Parents: 9d1bcb4
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Sun May 22 03:40:33 2016 +0800
Committer: xiezl <xi...@comp.nus.edu.sg>
Committed: Mon May 23 17:05:29 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                    | 35 ++++++++++++++++++-------------
 cmake/Cuda.cmake                  | 30 +++++++++++++--------------
 cmake/Dependencies.cmake          | 16 ++++++++++++++
 cmake/Templates/singa_config.h.in | 16 ++++++++++++++
 cmake/Thirdparty/FindCUDNN.cmake  | 38 +++++++++++++++++-----------------
 cmake/Thirdparty/FindLMDB.cmake   | 12 +++++++++++
 src/CMakeLists.txt                | 28 +++++++++++++++++--------
 test/CMakeLists.txt               |  5 ++---
 8 files changed, 120 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d1a1e6..8cb42fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,10 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
 PROJECT(singa)
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 #message(STATUS "${CMAKE_CXX_FLAGS}")
 
-list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
+LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
 #message(STATUS "module path: ${CMAKE_MODULE_PATH}")
 
 # Flags
@@ -11,22 +12,28 @@ IF(UNIX OR APPLE)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
 ENDIF()
 
-INCLUDE(cmake/ProtoBuf.cmake)
-INCLUDE(cmake/Cuda.cmake)
-# Includes
-SET(SINGA_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-include_directories(${SINGA_INCLUDE_DIR})
-#INCLUDE_DIRECTORIES(${singa_include_dir} ${PROJECT_BINARY_DIR})
-#INCLUDE_DIRECTORIES("/home/wangwei/local/cudnn5/include" "/usr/local/cuda/include")
+SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
+#message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
+INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
 
+#OPTION(CPU_ONLY "use GPU libs" OFF)
+OPTION(USE_CUDA "Use Cuda libs" ON)
+OPTION(USE_CUDNN "Use Cudnn libs" ON)
+OPTION(USE_OPENCV "Use opencv" OFF)
+OPTION(USE_LMDB "Use LMDB libs" OFF)
 
-SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
-SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
-#SET(singa_linker_lib cudnn)
-#LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} "/home/wangwei/local/cudnn5/lib64/")
+INCLUDE("cmake/Dependencies.cmake")
+ADD_DEFINITIONS(-DUSE_CMAKE)
 
-#message(STATUS "include_dir: ${SINGA_INCLUDE_DIR}")
+CONFIGURE_FILE (
+    "${PROJECT_SOURCE_DIR}/cmake/Templates/singa_config.h.in"
+    "${PROJECT_BINARY_DIR}/singa_config.h")
 
+#set(SINGA_CONFIGURE_SRC "${PROJECT_BINARY_DIR}/singa_config.h")
+#LIST(APPEND SRCS ${SINGA_CONFIGURE_SRCS} ${PROJECT_BINARY_DIR}/singa_config.h)
+
+SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
+SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(test)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index a9ddcb0..19d4e27 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -1,24 +1,24 @@
 
-find_package(CUDA 5.5 QUIET)
+FIND_PACKAGE(CUDA 5.5 QUIET)
 
-if(NOT CUDA_FOUND)
+IF(NOT CUDA_FOUND)
     return()
-endif()
+ENDIF()
 
-set(HAVE_CUDA TRUE)
-message(STATUS "Found cuda_v${CUDA_VERSION}")
-add_definitions(-DUSE_CUDA)
+SET(HAVE_CUDA TRUE)
+MESSAGE(STATUS "Found cuda_v${CUDA_VERSION}")
+#ADD_DEFINITIONS(-DUSE_CUDA)
 #message(STATUS "linking: ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}")
 
 
-#if(USE_CUDNN)
+IF(USE_CUDNN)
 #include(cmake/Modules/Cudnn.cmake)
-    find_package(CUDNN REQUIRED)
-    include_directories(SYSTEM ${CUDNN_INCLUDE_DIR})
-    list(APPEND SINGA_LINKER_LIBS ${CUDNN_LIBRARIES})
-    add_definitions(-DUSE_CUDNN)
-    add_definitions(-DCUDNN_VERSION_MAJOR=${CUDNN_VERSION_MAJOR})
-#endif()
+    FIND_PACKAGE(CUDNN REQUIRED)
+    INCLUDE_DIRECTORIES(SYSTEM ${CUDNN_INCLUDE_DIR})
+    LIST(APPEND SINGA_LINKER_LIBS ${CUDNN_LIBRARIES})
+    #ADD_DEFINITIONS(-DUSE_CUDNN)
+    #ADD_DEFINITIONS(-DCUDNN_VERSION_MAJOR=${CUDNN_VERSION_MAJOR})
+ENDIF()
 
-include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-list(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+INCLUDE_DIRECTORIES(SYSTEM ${CUDA_INCLUDE_DIRS})
+LIST(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
new file mode 100644
index 0000000..ae28073
--- /dev/null
+++ b/cmake/Dependencies.cmake
@@ -0,0 +1,16 @@
+SET(Singa_LINKER_LIBS "")
+
+INCLUDE("cmake/ProtoBuf.cmake")
+
+IF(USE_LMDB)
+    FIND_PACKAGE(LMDB REQUIRED)
+    INCLUDE_DIRECTORIES(SYSTEM ${LMDB_INCLUDE_DIR})
+    LIST(APPEND Singa_LINKER_LIBS ${LMDB_LIBRARIES})
+ENDIF()
+
+IF(NOT CPU_ONLY)
+    INCLUDE("cmake/Cuda.cmake")
+ELSE()
+    SET(USE_CUDA FALSE)
+    SET(USE_CUDNN FALSE)
+ENDIF()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/cmake/Templates/singa_config.h.in
----------------------------------------------------------------------
diff --git a/cmake/Templates/singa_config.h.in b/cmake/Templates/singa_config.h.in
new file mode 100644
index 0000000..e0f7328
--- /dev/null
+++ b/cmake/Templates/singa_config.h.in
@@ -0,0 +1,16 @@
+// Source directory
+#define SOURCE_FOLDER "${PROJECT_SOURCE_DIR}"
+
+// Binaries director
+#define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
+
+#cmakedefine CPU_ONLY 
+// cuda
+#cmakedefine USE_CUDA
+
+#cmakedefine USE_CUDNN
+#cmakedefine CUDNN_VERSION_MAJOR @CUDNN_VERSION_MAJOR@
+
+// lmdb
+#cmakedefine USE_LMDB
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/cmake/Thirdparty/FindCUDNN.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindCUDNN.cmake b/cmake/Thirdparty/FindCUDNN.cmake
index faf98d9..eefab9d 100644
--- a/cmake/Thirdparty/FindCUDNN.cmake
+++ b/cmake/Thirdparty/FindCUDNN.cmake
@@ -1,33 +1,33 @@
 
-find_path(CUDNN_INCLUDE_DIR NAME "cudnn.h" PATHS "$ENV{CMAKE_INCLUDE_PATH}")
-find_library(CUDNN_LIBRARIES NAME "libcudnn.so" PATHS "$ENV{CMAKE_LIBRARY_PATH}")
+FIND_PATH(CUDNN_INCLUDE_DIR NAME "cudnn.h" PATHS "$ENV{CMAKE_INCLUDE_PATH}")
+FIND_LIBRARY(CUDNN_LIBRARIES NAME "libcudnn.so" PATHS "$ENV{CMAKE_LIBRARY_PATH}")
 
 #message("cudnn include path:${CUDNN_INCLUDE_DIR}  lib path: ${CUDNN_LIBRARIES}")
 #message("env include path:$ENV{CUDNN_DIR} next: $ENV{CMAKE_INCLUDE_PATH}")
-include(FindPackageHandleStandardArgs)
+INCLUDE(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARIES)
 
-if(CUDNN_FOUND)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
-    string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+IF(CUDNN_FOUND)
+    FILE(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    STRING(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
         CUDNN_VERSION_MAJOR "${CUDNN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+    STRING(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
         CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
-    string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+    STRING(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
         CUDNN_VERSION_MINOR "${CUDNN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+    STRING(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
         CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
-    string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+    STRING(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
         CUDNN_VERSION_PATCH "${CUDNN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+    STRING(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
         CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
 
-    if(NOT CUDNN_VERSION_MAJOR)
-        set(CUDNN_VERSION "???")
-    else()
-        set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
-    endif()
-    message(STATUS "Found Cudnn_v${CUDNN_VERSION} at ${CUDNN_INCLUDE_DIR}")
-    mark_as_advanced(CUDNN_INCLUDE_DIR CUDNN_LIBRARIES)
+    IF(NOT CUDNN_VERSION_MAJOR)
+        SET(CUDNN_VERSION "???")
+    ELSE()
+        SET(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+    ENDIF()
+    MESSAGE(STATUS "Found Cudnn_v${CUDNN_VERSION} at ${CUDNN_INCLUDE_DIR}")
+    MARK_AS_ADVANCED(CUDNN_INCLUDE_DIR CUDNN_LIBRARIES)
 
-endif()
+ENDIF()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/cmake/Thirdparty/FindLMDB.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindLMDB.cmake b/cmake/Thirdparty/FindLMDB.cmake
new file mode 100644
index 0000000..c402d99
--- /dev/null
+++ b/cmake/Thirdparty/FindLMDB.cmake
@@ -0,0 +1,12 @@
+
+FIND_PATH(LMDB_INCLUDE_DIR NAMES lmdb.h PATHS "$ENV{LMDB_DIR}/include")
+FIND_LIBRARY(LMDB_LIBRARIES NAMES lmdb PATHS "$ENV{LMDB_DIR}/include")
+
+INCLUDE(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LMDB DEFAULT_MSG LMDB_INCLUDE_DIR LMDB_LIBRARIES)
+
+IF(LMDB_FOUND)
+    MESSAGE(STATUS "Found lmdb at $ENV{LMDB_DIR}")
+    MARK_AS_ADVANCED(LMDB_INCLUDE_DIR LMDB_LIBRARIES)
+    
+ENDIF()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e2e923e..39383bd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,26 +1,36 @@
 # generate protobuf sources
-file(GLOB proto_files proto/*.proto)
+FILE(GLOB proto_files proto/*.proto)
 singa_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
 # include python files either to force generation
-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-list(APPEND singa_linker_libs proto)
+ADD_LIBRARY(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+LIST(APPEND singa_linker_libs proto)
 
-FILE(GLOB_RECURSE utils_source ${CMAKE_CURRENT_SOURCE_DIR}/utils/ "*.cc")
+#FILE(GLOB_RECURSE utils_source ${CMAKE_CURRENT_SOURCE_DIR}/utils/ "*.cc")
+AUX_SOURCE_DIRECTORY(utils utils_source)
+#message(STATUS "UTILS ${utils_source}")
 ADD_LIBRARY(singa_utils SHARED ${utils_source})
 TARGET_LINK_LIBRARIES(singa_utils ${singa_linker_libs})
-list(APPEND singa_linker_libs singa_utils)
+LIST(APPEND singa_linker_libs singa_utils)
 
 
-FILE(GLOB_RECURSE core_source ${CMAKE_CURRENT_SOURCE_DIR}/core/ "*.cc")
+#FILE(GLOB_RECURSE core_source ${CMAKE_CURRENT_SOURCE_DIR}/core/ "*.cc")
+AUX_SOURCE_DIRECTORY(core/device core_source)
+AUX_SOURCE_DIRECTORY(core/memory core_source)
+AUX_SOURCE_DIRECTORY(core/scheduler core_source)
+AUX_SOURCE_DIRECTORY(core/tensor core_source)
+#message(STATUS "CORE ${core_source}")
 ADD_LIBRARY(singa_core SHARED ${core_source})
 TARGET_LINK_LIBRARIES(singa_core ${singa_linker_libs})
-list(APPEND singa_linker_libs singa_core)
+LIST(APPEND singa_linker_libs singa_core)
 #MESSAGE(STATUS "link libs " ${singa_linker_libs})
 
-FILE(GLOB_RECURSE model_source ${CMAKE_CURRENT_SOURCE_DIR}/model/ "*.cc")
+#FILE(GLOB_RECURSE model_source ${CMAKE_CURRENT_SOURCE_DIR}/model/ "*.cc")
+AUX_SOURCE_DIRECTORY(model model_source)
+AUX_SOURCE_DIRECTORY(model/layer model_source)
+#MESSAGE(STATUS "MODEL ${model_source}")
 ADD_LIBRARY(singa_model SHARED ${model_source})
 TARGET_LINK_LIBRARIES(singa_model ${singa_linker_libs})
-list(APPEND singa_linker_libs singa_model)
+LIST(APPEND singa_linker_libs singa_model)
 
 #ADD_LIBRARY(singa_layer SHARED ${LAYER_SOURCE})
 #ADD_LIBRARY(singa_model SHARED ${MODEL_SOURCE})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/611554f8/test/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index de64abd..a8b0e29 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -3,9 +3,8 @@ ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" "gtest/gtest-all.cc")
 
 AUX_SOURCE_DIRECTORY(singa singa_test_source)
 
-ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source})
+ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source}) 
 ADD_DEPENDENCIES(test_singa singa_core singa_utils)
 MESSAGE(STATUS "link libs" ${singa_linker_libs})
-TARGET_LINK_LIBRARIES(test_singa gtest singa_core singa_utils proto protobuf
-    ${SINGA_LINKER_LIBS})
+TARGET_LINK_LIBRARIES(test_singa gtest singa_core singa_utils singa_model proto protobuf ${SINGA_LINKER_LIBS})
 SET_TARGET_PROPERTIES(test_singa PROPERTIES LINK_FLAGS "${LINK_FLAGS} -pthread")

[25/50] [abbrv] incubator-singa git commit: SINGA-188 Add Dense layer

Posted by zh...@apache.org.

SINGA-188 Add Dense layer

Add implementation for dense layer

Add test files for dense layer, both cpp version and cuda version

Pass all tests


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/73d4a34b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/73d4a34b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/73d4a34b

Branch: refs/heads/master
Commit: 73d4a34b6aea80da77b42f234c26244aa98d1d93
Parents: 3e2507b
Author: zhaojing <zh...@comp.nus.edu.sg>
Authored: Tue May 31 22:23:17 2016 +0800
Committer: zhaojing <zh...@comp.nus.edu.sg>
Committed: Thu Jun 2 11:17:24 2016 +0800

----------------------------------------------------------------------
 src/model/layer/dense.cc |  85 +++++++++++++++
 src/model/layer/dense.h  |  69 ++++++++++++
 src/proto/model.proto    |  15 +++
 test/singa/test_dense.cc | 242 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 411 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/73d4a34b/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
new file mode 100644
index 0000000..ebee62a
--- /dev/null
+++ b/src/model/layer/dense.cc
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./dense.h"
+#include "singa/model/layer.h"
+#include <vector>
+
+namespace singa {
+using std::vector;
+
+Dense::~Dense() {
+  // delete weight_;
+  // delete bias_;
+}
+void Dense::Setup(const LayerConf &conf) {
+  Layer::Setup(conf);
+  DenseConf dense_conf = conf.dense_conf();
+  hdim_ = dense_conf.num_output();
+  vdim_ = dense_conf.num_input();
+  transpose_ = dense_conf.transpose();
+  if (transpose_)
+    weight_.Reshape(Shape{vdim_, hdim_});
+  else
+    weight_.Reshape(Shape{hdim_, vdim_});
+  bias_.Reshape(Shape{hdim_});
+  param_values_.push_back(&weight_);
+  param_values_.push_back(&bias_);
+}
+
+/// \copydoc Layer::Forward(int flag, const Tensor&)
+const Tensor Dense::Forward(int flag, const Tensor &input) {
+  Tensor output;
+  if (transpose_)
+    output = Mult(input, weight_);
+  else
+    output = Mult(input, weight_.T());
+  AddRow(bias_, &output);
+  buf_.push(input);
+  return output;
+}
+
+/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+const std::pair<Tensor, vector<Tensor>>
+Dense::Backward(int flag, const Tensor &grad) {
+  vector<Tensor> param_grad;
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  Tensor db, dw, dx;
+  db.ResetLike(bias_);
+  dw.ResetLike(weight_);
+  dx.ResetLike(src_data);
+  SumRows(grad, &db);
+  if (transpose_){
+    dx = Mult(grad, weight_.T()); 
+    dw = Mult(src_data.T(), grad);
+  }
+  else{
+    dx = Mult(grad, weight_);
+    dw = Mult(grad.T(), src_data);
+  }
+  param_grad.push_back(dw);
+  param_grad.push_back(db);
+  return std::make_pair(dx, param_grad);
+}
+
+void Dense::ToDevice(Device *device) { 
+  weight_.ToDevice(device);
+  bias_.ToDevice(device); 
+}
+} // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/73d4a34b/src/model/layer/dense.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h
new file mode 100644
index 0000000..d686a01
--- /dev/null
+++ b/src/model/layer/dense.h
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_DENSE_H_
+#define SRC_MODEL_LAYER_DENSE_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include <stack>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Dense : public Layer {
+ public:
+  ~Dense();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Dense"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf& conf) override;
+
+  void SetupParam(const Tensor& input);
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  void ToDevice(Device* device) override;
+  
+  size_t num_output() const { return hdim_; }
+  size_t num_input() const { return vdim_; }
+  bool transpose() const { return transpose_; }
+  const Tensor &weight() const { return weight_; }
+  const Tensor &bias() const { return bias_; }
+
+  void set_weight(Tensor w) {
+    weight_.ResetLike(w);
+    weight_.CopyData(w);
+  }
+  void set_bias(Tensor b) {
+    bias_.ResetLike(b);
+    bias_.CopyData(b);
+  }
+
+protected:
+  size_t batchsize_, vdim_, hdim_;
+  bool transpose_;
+  Tensor weight_, bias_;
+  // Tensor data_, grad_;
+  std::stack<Tensor> buf_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_DENSE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/73d4a34b/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 1b18703..75e2be7 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -208,6 +208,7 @@ message LayerConf {
   // optional ImageDataConf image_data_conf = 115;
   optional InfogainLossConf infogain_loss_conf = 116;
   optional InnerProductConf inner_product_conf = 117;
+  optional DenseConf dense_conf = 150;
   optional LogConf log_conf = 134;
   optional LRNConf lrn_conf = 118;
   // Used in SINGA
@@ -573,6 +574,20 @@ message InnerProductConf {
   optional int32 axis = 5 [default = 1];
 }
 
+message DenseConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3; // The filler for the weight
+  optional FillerConf bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+  optional uint32 num_input = 20; // The number of inputs for the layer
+  optional bool transpose = 21 [default = false]; // whether transpose or not
+}
+
 // Message that stores hyper-parameters used by LogLayer
 message LogConf {
   // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/73d4a34b/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
new file mode 100644
index 0000000..6d136af
--- /dev/null
+++ b/test/singa/test_dense.cc
@@ -0,0 +1,242 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/dense.h"
+
+#include "gtest/gtest.h"
+
+using singa::Dense;
+TEST(Dense, Setup) {
+  Dense dense;
+  EXPECT_EQ("Dense", dense.layer_type());
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_input(2);
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(conf);
+
+  EXPECT_EQ(3, dense.num_output());
+  EXPECT_EQ(2, dense.num_input());
+}
+
+TEST(Dense, ForwardCpp) {
+  Dense dense;
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_input(2);
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(conf);
+  
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f};
+  singa::Tensor in(singa::Shape{batchsize, vdim});
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+  
+  // set weight
+  const float we[hdim * vdim] = {
+      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{hdim, vdim});
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+
+  const float bia[hdim] = {
+      1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim});
+  bias.CopyDataFromHostPtr(bia, hdim);
+ 
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+  
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  const float *outptr1 = out1.data<const float *>();
+  EXPECT_EQ(9, out1.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++)
+      EXPECT_FLOAT_EQ((x[i * 2 +  0] * we[j * 2 + 0] + x[i * 2 + 1] * we[j * 2 + 1] + bia[j]), outptr1[i * 3 + j]);
+}
+
+TEST(Dense, ForwardCuda) {
+  Dense dense;
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_input(2);
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(conf);
+  
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+  
+  // set weight
+  const float we[hdim * vdim] = {
+      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+
+  const float bia[hdim] = {
+      1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim}, &cuda);
+  bias.CopyDataFromHostPtr(bia, hdim);
+ 
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+  
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  EXPECT_EQ(9, out1.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++)
+      EXPECT_FLOAT_EQ((x[i * 2 +  0] * we[j * 2 + 0] + x[i * 2 + 1] * we[j * 2 + 1] + bia[j]), outptr1[i * 3 + j]);
+}
+
+TEST(Dense, BackwardCpp) {
+  Dense dense;
+  
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_input(2);
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(conf);
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim});
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+
+  // set weight
+  const float we[hdim * vdim] = {
+      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{hdim, vdim});
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+  
+  const float bia[hdim] = {
+      1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim});
+  bias.CopyDataFromHostPtr(bia, hdim);
+  
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+
+  // grad
+  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 3.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{batchsize, hdim});
+  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
+
+  const auto ret = dense.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  singa::Tensor dweight = ret.second.at(0);
+  singa::Tensor dbias = ret.second.at(1);
+  const float *dx = in_grad.data<const float *>();
+  EXPECT_EQ(6, in_grad.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ((dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 +  j] + dy[i * 3 +  2] * we[2 * 2 + j]), dx[i * 2 + j]);
+  const float *dweightx = dweight.data<const float *>();
+  EXPECT_EQ(6, dweight.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ((dy[0 * 3 + i] * x[0 *2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] + dy[2 * 3 + i] * x[2 * 2 + j]), dweightx[i * 2 + j]);
+  const float *dbiasx = dbias.data<const float *>();
+  EXPECT_EQ(3, dbias.Size());
+  for (int i = 0; i < 3; i++)
+    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
+}
+
+TEST(Dense, BackwardCuda) {
+  Dense dense;
+  
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_input(2);
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(conf);
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+
+  // set weight
+  const float we[hdim * vdim] = {
+      1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+  
+  const float bia[hdim] = {
+      1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim}, &cuda);
+  bias.CopyDataFromHostPtr(bia, hdim);
+  
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+
+  // grad
+  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 3.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
+
+  const auto ret = dense.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  singa::Tensor dweight = ret.second.at(0);
+  singa::Tensor dbias = ret.second.at(1);
+  in_grad.ToDevice(&host);
+  const float *dx = in_grad.data<const float *>();
+  EXPECT_EQ(6, in_grad.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ((dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 +  j] + dy[i * 3 +  2] * we[2 * 2 + j]), dx[i * 2 + j]);
+  dweight.ToDevice(&host);
+  const float *dweightx = dweight.data<const float *>();
+  EXPECT_EQ(6, dweight.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ((dy[0 * 3 + i] * x[0 *2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] + dy[2 * 3 + i] * x[2 * 2 + j]), dweightx[i * 2 + j]);
+  dbias.ToDevice(&host);
+  const float *dbiasx = dbias.data<const float *>();
+  EXPECT_EQ(3, dbias.Size());
+  for (int i = 0; i < 3; i++)
+    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
+}

[04/50] [abbrv] incubator-singa git commit: SINGA-172 Add CMake supporting for Cuda and Cudnn libs

Posted by zh...@apache.org.

SINGA-172 Add CMake supporting for Cuda and Cudnn libs

Note: Before compilation, user should specify CMAKE_INCLUDE_PATH & CMAKE_LIBRARY_PATH as system environment variables.
Usage:
	mkdir build;
	cmake ..;
	make;
	./bin/test_singa;


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3a872014
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3a872014
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3a872014

Branch: refs/heads/master
Commit: 3a872014d73c8883f4c63a59612cfb8ea0a7811f
Parents: c3a0558
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Tue May 17 23:56:04 2016 +0800
Committer: xiezl <xi...@comp.nus.edu.sg>
Committed: Tue May 17 23:56:04 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                   | 19 +++++++++++++------
 cmake/Cuda.cmake                 | 20 ++++++++++++++++++++
 cmake/Thirdparty/FindCUDNN.cmake | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3a872014/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dd92d03..8457bf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,23 +2,30 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
 PROJECT(singa)
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11 -DUSE_CUDA -DUSE_CUDNN")
 
+list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
+#message(STATUS "module path: ${CMAKE_MODULE_PATH}")
+
 # Flags
 IF(UNIX OR APPLE)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
 ENDIF()
 
+INCLUDE(cmake/ProtoBuf.cmake)
+INCLUDE(cmake/Cuda.cmake)
 # Includes
-SET(singa_include_dir ${PROJECT_SOURCE_DIR}/include)
-INCLUDE_DIRECTORIES(${singa_include_dir} ${PROJECT_BINARY_DIR})
-INCLUDE_DIRECTORIES("/home/wangwei/local/cudnn5/include" "/usr/local/cuda/include")
+SET(SINGA_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
+include_directories(${SINGA_INCLUDE_DIR})
+#INCLUDE_DIRECTORIES(${singa_include_dir} ${PROJECT_BINARY_DIR})
+#INCLUDE_DIRECTORIES("/home/wangwei/local/cudnn5/include" "/usr/local/cuda/include")
 
 
 SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
-SET(singa_linker_lib cudnn)
-LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} "/home/wangwei/local/cudnn5/lib64/")
+#SET(singa_linker_lib cudnn)
+#LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} "/home/wangwei/local/cudnn5/lib64/")
+
+#message(STATUS "include_dir: ${SINGA_INCLUDE_DIR}")
 
-INCLUDE(cmake/ProtoBuf.cmake)
 
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(test)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3a872014/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
new file mode 100644
index 0000000..e3338af
--- /dev/null
+++ b/cmake/Cuda.cmake
@@ -0,0 +1,20 @@
+
+find_package(CUDA 5.5 QUIET)
+
+if(NOT CUDA_FOUND)
+    return()
+endif()
+
+set(HAVE_CUDA TRUE)
+message(STATUS "Found cuda_v${CUDA_VERSION}")
+include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+list(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+
+#if(USE_CUDNN)
+#include(cmake/Modules/Cudnn.cmake)
+    find_package(CUDNN REQUIRED)
+    include_directories(SYSTEM ${CUDNN_INCLUDE_DIR})
+    list(APPEND SINGA_LINKER_LIBS ${CUDNN_LIBRARIES})
+    add_definitions(-DUSE_CUDNN)
+#endif()
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3a872014/cmake/Thirdparty/FindCUDNN.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindCUDNN.cmake b/cmake/Thirdparty/FindCUDNN.cmake
new file mode 100644
index 0000000..faf98d9
--- /dev/null
+++ b/cmake/Thirdparty/FindCUDNN.cmake
@@ -0,0 +1,33 @@
+
+find_path(CUDNN_INCLUDE_DIR NAME "cudnn.h" PATHS "$ENV{CMAKE_INCLUDE_PATH}")
+find_library(CUDNN_LIBRARIES NAME "libcudnn.so" PATHS "$ENV{CMAKE_LIBRARY_PATH}")
+
+#message("cudnn include path:${CUDNN_INCLUDE_DIR}  lib path: ${CUDNN_LIBRARIES}")
+#message("env include path:$ENV{CUDNN_DIR} next: $ENV{CMAKE_INCLUDE_PATH}")
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARIES)
+
+if(CUDNN_FOUND)
+    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+        CUDNN_VERSION_MAJOR "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+        CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+    string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+        CUDNN_VERSION_MINOR "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+        CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+    string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+        CUDNN_VERSION_PATCH "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+        CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+
+    if(NOT CUDNN_VERSION_MAJOR)
+        set(CUDNN_VERSION "???")
+    else()
+        set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+    endif()
+    message(STATUS "Found Cudnn_v${CUDNN_VERSION} at ${CUDNN_INCLUDE_DIR}")
+    mark_as_advanced(CUDNN_INCLUDE_DIR CUDNN_LIBRARIES)
+
+endif()

[05/50] [abbrv] incubator-singa git commit: SINGA-170 Add Dropout layer and CudnnDropout layer

Posted by zh...@apache.org.

SINGA-170 Add Dropout layer and CudnnDropout layer

Checked code format via cpplint.py.
Tested the compilation and linking for cudnn.
Note, if there are multiple cuda installed, pls configure CUDA_BIN_PATH
to your cuda path (e.g., /usr/local/cuda-7.5) before `cmake ..`.
You need to set the CMAKE_INCLUDE_PATH and CMAKE_LIBRARY_PATH for cudnn.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/b4918753
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/b4918753
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/b4918753

Branch: refs/heads/master
Commit: b4918753cfee52a5ef537e453c953b4c384044d2
Parents: 3a87201
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Wed May 18 12:03:36 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Wed May 18 12:03:36 2016 +0800

----------------------------------------------------------------------
 include/singa/core/common.h     |  4 ++--
 include/singa/core/device.h     |  1 -
 include/singa/core/tensor.h     | 10 +++++-----
 src/core/tensor/tensor.cc       | 32 ++++++++++++++++----------------
 src/model/layer/cudnn_dropout.h | 10 ++++++----
 src/model/layer/cudnn_utils.h   |  6 +++---
 src/model/layer/dropout.h       | 10 +++++++---
 src/model/layer/rnn.h           | 13 ++++++++++---
 8 files changed, 49 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 4d783fb..2f5b167 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -24,7 +24,7 @@
 
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
-#include "cublas_v2.h"
+#include <cublas_v2.h>
 #ifdef USE_CUDNN
 #include <cudnn.h>
 #endif
@@ -40,7 +40,7 @@ typedef struct _Cuda { } Cuda;
 typedef struct _Cudnn { } Cudnn;
 /// To implement function using opencl libraries
 typedef struct _Opencl { } Opencl;
-}  // namespace lib;
+}  // namespace lib
 
 typedef unsigned char Byte;
 /// Blob reprent a chunk of memory (on device or host) managed by VirtualMemory.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index b96efca..9022041 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -130,7 +130,6 @@ class CppDevice : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-
 };
 
 /// a singleton CppDevice as the host for all devices.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 6c20c4f..88a895b 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -65,8 +65,8 @@ class Tensor {
  public:
   ~Tensor();
   Tensor();
-  Tensor(Shape&& shape, DataType dtype = kFloat32);
-  Tensor(const Shape& shape, DataType dtype = kFloat32);
+  explicit Tensor(Shape&& shape, DataType dtype = kFloat32);
+  explicit Tensor(const Shape& shape, DataType dtype = kFloat32);
   Tensor(Shape&& shape, Device* dev, DataType dtype = kFloat32);
   Tensor(const Shape& shape, Device* dev, DataType dtype = kFloat32);
 
@@ -278,7 +278,7 @@ Tensor operator/(const Tensor& t, DType x);
 template <typename DType>
 void Div(const Tensor& t, DType x, Tensor* ret);
 
-//================Blas operations============================================
+// ================Blas operations============================================
 // ===== Level 1
 // TODO(wangwei) make amax/amin/asum a member function of tensor
 // void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
@@ -308,7 +308,7 @@ void Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs,
 
 // tempalte<typename DType> T Dot(const Tensor& lhs, const Tensor& rhs);
 
-//================Random operations==========================================
+// ================Random operations==========================================
 /// For each element x set x = 1 if random() < p; otherwise x = 1.
 void Bernoulli(float p, Tensor* t);
 /// Fill in Tensor 't' following uniform distribution.
@@ -316,7 +316,7 @@ void Uniform(float low, float high, Tensor* t);
 /// Fill in Tensor 't' following Gaussian distribution.
 void Gaussian(float mean, float std, Tensor* t);
 
-//================Neural Net operations======================================
+// ================Neural Net operations======================================
 /* following API of cudnn, e.g., conv, pool, lrn, batchnorm, softmax
 void ConvFwd(const ConvConf& conf, const Tensor& x, const Tensor& w, Tensor* y);
 void ConvBwdBias(const ConvConf& conf, const Tensor& dy, Tensor* db);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index cd62a38..0e5570d 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -381,7 +381,7 @@ GenBinaryTensorFunction(Pow, Pow);
 #define EltwiseTensorScalarFn(fn, t, x, ret)                               \
   do {                                                                     \
     TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, { \
-      static_assert(std::is_same<SType, DType>::value,                             \
+      static_assert(std::is_same<SType, DType>::value,                     \
                     "The Scalar type must match the Tensor data type");    \
       ret->device()->Exec(                                                 \
           [t, x, ret](Context* ctx) {                                      \
@@ -436,8 +436,8 @@ template Tensor Mult<float>(float alpha, const Tensor& lhs, float beta,
     const Tensor& rhs);
 
 template <typename SType>
-void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
-{
+void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B,
+          Tensor* C) {
   CHECK_EQ(A.shape().size(), 2u);
   bool transA = A.transpose();
   size_t m = transA ? A.shape()[1] : A.shape()[0], n = 0;
@@ -445,14 +445,14 @@ void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
     n = C->Size();
     TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
       static_assert(std::is_same<SType, DType>::value,
-        "The scalar type must be the same as the tensor data type");
+                    "The scalar type must be the same as the tensor data type");
       C->device()->Exec(
-        [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
-        GEMV<DType, Lib>(transA, m, n, alpha, A.blob(),
-          B.blob(), beta, C->blob(), ctx);
-        },
-        {A.blob(), B.blob()}, {C->blob()});
-      });
+          [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
+            GEMV<DType, Lib>(transA, m, n, alpha, A.blob(), B.blob(), beta,
+                             C->blob(), ctx);
+          },
+          {A.blob(), B.blob()}, {C->blob()});
+    });
   } else {
     CHECK(!C->transpose());
     bool transB = B.transpose();
@@ -462,15 +462,15 @@ void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
     CHECK_EQ(A.Size(), m * k);
     CHECK_EQ(B.Size(), n * k);
     TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
-        static_assert(std::is_same<SType, DType>::value,
-          "The scalar type must be the same as the tensor data type");
-        C->device()->Exec(
+      static_assert(std::is_same<SType, DType>::value,
+                    "The scalar type must be the same as the tensor data type");
+      C->device()->Exec(
           [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
-          GEMM<DType, Lib>(transA, transB, m, n, k, alpha, A.blob(),
-            B.blob(), beta, C->blob(), ctx);
+            GEMM<DType, Lib>(transA, transB, m, n, k, alpha, A.blob(), B.blob(),
+                             beta, C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
-        });
+    });
   }
 }
 template void Mult<float>(float alpha, const Tensor& lhs, float beta,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index d2b68b9..db0aa15 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -16,12 +16,14 @@
  * limitations under the License.
  */
 
-#ifndef SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_
-#define SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_
+#ifndef SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
+#define SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
 #if CUDNN_MAJOR_VERSION >= 5
-
+#include <utility>
+#include <string>
+#include <vector>
 #include "./dropout.h"
 #include "singa/core/common.h"
 #include "singa/model/layer.h"
@@ -51,4 +53,4 @@ class CudnnDropout : public Dropout {
 }  // namespace
 #endif  // CUDNN_VERSION_MAJOR>=5
 #endif  // USE_CUDNN
-#endif  // SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_
+#endif  // SRC_MODEL_LAYER_CUDNN_DROPOUT_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/src/model/layer/cudnn_utils.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
index 92c8df7..298ee5c 100644
--- a/src/model/layer/cudnn_utils.h
+++ b/src/model/layer/cudnn_utils.h
@@ -15,8 +15,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef SINGA_MODEL_LAYER_CUDNN_BASE_H_
-#define SINGA_MODEL_LAYER_CUDNN_BASE_H_
+#ifndef SRC_MODEL_LAYER_CUDNN_UTILS_H_
+#define SRC_MODEL_LAYER_CUDNN_UTILS_H_
 
 #ifdef USE_CUDNN
 
@@ -82,4 +82,4 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 
 }  // namespace singa
 #endif  // USE_CUDNN
-#endif  // SINGA_MODEL_LAYER_CUDNN_BASE_H_
+#endif  // SRC_MODEL_LAYER_CUDNN_UTILS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/src/model/layer/dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
index a6e733a..5efaf6a 100644
--- a/src/model/layer/dropout.h
+++ b/src/model/layer/dropout.h
@@ -15,9 +15,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef SINGA_MODEL_LAYER_DROPOUT_H_
-#define SINGA_MODEL_LAYER_DROPOUT_H_
+#ifndef SRC_MODEL_LAYER_DROPOUT_H_
+#define SRC_MODEL_LAYER_DROPOUT_H_
+#include <utility>
+#include <string>
+#include <vector>
 #include "singa/model/layer.h"
+
 namespace singa {
 class Dropout : public Layer {
  public:
@@ -55,4 +59,4 @@ class Dropout : public Layer {
   Tensor mask_;
 };
 }  // namespace singa
-#endif  // SINGA_MODEL_LAYER_DROPOUT_H_
+#endif  // SRC_MODEL_LAYER_DROPOUT_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4918753/src/model/layer/rnn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/rnn.h b/src/model/layer/rnn.h
index a6ba461..35c86bd 100644
--- a/src/model/layer/rnn.h
+++ b/src/model/layer/rnn.h
@@ -15,9 +15,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef SINGA_MODEL_LAYER_DROPOUT_H_
-#define SINGA_MODEL_LAYER_DROPOUT_H_
+#ifndef SRC_MODEL_LAYER_RNN_H_
+#define SRC_MODEL_LAYER_RNN_H_
+
+#include <utility>
+#include <string>
+#include <vector>
+#include <stack>
+
 #include "singa/model/layer.h"
+
 namespace singa {
 /// To enable use the same layer multiple times in one iteration in RNN,
 /// the Forward() function pushes the 'input' or 'output' that are
@@ -56,4 +63,4 @@ class RNN : public Layer {
   std::stack<Tensor*> states_;
 };
 }  // namespace singa
-#endif  // SINGA_MODEL_LAYER_DROPOUT_H_
+#endif  // SRC_MODEL_LAYER_RNN_H_

[42/50] [abbrv] incubator-singa git commit: SINGA-190 - Add prelu layer and flatten layer

Posted by zh...@apache.org.

SINGA-190 - Add prelu layer and flatten layer

Merge PR#162 into dev


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/26df5ac0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/26df5ac0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/26df5ac0

Branch: refs/heads/master
Commit: 26df5ac03326576cebcca516da3b27ba1fd0dbd8
Parents: 58be3f8 6d69047
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 22:33:02 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 22:33:02 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h          | 396 ++++++++---------
 src/core/tensor/math_kernel.cu       | 656 +++++++++++++++-------------
 src/core/tensor/math_kernel.h        |  93 ++--
 src/core/tensor/tensor.cc            | 702 ++++++++++++++++--------------
 src/core/tensor/tensor_math.h        | 393 +++++++++--------
 src/core/tensor/tensor_math_cpp.h    | 585 +++++++++++++++++++------
 src/core/tensor/tensor_math_cuda.h   | 412 ++++++++++++++----
 src/model/layer/cudnn_convolution.cc | 180 ++++----
 test/singa/test_cudnn_convolution.cc | 181 ++++++++
 test/singa/test_tensor_math.cc       | 295 ++++++++++++-
 10 files changed, 2470 insertions(+), 1423 deletions(-)
----------------------------------------------------------------------

[13/50] [abbrv] incubator-singa git commit: SINGA-176 - Add loss and metric base classes

Posted by zh...@apache.org.

SINGA-176 - Add loss and metric base classes

Rename layer.proto to model.proto, which includes proto messages for classes under model/


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3171459b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3171459b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3171459b

Branch: refs/heads/master
Commit: 3171459b0ce39722c42b5eef96ae4892e274cb5c
Parents: a1c3437
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu May 26 14:06:50 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 26 14:11:18 2016 +0800

----------------------------------------------------------------------
 include/singa/model/layer.h     |   2 +-
 include/singa/model/loss.h      |   2 +-
 include/singa/model/metric.h    |   2 +-
 src/model/layer/cudnn_dropout.h |   1 -
 src/proto/layer.proto           | 852 -----------------------------------
 src/proto/model.proto           | 852 +++++++++++++++++++++++++++++++++++
 6 files changed, 855 insertions(+), 856 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3171459b/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 084c42e..5803295 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -24,7 +24,7 @@
 #include <stack>
 #include <utility>
 #include "singa/core/tensor.h"
-#include "singa/proto/layer.pb.h"
+#include "singa/proto/model.pb.h"
 
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3171459b/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 6c79e7b..6a23067 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -18,7 +18,7 @@
 
 #ifndef SINGA_MODEL_LOSS_H_
 #define SINGA_MODEL_LOSS_H_
-#include "singa/proto/layer.pb.h"
+#include "singa/proto/model.pb.h"
 #include "singa/core/tensor.h"
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3171459b/include/singa/model/metric.h
----------------------------------------------------------------------
diff --git a/include/singa/model/metric.h b/include/singa/model/metric.h
index 6519028..b99ff0d 100644
--- a/include/singa/model/metric.h
+++ b/include/singa/model/metric.h
@@ -19,7 +19,7 @@
 #ifndef SINGA_MODEL_METRIC_H_
 #define SINGA_MODEL_METRIC_H_
 #include "singa/core/tensor.h"
-#include "singa/proto/layer.pb.h"
+#include "singa/proto/model.pb.h"
 namespace singa {
 
 /// The base metric class, which declares the APIs for computing the performance

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3171459b/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index d3b3de6..7cb185b 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -30,7 +30,6 @@
 #include <vector>
 
 #include "singa/model/layer.h"
-#include "singa/proto/core.pb.h"
 
 namespace singa {
 class CudnnDropout : public Dropout {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3171459b/src/proto/layer.proto
----------------------------------------------------------------------
diff --git a/src/proto/layer.proto b/src/proto/layer.proto
deleted file mode 100644
index 51225ee..0000000
--- a/src/proto/layer.proto
+++ /dev/null
@@ -1,852 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package singa;
-
-/// \file layer.proto is adapted from [Caffe](https://github.com/BVLC/caffe/)'s
-/// proto file with commit id c419f8517b1e1b3d7a07fe212fc6c90a70b519ea. We
-/// use caffe's protocol for configuring layer hyper-parameters for easy
-/// transporting Caffe model into SINGA. Specifically, we do the following
-/// changes:
-/// 1. we rename LayerParameter to LayerConf to differentiate model parameters
-/// 2. we rename xxxParameter to xxxConf for fields of LayerParameter
-/// 3. we comment out some fields (using /*...*/) not used in SINGA layer but
-///    reserve their tags.
-/// 4. we add new fields (commented like 'singa field..') to support our own
-///   functionalities.
-/// TODO(wangwei) write a proto converter to automatically load caffe models
-/// using Python (or C++/Java).
-
-// Specifies the shape (dimensions) of a Blob.
-message BlobShape {
-  repeated int64 dim = 1 [packed = true];
-}
-
-message BlobProto {
-  optional BlobShape shape = 7;
-  repeated float data = 5 [packed = true];
-  repeated float diff = 6 [packed = true];
-  repeated double double_data = 8 [packed = true];
-  repeated double double_diff = 9 [packed = true];
-
-  // 4D dimensions -- deprecated.  Use "shape" instead.
-  optional int32 num = 1 [default = 0];
-  optional int32 channels = 2 [default = 0];
-  optional int32 height = 3 [default = 0];
-  optional int32 width = 4 [default = 0];
-}
-
-message FillerConf {
-  // The filler type.
-  optional string type = 1 [default = 'constant'];
-  optional float value = 2 [default = 0]; // the value in constant filler
-  optional float min = 3 [default = 0]; // the min value in uniform filler
-  optional float max = 4 [default = 1]; // the max value in uniform filler
-  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
-  optional float std = 6 [default = 1]; // the std value in Gaussian filler
-  // The expected number of non-zero output weights for a given input in
-  // Gaussian filler -- the default -1 means don't perform sparsification.
-  /* optional int32 sparse = 7 [default = -1]; */
-  // Normalize the filler variance by fan_in, fan_out, or their average.
-  // Applies to 'xavier' and 'msra' fillers.
-  enum VarianceNorm {
-    FAN_IN = 0;
-    FAN_OUT = 1;
-    AVERAGE = 2;
-  }
-  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
-}
-
-// Specifies training parameters (multipliers on global learning constants,
-// and the name and other settings used for weight sharing).
-message ParamSpec {
-  // The names of the parameter blobs -- useful for sharing parameters among
-  // layers, but never required otherwise.  To share a parameter between two
-  // layers, give it a (non-empty) name.
-  optional string name = 1;
-
-  // Whether to require shared weights to have the same shape, or just the same
-  // count -- defaults to STRICT if unspecified.
-  /*
-  optional DimCheckMode share_mode = 2;
-  enum DimCheckMode {
-    // STRICT (default) requires that num, channels, height, width each match.
-    STRICT = 0;
-    // PERMISSIVE requires only the count (num*channels*height*width) to match.
-    PERMISSIVE = 1;
-  }
-  */
-
-  // The multiplier on the global learning rate for this parameter.
-  optional float lr_mult = 3 [default = 1.0];
-
-  // The multiplier on the global weight decay for this parameter.
-  optional float decay_mult = 4 [default = 1.0];
-
-  // SINGA uses this filed internally. Users just configure the fillers in
-  // Layer specific conf message as caffe (style).
-  optional FillerConf filler = 20;
-}
-
-enum Phase {
-  kTrain = 4;
-  kEval = 8;
-}
-// NOTE
-// Update the next available ID when you add a new LayerConf field.
-//
-// LayerConf next available layer-specific ID: 139 (last added: tile_param)
-message LayerConf {
-  optional string name = 1; // the layer name
-  optional string type = 2; // the layer type
-  /* repeated string bottom = 3; // the name of each bottom blob */
-  /* repeated string top = 4; // the name of each top blob */
-
-  // The train / test phase for computation.
-  // optional Phase phase = 10;
-
-  // The amount of weight to assign each top blob in the objective.
-  // Each layer assigns a default value, usually of either 0 or 1,
-  // to each top blob.
-  /* repeated float loss_weight = 5; */
-
-  // Specifies training parameters (multipliers on global learning constants,
-  // and the name and other settings used for weight sharing).
-  repeated ParamSpec param = 6;
-
-  // The blobs containing the numeric parameters of the layer.
-  repeated BlobProto blobs = 7;
-
-  // Specifies on which bottoms the backpropagation should be skipped.
-  // The size must be either 0 or equal to the number of bottoms.
-  /* repeated bool propagate_down = 11; */
-
-  // Rules controlling whether and when a layer is included in the network,
-  // based on the current NetState.  You may specify a non-zero number of rules
-  // to include OR exclude, but not both.  If no include or exclude rules are
-  // specified, the layer is always included.  If the current NetState meets
-  // ANY (i.e., one or more) of the specified rules, the layer is
-  // included/excluded.
-  /* repeated NetStateRule include = 8; */
-  /* repeated NetStateRule exclude = 9; */
-
-  // Confs for data pre-processing.
-  /* optional TransformationConf transform_param = 100; */
-
-  // Confs shared by loss layers.
-  /* optional LossConf loss_param = 101; */
-
-  // Layer type-specific parameters.
-  //
-  // Note: certain layers may have more than one computational engine
-  // for their implementation. These layers include an Engine type and
-  // engine parameter for selecting the implementation.
-  // The default for the engine is set by the ENGINE switch at compile-time.
-  //optional AccuracyConf accuracy_conf = 102;
-  optional ArgMaxConf argmax_conf = 103;
-  optional ConcatConf concat_conf = 104;
-  optional ContrastiveLossConf contrastive_loss_conf = 105;
-  optional ConvolutionConf convolution_conf = 106;
-  // optional DataConf data_conf = 107;
-  optional DropoutConf dropout_conf = 108;
-  // optional DummyDataConf dummy_data_conf = 109;
-  optional EltwiseConf eltwise_conf = 110;
-  optional EmbedConf embed_conf = 137;
-  optional ExpConf exp_conf = 111;
-  optional FlattenConf flatten_conf = 135;
-  // optional HDF5DataConf hdf5_data_conf = 112;
-  // optional HDF5OutputConf hdf5_output_conf = 113;
-  optional HingeLossConf hinge_loss_conf = 114;
-  // optional ImageDataConf image_data_conf = 115;
-  optional InfogainLossConf infogain_loss_conf = 116;
-  optional InnerProductConf inner_product_conf = 117;
-  optional LogConf log_conf = 134;
-  optional LRNConf lrn_conf = 118;
-  // Used in SINGA
-  optional MetricConf metric_conf = 200;
-  // optional MemoryDataConf memory_data_conf = 119;
-  optional MVNConf mvn_conf = 120;
-  optional PoolingConf pooling_conf = 121;
-  optional PowerConf power_conf = 122;
-  optional PReLUConf prelu_conf = 131;
-  // optional PythonConf python_conf = 130;
-  optional ReductionConf reduction_conf = 136;
-  optional ReLUConf relu_conf = 123;
-  optional ReshapeConf reshape_conf = 133;
-  optional SigmoidConf sigmoid_conf = 124;
-  optional SoftmaxConf softmax_conf = 125;
-  optional SPPConf spp_conf = 132;
-  optional SliceConf slice_conf = 126;
-  optional TanHConf tanh_conf = 127;
-  optional ThresholdConf threshold_conf = 128;
-  optional TileConf tile_conf = 138;
-  //optional WindowDataConf window_data_conf = 129;
-}
-
-// Message that stores hyper-parameters used to apply transformation
-// to the data layer's data
-/*
-message TransformationConf {
-  // For data pre-processing, we can do simple scaling and subtracting the
-  // data mean, if provided. Note that the mean subtraction is always carried
-  // out before scaling.
-  optional float scale = 1 [default = 1];
-  // Specify if we want to randomly mirror data.
-  optional bool mirror = 2 [default = false];
-  // Specify if we would like to randomly crop an image.
-  optional uint32 crop_size = 3 [default = 0];
-  // mean_file and mean_value cannot be specified at the same time
-  optional string mean_file = 4;
-  // if specified can be repeated once (would substract it from all the channels)
-  // or can be repeated the same number of times as channels
-  // (would subtract them from the corresponding channel)
-  repeated float mean_value = 5;
-  // Force the decoded image to have 3 color channels.
-  optional bool force_color = 6 [default = false];
-  // Force the decoded image to have 1 color channels.
-  optional bool force_gray = 7 [default = false];
-}
-*/
-
-// Message that stores hyper-parameters shared by loss layers
-message LossConf {
-  // If specified, ignore instances with the given label.
-  optional int32 ignore_label = 1;
-  // If true, normalize each batch across all instances (including spatial
-  // dimesions, but not ignored instances); else, divide by batch size only.
-  optional bool normalize = 2 [default = true];
-}
-
-message MetricConf {
-  // When computing accuracy, count as correct by comparing the true label to
-  // the top k scoring classes.  By default, only compare to the top scoring
-  // class (i.e. argmax).
-  optional uint32 top_k = 1 [default = 1];
-
-  // The "label" axis of the prediction blob, whose argmax corresponds to the
-  // predicted label -- may be negative to index from the end (e.g., -1 for the
-  // last axis).  For example, if axis == 1 and the predictions are
-  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
-  // labels with integer values in {0, 1, ..., C-1}.
-  optional int32 axis = 2 [default = 1];
-
-  // If specified, ignore instances with the given label.
-  optional int32 ignore_label = 3;
-}
-// Messages that store hyper-parameters used by individual layer types follow, in
-// alphabetical order.
-
-
-
-message ArgMaxConf {
-  // If true produce pairs (argmax, maxval)
-  optional bool out_max_val = 1 [default = false];
-  optional uint32 top_k = 2 [default = 1];
-  // The axis along which to maximise -- may be negative to index from the
-  // end (e.g., -1 for the last axis).
-  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
-  // for each index of the first / num dimension.
-  optional int32 axis = 3;
-}
-
-message ConcatConf {
-  // The axis along which to concatenate -- may be negative to index from the
-  // end (e.g., -1 for the last axis).  Other axes must have the
-  // same dimension for all the bottom blobs.
-  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 2 [default = 1];
-
-  // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 concat_dim = 1 [default = 1];
-}
-
-message ContrastiveLossConf {
-  // margin for dissimilar pair
-  optional float margin = 1 [default = 1.0];
-  // The first implementation of this cost did not exactly match the cost of
-  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
-  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
-  // Hadsell paper. New models should probably use this version.
-  // legacy_version = true uses (margin - d^2). This is kept to support /
-  // reproduce existing models and results
-  optional bool legacy_version = 2 [default = false];
-}
-
-message ConvolutionConf {
-  optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
-
-  // Pad, kernel size, and stride are all given as a single value for equal
-  // dimensions in all spatial dimensions, or once per spatial dimension.
-  repeated uint32 pad = 3; // The padding size; defaults to 0
-  repeated uint32 kernel_size = 4; // The kernel size
-  repeated uint32 stride = 6; // The stride; defaults to 1
-
-  // For 2D convolution only, the *_h and *_w versions may also be used to
-  // specify both spatial dimensions.
-  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
-  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
-  optional uint32 kernel_h = 11; // The kernel height (2D only)
-  optional uint32 kernel_w = 12; // The kernel width (2D only)
-  optional uint32 stride_h = 13; // The stride height (2D only)
-  optional uint32 stride_w = 14; // The stride width (2D only)
-
-  optional uint32 group = 5 [default = 1]; // The group size for group conv
-
-  optional FillerConf weight_filler = 7; // The filler for the weight
-  optional FillerConf bias_filler = 8; // The filler for the bias
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 15 [default = DEFAULT];
-
-  // The axis to interpret as "channels" when performing convolution.
-  // Preceding dimensions are treated as independent inputs;
-  // succeeding dimensions are treated as "spatial".
-  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
-  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
-  // groups g>1) filters across the spatial axes (H, W) of the input.
-  // With (N, C, D, H, W) inputs, and axis == 1, we perform
-  // N independent 3D convolutions, sliding (C/g)-channels
-  // filters across the spatial axes (D, H, W) of the input.
-  optional int32 axis = 16 [default = 1];
-
-  // Whether to force use of the general ND convolution, even if a specific
-  // implementation for blobs of the appropriate number of spatial dimensions
-  // is available. (Currently, there is only a 2D-specific convolution
-  // implementation; for input blobs with num_axes != 2, this option is
-  // ignored and the ND implementation will be used.)
-  optional bool force_nd_im2col = 17 [default = false];
-}
-
-/*
-message DataConf {
-  enum DB {
-    LEVELDB = 0;
-    LMDB = 1;
-  }
-  // Specify the data source.
-  optional string source = 1;
-  // Specify the batch size.
-  optional uint32 batch_size = 4;
-  // The rand_skip variable is for the data layer to skip a few data points
-  // to avoid all asynchronous sgd clients to start at the same point. The skip
-  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
-  // be larger than the number of keys in the database.
-  // DEPRECATED. Each solver accesses a different subset of the database.
-  optional uint32 rand_skip = 7 [default = 0];
-  optional DB backend = 8 [default = LEVELDB];
-  // DEPRECATED. See TransformationConf. For data pre-processing, we can do
-  // simple scaling and subtracting the data mean, if provided. Note that the
-  // mean subtraction is always carried out before scaling.
-  optional float scale = 2 [default = 1];
-  optional string mean_file = 3;
-  // DEPRECATED. See TransformationConf. Specify if we would like to randomly
-  // crop an image.
-  optional uint32 crop_size = 5 [default = 0];
-  // DEPRECATED. See TransformationConf. Specify if we want to randomly mirror
-  // data.
-  optional bool mirror = 6 [default = false];
-  // Force the encoded image to have 3 color channels
-  optional bool force_encoded_color = 9 [default = false];
-  // Prefetch queue (Number of batches to prefetch to host memory, increase if
-  // data access bandwidth varies).
-  optional uint32 prefetch = 10 [default = 4];
-}
-*/
-
-message DropoutConf {
-  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
-}
-
-// DummyDataLayer fills any number of arbitrarily shaped blobs with random
-// (or constant) data generated by "Fillers" (see "message FillerConf").
-message DummyDataConf {
-  // This layer produces N >= 1 top blobs.  DummyDataConf must specify 1 or N
-  // shape fields, and 0, 1 or N data_fillers.
-  //
-  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
-  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
-  // specified, the ith is applied to the ith top blob.
-  repeated FillerConf data_filler = 1;
-  repeated BlobShape shape = 6;
-
-  // 4D dimensions -- deprecated.  Use "shape" instead.
-  repeated uint32 num = 2;
-  repeated uint32 channels = 3;
-  repeated uint32 height = 4;
-  repeated uint32 width = 5;
-}
-
-message EltwiseConf {
-  enum EltwiseOp {
-    PROD = 0;
-    SUM = 1;
-    MAX = 2;
-  }
-  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
-  repeated float coeff = 2; // blob-wise coefficient for SUM operation
-
-  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
-  // of computing the gradient for the PROD operation. (No effect for SUM op.)
-  optional bool stable_prod_grad = 3 [default = true];
-}
-
-// Message that stores hyper-parameters used by EmbedLayer
-message EmbedConf {
-  optional uint32 num_output = 1; // The number of outputs for the layer
-  // The input is given as integers to be interpreted as one-hot
-  // vector indices with dimension num_input.  Hence num_input should be
-  // 1 greater than the maximum possible input value.
-  optional uint32 input_dim = 2;
-
-  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
-  optional FillerConf weight_filler = 4; // The filler for the weight
-  optional FillerConf bias_filler = 5; // The filler for the bias
-
-}
-
-// Message that stores hyper-parameters used by ExpLayer
-message ExpConf {
-  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
-  // Or if base is set to the default (-1), base is set to e,
-  // so y = exp(shift + scale * x).
-  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
-}
-
-/// Message that stores hyper-parameters used by FlattenLayer
-message FlattenConf {
-  // The first axis to flatten: all preceding axes are retained in the output.
-  // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 1 [default = 1];
-
-  // The last axis to flatten: all following axes are retained in the output.
-  // May be negative to index from the end (e.g., the default -1 for the last
-  // axis).
-  optional int32 end_axis = 2 [default = -1];
-}
-
-/*
-// Message that stores hyper-parameters used by HDF5DataLayer
-message HDF5DataConf {
-  // Specify the data source.
-  optional string source = 1;
-  // Specify the batch size.
-  optional uint32 batch_size = 2;
-
-  // Specify whether to shuffle the data.
-  // If shuffle == true, the ordering of the HDF5 files is shuffled,
-  // and the ordering of data within any given HDF5 file is shuffled,
-  // but data between different files are not interleaved; all of a file's
-  // data are output (in a random order) before moving onto another file.
-  optional bool shuffle = 3 [default = false];
-}
-
-message HDF5OutputConf {
-  optional string file_name = 1;
-}
-*/
-
-message HingeLossConf {
-  enum Norm {
-    L1 = 1;
-    L2 = 2;
-  }
-  // Specify the Norm to use L1 or L2
-  optional Norm norm = 1 [default = L1];
-}
-
-/*
-message ImageDataConf {
-  // Specify the data source.
-  optional string source = 1;
-  // Specify the batch size.
-  optional uint32 batch_size = 4 [default = 1];
-  // The rand_skip variable is for the data layer to skip a few data points
-  // to avoid all asynchronous sgd clients to start at the same point. The skip
-  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
-  // be larger than the number of keys in the database.
-  optional uint32 rand_skip = 7 [default = 0];
-  // Whether or not ImageLayer should shuffle the list of files at every epoch.
-  optional bool shuffle = 8 [default = false];
-  // It will also resize images if new_height or new_width are not zero.
-  optional uint32 new_height = 9 [default = 0];
-  optional uint32 new_width = 10 [default = 0];
-  // Specify if the images are color or gray
-  optional bool is_color = 11 [default = true];
-  // DEPRECATED. See TransformationConf. For data pre-processing, we can do
-  // simple scaling and subtracting the data mean, if provided. Note that the
-  // mean subtraction is always carried out before scaling.
-  optional float scale = 2 [default = 1];
-  optional string mean_file = 3;
-  // DEPRECATED. See TransformationConf. Specify if we would like to randomly
-  // crop an image.
-  optional uint32 crop_size = 5 [default = 0];
-  // DEPRECATED. See TransformationConf. Specify if we want to randomly mirror
-  // data.
-  optional bool mirror = 6 [default = false];
-  optional string root_folder = 12 [default = ""];
-}
-*/
-
-message InfogainLossConf {
-  // Specify the infogain matrix source.
-  optional string source = 1;
-}
-
-message InnerProductConf {
-  optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
-  optional FillerConf weight_filler = 3; // The filler for the weight
-  optional FillerConf bias_filler = 4; // The filler for the bias
-
-  // The first axis to be lumped into a single inner product computation;
-  // all preceding axes are retained in the output.
-  // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 5 [default = 1];
-}
-
-// Message that stores hyper-parameters used by LogLayer
-message LogConf {
-  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
-  // Or if base is set to the default (-1), base is set to e,
-  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
-  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
-}
-
-// Message that stores hyper-parameters used by LRNLayer
-message LRNConf {
-  optional uint32 local_size = 1 [default = 5];
-  optional float alpha = 2 [default = 1.];
-  optional float beta = 3 [default = 0.75];
-  enum NormRegion {
-    ACROSS_CHANNELS = 0;
-    WITHIN_CHANNEL = 1;
-  }
-  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
-  optional float k = 5 [default = 1.];
-}
-
-message MemoryDataConf {
-  optional uint32 batch_size = 1;
-  optional uint32 channels = 2;
-  optional uint32 height = 3;
-  optional uint32 width = 4;
-}
-
-message MVNConf {
-  // This parameter can be set to false to normalize mean only
-  optional bool normalize_variance = 1 [default = true];
-
-  // This parameter can be set to true to perform DNN-like MVN
-  optional bool across_channels = 2 [default = false];
-
-  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 3 [default = 1e-9];
-}
-
-message PoolingConf {
-  enum PoolMethod {
-    MAX = 0;
-    AVE = 1;
-    STOCHASTIC = 2;
-  }
-  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
-  // Pad, kernel size, and stride are all given as a single value for equal
-  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
-  optional uint32 pad_h = 9 [default = 0]; // The padding height
-  optional uint32 pad_w = 10 [default = 0]; // The padding width
-  optional uint32 kernel_size = 2; // The kernel size (square)
-  optional uint32 kernel_h = 5; // The kernel height
-  optional uint32 kernel_w = 6; // The kernel width
-  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
-  optional uint32 stride_h = 7; // The stride height
-  optional uint32 stride_w = 8; // The stride width
-  /*
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 11 [default = DEFAULT];
-  */
-  // If global_pooling then it will pool over the size of the bottom by doing
-  // kernel_h = bottom->height and kernel_w = bottom->width
-  optional bool global_pooling = 12 [default = false];
-}
-
-message PowerConf {
-  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
-  optional float power = 1 [default = 1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
-}
-/*
-message PythonConf {
-  optional string module = 1;
-  optional string layer = 2;
-  // This value is set to the attribute `param_str` of the `PythonLayer` object
-  // in Python before calling the `setup()` method. This could be a number,
-  // string, dictionary in Python dict format, JSON, etc. You may parse this
-  // string in `setup` method and use it in `forward` and `backward`.
-  optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
-  optional bool share_in_parallel = 4 [default = false];
-}
-*/
-
-// Message that stores hyper-parameters used by ReductionLayer
-message ReductionConf {
-  enum ReductionOp {
-    SUM = 1;
-    ASUM = 2;
-    SUMSQ = 3;
-    MEAN = 4;
-  }
-
-  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
-
-  // The first axis to reduce to a scalar -- may be negative to index from the
-  // end (e.g., -1 for the last axis).
-  // (Currently, only reduction along ALL "tail" axes is supported; reduction
-  // of axis M through N, where N < num_axes - 1, is unsupported.)
-  // Suppose we have an n-axis bottom Blob with shape:
-  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
-  // If axis == m, the output Blob will have shape
-  //     (d0, d1, d2, ..., d(m-1)),
-  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
-  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
-  // If axis == 0 (the default), the output Blob always has the empty shape
-  // (count 1), performing reduction across the entire input --
-  // often useful for creating new loss functions.
-  optional int32 axis = 2 [default = 0];
-
-  optional float coeff = 3 [default = 1.0]; // coefficient for output
-}
-
-// Message that stores hyper-parameters used by ReLULayer
-message ReLUConf {
-  // Allow non-zero slope for negative inputs to speed up optimization
-  // Described in:
-  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
-  // improve neural network acoustic models. In ICML Workshop on Deep Learning
-  // for Audio, Speech, and Language Processing.
-  optional float negative_slope = 1 [default = 0];
-  /*
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 2 [default = DEFAULT];
-  */
-}
-
-message ReshapeConf {
-  // Specify the output dimensions. If some of the dimensions are set to 0,
-  // the corresponding dimension from the bottom layer is used (unchanged).
-  // Exactly one dimension may be set to -1, in which case its value is
-  // inferred from the count of the bottom blob and the remaining dimensions.
-  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
-  //
-  //   layer {
-  //     type: "Reshape" bottom: "input" top: "output"
-  //     reshape_param { ... }
-  //   }
-  //
-  // If "input" is 2D with shape 2 x 8, then the following reshape_param
-  // specifications are all equivalent, producing a 3D blob "output" with shape
-  // 2 x 2 x 4:
-  //
-  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
-  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
-  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
-  //   reshape_param { shape { dim: -1  dim: 0  dim:  2 } }
-  //
-  optional BlobShape shape = 1;
-
-  // axis and num_axes control the portion of the bottom blob's shape that are
-  // replaced by (included in) the reshape. By default (axis == 0 and
-  // num_axes == -1), the entire bottom blob shape is included in the reshape,
-  // and hence the shape field must specify the entire output shape.
-  //
-  // axis may be non-zero to retain some portion of the beginning of the input
-  // shape (and may be negative to index from the end; e.g., -1 to begin the
-  // reshape after the last axis, including nothing in the reshape,
-  // -2 to include only the last axis, etc.).
-  //
-  // For example, suppose "input" is a 2D blob with shape 2 x 8.
-  // Then the following ReshapeLayer specifications are all equivalent,
-  // producing a blob "output" with shape 2 x 2 x 4:
-  //
-  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
-  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
-  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
-  //
-  // num_axes specifies the extent of the reshape.
-  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
-  // input axes in the range [axis, axis+num_axes].
-  // num_axes may also be -1, the default, to include all remaining axes
-  // (starting from axis).
-  //
-  // For example, suppose "input" is a 2D blob with shape 2 x 8.
-  // Then the following ReshapeLayer specifications are equivalent,
-  // producing a blob "output" with shape 1 x 2 x 8.
-  //
-  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
-  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
-  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
-  //
-  // On the other hand, these would produce output blob shape 2 x 1 x 8:
-  //
-  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
-  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
-  //
-  optional int32 axis = 2 [default = 0];
-  optional int32 num_axes = 3 [default = -1];
-}
-
-message SigmoidConf {
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 1 [default = DEFAULT];
-}
-
-message SliceConf {
-  // The axis along which to slice -- may be negative to index from the end
-  // (e.g., -1 for the last axis).
-  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 3 [default = 1];
-  repeated uint32 slice_point = 2;
-
-  // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 slice_dim = 1 [default = 1];
-}
-
-// Message that stores hyper-parameters used by SoftmaxLayer, SoftmaxWithLossLayer
-message SoftmaxConf {
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 1 [default = DEFAULT];
-
-  // The axis along which to perform the softmax -- may be negative to index
-  // from the end (e.g., -1 for the last axis).
-  // Any other axes will be evaluated as independent softmaxes.
-  optional int32 axis = 2 [default = 1];
-}
-
-message TanHConf {
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 1 [default = DEFAULT];
-}
-
-// Message that stores hyper-parameters used by TileLayer
-message TileConf {
-  // The index of the axis to tile.
-  optional int32 axis = 1 [default = 1];
-
-  // The number of copies (tiles) of the blob to output.
-  optional int32 tiles = 2;
-}
-
-// Message that stores hyper-parameters used by ThresholdLayer
-message ThresholdConf {
-  optional float threshold = 1 [default = 0]; // Strictly positive values
-}
-
-/*
-message WindowDataConf {
-  // Specify the data source.
-  optional string source = 1;
-  // For data pre-processing, we can do simple scaling and subtracting the
-  // data mean, if provided. Note that the mean subtraction is always carried
-  // out before scaling.
-  optional float scale = 2 [default = 1];
-  optional string mean_file = 3;
-  // Specify the batch size.
-  optional uint32 batch_size = 4;
-  // Specify if we would like to randomly crop an image.
-  optional uint32 crop_size = 5 [default = 0];
-  // Specify if we want to randomly mirror data.
-  optional bool mirror = 6 [default = false];
-  // Foreground (object) overlap threshold
-  optional float fg_threshold = 7 [default = 0.5];
-  // Background (non-object) overlap threshold
-  optional float bg_threshold = 8 [default = 0.5];
-  // Fraction of batch that should be foreground objects
-  optional float fg_fraction = 9 [default = 0.25];
-  // Amount of contextual padding to add around a window
-  // (used only by the window_data_layer)
-  optional uint32 context_pad = 10 [default = 0];
-  // Mode for cropping out a detection window
-  // warp: cropped window is warped to a fixed size and aspect ratio
-  // square: the tightest square around the window is cropped
-  optional string crop_mode = 11 [default = "warp"];
-  // cache_images: will load all images in memory for faster access
-  optional bool cache_images = 12 [default = false];
-  // append root_folder to locate images
-  optional string root_folder = 13 [default = ""];
-}
-*/
-
-message SPPConf {
-  enum PoolMethod {
-    MAX = 0;
-    AVE = 1;
-    STOCHASTIC = 2;
-  }
-  optional uint32 pyramid_height = 1;
-  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
-  /*
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    CUDNN = 2;
-  }
-  optional Engine engine = 6 [default = DEFAULT];
-  */
-}
-
-message PReLUConf {
-  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
-  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
-
-  // Initial value of a_i. Default is a_i=0.25 for all i.
-  optional FillerConf filler = 1;
-  // Whether or not slope paramters are shared across channels.
-  optional bool channel_shared = 2 [default = false];
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3171459b/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
new file mode 100644
index 0000000..51225ee
--- /dev/null
+++ b/src/proto/model.proto
@@ -0,0 +1,852 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package singa;
+
+/// \file layer.proto is adapted from [Caffe](https://github.com/BVLC/caffe/)'s
+/// proto file with commit id c419f8517b1e1b3d7a07fe212fc6c90a70b519ea. We
+/// use caffe's protocol for configuring layer hyper-parameters for easy
+/// transporting Caffe model into SINGA. Specifically, we do the following
+/// changes:
+/// 1. we rename LayerParameter to LayerConf to differentiate model parameters
+/// 2. we rename xxxParameter to xxxConf for fields of LayerParameter
+/// 3. we comment out some fields (using /*...*/) not used in SINGA layer but
+///    reserve their tags.
+/// 4. we add new fields (commented like 'singa field..') to support our own
+///   functionalities.
+/// TODO(wangwei) write a proto converter to automatically load caffe models
+/// using Python (or C++/Java).
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+message FillerConf {
+  // The filler type.
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  /* optional int32 sparse = 7 [default = -1]; */
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  /*
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+  */
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+
+  // SINGA uses this filed internally. Users just configure the fillers in
+  // Layer specific conf message as caffe (style).
+  optional FillerConf filler = 20;
+}
+
+enum Phase {
+  kTrain = 4;
+  kEval = 8;
+}
+// NOTE
+// Update the next available ID when you add a new LayerConf field.
+//
+// LayerConf next available layer-specific ID: 139 (last added: tile_param)
+message LayerConf {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  /* repeated string bottom = 3; // the name of each bottom blob */
+  /* repeated string top = 4; // the name of each top blob */
+
+  // The train / test phase for computation.
+  // optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  /* repeated float loss_weight = 5; */
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies on which bottoms the backpropagation should be skipped.
+  // The size must be either 0 or equal to the number of bottoms.
+  /* repeated bool propagate_down = 11; */
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  /* repeated NetStateRule include = 8; */
+  /* repeated NetStateRule exclude = 9; */
+
+  // Confs for data pre-processing.
+  /* optional TransformationConf transform_param = 100; */
+
+  // Confs shared by loss layers.
+  /* optional LossConf loss_param = 101; */
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  //optional AccuracyConf accuracy_conf = 102;
+  optional ArgMaxConf argmax_conf = 103;
+  optional ConcatConf concat_conf = 104;
+  optional ContrastiveLossConf contrastive_loss_conf = 105;
+  optional ConvolutionConf convolution_conf = 106;
+  // optional DataConf data_conf = 107;
+  optional DropoutConf dropout_conf = 108;
+  // optional DummyDataConf dummy_data_conf = 109;
+  optional EltwiseConf eltwise_conf = 110;
+  optional EmbedConf embed_conf = 137;
+  optional ExpConf exp_conf = 111;
+  optional FlattenConf flatten_conf = 135;
+  // optional HDF5DataConf hdf5_data_conf = 112;
+  // optional HDF5OutputConf hdf5_output_conf = 113;
+  optional HingeLossConf hinge_loss_conf = 114;
+  // optional ImageDataConf image_data_conf = 115;
+  optional InfogainLossConf infogain_loss_conf = 116;
+  optional InnerProductConf inner_product_conf = 117;
+  optional LogConf log_conf = 134;
+  optional LRNConf lrn_conf = 118;
+  // Used in SINGA
+  optional MetricConf metric_conf = 200;
+  // optional MemoryDataConf memory_data_conf = 119;
+  optional MVNConf mvn_conf = 120;
+  optional PoolingConf pooling_conf = 121;
+  optional PowerConf power_conf = 122;
+  optional PReLUConf prelu_conf = 131;
+  // optional PythonConf python_conf = 130;
+  optional ReductionConf reduction_conf = 136;
+  optional ReLUConf relu_conf = 123;
+  optional ReshapeConf reshape_conf = 133;
+  optional SigmoidConf sigmoid_conf = 124;
+  optional SoftmaxConf softmax_conf = 125;
+  optional SPPConf spp_conf = 132;
+  optional SliceConf slice_conf = 126;
+  optional TanHConf tanh_conf = 127;
+  optional ThresholdConf threshold_conf = 128;
+  optional TileConf tile_conf = 138;
+  //optional WindowDataConf window_data_conf = 129;
+}
+
+// Message that stores hyper-parameters used to apply transformation
+// to the data layer's data
+/*
+message TransformationConf {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would substract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [default = false];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [default = false];
+}
+*/
+
+// Message that stores hyper-parameters shared by loss layers
+message LossConf {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // If true, normalize each batch across all instances (including spatial
+  // dimesions, but not ignored instances); else, divide by batch size only.
+  optional bool normalize = 2 [default = true];
+}
+
+message MetricConf {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+// Messages that store hyper-parameters used by individual layer types follow, in
+// alphabetical order.
+
+
+
+message ArgMaxConf {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+message ConcatConf {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+message ContrastiveLossConf {
+  // margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false];
+}
+
+message ConvolutionConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
+  optional uint32 group = 5 [default = 1]; // The group size for group conv
+
+  optional FillerConf weight_filler = 7; // The filler for the weight
+  optional FillerConf bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [default = false];
+}
+
+/*
+message DataConf {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationConf. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationConf. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationConf. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Number of batches to prefetch to host memory, increase if
+  // data access bandwidth varies).
+  optional uint32 prefetch = 10 [default = 4];
+}
+*/
+
+message DropoutConf {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerConf").
+message DummyDataConf {
+  // This layer produces N >= 1 top blobs.  DummyDataConf must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerConf data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseConf {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores hyper-parameters used by EmbedLayer
+message EmbedConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerConf weight_filler = 4; // The filler for the weight
+  optional FillerConf bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores hyper-parameters used by ExpLayer
+message ExpConf {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+/// Message that stores hyper-parameters used by FlattenLayer
+message FlattenConf {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [default = 1];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [default = -1];
+}
+
+/*
+// Message that stores hyper-parameters used by HDF5DataLayer
+message HDF5DataConf {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+message HDF5OutputConf {
+  optional string file_name = 1;
+}
+*/
+
+message HingeLossConf {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+/*
+message ImageDataConf {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [default = 1];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationConf. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationConf. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationConf. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+*/
+
+message InfogainLossConf {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+}
+
+message InnerProductConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3; // The filler for the weight
+  optional FillerConf bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+}
+
+// Message that stores hyper-parameters used by LogLayer
+message LogConf {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores hyper-parameters used by LRNLayer
+message LRNConf {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+}
+
+message MemoryDataConf {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+message MVNConf {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [default = 1e-9];
+}
+
+message PoolingConf {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  /*
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  */
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+}
+
+message PowerConf {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+/*
+message PythonConf {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // If true, each worker solver sequentially run forward from this layer.
+  // This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [default = false];
+}
+*/
+
+// Message that stores hyper-parameters used by ReductionLayer
+message ReductionConf {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [default = 0];
+
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
+}
+
+// Message that stores hyper-parameters used by ReLULayer
+message ReLUConf {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  /*
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+  */
+}
+
+message ReshapeConf {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim: -1  dim: 0  dim:  2 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
+}
+
+message SigmoidConf {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+message SliceConf {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores hyper-parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxConf {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
+}
+
+message TanHConf {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores hyper-parameters used by TileLayer
+message TileConf {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores hyper-parameters used by ThresholdLayer
+message ThresholdConf {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+/*
+message WindowDataConf {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+*/
+
+message SPPConf {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  /*
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+  */
+}
+
+message PReLUConf {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerConf filler = 1;
+  // Whether or not slope paramters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+}

[46/50] [abbrv] incubator-singa git commit: SINGA-195 Channel for sending training statistics

Posted by zh...@apache.org.

SINGA-195 Channel for sending training statistics

Add comments and TODOs.
Reformat some code.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a4fc4ea1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a4fc4ea1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a4fc4ea1

Branch: refs/heads/master
Commit: a4fc4ea1d251242129be2e1a3cd388ca145223ca
Parents: a2a8e34
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 13 17:47:44 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 17:48:36 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/channel.h | 11 ++++++++++-
 src/utils/channel.cc          | 25 +++++++++++++++----------
 test/singa/test_channel.cc    |  4 ++--
 3 files changed, 27 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a4fc4ea1/include/singa/utils/channel.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/channel.h b/include/singa/utils/channel.h
index 7cd7aa3..b640e90 100644
--- a/include/singa/utils/channel.h
+++ b/include/singa/utils/channel.h
@@ -31,16 +31,24 @@
 
 namespace singa {
 
+/// Channel for appending metrics or other information into files or screen.
 class Channel {
  public:
   explicit Channel(const std::string& name);
   ~Channel();
 
+  /// Return the channel name, which is also used for naming the output file.
   inline const std::string& GetName() { return name_; }
+  /// Disabled by default.
   inline void EnableDestStderr(bool enable) { stderr_ = enable; }
+  /// Enabled by default.
   inline void EnableDestFile(bool enable) { file_ = enable; }
+  /// Reset the output file path.
+  /// The dest file is named as global dir + channel name by default.
   void SetDestFilePath(const std::string& file);
+  /// Append a string message
   void Send(const std::string& message);
+  /// Append a protobuf message
   void Send(const google::protobuf::Message& message);
 
  private:
@@ -64,7 +72,8 @@ class ChannelManager {
   std::map<std::string, Channel*> name2ptr_;
 };
 
-/// Initial function for global usage of channel
+/// Initial function for global usage of channel.
+/// 'argv' is for future use.
 void InitChannel(const char* argv);
 /// Set the directory name for persisting channel content
 void SetChannelDirectory(const char* path);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a4fc4ea1/src/utils/channel.cc
----------------------------------------------------------------------
diff --git a/src/utils/channel.cc b/src/utils/channel.cc
index 52909a3..95daed6 100644
--- a/src/utils/channel.cc
+++ b/src/utils/channel.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -28,7 +28,7 @@ namespace singa {
 
 ChannelManager::~ChannelManager() {
   for (auto it : name2ptr_) {
-    if (it.second != nullptr) delete(it.second);
+    if (it.second != nullptr) delete (it.second);
   }
 }
 
@@ -39,7 +39,7 @@ void ChannelManager::Init() {
 void ChannelManager::SetDefaultDir(const char* dir) {
   if (dir != nullptr) {
     dir_ = dir;
-    if (dir[dir_.length()-1] != '/') dir_ += '/';
+    if (dir[dir_.length() - 1] != '/') dir_ += '/';
   }
 }
 
@@ -48,16 +48,14 @@ Channel* ChannelManager::GetInstance(const std::string& channel) {
   if (name2ptr_.find(channel) == name2ptr_.end()) {
     // create new channel
     Channel* chn = new Channel(channel);
-    chn->SetDestFilePath(dir_+channel);
+    chn->SetDestFilePath(dir_ + channel);
     chn->EnableDestFile(true);
     name2ptr_[channel] = chn;
   }
   return name2ptr_[channel];
 }
 
-Channel::Channel(const std::string& name) {
-  name_ = name;
-}
+Channel::Channel(const std::string& name) { name_ = name; }
 
 Channel::~Channel() {
   if (os_.is_open()) os_.close();
@@ -66,6 +64,11 @@ Channel::~Channel() {
 void Channel::SetDestFilePath(const std::string& file) {
   // file is append only
   if (os_.is_open()) os_.close();
+  {
+    ifstream fin(file.c_str());
+    if (fin.good())
+      LOG(WARNING) << "Messages will be appended to an existed file: " << file;
+  }
   os_.open(file.c_str(), std::ios::app);
   if (os_.is_open() == false)
     LOG(WARNING) << "Cannot open channel file (" << file << ")";
@@ -74,11 +77,13 @@ void Channel::SetDestFilePath(const std::string& file) {
 void Channel::Send(const std::string& message) {
   if (stderr_) fprintf(stderr, "%s\n", message.c_str());
   if (file_ && os_.is_open()) os_ << message << "\n";
+  // TODO(wangwei) flush
 }
 
 void Channel::Send(const google::protobuf::Message& message) {
   if (stderr_) fprintf(stderr, "%s\n", message.DebugString().c_str());
   if (file_ && os_.is_open()) message.SerializeToOstream(&os_);
+  // TODO(wangwei) flush
 }
 
 void InitChannel(const char* argv) {
@@ -87,12 +92,12 @@ void InitChannel(const char* argv) {
 }
 
 void SetChannelDirectory(const char* path) {
-  ChannelManager * mng = Singleton<ChannelManager>().Instance();
+  ChannelManager* mng = Singleton<ChannelManager>().Instance();
   mng->SetDefaultDir(path);
 }
 
 Channel* GetChannel(const std::string& channel_name) {
-  ChannelManager * mng = Singleton<ChannelManager>().Instance();
+  ChannelManager* mng = Singleton<ChannelManager>().Instance();
   return mng->GetInstance(channel_name);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a4fc4ea1/test/singa/test_channel.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_channel.cc b/test/singa/test_channel.cc
index 77d7cbc..68b0017 100644
--- a/test/singa/test_channel.cc
+++ b/test/singa/test_channel.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

[23/50] [abbrv] incubator-singa git commit: SINGA-180 Add Activation layer and Softmax layer

Posted by zh...@apache.org.

SINGA-180 Add Activation layer and Softmax layer

Add cpu and cudnn implementation for activation and softmax layer.

Note: activation layer currently support sigmoid/tanh function and relu forward computation.

Remove tensor softmax function. Instead, use tensor op(*) and function(Sum) to impletment softmax function.

Add test files for activation and softmax layer.

Add Element-wise implementation for activation functions (relu/tanh/sigmoid).

Add tensor scaler comparison function (<, <=, >, >=), i.e., to compare a tensor with a constant.

Add implementation for tensor math functions (exp, log, pow).

Add functions for matrix op vector, where op is multiply and div.

Pass all tests.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3e2507b7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3e2507b7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3e2507b7

Branch: refs/heads/master
Commit: 3e2507b7af8c4fe3746f3156f29eba99a30e546f
Parents: 2dac380
Author: jixin <ji...@comp.nus.edu.sg>
Authored: Fri May 27 22:03:35 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Tue May 31 22:08:31 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h         | 107 +++++++++++++++++-----
 src/core/tensor/math_kernel.cu      | 132 +++++++++++++++++----------
 src/core/tensor/math_kernel.h       |   6 +-
 src/core/tensor/tensor.cc           | 152 ++++++++++++++++---------------
 src/core/tensor/tensor_math.h       |  47 +++++++++-
 src/core/tensor/tensor_math_cpp.h   | 148 ++++++++++++++++++++++++------
 src/core/tensor/tensor_math_cuda.h  |  54 +++++++----
 src/model/layer/activation.cc       |  67 ++++++++++++++
 src/model/layer/activation.h        |  51 +++++++++++
 src/model/layer/cudnn_activation.cc | 115 +++++++++++++++++++++++
 src/model/layer/cudnn_activation.h  |  58 ++++++++++++
 src/model/layer/cudnn_softmax.cc    |  77 ++++++++++++++++
 src/model/layer/cudnn_softmax.h     |  54 +++++++++++
 src/model/layer/softmax.cc          |  64 +++++++++++++
 src/model/layer/softmax.h           |  45 +++++++++
 test/singa/test_activation.cc       | 133 +++++++++++++++++++++++++++
 test/singa/test_cudnn_activation.cc | 136 +++++++++++++++++++++++++++
 test/singa/test_cudnn_dropout.cc    |   2 +-
 test/singa/test_cudnn_softmax.cc    | 107 ++++++++++++++++++++++
 test/singa/test_softmax.cc          | 110 ++++++++++++++++++++++
 20 files changed, 1468 insertions(+), 197 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 8682bca..bb8d7f8 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -62,7 +62,7 @@ inline size_t SizeOf(DataType t) {
 /// then it must be set up correctly (shape, device). Otherwise, runtime error
 /// like SegmentFault would happen. Simply type/device check would be conducted.
 class Tensor {
-public:
+ public:
   ~Tensor();
   Tensor();
   explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
@@ -83,7 +83,8 @@ public:
   Device *device() const { return device_; }
 
   /// Return immutable Tensor values with given type.
-  template <typename DType> DType data() const {
+  template <typename DType>
+  DType data() const {
     return static_cast<DType>(blob()->data());
   }
 
@@ -130,7 +131,8 @@ public:
   void ToHost();
 
   /// Set each element of the tensor to be x
-  template <typename SType> void SetValue(const SType x);
+  template <typename SType>
+  void SetValue(const SType x);
 
   /// For init the tensor values, copy 'num' elements.
   template <typename DType>
@@ -141,7 +143,7 @@ public:
   void CopyData(const Tensor &other);
 
   /// Return an exactly the same Tensor with data been deep copied.
-  Tensor Clone();
+  Tensor Clone() const;
 
   // Tensor operations
 
@@ -167,23 +169,27 @@ public:
   // Scalar operations.
 
   /// T is a scalar type
-  template <typename DType> Tensor &operator+=(DType x);
+  template <typename DType>
+  Tensor &operator+=(DType x);
 
   /// T is a scalar type
-  template <typename DType> Tensor &operator-=(const DType x);
+  template <typename DType>
+  Tensor &operator-=(const DType x);
 
   /// T is a scalar type
-  template <typename DType> Tensor &operator*=(const DType x);
+  template <typename DType>
+  Tensor &operator*=(const DType x);
 
   /// T is a scalar type
-  template <typename DType> Tensor &operator/=(const DType x);
+  template <typename DType>
+  Tensor &operator/=(const DType x);
 
   /// save Tensor into a proto msg
   // void ToProto(TensorProto* t);
   /// load Tensor from proto msg
   // void FromProto(const TensorProto& t);
 
-protected:
+ protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
   Device *device_ = nullptr;
@@ -220,7 +226,8 @@ Tensor Sqrt(const Tensor &t);
 Tensor Square(const Tensor &t);
 Tensor Tanh(const Tensor &t);
 
-template <typename SType> SType Sum(const Tensor &t);
+template <typename SType>
+SType Sum(const Tensor &t);
 /// Sum elements in the Tensor, currently only support vector and matrix.
 /// if 'axis' is 0, sum all rows into a single row
 /// if 'axis' is 1, sum all columns into a single column
@@ -232,16 +239,48 @@ Tensor Sum(const Tensor &t, int axis);
 /// if 'axis' is 1, average all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.average
 Tensor Average(const Tensor &t, int axis);
+/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
+/// and shape_[axis]*...*shape_[nDim()] columns.
+/// and do softmax along each row.
+Tensor SoftMax(const Tensor &t, int axis = 0);
+void SoftMax(const Tensor &t, int axis, Tensor *ret);
+
 /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
 /// and shape_[axis+1]*...*shape_[nDim()] columns.
 /// and do softmax along each row.
-Tensor Softmax(const Tensor &t, int axis = -1);
-void Softmax(const Tensor &t, Tensor *ret, int axis = -1);
+// Tensor Softmax(const Tensor& t, int axis = -1);
+// void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
+
+/// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
+template <typename DType>
+Tensor operator<(const Tensor &t, const DType x);
+template <typename DType>
+void LT(const Tensor &t, DType x, Tensor *ret);
+
+/// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
+template <typename DType>
+Tensor operator<=(const Tensor &t, const DType x);
+template <typename DType>
+void LE(const Tensor &t, DType x, Tensor *ret);
+
+/// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
+template <typename DType>
+Tensor operator>(const Tensor &t, const DType x);
+template <typename DType>
+void GT(const Tensor &t, DType x, Tensor *ret);
+
+/// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
+template <typename DType>
+Tensor operator>=(const Tensor &t, const DType x);
+template <typename DType>
+void GE(const Tensor &t, DType x, Tensor *ret);
 
 /// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType> Tensor Pow(const Tensor &t, DType x);
+template <typename DType>
+Tensor Pow(const Tensor &t, DType x);
 /// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType> void Pow(const Tensor &t, DType x, Tensor *ret);
+template <typename DType>
+void Pow(const Tensor &t, DType x, Tensor *ret);
 /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
 Tensor Pow(const Tensor &base, Tensor exp);
 /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
@@ -256,18 +295,25 @@ void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
 Tensor operator/(const Tensor &lhs, const Tensor &rhs);
 void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
 
-template <typename DType> Tensor operator+(const Tensor &t, DType x);
-template <typename DType> void Add(const Tensor &t, DType x, Tensor *ret);
+template <typename DType>
+Tensor operator+(const Tensor &t, DType x);
+template <typename DType>
+void Add(const Tensor &t, DType x, Tensor *ret);
 
-template <typename DType> Tensor operator-(const Tensor &t, DType x);
-template <typename DType> void Sub(const Tensor &t, DType x, Tensor *ret);
+template <typename DType>
+Tensor operator-(const Tensor &t, DType x);
+template <typename DType>
+void Sub(const Tensor &t, DType x, Tensor *ret);
 
-template <typename DType> Tensor operator*(const Tensor &t, DType x);
+template <typename DType>
+Tensor operator*(const Tensor &t, DType x);
 template <typename DType>
 void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
 
-template <typename DType> Tensor operator/(const Tensor &t, DType x);
-template <typename DType> void Div(const Tensor &t, DType x, Tensor *ret);
+template <typename DType>
+Tensor operator/(const Tensor &t, DType x);
+template <typename DType>
+void Div(const Tensor &t, DType x, Tensor *ret);
 
 // ================Blas operations============================================
 // We fix the scalar argument type to be float.
@@ -301,6 +347,7 @@ void Uniform(float low, float high, Tensor *t);
 void Gaussian(float mean, float std, Tensor *t);
 
 // follow the consistency guide
+// https://issues.apache.org/jira/browse/SINGA-182
 // ============Matrix vector operations=======================================
 /// Add column 'v' with each column of matrix M
 void AddColumn(const Tensor &v, Tensor *M);
@@ -329,12 +376,28 @@ void SumRows(const Tensor &M, Tensor *out);
 void SumColumns(const Tensor &M, Tensor *out);
 
 /// For each element x of Tensor 'in', compute alpha/x
-template <typename SType> Tensor Div(const SType alpha, const Tensor &in);
+template <typename SType>
+Tensor Div(const SType alpha, const Tensor &in);
 
 /// For each element x of Tensor 'in', compute alpha/x into 'out'
 template <typename SType>
 void Div(const SType alpha, const Tensor &in, Tensor *out);
 
+/*
+/// Multiply each column of the lhs matrix with the rhs column
+Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
+void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+/// Multiply each row of the lhs matrix with the rhs row
+Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
+void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+/// Div each row of the lhs matrix with the rhs column
+Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
+void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+/// Divide each row of the lhs matrix by the rhs row
+Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
+void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+*/
+
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index 88041b1..aed6add 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -32,7 +32,7 @@
 #define CU1DBLOCK 1024
 #define CU1DBLOCKF 1024.0
 
-namespace singa{
+namespace singa {
 // Cuda Kernel Functions
 namespace cuda {
 __global__ void kernel_softmax_loss(const float *prob, const int *label,
@@ -147,7 +147,8 @@ __global__ void kernel_add_vec_row(const float *src_vec_data,
     des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
   }
 }
-__global__ void kernel_add(const float *src1, const float *src2, float*out, int n) {
+__global__ void kernel_add(const float *src1, const float *src2, float *out,
+                           int n) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
@@ -155,7 +156,8 @@ __global__ void kernel_add(const float *src1, const float *src2, float*out, int
   }
 }
 
-__global__ void kernel_sub(const float *src1, const float *src2, float*out, int n) {
+__global__ void kernel_sub(const float *src1, const float *src2, float *out,
+                           int n) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
@@ -323,42 +325,28 @@ __global__ void kernel_threshold(const float *src_data, float *des_data,
     des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
   }
 }
-
-/*
-void softmaxloss_forward(int n, int dim, const float *prob,
-    const int *label, float *loss) {
-  kernel_softmax_loss<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(prob, label, loss, n,
-      dim);
-}
-
-void softmaxloss_backward(int n, int dim, float scale,
-    const int *label, float *grad) {
-  kernel_softmax_gradient<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(grad, label, n,
-      dim, scale);
-}
-*/
 void sum(int n, const float *in, float *out) {
   int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
   //  here, we only need one block
   int num_blocks = 1;
 
-  kernel_sum_vec<<<num_blocks, threads_per_block>>>(in, out, n);
+  kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
 }
 
 void sum_row(int rows, int cols, int stride, const float *in, float *out) {
   int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
   int num_blocks = cols;
 
-  kernel_sum_row<<<num_blocks, threads_per_block>>>(in, out, rows, cols,
-                                                    stride);
+  kernel_sum_row << <num_blocks, threads_per_block>>>
+      (in, out, rows, cols, stride);
 }
 
 void sum_col(int rows, int cols, int stride, const float *in, float *out) {
   int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
   int num_blocks = rows;
 
-  kernel_sum_col<<<num_blocks, threads_per_block>>>(in, out,
-                                                    rows, cols, stride);
+  kernel_sum_col << <num_blocks, threads_per_block>>>
+      (in, out, rows, cols, stride);
 }
 void add_row(int rows, int cols, int stride, const float *in_row,
              const float *in_mat, float *out) {
@@ -366,92 +354,91 @@ void add_row(int rows, int cols, int stride, const float *in_row,
   dim3 num_blocks(
       cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
       rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
-  kernel_add_vec_row<<<num_blocks, threads_per_block>>>(in_row, in_mat, out,
-                                                        rows, cols, stride);
+  kernel_add_vec_row << <num_blocks, threads_per_block>>>
+      (in_row, in_mat, out, rows, cols, stride);
 }
 void add(int n, const float *a, const float *b, float *out) {
-  kernel_add<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+  kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 }
 void sub(int n, const float *a, const float *b, float *out) {
-  kernel_sub<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+  kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 }
 void exp(int n, const float *in, float *out) {
-  kernel_exp<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void log(int n, const float *in, float *out) {
-  kernel_log<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void sigmoid(int n, const float *in, float *out) {
-  kernel_sigmoid<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void sigmoid_grad(int n, const float *in, float *out) {
-  kernel_sigmoid_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void relu(int n, const float *in, float *out) {
-  kernel_relu<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void relu_grad(int n, const float *in, float *out) {
-  kernel_relu_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void tanh(int n, const float *in, float *out) {
-  kernel_tanh<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void tanh_grad(int n, const float *in, float *out) {
-  kernel_tanh_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void softplus(int n, const float *in, float *out) {
-  kernel_softplus<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void softplus_grad(int n, const float *in, float *out) {
-  kernel_softplus_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void square(int n, const float *in, float *out) {
-  kernel_square<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void square_grad(int n, const float *in, float *out) {
-  kernel_square_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void sqrt(int n, const float *in, float *out) {
-  kernel_sqrt<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+  kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
 void pow(int n, const float *a, const float *b, float *out) {
-  kernel_pow<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+  kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 }
 
 void mult(int n, const float *a, const float *b, float *out) {
-  kernel_mult<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 }
 
 void mult(int n, const float *a, const float x, float *out) {
-  kernel_mult<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, x, out, n);
+  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
 }
 
 void div(int n, const float *a, const float *b, float *out) {
-  kernel_div<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+  kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
 }
 
 void set_value(int n, float v, float *out) {
-  kernel_set_value<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(out, v, n);
+  kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
 }
 
 void threshold(int n, float alpha, const float *in, float *out) {
-  kernel_threshold<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, alpha, n);
+  kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
 }
 
-
 // follow the consistency guide for math API
 __global__ void KernelDiv(const size_t num, const float alpha, const float *in,
                           float *out) {
@@ -461,6 +448,36 @@ __global__ void KernelDiv(const size_t num, const float alpha, const float *in,
   }
 }
 
+__global__ void KernelGE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelGT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] > x ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelLE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
+  }
+}
+
+__global__ void KernelLT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+  }
+}
+
 __global__ void KernelSet(const size_t num, const float x, float *out) {
   for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
        idx += blockDim.x * gridDim.x) {
@@ -468,14 +485,31 @@ __global__ void KernelSet(const size_t num, const float x, float *out) {
   }
 }
 
+void Set(const size_t num, const float x, float *out, cudaStream_t s) {
+  KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+}
 void Div(const size_t num, float alpha, const float *in, float *out,
          cudaStream_t s) {
-  KernelDiv<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, alpha, in, out);
+  KernelDiv << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, alpha, in, out);
 }
 
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
-  KernelSet<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, x, out);
+void GT(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void GE(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 }
+void LT(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void LE(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+
 }  // namespace cuda
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 925346e..5c906a9 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -86,7 +86,11 @@ void threshold(int n, float alpha, const float *in, float *out);
 void Div(const size_t num, const float x, const float *in, float *out,
          cudaStream_t s);
 void Set(const size_t num, const float x, float *out, cudaStream_t s);
-} // cuda
+void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
+void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
+void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
+void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
+}  // cuda
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index fcf42c2..5ae375c 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -142,7 +142,7 @@ void Tensor::CopyData(const Tensor &src) {
   }
 }
 
-Tensor Tensor::Clone() {
+Tensor Tensor::Clone() const {
   Tensor t(shape_, device_, data_type_);
   t.transpose_ = transpose_;
   t.CopyData(*this);
@@ -200,28 +200,28 @@ Tensor Reshape(const Tensor &in, Shape &&s) {
   return out;
 }
 
-#define GenUnaryTensorArgMemberFunction(op, fn)                                \
+#define GenUnaryTensorArgMemberFn(op, fn)                                \
   Tensor &Tensor::op(const Tensor &t) {                                        \
     fn(*this, t, this);                                                        \
     return *this;                                                              \
   }
 
-GenUnaryTensorArgMemberFunction(operator+=, Add);
-GenUnaryTensorArgMemberFunction(operator-=, Sub);
-GenUnaryTensorArgMemberFunction(operator*=, EltwiseMult);
-GenUnaryTensorArgMemberFunction(operator/=, Div);
+GenUnaryTensorArgMemberFn(operator+=, Add);
+GenUnaryTensorArgMemberFn(operator-=, Sub);
+GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
+GenUnaryTensorArgMemberFn(operator/=, Div);
 
-#define GenUnaryScalarArgMemberFunction(op, fn)                                \
+#define GenUnaryScalarArgMemberFn(op, fn)                                \
   template <typename DType> Tensor &Tensor::op(DType x) {                      \
     fn(*this, x, this);                                                        \
     return *this;                                                              \
   }                                                                            \
   template Tensor &Tensor::op<float>(float x)
 
-GenUnaryScalarArgMemberFunction(operator-=, Sub);
-GenUnaryScalarArgMemberFunction(operator+=, Add);
-GenUnaryScalarArgMemberFunction(operator*=, EltwiseMult);
-GenUnaryScalarArgMemberFunction(operator/=, Div);
+GenUnaryScalarArgMemberFn(operator-=, Sub);
+GenUnaryScalarArgMemberFn(operator+=, Add);
+GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
+GenUnaryScalarArgMemberFn(operator/=, Div);
 
 // ====================Tensor Operations=======================================
 void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
@@ -325,34 +325,35 @@ template <typename SType> void Tensor::SetValue(const SType x) {
 }
 template void Tensor::SetValue<float>(const float x);
 
-#define EltwiseUnaryTensorFn(fn, t, ret)                                       \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         \
-      ret->device()->Exec(                                                     \
-          [t, ret](Context *ctx) {                                             \
-            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);             \
-          },                                                                   \
-          {t.blob()}, {ret->blob()});                                          \
-    });                                                                        \
+#define EltwiseUnaryTensorFn(fn, t, ret)                               \
+  do {                                                                 \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
+      ret->device()->Exec(                                             \
+          [t, ret](Context* ctx) {                                     \
+            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);     \
+          },                                                           \
+          {t.blob()}, {ret->blob()});                                  \
+    });                                                                \
   } while (0)
 
-#define GenUnaryTensorFunction(fn)                                             \
-  Tensor fn(const Tensor &t) {                                                 \
-    Tensor ret(t.shape(), t.device(), t.data_type());                          \
-    auto *retptr = &ret;                                                       \
-    EltwiseUnaryTensorFn(fn, t, retptr);                                       \
-    return ret;                                                                \
-  }
-
-GenUnaryTensorFunction(Abs);
-GenUnaryTensorFunction(Exp);
-GenUnaryTensorFunction(Log);
-GenUnaryTensorFunction(ReLU);
-GenUnaryTensorFunction(Sigmoid);
-GenUnaryTensorFunction(Sign);
-GenUnaryTensorFunction(Sqrt);
-GenUnaryTensorFunction(Square);
-GenUnaryTensorFunction(Tanh);
+#define GenUnaryTensorFn(fn)                          \
+  Tensor fn(const Tensor &t) {                        \
+    Tensor ret(t.shape(), t.device(), t.data_type()); \
+    auto *retptr = &ret;                              \
+    EltwiseUnaryTensorFn(fn, t, retptr);              \
+    return ret;                                       \
+  }                                                   \
+  void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }
+
+GenUnaryTensorFn(Abs);
+GenUnaryTensorFn(Exp);
+GenUnaryTensorFn(Log);
+GenUnaryTensorFn(ReLU);
+GenUnaryTensorFn(Sigmoid);
+GenUnaryTensorFn(Sign);
+GenUnaryTensorFn(Sqrt);
+GenUnaryTensorFn(Square);
+GenUnaryTensorFn(Tanh);
 
 // TODO(wangwei) conside async exec
 template <> float Sum<float>(const Tensor &t) {
@@ -402,28 +403,25 @@ Tensor Average(const Tensor &t, int axis) {
   }
 }
 
-Tensor Softmax(const Tensor &t, int axis) {
-  Tensor ret(t.shape(), t.device(), t.data_type());
-  Softmax(t, &ret, axis);
-  return ret;
+Tensor SoftMax(const Tensor &in, int axis) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  SoftMax(in, axis, &out);
+  return out;
 }
 
-void Softmax(const Tensor &t, Tensor *ret, int axis) {
-  int nrow = 1, ncol = t.Size(), size = ncol;
-  CHECK_GE(axis, -1);
-  CHECK_GT(t.shape().size(), 0u);
-  if (axis > -1) {
-    nrow = Product(t.shape(), 0, axis + 1);
-    CHECK_EQ(size % nrow, 0) << "Size = " << size << " nrow = " << nrow;
+void SoftMax(const Tensor &in, int axis, Tensor *out) {
+  size_t nrow = 1, ncol = in.Size(), size = ncol;
+  CHECK_GE(axis, 0);
+  if (axis > 0) {
+    nrow = Product(in.shape(), 0, axis);
+    CHECK_EQ(size % nrow, 0u) << "Size = " << size << " nrow = " << nrow;
     ncol = size / nrow;
   }
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    ret->device()->Exec(
-        [nrow, ncol, t, ret](Context *ctx) {
-          Softmax<DType, Lang>(nrow, ncol, t.blob(), ret->blob(), ctx);
-        },
-        {t.blob()}, {ret->blob()});
-  });
+  Exp(in, out);
+  out->Reshape(Shape{nrow, ncol});
+  Tensor sum(Shape{nrow}, in.device(), in.data_type());
+  SumColumns(*out, &sum);
+  DivColumn(sum, out);
 }
 
 #define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
@@ -439,7 +437,7 @@ void Softmax(const Tensor &t, Tensor *ret, int axis) {
     });                                                                        \
   } while (0)
 
-#define GenBinaryTensorFunction(op, fn)                                        \
+#define GenBinaryTensorFn(op, fn)                                        \
   Tensor op(const Tensor &lhs, const Tensor &rhs) {                            \
     Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());                    \
     fn(lhs, rhs, &ret);                                                        \
@@ -449,11 +447,11 @@ void Softmax(const Tensor &t, Tensor *ret, int axis) {
     EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                                  \
   }
 
-GenBinaryTensorFunction(operator+, Add);
-GenBinaryTensorFunction(operator-, Sub);
-GenBinaryTensorFunction(operator*, EltwiseMult);
-GenBinaryTensorFunction(operator/, Div);
-GenBinaryTensorFunction(Pow, Pow);
+GenBinaryTensorFn(operator+, Add);
+GenBinaryTensorFn(operator-, Sub);
+GenBinaryTensorFn(operator*, EltwiseMult);
+GenBinaryTensorFn(operator/, Div);
+GenBinaryTensorFn(Pow, Pow);
 
 #define EltwiseTensorScalarFn(fn, t, x, ret)                                   \
   do {                                                                         \
@@ -468,7 +466,7 @@ GenBinaryTensorFunction(Pow, Pow);
     });                                                                        \
   } while (0)
 
-#define GenTensorScalarFunction(op, fn)                                        \
+#define GenTensorScalarFn(op, fn)                                        \
   template <typename SType> Tensor op(const Tensor &t, SType x) {              \
     Tensor ret(t.shape(), t.device(), t.data_type());                          \
     fn(t, x, &ret);                                                            \
@@ -480,11 +478,15 @@ GenBinaryTensorFunction(Pow, Pow);
   template Tensor op<float>(const Tensor &t, float x);                         \
   template void fn<float>(const Tensor &t, const float x, Tensor *ret)
 
-GenTensorScalarFunction(operator+, Add);
-GenTensorScalarFunction(operator-, Sub);
-GenTensorScalarFunction(operator*, EltwiseMult);
-GenTensorScalarFunction(operator/, Div);
-GenTensorScalarFunction(Pow, Pow);
+GenTensorScalarFn(operator+, Add);
+GenTensorScalarFn(operator-, Sub);
+GenTensorScalarFn(operator*, EltwiseMult);
+GenTensorScalarFn(operator/, Div);
+GenTensorScalarFn(Pow, Pow);
+GenTensorScalarFn(operator<, LT);
+GenTensorScalarFn(operator<=, LE);
+GenTensorScalarFn(operator>, GT);
+GenTensorScalarFn(operator>=, GE);
 
 // ================Blas operations============================================
 Tensor Mult(const Tensor &lhs, const Tensor &rhs) {
@@ -633,8 +635,8 @@ void DivRow(const Tensor &v, Tensor *M) {
 /// Multiply column 'v' and each column of matrix M; write results into 'out'
 void MultColumn(const Tensor &v, Tensor *M) {
   CHECK(!M->transpose()) << "Not supported yet";
-  CHECK_EQ(M->nDim(), 2);
-  CHECK_EQ(v.nDim(), 1);
+  CHECK_EQ(M->nDim(), 2u);
+  CHECK_EQ(v.nDim(), 1u);
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
@@ -650,8 +652,8 @@ void MultColumn(const Tensor &v, Tensor *M) {
 /// Multiply row 'v' with each row of matrix M; write results into 'out'
 void MultRow(const Tensor &v, Tensor *M) {
   CHECK(!M->transpose()) << "Not supported yet";
-  CHECK_EQ(M->nDim(), 2);
-  CHECK_EQ(v.nDim(), 1);
+  CHECK_EQ(M->nDim(), 2u);
+  CHECK_EQ(v.nDim(), 1u);
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
@@ -673,8 +675,8 @@ void SumColumns(const Tensor &M, Tensor *v) {
     Tensor X = M.T();
     SumRows(X, v);
   } else {
-    CHECK_EQ(M.nDim(), 2);
-    CHECK_EQ(v->nDim(), 1);
+    CHECK_EQ(M.nDim(), 2u);
+    CHECK_EQ(v->nDim(), 1u);
     size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
     CHECK_EQ(nb_row, v->Size());
 
@@ -688,8 +690,8 @@ void SumRows(const Tensor &M, Tensor *v) {
     Tensor X = M.T();
     SumColumns(X, v);
   } else {
-    CHECK_EQ(M.nDim(), 2);
-    CHECK_EQ(v->nDim(), 1);
+    CHECK_EQ(M.nDim(), 2u);
+    CHECK_EQ(v->nDim(), 1u);
     size_t nb_row = M.shape(0), nb_col = M.shape(1);
     CHECK_EQ(nb_col, v->Size());
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 98d91bf..ff865e0 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -220,6 +220,27 @@ void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret,
   LOG(FATAL) << "Not Implemented";
 }
 
+/// ret[i]=(input[i]<x)?1.f:0.f
+template <typename DType, typename Lang>
+void LT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret[i]=(input[i]<=x)?1.f:0.f
+template <typename DType, typename Lang>
+void LE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret[i]=(input[i]>x)?1.f:0.f
+template <typename DType, typename Lang>
+void GT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret[i]=(input[i]>x)?1.f:0.f
+template <typename DType, typename Lang>
+void GE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // ===== Level 1
 /// return the index of the element with the max value.
@@ -319,6 +340,30 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
           Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
-} // namespace singa
+/// ret[i]=(input[i]<x)?1.f:0.f
+template <typename DType, typename Lang>
+void LT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret[i]=(input[i]<=x)?1.f:0.f
+template <typename DType, typename Lang>
+void LE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret[i]=(input[i]>x)?1.f:0.f
+template <typename DType, typename Lang>
+void GT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret[i]=(input[i]>=x)?1.f:0.f
+template <typename DType, typename Lang>
+void GE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
 
+}  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 97da896..693f09c 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -19,6 +19,7 @@
 #define SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
 #include "./tensor_math.h"
 #include "singa/core/common.h"
+#include <math.h>
 
 #ifdef USE_CBLAS
 #include <cblas.h>
@@ -51,6 +52,16 @@ void Add<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
 }
 
 template <>
+void Add<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
+                           Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] + x;
+  }
+}
+
+template <>
 void Sub<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
                            Blob *ret, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
@@ -61,6 +72,7 @@ void Sub<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
     dptr[i] = lptr[i] - rptr[i];
   }
 }
+
 // sum all elements of input into ret
 // TODO(wangwei) optimize using omp
 template <>
@@ -74,53 +86,96 @@ void Sum<float, lang::Cpp>(int count, const Blob *input, float *ret,
   *ret = s;
 }
 
-// TODO(wangwei) optimize using omp
 template <>
-void SumRows<float, lang::Cpp>(int nrow, int ncol, const Blob *input, Blob *ret,
-                               Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(int count, const Blob *input, float x,
+                                   Blob *ret, Context *ctx) {
   float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *in = static_cast<const float *>(input->data());
-  memset(dptr, 0, ncol * sizeof(float));
-  for (int r = 0; r < nrow; r++) {
-    for (int c = 0; c < ncol; c++) {
-      dptr[c] += in[r * ncol + c];
-    }
+  const float *lptr = static_cast<const float *>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] * x;
   }
 }
 
-// Sum the rows of the input matrix into a vector
-// TODO(wangwei) optimize using omp
 template <>
-void SumColumns<float, lang::Cpp>(int nrow, int ncol, const Blob *input,
-                                  Blob *ret, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
+                                   Blob *ret, Context *ctx) {
   float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *in = static_cast<const float *>(input->data());
-  memset(dptr, 0, ncol * sizeof(float));
-  for (int r = 0; r < nrow; r++) {
-    for (int c = 0; c < ncol; c++) {
-      dptr[r] += in[r * ncol + c];
-    }
+  const float *lptr = static_cast<const float *>(lhs->data());
+  const float *rptr = static_cast<const float *>(rhs->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] * rptr[i];
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *input, float x,
-                                   Blob *ret, Context *ctx) {
+void Exp<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
+                           Context *ctx) {
   float *dptr = static_cast<float *>(ret->mutable_data());
   const float *lptr = static_cast<const float *>(input->data());
   for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * x;
+    dptr[i] = exp(lptr[i]);
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                                   Blob *ret, Context *ctx) {
+void Log<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
+                           Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
+  for (int i = 0; i < count; i++) {
+    CHECK_GT(lptr[i], 0.f);
+    dptr[i] = log(lptr[i]);
+  }
+}
+
+template <>
+void Tanh<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
+                            Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = tanh(lptr[i]);
+  }
+}
+
+template <>
+void ReLU<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
+                            Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = (lptr[i] >= 0.f) ? lptr[i] : 0.f;
+  }
+}
+
+template <>
+void Sigmoid<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
+                               Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = 1.f / (1.f + exp(-lptr[i]));
+  }
+}
+
+template <>
+void Pow<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
+                           Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = pow(lptr[i], x);
+  }
+}
+
+template <>
+void Pow<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
+                           Blob *ret, Context *ctx) {
   float *dptr = static_cast<float *>(ret->mutable_data());
   const float *lptr = static_cast<const float *>(lhs->data());
   const float *rptr = static_cast<const float *>(rhs->data());
   for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * rptr[i];
+    dptr[i] = pow(lptr[i], rptr[i]);
   }
 }
 
@@ -159,8 +214,15 @@ void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
                            Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = alpha / inPtr[i];
+}
+template <>
+void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = alpha / inPtr[i];
+    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
   }
 }
 
@@ -192,9 +254,38 @@ template <>
 void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
                            Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++)
-    outPtr[i] = x;
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  }
 }
+
 #ifdef USE_CBLAS
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -224,7 +315,6 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
 
 #endif  // USE_CBLAS
 
-
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 26299ba..4a2ba66 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -73,25 +73,6 @@ void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
   cuda::sum(count, in, ret);
 }
 
-// TODO(wangwei) optimize using stream
-template <>
-void SumRows<float, lang::Cuda>(int nrow, int ncol, const Blob *input,
-                                Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *in = static_cast<const float *>(input->data());
-  cuda::sum_row(nrow, ncol, ncol, in, dptr);
-}
-
-// Sum the rows of the input matrix into a vector
-// TODO(wangwei) optimize using stream
-template <>
-void SumColumns<float, lang::Cuda>(int nrow, int ncol, const Blob *input,
-                                   Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *in = static_cast<const float *>(input->data());
-  cuda::sum_col(nrow, ncol, ncol, in, dptr);
-}
-
 // follow the consistency guide of math API
 template <>
 void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
@@ -144,7 +125,42 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
 }
+
+template <>
+void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                                   Blob* out, Context *ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                                   Blob* out,  Context *ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                                   Blob* out, Context *ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                                   Blob* out,  Context *ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
+}
+
+
+
+
+
 }  // namespace singa
 
 #endif  // USE_CUDA
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/activation.cc b/src/model/layer/activation.cc
new file mode 100644
index 0000000..464e24d
--- /dev/null
+++ b/src/model/layer/activation.cc
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./activation.h"
+namespace singa {
+
+void Activation::Setup(const LayerConf& conf) {
+  Layer::Setup(conf);
+  mode_ = conf.type();
+  if (mode_ == "RELU") {
+    neg_slope_ = conf.relu_conf().negative_slope();
+  }
+}
+
+const Tensor Activation::Forward(int flag, const Tensor& input) {
+  Tensor output;
+  if (mode_ == "SIGMOID") {
+    output = Sigmoid(input);
+    buf_.push(output);
+  } else if (mode_ == "TANH") {
+    output = Tanh(input);
+    buf_.push(output);
+  } else if (mode_ == "RELU") {
+    output = ReLU(input);
+    buf_.push(input);
+  } else {
+    LOG(FATAL) << "Unkown activation: " << mode_;
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> Activation::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  // inout means either input or output, but only one is valid for an
+  // activation.
+  Tensor input_grad, inout = buf_.top();
+  buf_.pop();
+  if (mode_ == "SIGMOID") {
+    input_grad = grad * inout * (inout * (-1.f) + 1.f);
+  } else if (mode_ == "TANH") {
+    input_grad = grad * (inout * inout * (-1.f) + 1.f);
+  } else if (mode_ == "RELU") {
+    input_grad = grad * (inout > 0.f) + (inout <= 0.f) * neg_slope_;
+  } else {
+    LOG(FATAL) << "Unkown activation: " << mode_;
+  }
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/activation.h
----------------------------------------------------------------------
diff --git a/src/model/layer/activation.h b/src/model/layer/activation.h
new file mode 100644
index 0000000..1747577
--- /dev/null
+++ b/src/model/layer/activation.h
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_ACTIVATION_H_
+#define SINGA_MODEL_LAYER_ACTIVATION_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Activation : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Activation"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf& conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  const std::string Mode() const { return mode_; }
+
+  const float Negative_slope() const { return neg_slope_; }
+
+ protected:
+  std::string mode_;
+  std::stack<Tensor> buf_;
+  float neg_slope_;
+};
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_ACTIVATION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
new file mode 100644
index 0000000..73c70d7
--- /dev/null
+++ b/src/model/layer/cudnn_activation.cc
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa_config.h"
+#ifdef USE_CUDNN
+#include "./cudnn_activation.h"
+#include <cudnn.h>
+
+#include "./cudnn_utils.h"
+#include "singa/core/common.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+CudnnActivation::~CudnnActivation() {
+  if (acti_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyActivationDescriptor(acti_desc_));
+  if (desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
+}
+
+void CudnnActivation::InitCudnn(size_t size, DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+  CUDNN_CHECK(cudnnCreateActivationDescriptor(&acti_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
+
+  if (mode_ == "SIGMOID")
+    cudnn_mode_ = CUDNN_ACTIVATION_SIGMOID;
+  else if (mode_ == "TANH")
+    cudnn_mode_ = CUDNN_ACTIVATION_TANH;
+  else if (mode_ == "RELU")
+    cudnn_mode_ = CUDNN_ACTIVATION_RELU;
+  else
+    LOG(FATAL) << "Unkown activation: " << mode_;
+
+  nan_opt_ = CUDNN_PROPAGATE_NAN;
+  CUDNN_CHECK(
+      cudnnSetActivationDescriptor(acti_desc_, cudnn_mode_, nan_opt_, 0.0f));
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnActivation::Forward(int flag, const Tensor& input) {
+  auto size = input.Size();
+  DataType dtype = input.data_type();
+  if (!has_init_cudnn_) {
+    InitCudnn(size, dtype);
+  }
+  Tensor output;
+  output.ResetLike(input);
+  output.device()->Exec([input, output, this](Context* ctx) {
+    Blob* inblob = input.blob(), * outblob = output.blob();
+    float alpha = 1.0f, beta = 0.0f;
+#if CUDNN_VERSION_MAJOR == 5
+    CUDNN_CHECK(cudnnActivationForward(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
+        inblob->data(), &beta, this->desc_, outblob->mutable_data()));
+#elif CUDNN_VERSION_MAJOR == 4
+    CUDNN_CHECK(cudnnActivationForward_v4(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
+        inblob->data(), &beta, this->desc_, outblob->mutable_data()));
+#endif
+  }, {input.blob()}, {output.blob()});
+  if (cudnn_mode_ == CUDNN_ACTIVATION_SIGMOID ||
+      cudnn_mode_ == CUDNN_ACTIVATION_TANH) {
+    buf_.push(output);
+  } else if (cudnn_mode_ == CUDNN_ACTIVATION_RELU) {
+    buf_.push(input);
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnActivation::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor dx;  // inout = buf_.top();
+  // inout means either used as input or output, only one is valid for one type
+  // of activation
+  Tensor inout = buf_.top();
+  buf_.pop();
+  dx.ResetLike(grad);
+  dx.device()->Exec([dx, grad, inout, this](Context* ctx) {
+    Blob* dyblob = grad.blob(), * dxblob = dx.blob(), * yblob = inout.blob(),
+          * xblob = inout.blob();
+    float alpha = 1.0f, beta = 0.0f;
+#if CUDNN_VERSION_MAJOR == 5
+    CUDNN_CHECK(cudnnActivationBackward(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_, yblob->data(),
+        this->desc_, dyblob->data(), this->desc_, xblob->data(), &beta,
+        this->desc_, dxblob->mutable_data()));
+#elif CUDNN_VERSION_MAJOR == 4
+    CUDNN_CHECK(cudnnActivationBackward_v4(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_, yblob->data(),
+        this->desc_, dyblob->data(), this->desc_, xblob->data(), &beta,
+        this->desc_, dxblob->mutable_data()));
+#endif
+  }, {grad.blob(), inout.blob()}, {dx.blob()});
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/cudnn_activation.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.h b/src/model/layer/cudnn_activation.h
new file mode 100644
index 0000000..b572db7
--- /dev/null
+++ b/src/model/layer/cudnn_activation.h
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
+#define SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
+#include "singa_config.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "./activation.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnActivation : public Activation {
+ public:
+  ~CudnnActivation();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "CudnnActivation"; }
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  /// Init cudnn related data structures.
+  void InitCudnn(size_t size, DataType dtype);
+
+  const cudnnActivationMode_t CudnnMode() const { return cudnn_mode_; }
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnActivationDescriptor_t acti_desc_;
+  cudnnTensorDescriptor_t desc_;
+  cudnnNanPropagation_t nan_opt_;
+  cudnnActivationMode_t cudnn_mode_;
+};
+}  // namespace
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
new file mode 100644
index 0000000..bc7fe78
--- /dev/null
+++ b/src/model/layer/cudnn_softmax.cc
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa_config.h"
+#include "./cudnn_softmax.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+namespace singa {
+CudnnSoftmax::~CudnnSoftmax() {
+  if (desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
+}
+
+void CudnnSoftmax::InitCudnn(size_t size, DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
+
+  algorithm_ = CUDNN_SOFTMAX_ACCURATE;
+  mode_ = CUDNN_SOFTMAX_MODE_INSTANCE;
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnSoftmax::Forward(int flag, const Tensor& input) {
+  auto size = input.Size();
+  DataType dtype = input.data_type();
+  if (!has_init_cudnn_) {
+    InitCudnn(size, dtype);
+  }
+  Tensor output;
+  output.ResetLike(input);
+  output.device()->Exec([input, output, this](Context* ctx) {
+    Blob* inblob = input.blob(), * outblob = output.blob();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnSoftmaxForward(ctx->cudnn_handle, this->algorithm_, this->mode_,
+                        &alpha, this->desc_, inblob->data(), &beta, this->desc_,
+                        outblob->mutable_data());
+  }, {input.blob()}, {output.blob()});
+  buf_.push(output);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnSoftmax::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor dx, output = buf_.top();
+  buf_.pop();
+  dx.ResetLike(grad);
+  dx.device()->Exec([dx, grad, output, this](Context* ctx) {
+    Blob* dyblob = grad.blob(), * dxblob = dx.blob(), * yblob = output.blob();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnSoftmaxBackward(ctx->cudnn_handle, this->algorithm_, this->mode_,
+                         &alpha, this->desc_, yblob->data(), this->desc_,
+                         dyblob->data(), &beta, this->desc_,
+                         dxblob->mutable_data());
+  }, {grad.blob(), output.blob()}, {dx.blob()});
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/cudnn_softmax.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.h b/src/model/layer/cudnn_softmax.h
new file mode 100644
index 0000000..ee92d6f
--- /dev/null
+++ b/src/model/layer/cudnn_softmax.h
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LAYER_CUDNN_SOFTMAX_H_
+#define SINGA_MODEL_LAYER_CUDNN_SOFTMAX_H_
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "./softmax.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnSoftmax : public Softmax {
+ public:
+  ~CudnnSoftmax();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "CudnnSoftmax"; }
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  /// Init cudnn related data structures.
+  void InitCudnn(size_t size, DataType dtype);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t desc_;
+  cudnnSoftmaxAlgorithm_t algorithm_;
+  cudnnSoftmaxMode_t mode_;
+};
+}  // namespace
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_SOFTMAX_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/softmax.cc b/src/model/layer/softmax.cc
new file mode 100644
index 0000000..813ebf0
--- /dev/null
+++ b/src/model/layer/softmax.cc
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./softmax.h"
+namespace singa {
+
+void Softmax::Setup(const LayerConf& conf) {
+  Layer::Setup(conf);
+  axis_ = conf.softmax_conf().axis();  // default is 1
+}
+
+const Tensor Softmax::Forward(int flag, const Tensor& input) {
+  if (input.nDim() == 1) {
+    Tensor tmp = Reshape(input, Shape{1, input.Size()});
+    buf_.push(SoftMax(tmp, 0));
+  } else {
+    buf_.push(SoftMax(input, axis_));
+  }
+  return buf_.top();
+}
+
+const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,
+                                                          const Tensor& grad) {
+  size_t nrow = 1, ncol = grad.Size();
+  if (grad.nDim() > 1 && axis_ > 0) {
+    nrow = Product(grad.shape(), 0, axis_);
+    ncol = Product(grad.shape(), axis_, grad.nDim());
+  }
+  Tensor input_grad = grad.Clone();
+  input_grad.Reshape(Shape{nrow, ncol});
+  Tensor y = buf_.top();
+  buf_.pop();
+  CHECK(y.shape() == input_grad.shape());
+  Tensor sigma = input_grad * y;
+  Tensor sum(Shape{nrow}, grad.device(), grad.data_type());
+  SumColumns(sigma, &sum);
+  // dL / dy_i = grad_i
+  // dy_i / dx_i = y_i - y_i^2, if i == j
+  // dy_i / dx_j = - y_i * y_j, if i != j
+  // dL / dx_i = sum_j((dL / dy_j) * (dy_j / dx_i))
+  // dL / dx_i = y_i * (grad_i - sum), where sum = sum_i(grad_i * y_i);
+  SubColumn(sum, &input_grad);
+  input_grad = input_grad * y;
+  // Mult(input_grad, y, &input_grad);
+  vector<Tensor> param_grad;
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/src/model/layer/softmax.h
----------------------------------------------------------------------
diff --git a/src/model/layer/softmax.h b/src/model/layer/softmax.h
new file mode 100644
index 0000000..ea3a70a
--- /dev/null
+++ b/src/model/layer/softmax.h
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_SOFTMAX_H_
+#define SINGA_MODEL_LAYER_SOFTMAX_H_
+#include "singa/model/layer.h"
+#include <stack>
+namespace singa {
+class Softmax : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Softmax"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf& conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int flag, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  const int Axis() const { return axis_; }
+
+ protected:
+  int axis_;
+  std::stack<Tensor> buf_;
+};
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_SOFTMAX_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/test/singa/test_activation.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_activation.cc b/test/singa/test_activation.cc
new file mode 100644
index 0000000..9e34282
--- /dev/null
+++ b/test/singa/test_activation.cc
@@ -0,0 +1,133 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/activation.h"
+#include "gtest/gtest.h"
+#include <math.h> // exp, tanh
+
+using singa::Activation;
+TEST(Activation, Setup) {
+  Activation acti;
+  EXPECT_EQ("Activation", acti.layer_type());
+
+  singa::LayerConf conf;
+  conf.set_type("RELU");
+  singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+  reluconf->set_negative_slope(0.5);
+
+  acti.Setup(conf);
+  EXPECT_EQ("RELU", acti.Mode());
+  EXPECT_EQ(0.5f, acti.Negative_slope());
+}
+
+TEST(Activation, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  float neg_slope = 0.5f;
+  std::string types[] = {"SIGMOID","TANH","RELU"};
+  for (int j = 0; j < 3; j++) {
+    Activation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "RELU") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(conf);
+
+    singa::Tensor out = acti.Forward(0, in);
+
+    const float* yptr = out.data<const float*>();
+    EXPECT_EQ(n, out.Size());
+
+    float* y = new float[n];
+    if (acti.Mode() == "SIGMOID") {
+      for (size_t i = 0; i < n; i++)
+        y[i] = 1.f / (1.f + exp(-x[i]));
+    }
+    else if (acti.Mode() == "TANH") {
+      for (size_t i = 0; i < n; i++)
+        y[i] = tanh(x[i]);
+    }
+    else if (acti.Mode() == "RELU") {
+      for (size_t i = 0; i < n; i++)
+        y[i] = (x[i] >= 0.f) ? x[i] : 0.f;
+    }
+    else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    EXPECT_FLOAT_EQ(y[0], yptr[0]);
+    EXPECT_FLOAT_EQ(y[4], yptr[4]);
+    EXPECT_FLOAT_EQ(y[5], yptr[5]);
+  }
+}
+
+TEST(Activation, Backward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  float neg_slope = 0.5f;
+  std::string types[] = {"SIGMOID","TANH","RELU"};  
+  for (int j = 0; j < 3; j++) {
+    Activation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "RELU") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(conf);
+
+    singa::Tensor out = acti.Forward(0, in);
+    const float* yptr = out.data<const float*>();
+
+    const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
+    singa::Tensor out_diff(singa::Shape{n});
+    out_diff.CopyDataFromHostPtr<float>(grad, n);
+    const auto in_diff = acti.Backward(0, out_diff);
+    const float* xptr = in_diff.first.data<const float*>();
+
+    float* dx = new float[n];
+    if (acti.Mode() == "SIGMOID") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] = grad[i] * yptr[i] * (1. - yptr[i]);
+    }
+    else if (acti.Mode() == "TANH") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] = grad[i] * (1 - yptr[i] * yptr[i]);
+    }
+    else if (acti.Mode() == "RELU") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] = grad[i] * (x[i] > 0.f) + acti.Negative_slope() * (x[i] <= 0.f);
+    }
+    else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    EXPECT_FLOAT_EQ(dx[0], xptr[0]);
+    EXPECT_FLOAT_EQ(dx[4], xptr[4]);
+    EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_activation.cc b/test/singa/test_cudnn_activation.cc
new file mode 100644
index 0000000..ee9f9b5
--- /dev/null
+++ b/test/singa/test_cudnn_activation.cc
@@ -0,0 +1,136 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa_config.h"
+#ifdef USE_CUDNN
+
+#include "singa/proto/core.pb.h"
+#include "../src/model/layer/cudnn_activation.h"
+#include "gtest/gtest.h"
+#include <math.h>  // exp tanh
+#include <cudnn.h>
+
+using singa::CudnnActivation;
+TEST(TCudnnActivation, Setup) {
+  CudnnActivation acti;
+  EXPECT_EQ("CudnnActivation", acti.layer_type());
+
+  singa::LayerConf conf;
+  conf.set_type("RELU");
+  singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+  reluconf->set_negative_slope(0.5f);
+
+  acti.Setup(conf);
+  acti.InitCudnn(1, singa::kFloat32);
+  EXPECT_EQ(CUDNN_ACTIVATION_RELU, acti.CudnnMode());
+  EXPECT_EQ(0.5f, acti.Negative_slope());
+}
+
+TEST(TCudnnActivation, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{n}, &cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  float neg_slope = 0.5f;
+  std::string types[] = {"SIGMOID", "TANH", "RELU"};
+  for (int j = 0; j < 3; j++) {
+    CudnnActivation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "RELU") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(conf);
+    // acti.InitCudnn(n, singa::kFloat32);
+
+    singa::Tensor out = acti.Forward(0, in);
+    EXPECT_EQ(n, out.Size());
+    singa::CppCPU host(0, 1);
+    out.ToDevice(&host);
+    const float* yptr = out.data<const float*>();
+    float* y = new float[n];
+    if (acti.Mode() == "SIGMOID") {
+      for (size_t i = 0; i < n; i++) y[i] = 1.f / (1.f + exp(-x[i]));
+    } else if (acti.Mode() == "TANH") {
+      for (size_t i = 0; i < n; i++) y[i] = tanh(x[i]);
+    } else if (acti.Mode() == "RELU") {
+      for (size_t i = 0; i < n; i++) y[i] = (x[i] >= 0.f) ? x[i] : 0.f;
+    } else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    EXPECT_FLOAT_EQ(y[0], yptr[0]);
+    EXPECT_FLOAT_EQ(y[4], yptr[4]);
+    EXPECT_FLOAT_EQ(y[5], yptr[5]);
+  }
+}
+
+TEST(TCudnnActivation, Backward) {
+  const float x[] = {2.0f, 3.0f, 3.0f, 7.f, 0.0f, 5.0, 1.5, 2.5, -2.5, 1.5};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{n}, &cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+  float neg_slope = 0.5f;
+  std::string types[] = {"SIGMOID", "TANH", "RELU"};
+  for (int j = 0; j < 3; j++) {
+    CudnnActivation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "RELU") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(conf);
+    acti.InitCudnn(n, singa::kFloat32);
+    singa::Tensor out = acti.Forward(0, in);
+    EXPECT_EQ(n, out.Size());
+    singa::CppCPU host(0, 1);
+    out.ToDevice(&host);
+    const float* yptr = out.data<const float*>();
+
+    const float grad[] = {2.0f, 1.0f, 2.0f, 0.0f, -2.0f,
+                          -1.0, 1.5,  2.5,  -1.5, -2.5};
+    singa::Tensor out_diff(singa::Shape{n}, &cuda);
+    out_diff.CopyDataFromHostPtr<float>(grad, n);
+    const auto ret = acti.Backward(0, out_diff);
+    singa::Tensor in_diff = ret.first;
+    in_diff.ToDevice(&host);
+    const float* xptr = in_diff.data<const float*>();
+    float* dx = new float[n];
+    if (acti.Mode() == "SIGMOID") {
+      for (size_t i = 0; i < n; i++) dx[i] = grad[i] * yptr[i] * (1. - yptr[i]);
+    } else if (acti.Mode() == "TANH") {
+      for (size_t i = 0; i < n; i++) dx[i] = grad[i] * (1. - yptr[i] * yptr[i]);
+    } else if (acti.Mode() == "RELU") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] =
+            grad[i] * (x[i] > 0.f);  //+ acti.Negative_slope() * (x[i] <= 0.f);
+    } else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    for (size_t i = 0; i < n; i++) {
+      EXPECT_NEAR(dx[i], xptr[i], 1e-7);
+    }
+  }
+}
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_dropout.cc b/test/singa/test_cudnn_dropout.cc
index e1a6333..32572d0 100644
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@ -21,7 +21,7 @@
 #include "../src/model/layer/cudnn_dropout.h"
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
-#if CUDNN_MAJOR_VERSION >= 5
+#if CUDNN_VERSION_MAJOR >= 5
 
 #include "gtest/gtest.h"
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_softmax.cc b/test/singa/test_cudnn_softmax.cc
new file mode 100644
index 0000000..dcbf1ed
--- /dev/null
+++ b/test/singa/test_cudnn_softmax.cc
@@ -0,0 +1,107 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa_config.h"
+#ifdef USE_CUDNN
+
+#include "../src/model/layer/cudnn_softmax.h"
+#include "gtest/gtest.h"
+#include <math.h>  // exp
+#include <cudnn.h>
+
+using singa::CudnnSoftmax;
+TEST(CudnnSoftmax, Setup) {
+  CudnnSoftmax sft;
+  EXPECT_EQ("CudnnSoftmax", sft.layer_type());
+
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_axis(2);
+
+  sft.Setup(conf);
+  sft.InitCudnn(1, singa::kFloat32);
+  EXPECT_EQ(2, sft.Axis());
+}
+
+TEST(CudnnSoftmax, Forward) {
+  const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{n}, &cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 1;
+  CudnnSoftmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_axis(axis);
+  sft.Setup(conf);
+  sft.InitCudnn(n, singa::kFloat32);
+
+  singa::Tensor out = sft.Forward(0, in);
+  singa::CppCPU host(0, 1);
+  out.ToDevice(&host);
+  const float* yptr = out.data<const float*>();
+  EXPECT_EQ(n, out.Size());
+
+  float* y = new float[n];
+  float sigma = 0.f;
+  for (size_t i = 0; i < n; i++) sigma += exp(x[i]);
+  for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma;
+  EXPECT_FLOAT_EQ(y[0], yptr[0]);
+  EXPECT_FLOAT_EQ(y[4], yptr[4]);
+  EXPECT_FLOAT_EQ(y[5], yptr[5]);
+}
+
+TEST(CudnnSoftmax, Backward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -1.0};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{n}, &cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 1;
+  CudnnSoftmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_axis(axis);
+  sft.Setup(conf);
+  singa::Tensor out = sft.Forward(0, in);
+  singa::CppCPU host(0, 1);
+  out.ToDevice(&host);
+  const float* yptr = out.data<const float*>();
+
+  const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
+  singa::Tensor out_diff(singa::Shape{n}, &cuda);
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto ret = sft.Backward(0, out_diff);
+  singa::Tensor in_diff = ret.first;
+  in_diff.ToDevice(&host);
+  const float* xptr = in_diff.data<const float*>();
+
+  float* dx = new float[n];
+  float sigma = 0.f;
+  for (size_t i = 0; i < n; i++) sigma += grad[i] * yptr[i];
+  for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma) * yptr[i];
+  EXPECT_FLOAT_EQ(dx[0], xptr[0]);
+  EXPECT_FLOAT_EQ(dx[4], xptr[4]);
+  EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+}
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2507b7/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_softmax.cc b/test/singa/test_softmax.cc
new file mode 100644
index 0000000..da2a6ef
--- /dev/null
+++ b/test/singa/test_softmax.cc
@@ -0,0 +1,110 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/softmax.h"
+#include "gtest/gtest.h"
+#include <math.h> // exp
+
+using singa::Softmax;
+TEST(Softmax, Setup) {
+  Softmax sft;
+  EXPECT_EQ("Softmax", sft.layer_type());
+
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_axis(2);
+
+  sft.Setup(conf);
+  EXPECT_EQ(2, sft.Axis());
+}
+
+TEST(Softmax, Forward) {
+  const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t row = 2;
+  size_t col = 3;
+  singa::Tensor in(singa::Shape{row, col});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 1;
+  Softmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_axis(axis);
+  sft.Setup(conf);
+
+  singa::Tensor out = sft.Forward(0, in);
+  const float* yptr = out.data<const float*>();
+  EXPECT_EQ(n, out.Size());
+
+  float* y = new float[n];
+  float* sigma = new float[row];
+  for (size_t i = 0; i < row; i++)
+    sigma[i] = 0.f;
+  for (size_t i = 0; i < n; i++)
+    sigma[i / col] += exp(x[i]);
+  //EXPECT_EQ(0, sigma[1]);
+  for (size_t i = 0; i < row; i++)
+    for (size_t j = 0; j < col; j++)
+      y[i * col + j] = exp(x[i * col + j]) / sigma[i];
+  EXPECT_FLOAT_EQ(y[0], yptr[0]);
+  EXPECT_FLOAT_EQ(y[4], yptr[4]);
+  EXPECT_FLOAT_EQ(y[5], yptr[5]);
+}
+
+TEST(Softmax, Backward) {
+  const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t row = 2;
+  size_t col = 3;
+  singa::Tensor in(singa::Shape{row, col});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 1;
+  Softmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_axis(axis);
+  sft.Setup(conf);
+  singa::Tensor out = sft.Forward(0, in);
+  const float* yptr = out.data<const float*>();
+
+  const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
+  singa::Tensor out_diff(singa::Shape{row, col});
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto in_diff = sft.Backward(0, out_diff);
+  const float* xptr = in_diff.first.data<const float*>();
+
+  float* dx = new float[n];
+  float* sigma = new float[row];
+  for (size_t i = 0; i < row; i++)
+    sigma[i] = 0.f;
+  for (size_t i = 0; i < n; i++)
+    sigma[i / col] += grad[i] * yptr[i];
+  // EXPECT_EQ(0, sigma[0]);
+  // EXPECT_EQ(0, sigma[1]);
+  for (size_t i = 0; i < row; i++)
+    for (size_t j = 0; j < col; j++)
+      dx[i * col + j] = (grad[i * col + j] - sigma[i]) * yptr[i * col +j];
+  EXPECT_FLOAT_EQ(dx[0], xptr[0]);
+  EXPECT_FLOAT_EQ(dx[4], xptr[4]);
+  EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+}

[03/50] [abbrv] incubator-singa git commit: SINGA-170 Add Dropout layer and CudnnDropout layer

Posted by zh...@apache.org.

SINGA-170 Add Dropout layer and CudnnDropout layer

Add test_dropout.cc for Dropout class.
Add RNN base layer draft.
Add math functions to support Dropout.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c3a0558c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c3a0558c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c3a0558c

Branch: refs/heads/master
Commit: c3a0558cf5896a9313e9e5c2636e742ec8649fad
Parents: 99e0d24
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Tue May 17 15:42:43 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Tue May 17 15:42:43 2016 +0800

----------------------------------------------------------------------
 include/singa/core/device.h       |  2 +
 include/singa/core/tensor.h       | 20 ++++-----
 include/singa/model/layer.h       | 71 ++++++++++----------------------
 include/singa/model/rnn.h         | 29 -------------
 src/core/device/device.cc         |  4 +-
 src/core/tensor/tensor.cc         | 15 ++++---
 src/core/tensor/tensor_math_cpp.h | 24 ++++++++++-
 src/model/layer/cudnn_dropout.cc  | 71 ++++++++++++++++----------------
 src/model/layer/cudnn_dropout.h   | 14 +++----
 src/model/layer/cudnn_utils.h     | 14 ++++---
 src/model/layer/dropout.cc        |  6 +--
 src/model/layer/dropout.h         | 11 ++++-
 src/model/layer/rnn.h             | 59 ++++++++++++++++++++++++++
 test/singa/test_dropout.cc        | 75 +++++++++++++++++++++++++++++++++-
 test/singa/test_tensor.cc         |  3 +-
 15 files changed, 266 insertions(+), 152 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index f3bb5a2..b96efca 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -114,6 +114,7 @@ class Device {
   // SafeQueue<Operation> op_queue_;
   // SafeQueue<Operation> op_log_;
   /// The host device
+  Context ctx_;
   Device* host_;
 };
 // Implement Device using Cpp libs.
@@ -129,6 +130,7 @@ class CppDevice : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
+
 };
 
 /// a singleton CppDevice as the host for all devices.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 4807123..6c20c4f 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -155,38 +155,38 @@ class Tensor {
   Tensor T() const;
 
   /// Copy the meta info with data blob shared.
-  void operator=(const Tensor& t);
+  Tensor& operator=(const Tensor& t);
 
   /// Copy the meta info with data blob shared.
-  void operator=(Tensor&& t);
+  Tensor& operator=(Tensor&& t);
 
 
-  void operator+=(const Tensor& t);
+  Tensor& operator+=(const Tensor& t);
   // void operator+=(Tensor&& t);
-  void operator-=(const Tensor& t);
+  Tensor& operator-=(const Tensor& t);
   // void operator-=(Tensor&& t);
-  void operator*=(const Tensor& t);
+  Tensor& operator*=(const Tensor& t);
   // void operator*=(Tensor&& t);
-  void operator/=(const Tensor& t);
+  Tensor& operator/=(const Tensor& t);
   // void operator/=(Tensor&& t);
 
   // Scalar operations.
 
   /// T is a scalar type
   template<typename DType>
-  void operator+=(DType x);
+  Tensor& operator+=(DType x);
 
   /// T is a scalar type
   template <typename DType>
-  void operator-=(const DType x);
+  Tensor& operator-=(const DType x);
 
   /// T is a scalar type
   template <typename DType>
-  void operator*=(const DType x);
+  Tensor& operator*=(const DType x);
 
   /// T is a scalar type
   template <typename DType>
-  void operator/=(const DType x);
+  Tensor& operator/=(const DType x);
 
   /// save Tensor into a proto msg
   // void ToProto(TensorProto* t);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 48fc58f..a4c4630 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -42,9 +42,9 @@ class Layer {
   }
 
   // ============= Following Functions could be override =====================
-  /// Destruct the objecst created by this layer.
+  /// Destruct objects created by this layer.
   virtual ~Layer() {
-    for (Tensor * t : param_values_) {
+    for (Tensor* t : param_values_) {
       delete t;
     }
   }
@@ -56,19 +56,18 @@ class Layer {
   /// Set meta data fields configured in 'conf' (a proto message).
   virtual void Setup(const LayerConf& conf) {
     name_ = conf.name();
-    for (const auto& spec : conf.param())
-      param_specs_.push_back(spec);
+    for (const auto& spec : conf.param()) param_specs_.push_back(spec);
     // TODO(wangwei) load param values from checkpoint blobs.
   }
 
   /// Do feature transformation for the given 'input' tensor (denoted as x).
-  /// 'flag' is either kPhaseTrain or kPhaseTest for feed-forward nets, and
+  /// 'flag' is either kTrain or kEval for feed-forward nets, and
   /// would be used for other phases of training other nets. For example, when
   /// training RBM, we may create an alias of this function as ComputeFeature
-  /// where flag could be kPositivePhase and kNegativePhase.
+  /// where flag could be kPositive and kNegative.
   /// It will return a Tensor (denoted as y).
   /// If the 'input' or 'output' is required for computing the gradients in
-  /// Backward(), then push them into the states_ stack.
+  /// Backward(), then buffer them as internal data.
   virtual const Tensor Forward(int flag, const Tensor& input) {
     LOG(FATAL) << "Not implemented";
     Tensor t;
@@ -77,10 +76,12 @@ class Layer {
 
   /// \copydoc Forward(int flag, const Tensor& input)
   /// Accept multiple input tensors and generate multiple output tensors.
+  /// If there is only one input tensor, it will call Forward(int, const
+  /// Tensor&) by default. Users can override this function for layers who
+  /// generate more than one outputs.
   virtual const vector<Tensor> Forward(int flag, const vector<Tensor>& inputs) {
     vector<Tensor> ret;
-    if (inputs.size() == 1)
-      ret.push_back(Forward(flag, inputs.at(0)));
+    if (inputs.size() == 1) ret.push_back(Forward(flag, inputs.at(0)));
 
     LOG(FATAL) << "Not implemented";
     return ret;
@@ -88,19 +89,14 @@ class Layer {
 
   /// Compute gradients of this layer.
   /// Specifically, there are two types of gradients:
-  /// 1. gradients of preceding layers, i.e., dx.
-  /// 2. gradients of parameters of this layer.
-  /// 1 and 2 are returned as a pair of vector<Tensor>
+  /// 1. gradient of the preceding layer, i.e., dx.
+  /// 2. gradients of parameters of this layer, e.g., dw for weight matrix.
   /// 1 is an empty tensor if there is no preceding layer or there is no need to
-  /// compute dx (e.g., x is from a data layer); 2 is empty if this layer has no
-  /// parameters.
-  /// 'flag' is either kTrainPhase or kTestPhase for feed-forward nets, and
+  /// compute dx (e.g., x is from a data layer); 2 is an empty vector if this
+  // layer has no parameters.
+  /// 'flag' is either kTrain or kEval for feed-forward nets, and
   /// would be used for other phases when training other nets.
   /// 'grad' is a Tensor for gradient (dy) from the upper layer.
-  /// Some layer would use 'input' or 'output' from Forward to compute the
-  /// gradients of parameters. Backward() pop out the state data.
-  /// It is useful for RNN layers, where the same layer is used multiple
-  /// times just like unrolling the layer.
   virtual const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                            const Tensor& grad) {
     LOG(FATAL) << "Not implemented!";
@@ -117,7 +113,7 @@ class Layer {
       auto ret = Backward(flag, grads.at(0));
       input_grad.push_back(ret.first);
       param_grad = ret.second;
-    } else  {
+    } else {
       LOG(FATAL) << "Not implemented";
     }
     return std::make_pair(input_grad, param_grad);
@@ -137,7 +133,7 @@ class Layer {
   /// Serialize the layer info (including params) into a LayerConf proto message
   virtual void ToProto(LayerConf* conf) const {
     conf->set_name(name_);
-    for (const auto& spec: param_specs_) {
+    for (const auto& spec : param_specs_) {
       ParamSpec* p = conf->add_param();
       p->CopyFrom(spec);
     }
@@ -157,19 +153,13 @@ class Layer {
   }
   /// Return specs/configuration of all parameter instances of this layer.
   /// \ref ParamSpec.
-  const vector<ParamSpec> param_specs() {
-    return param_specs_;
-  }
+  const vector<ParamSpec> param_specs() { return param_specs_; }
 
   /// Return the i-th ParamSpec.
-  const ParamSpec& param_specs(int i) {
-    return param_specs_.at(i);
-  }
+  const ParamSpec& param_specs(int i) { return param_specs_.at(i); }
 
   /// Return pointers to parameter Tensor s.
-  const vector<Tensor*> param_values() {
-    return param_values_;
-  }
+  const vector<Tensor*> param_values() { return param_values_; }
 
   /// Return a pointer to the 'i'-th parameter Tensor.
   Tensor* param_value(size_t i) {
@@ -180,8 +170,7 @@ class Layer {
   /// Return names of all parmaeters.
   const vector<string> param_names() {
     vector<string> pname;
-    for (const auto& spec: param_specs_)
-      pname.push_back(spec.name());
+    for (const auto& spec : param_specs_) pname.push_back(spec.name());
     return pname;
   }
 
@@ -195,29 +184,11 @@ class Layer {
   /// Used for debugging and logging.
   const std::string name() const { return name_; }
 
-  /*
-  std::stack<Tensor> states() const {
-    return states_;
-  }
-  */
-
  protected:
   std::string name_;
   vector<Tensor*> param_values_;
   vector<ParamSpec> param_specs_;
-  /// Used to store input or output of Forward(), which would be used in
-  /// Backward.  Rules:
-  /// 1. push the 'input' or 'output' into states_ if the flag of Forward() is
-  ///    for training.
-  /// 2. pop data out in Backward().
-  /// TODO(wangwei) enable this feature for rnn layers.
-  // std::stack<Tensor*> states_;
 };
 
-// ===========================================================================
-// Order layer sub-classes based on alphabetical order of the first letter.
-// ===========================================================================
-
-
 }  // namespace singa
 #endif  // SINGA_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/include/singa/model/rnn.h
----------------------------------------------------------------------
diff --git a/include/singa/model/rnn.h b/include/singa/model/rnn.h
deleted file mode 100644
index 7d2c20c..0000000
--- a/include/singa/model/rnn.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-namespace singa {
-
-class RNN {
-
-
-
-
-};
-
-}  /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index b2a8705..33f5bd8 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -23,11 +23,13 @@ Device::Device(int id, int num_executors, string scheduler, string vm)
     : id_(id) {
   scheduler_ = nullptr;
   vm_ = nullptr;
+  ctx_.seed = 0;
+  ctx_.random_generator = std::mt19937(ctx_.seed);
 }
 
 void Device::Exec(function<void(Context*)> fn, const vector<Blob*> read_blobs,
                     const vector<Blob*> write_blobs, bool use_rand_generator) {
-  fn(nullptr);
+  fn(&ctx_);
 }
 
 Blob* Device::NewBlob(int size) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 8352b48..cd62a38 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -71,7 +71,7 @@ Tensor::Tensor(Tensor&& t)
 }
 
 void Tensor::ResetLike(const Tensor& t) {
-  if (blob_->size() != t.MemSize()) {
+  if (blob_ == nullptr || blob_->size() != t.MemSize()) {
     if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     shape_ = t.shape_;
     device_ = t.device_;
@@ -152,7 +152,7 @@ Tensor Tensor::T() const {
   return t;
 }
 
-void Tensor::operator=(const Tensor& t) {
+Tensor& Tensor::operator=(const Tensor& t) {
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
@@ -161,9 +161,10 @@ void Tensor::operator=(const Tensor& t) {
   device_ = t.device_;
   blob_ = t.blob();
   blob_->IncRefCount();
+  return *this;
 }
 
-void Tensor::operator=(Tensor&& t) {
+Tensor& Tensor::operator=(Tensor&& t) {
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
@@ -171,10 +172,11 @@ void Tensor::operator=(Tensor&& t) {
   device_ = t.device_;
   blob_ = t.blob_;
   t.blob_ = nullptr;
+  return *this;
 }
 
 #define GenUnaryTensorArgMemberFunction(op, fn) \
-  void Tensor::op(const Tensor& t) { fn(*this, t, this); }
+  Tensor& Tensor::op(const Tensor& t) { fn(*this, t, this); return *this; }
 
 GenUnaryTensorArgMemberFunction(operator+=, Add);
 GenUnaryTensorArgMemberFunction(operator-=, Sub);
@@ -183,10 +185,11 @@ GenUnaryTensorArgMemberFunction(operator/=, Div);
 
 #define GenUnaryScalarArgMemberFunction(op, fn) \
   template <typename DType>                     \
-  void Tensor::op(DType x) {                    \
+  Tensor& Tensor::op(DType x) {                 \
     fn(*this, x, this);                         \
+    return *this;                               \
   }                                             \
-  template void Tensor::op<float>(float x)
+  template Tensor& Tensor::op<float>(float x)
 
 GenUnaryScalarArgMemberFunction(operator-=, Sub);
 GenUnaryScalarArgMemberFunction(operator+=, Add);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 9e7ed30..2cbc225 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -39,6 +39,26 @@ void Add<float, lib::Cpp>(int count,
     dptr[i] = lptr[i] + rptr[i];
   }
 }
+template <>
+void EltwiseMult<float, lib::Cpp>(int count, const Blob* input, float x, Blob* ret, Context* ctx)
+{
+  float *dptr = static_cast<float*>(ret->mutable_data());
+  const float *lptr = static_cast<const float*>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] * x;
+  }
+}
+
+template <>
+void EltwiseMult<float, lib::Cpp>(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx)
+{
+  float *dptr = static_cast<float*>(ret->mutable_data());
+  const float *lptr = static_cast<const float*>(lhs->data());
+  const float *rptr = static_cast<const float*>(rhs->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] * rptr[i];
+  }
+}
 
 template <>
 void Bernoulli<float, lib::Cpp>(int count, float p, Blob* ret,
@@ -46,7 +66,7 @@ void Bernoulli<float, lib::Cpp>(int count, float p, Blob* ret,
   std::bernoulli_distribution distribution(p);
   float* ptr = static_cast<float*>(ret->mutable_data());
   for (int i = 0; i < count; i ++) {
-    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+    ptr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
@@ -69,6 +89,8 @@ void Gaussian<float, lib::Cpp>(int count, float mean, float std, Blob* ret,
     ptr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
+
+
 #ifdef USE_CBLAS
 template<>
 void Dot<float, lib::Cpp>(int count,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index 926ccb9..4d5f5d5 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -17,18 +17,16 @@
  */
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
-//#if CUDNN_MAJOR_VERSION >= 5
-#include "./cudnn_utils.h"
+#if CUDNN_MAJOR_VERSION >= 5
 #include "./cudnn_dropout.h"
+#include "./cudnn_utils.h"
 #include "singa/utils/logging.h"
 namespace singa {
 CudnnDropout::~CudnnDropout() {
   if (drop_desc_ != nullptr)
     CUDNN_CHECK(cudnnDestroyDropoutDescriptor(drop_desc_));
-  if (x_desc_ != nullptr)
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
-  if (y_desc_ != nullptr)
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
 }
 
 void CudnnDropout::InitCudnn(int size, DataType dtype, Context* ctx) {
@@ -37,18 +35,16 @@ void CudnnDropout::InitCudnn(int size, DataType dtype, Context* ctx) {
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
   CUDNN_CHECK(cudnnCreateDropoutDescriptor(&drop_desc_));
 
-  int dim[] = {size};
-  int stride[] = {1};
-  CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_desc_, GetCudnnDataType(dtype), 1,
-      dim, stride));
-  CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_desc_, GetCudnnDataType(dtype), 1,
-      dim, stride));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      x_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
 
   cudnnDropoutGetStatesSize(ctx->cudnn_handle, &state_size_);
   cudnnDropoutGetReserveSpaceSize(x_desc_, &reserve_size_);
-  cudnnSetDropoutDescriptor(drop_desc_, ctx->cudnn_handle, dropout_ratio_,
-    state_.blob()->mutable_data(),
-    state_size_, ctx->seed);
+  cudnnSetDropoutDescriptor(drop_desc_, ctx->cudnn_handle, 1 - dropout_ratio_,
+                            state_.blob()->mutable_data(), state_size_,
+                            ctx->seed);
   has_init_cudnn_ = true;
 }
 
@@ -59,23 +55,27 @@ const Tensor CudnnDropout::Forward(int flag, const Tensor& input) {
     if (!has_init_cudnn_) {
       input.device()->Exec(
           [size, dtype, this](Context* ctx) {
-          this->InitCudnn(size, dtype, ctx);
+            this->InitCudnn(size, dtype, ctx);
           },
-          {}, {state_.blob()});
+          {}, {this->state_.blob()});
       mask_.ResetLike(input);
+      // TODO(wangwei) update for async running,
+      // where reserve_size_ may not available
       CHECK_EQ(reserve_size_, mask_.MemSize());
     }
-    Tensor out;
-    out.ResetLike(input);
-    Blob *inblob = input.blob(), *outblob = out.blob(), *mblob = mask_.blob();
-    out.device()->Exec(
-        [inblob, outblob, mblob, this](Context* ctx) {
-        cudnnDropoutForward(
-            ctx->cudnn_handle, this->drop_desc_, this->x_desc_, inblob->data(),
-            this->y_desc_, outblob->mutable_data(), mblob, this->reserve_size_);
+    Tensor output;
+    output.ResetLike(input);
+    output.device()->Exec(
+        [input, output, this](Context* ctx) {
+          Blob *inblob = input.blob(), *outblob = output.blob(),
+               *mblob = mask_.blob();
+          cudnnDropoutForward(ctx->cudnn_handle, this->drop_desc_,
+                              this->x_desc_, inblob->data(), this->y_desc_,
+                              outblob->mutable_data(), mblob,
+                              this->reserve_size_);
         },
-        {inblob}, {mblob, outblob});
-    return out;
+        {input.blob()}, {output.blob(), mask_.blob()});
+    return output;
   } else {
     return input;
   }
@@ -87,20 +87,21 @@ const std::pair<Tensor, vector<Tensor>> CudnnDropout::Backward(
   Tensor dx;
   if (flag & kTrain) {
     dx.ResetLike(grad);
-    Blob *dyblob = grad.blob(), *dxblob = dx.blob(), *mblob = mask_.blob();
     dx.device()->Exec(
-        [dyblob, dxblob, mblob, this](Context* ctx) {
-        cudnnDropoutBackward(ctx->cudnn_handle, this->drop_desc_,
-            this->y_desc_, dyblob->data(), this->x_desc_,
-            dxblob->mutable_data(), mblob,
-            this->reserve_size_);
+        [dx, grad, this](Context* ctx) {
+          Blob *dyblob = grad.blob(), *dxblob = dx.blob(),
+               *mblob = this->mask_.blob();
+          cudnnDropoutBackward(ctx->cudnn_handle, this->drop_desc_,
+                               this->y_desc_, dyblob->data(), this->x_desc_,
+                               dxblob->mutable_data(), mblob->mutable_data(),
+                               this->reserve_size_);
         },
-        {dyblob, mblob}, {dxblob});
+        {grad.blob(), mask_.blob()}, {dx.blob()});
   } else {
     LOG(ERROR) << "Do not call backward for evaluation phase";
   }
   return std::make_pair(dx, param_grad);
 }
 }  // namespace singa
-//#endif  // CUDNN_VERSION_MAJOR>=5
+#endif  // CUDNN_VERSION_MAJOR>=5
 #endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index 0a19214..d2b68b9 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -20,12 +20,12 @@
 #define SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
-//#if CUDNN_MAJOR_VERSION >= 5
+#if CUDNN_MAJOR_VERSION >= 5
 
-#include "singa/model/layer.h"
+#include "./dropout.h"
 #include "singa/core/common.h"
+#include "singa/model/layer.h"
 #include "singa/proto/core.pb.h"
-#include "./dropout.h"
 
 namespace singa {
 class CudnnDropout : public Dropout {
@@ -35,8 +35,8 @@ class CudnnDropout : public Dropout {
   const std::string layer_type() const override { return "CudnnDropout"; }
 
   const Tensor Forward(int flag, const Tensor& input) override;
-  const std::pair<Tensor, vector<Tensor>> Backward(
-      int flag, const Tensor& grad) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
 
   /// Init cudnn related data structures.
   void InitCudnn(int size, DataType dtype, Context* ctx);
@@ -49,6 +49,6 @@ class CudnnDropout : public Dropout {
   Tensor state_;
 };
 }  // namespace
-//#endif  // CUDNN_VERSION_MAJOR>=5
+#endif  // CUDNN_VERSION_MAJOR>=5
 #endif  // USE_CUDNN
-#endif // SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_
+#endif  // SINGA_MODEL_LAYER_CUDNN_DROPOUT_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/model/layer/cudnn_utils.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
index 735ec13..92c8df7 100644
--- a/src/model/layer/cudnn_utils.h
+++ b/src/model/layer/cudnn_utils.h
@@ -17,10 +17,12 @@
  */
 #ifndef SINGA_MODEL_LAYER_CUDNN_BASE_H_
 #define SINGA_MODEL_LAYER_CUDNN_BASE_H_
+
 #ifdef USE_CUDNN
+
+#include <cudnn.h>
 #include "singa/proto/core.pb.h"
 #include "singa/utils/logging.h"
-#include <cudnn.h>
 namespace singa {
 inline cudnnDataType_t GetCudnnDataType(DataType dtype) {
   cudnnDataType_t ret;
@@ -41,11 +43,11 @@ inline cudnnDataType_t GetCudnnDataType(DataType dtype) {
   return ret;
 }
 
-#define CUDNN_CHECK(condition) \
-  do { \
-    cudnnStatus_t status = condition; \
-    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "\
-      << cudnnGetErrorString(status); \
+#define CUDNN_CHECK(condition)                                             \
+  do {                                                                     \
+    cudnnStatus_t status = condition;                                      \
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "                          \
+                                           << cudnnGetErrorString(status); \
   } while (0)
 
 /*

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/model/layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.cc b/src/model/layer/dropout.cc
index f0fe25b..c2c97be 100644
--- a/src/model/layer/dropout.cc
+++ b/src/model/layer/dropout.cc
@@ -30,7 +30,7 @@ const Tensor Dropout::Forward(int flag, const Tensor& input) {
   if (flag & kTrain) {
     mask_.ResetLike(input);
     // set mask_[i] = 1 with prob 1-dropout_rato_
-    Bernoulli(1 - dropout_ratio_, &mask_);
+    Bernoulli(1.0f - dropout_ratio_, &mask_);
     mask_ *= 1.0f / (1.0f - dropout_ratio_);
     out = input * mask_;
   } else {
@@ -39,8 +39,8 @@ const Tensor Dropout::Forward(int flag, const Tensor& input) {
   return out;
 }
 
-const std::pair<Tensor, vector<Tensor>> Dropout::Backward(
-    int flag, const Tensor& grad) {
+const std::pair<Tensor, vector<Tensor>> Dropout::Backward(int flag,
+                                                          const Tensor& grad) {
   vector<Tensor> param_grad;
   Tensor input_grad;
   if (flag & kTrain) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/model/layer/dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
index de349a5..a6e733a 100644
--- a/src/model/layer/dropout.h
+++ b/src/model/layer/dropout.h
@@ -31,7 +31,8 @@ class Dropout : public Layer {
   /// if flag is kTrain, then do dropout with given dropout_ratio;
   /// otherwise if it is kEval, copy input directly to the output
   /// TODO(wangwei) There are diff implementations, Caffe vs
-  /// <a href="https://github.com/nitishsrivastava/deepnet/blob/master/deepnet/fastdropoutnet.py">
+  /// <a
+  /// href="https://github.com/nitishsrivastava/deepnet/blob/master/deepnet/fastdropoutnet.py">
   const Tensor Forward(int flag, const Tensor& input) override;
 
   /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
@@ -40,6 +41,14 @@ class Dropout : public Layer {
 
   void ToDevice(Device* device) override;
 
+  float dropout_ratio() const {
+    return dropout_ratio_;
+  }
+
+  const Tensor& mask() const {
+    return mask_;
+  }
+
  protected:
   /// the proability to set each element to 0.
   float dropout_ratio_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/src/model/layer/rnn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/rnn.h b/src/model/layer/rnn.h
new file mode 100644
index 0000000..a6ba461
--- /dev/null
+++ b/src/model/layer/rnn.h
@@ -0,0 +1,59 @@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_DROPOUT_H_
+#define SINGA_MODEL_LAYER_DROPOUT_H_
+#include "singa/model/layer.h"
+namespace singa {
+/// To enable use the same layer multiple times in one iteration in RNN,
+/// the Forward() function pushes the 'input' or 'output' that are
+/// necessary for Backward() in a stack (states_). If neither 'input' or
+/// 'output' is used by Backward(), then do not store them. The Backward()
+/// pops data from the states_ stack to compute gradients. Users are
+/// responsible for accumulating the gradients for the same parameters.
+class RNN : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "RNN"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf& conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const vector<Tensor>&)
+  const vector<Tensor> Forward(int flag, const vector<Tensor>& input) override;
+
+  /// \copydoc Layer::Backward(int, const vector<Tensor>&);
+  const std::pair<vector<Tensor>, vector<Tensor>> Backward(
+      int flag, const vector<Tensor>& grad) override;
+
+  void ToDevice(Device* device) override;
+
+  /// Return the internal state stack, which should be empty at the beginning
+  /// of
+  /// one iteration.
+  std::stack<Tensor> states() const { return states_; }
+
+ protected:
+  /// Storing input or output from Forward(), which are used in Backward().
+  /// Rules:
+  /// 1. push the 'input' or 'output' into states_ if the flag of Forward() is
+  ///    for kTrain and 'input' or 'output' is necessary for Backward().
+  /// 2. pop data out in Backward().
+  std::stack<Tensor*> states_;
+};
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_DROPOUT_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/test/singa/test_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dropout.cc b/test/singa/test_dropout.cc
index cfe9d73..3190ecd 100644
--- a/test/singa/test_dropout.cc
+++ b/test/singa/test_dropout.cc
@@ -19,11 +19,82 @@
 *
 *************************************************************/
 
-#include "gtest/gtest.h"
 #include "../src/model/layer/dropout.h"
+#include "gtest/gtest.h"
+
+using singa::Dropout;
+TEST(DropoutLayer, Setup) {
+  Dropout drop;
+  EXPECT_EQ("Dropout", drop.layer_type());
+
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(0.8);
+
+  drop.Setup(conf);
+  EXPECT_EQ(0.8f, drop.dropout_ratio());
+}
+
+TEST(DropoutLayer, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr(x, n);
+
+  float pdrop = 0.5;
+  Dropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(conf);
+  float scale = 1.0f / (1.0f - pdrop);
+
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
+
+  const float* mptr = static_cast<const float*>(drop.mask().blob()->data());
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(0, mptr[i] * (mptr[i] - scale));
+
+  const float* outptr1 = static_cast<const float*>(out1.blob()->data());
+  EXPECT_EQ(n, out1.Size());
+  // the output value should be 0 or the same as the input
+  EXPECT_EQ(0.f, outptr1[0] * (outptr1[0] - scale * x[0]));
+  EXPECT_EQ(0.f, outptr1[1] * (outptr1[1] - scale * x[1]));
+  EXPECT_EQ(0.f, outptr1[7] * (outptr1[7] - scale * x[7]));
+
+  singa::Tensor out2 = drop.Forward(singa::kEval, in);
+  EXPECT_EQ(n, out2.Size());
+  const float* outptr2 = static_cast<const float*>(out2.blob()->data());
+  // the output value should be the same as the input
+  EXPECT_EQ(x[0], outptr2[0]);
+  EXPECT_EQ(x[1], outptr2[1]);
+  EXPECT_EQ(x[7], outptr2[7]);
+}
+
+TEST(DropoutLayer, Backward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr(x, n);
 
+  float pdrop = 0.5;
+  float scale = 1.0f / (1.0f - pdrop);
 
-TEST(TestDropoutLayer, Setup) {
+  Dropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(conf);
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
 
+  const float dy[] = {4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{n});
+  grad.CopyDataFromHostPtr(dy, n);
 
+  const float* mptr = static_cast<const float*>(drop.mask().blob()->data());
+  const auto ret = drop.Backward(singa::kTrain, grad);
+  const float* dx = static_cast<const float*>(ret.first.blob()->data());
+  EXPECT_FLOAT_EQ(dx[0], dy[0] * (mptr[0] > 0 ? 1.0f : 0.0f) * scale);
+  EXPECT_FLOAT_EQ(dx[1], dy[1] * (mptr[1] > 0) * scale);
+  EXPECT_FLOAT_EQ(dx[7], dy[7] * (mptr[7] > 0) * scale);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c3a0558c/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index ae20823..8c3c901 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -107,7 +107,8 @@ TEST(TensorClass, T) {
   EXPECT_EQ(true, o.transpose());
   EXPECT_EQ(t.blob(), o.blob());
   EXPECT_EQ(t.data_type(), o.data_type());
-  EXPECT_TRUE((t.shape() ==  o.shape()));
+  EXPECT_EQ(t.shape()[0],  o.shape()[1]);
+  EXPECT_EQ(t.shape()[1],  o.shape()[0]);
 }

[49/50] [abbrv] incubator-singa git commit: SINGA-196 Rename class Blob to Block

Posted by zh...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 9a8839e..3488b55 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -32,7 +32,7 @@ namespace singa {
 
 /// out[i] = |in[i]|
 template <>
-void Abs<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Abs<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                             Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -40,16 +40,16 @@ void Abs<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 }
 /// out = in + x
 template <>
-void Add<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                            Blob* out, Context* ctx) {
+void Add<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                            Block* out, Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
   cuda::add(num, inPtr, x, outPtr, ctx->stream);
 }
 /// out = in1 + in2
 template <>
-void Add<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
-                            Blob* out, Context* ctx) {
+void Add<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
   const float* inPtr1 = static_cast<const float*>(in1->data());
   const float* inPtr2 = static_cast<const float*>(in2->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -59,7 +59,7 @@ void Add<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
 /// if x>high, then x=high; if x<low, then x=low.
 template <>
 void Clamp<float, lang::Cuda>(const size_t num, const float low,
-                              const float high, const Blob* in, Blob* out,
+                              const float high, const Block* in, Block* out,
                               Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -67,8 +67,8 @@ void Clamp<float, lang::Cuda>(const size_t num, const float low,
 }
 /// out = in1 / in2
 template <>
-void Div<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
-                            Blob* out, Context* ctx) {
+void Div<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
   const float* inPtr1 = static_cast<const float*>(in1->data());
   const float* inPtr2 = static_cast<const float*>(in2->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -76,8 +76,8 @@ void Div<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
 }
 
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float x, const Blob* in,
-                            Blob* out, Context* ctx) {
+void Div<float, lang::Cuda>(const size_t num, const float x, const Block* in,
+                            Block* out, Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
   cuda::div(num, x, inPtr, outPtr, ctx->stream);
@@ -85,16 +85,17 @@ void Div<float, lang::Cuda>(const size_t num, const float x, const Blob* in,
 
 /// out = in * x
 template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in,
-                                    const float x, Blob* out, Context* ctx) {
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in,
+                                    const float x, Block* out, Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
   cuda::mult(num, inPtr, x, outPtr, ctx->stream);
 }
 /// out = in1 * in2
 template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in1,
-                                    const Blob* in2, Blob* out, Context* ctx) {
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in1,
+                                    const Block* in2, Block* out,
+                                    Context* ctx) {
   const float* inPtr1 = static_cast<const float*>(in1->data());
   const float* inPtr2 = static_cast<const float*>(in2->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -102,7 +103,7 @@ void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in1,
 }
 /// Base is e. out[i]=e^in[i]
 template <>
-void Exp<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Exp<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                             Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -110,24 +111,24 @@ void Exp<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 }
 
 template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                           Blob* out, Context* ctx) {
+void GE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->mutable_data());
   const float* inPtr = static_cast<const float*>(in->data());
   cuda::ge(num, inPtr, x, outPtr, ctx->stream);
 }
 
 template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                           Blob* out, Context* ctx) {
+void GT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->mutable_data());
   const float* inPtr = static_cast<const float*>(in->data());
   cuda::gt(num, inPtr, x, outPtr, ctx->stream);
 }
 
 template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                           Blob* out, Context* ctx) {
+void LE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->mutable_data());
   const float* inPtr = static_cast<const float*>(in->data());
   cuda::le(num, inPtr, x, outPtr, ctx->stream);
@@ -135,15 +136,15 @@ void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
 
 /// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
 template <>
-void Log<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Log<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                             Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
   cuda::log(num, inPtr, outPtr, ctx->stream);
 }
 template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                           Blob* out, Context* ctx) {
+void LT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->mutable_data());
   const float* inPtr = static_cast<const float*>(in->data());
   cuda::lt(num, inPtr, x, outPtr, ctx->stream);
@@ -151,16 +152,16 @@ void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
 
 /// Element-wise operation, out[i] = in[i]^x
 template <>
-void Pow<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                            Blob* out, Context* ctx) {
+void Pow<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                            Block* out, Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
   cuda::pow(num, inPtr, x, outPtr, ctx->stream);
 }
 /// Element-wise operation, out[i] = in1[i]^in2[i]
 template <>
-void Pow<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
-                            Blob* out, Context* ctx) {
+void Pow<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
   const float* inPtr1 = static_cast<const float*>(in1->data());
   const float* inPtr2 = static_cast<const float*>(in2->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -169,7 +170,7 @@ void Pow<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
 
 /// Element-wise operation, out[i]=max(0, in[i])
 template <>
-void ReLU<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void ReLU<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -178,14 +179,14 @@ void ReLU<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 
 /// out[i] = x
 template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+void Set<float, lang::Cuda>(const size_t num, const float x, Block* out,
                             Context* ctx) {
   float* outPtr = static_cast<float*>(out->mutable_data());
   cuda::set(num, x, outPtr, ctx->stream);
 }
 /// Element-wise operation, out[i]=sigmoid([in[i])
 template <>
-void Sigmoid<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Sigmoid<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                                 Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -193,7 +194,7 @@ void Sigmoid<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 }
 // out[i] = sign(in[i])
 template <>
-void Sign<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Sign<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -202,7 +203,7 @@ void Sign<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 
 /// Element-wise operation, out[i]=sqrt([in[i])
 template <>
-void Sqrt<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Sqrt<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -211,7 +212,7 @@ void Sqrt<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 
 /// Element-wise operation, out[i]=in[i]^2
 template <>
-void Square<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Square<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                                Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -219,8 +220,8 @@ void Square<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 }
 /// out = in1 - in2
 template <>
-void Sub<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
-                            Blob* out, Context* ctx) {
+void Sub<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
   const float* inPtr1 = static_cast<const float*>(in1->data());
   const float* inPtr2 = static_cast<const float*>(in2->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -229,7 +230,7 @@ void Sub<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
 
 /// sum all elements of input into out
 template <>
-void Sum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
                             Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   cuda::sum(num, inPtr, out, ctx->stream);
@@ -237,7 +238,7 @@ void Sum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
 
 /// Element-wise operation, out[i]=tanh([in[i])
 template <>
-void Tanh<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+void Tanh<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -249,7 +250,7 @@ void Tanh<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <>
-void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Blob* out,
+void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Block* out,
                                   Context* ctx) {
   auto rgen = ctx->curand_generator;
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -261,7 +262,7 @@ void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Blob* out,
 // If DType is not float, then convert the low and high to DType
 template <>
 void Uniform<float, lang::Cuda>(const size_t num, const float low,
-                                const float high, Blob* out, Context* ctx) {
+                                const float high, Block* out, Context* ctx) {
   auto rgen = ctx->curand_generator;
   float* outPtr = static_cast<float*>(out->mutable_data());
   CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
@@ -273,7 +274,7 @@ void Uniform<float, lang::Cuda>(const size_t num, const float low,
 // If DType is not float, then convert the mean and delta to DType
 template <>
 void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
-                                 const float std, Blob* out, Context* ctx) {
+                                 const float std, Block* out, Context* ctx) {
   auto rgen = ctx->curand_generator;
   float* outPtr = static_cast<float*>(out->mutable_data());
   CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
@@ -282,7 +283,7 @@ void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
 // =========================Blas operations==================================
 // ref to http://docs.nvidia.com/cuda/cublas
 template <>
-void Amax<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+void Amax<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
@@ -293,7 +294,7 @@ void Amax<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
 
 /// return the index of the element with the min value.
 template <>
-void Amin<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+void Amin<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
@@ -304,7 +305,7 @@ void Amin<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
 
 /// out = sum |x| for all x in in
 template <>
-void Asum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+void Asum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
@@ -314,7 +315,7 @@ void Asum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
 /// out = alpha * in + out
 template <>
 void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
-                             const Blob* in, Blob* out, Context* ctx) {
+                             const Block* in, Block* out, Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
@@ -323,22 +324,22 @@ void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
 
 /// out = \sum_i in1[i] * in2[i]
 template <>
-void Dot<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
-                            float* out, Context* ctx) {
+void Dot<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, float* out, Context* ctx) {
   const float* inPtr1 = static_cast<const float*>(in1->data());
   const float* inPtr2 = static_cast<const float*>(in2->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
 }
 template <>
-void Nrm2<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+void Nrm2<float, lang::Cuda>(const size_t num, const Block* in, float* out,
                              Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   const float* inPtr = static_cast<const float*>(in->data());
   cublasSnrm2(handle, num, inPtr, 1, out);
 }
 template <>
-void Scale<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+void Scale<float, lang::Cuda>(const size_t num, const float x, Block* out,
                               Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -348,8 +349,8 @@ void Scale<float, lang::Cuda>(const size_t num, const float x, Blob* out,
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
-                             const size_t ncol, const Blob* M, const Blob* v,
-                             Blob* out, Context* ctx) {
+                             const size_t ncol, const Block* M, const Block* v,
+                             Block* out, Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   const float* MPtr = static_cast<const float*>(M->data());
   const float* vPtr = static_cast<const float*>(v->data());
@@ -364,8 +365,8 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
 }
 template <>
 void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
-                             const float alpha, const Blob* A, const Blob* v,
-                             const float beta, Blob* out, Context* ctx) {
+                             const float alpha, const Block* A, const Block* v,
+                             const float beta, Block* out, Context* ctx) {
   const float* APtr = static_cast<const float*>(A->data());
   const float* vPtr = static_cast<const float*>(v->data());
   float* outPtr = static_cast<float*>(out->mutable_data());
@@ -383,8 +384,8 @@ template <>
 void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
                              const size_t nrowA, const size_t ncolB,
                              const size_t ncolA, const float alpha,
-                             const Blob* A, const Blob* B, const float beta,
-                             Blob* C, Context* ctx) {
+                             const Block* A, const Block* B, const float beta,
+                             Block* C, Context* ctx) {
   auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
   int lda = transA ? nrowA : ncolA;
@@ -400,23 +401,23 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
 
 template <>
 void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize,
-                                            const size_t dim, const Blob *p,
-                                            const Blob *t, Blob *loss,
-                                            Context *ctx) {
-  const float *pPtr = static_cast<const float *>(p->data());
-  const int *tPtr = static_cast<const int *>(t->data());
-  float *lossPtr = static_cast<float *>(loss->mutable_data());
+                                            const size_t dim, const Block* p,
+                                            const Block* t, Block* loss,
+                                            Context* ctx) {
+  const float* pPtr = static_cast<const float*>(p->data());
+  const int* tPtr = static_cast<const int*>(t->data());
+  float* lossPtr = static_cast<float*>(loss->mutable_data());
   cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream);
 }
 template <>
 void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
-                                               const size_t dim, const Blob *p,
-                                               const Blob *t, Blob *grad,
-                                               Context *ctx) {
+                                               const size_t dim, const Block* p,
+                                               const Block* t, Block* grad,
+                                               Context* ctx) {
   CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
-  const float *pPtr = static_cast<const float *>(p->data());
-  const int *tPtr = static_cast<const int *>(t->data());
-  float *gradPtr = static_cast<float *>(grad->mutable_data());
+  const float* pPtr = static_cast<const float*>(p->data());
+  const int* tPtr = static_cast<const int*>(t->data());
+  float* gradPtr = static_cast<float*>(grad->mutable_data());
   cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
                                ctx->stream);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
index 8ecbbc7..98a5758 100644
--- a/src/model/layer/cudnn_activation.cc
+++ b/src/model/layer/cudnn_activation.cc
@@ -63,18 +63,18 @@ const Tensor CudnnActivation::Forward(int flag, const Tensor& input) {
   Tensor output;
   output.ResetLike(input);
   output.device()->Exec([input, output, this](Context* ctx) {
-    Blob* inblob = input.blob(), * outblob = output.blob();
+    Block* inblock = input.block(), * outblock = output.block();
     float alpha = 1.0f, beta = 0.0f;
 #if CUDNN_VERSION_MAJOR == 5
     CUDNN_CHECK(cudnnActivationForward(
         ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
-        inblob->data(), &beta, this->desc_, outblob->mutable_data()));
+        inblock->data(), &beta, this->desc_, outblock->mutable_data()));
 #elif CUDNN_VERSION_MAJOR == 4
     CUDNN_CHECK(cudnnActivationForward_v4(
         ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
-        inblob->data(), &beta, this->desc_, outblob->mutable_data()));
+        inblock->data(), &beta, this->desc_, outblock->mutable_data()));
 #endif
-  }, {input.blob()}, {output.blob()});
+  }, {input.block()}, {output.block()});
   if (flag & kTrain) {
     if (cudnn_mode_ == CUDNN_ACTIVATION_SIGMOID ||
         cudnn_mode_ == CUDNN_ACTIVATION_TANH) {
@@ -97,21 +97,21 @@ const std::pair<Tensor, vector<Tensor>> CudnnActivation::Backward(
   buf_.pop();
   dx.ResetLike(grad);
   dx.device()->Exec([dx, grad, inout, this](Context* ctx) {
-    Blob* dyblob = grad.blob(), * dxblob = dx.blob(), * yblob = inout.blob(),
-          * xblob = inout.blob();
+    Block* dyblock = grad.block(), * dxblock = dx.block(),
+           * yblock = inout.block(), * xblock = inout.block();
     float alpha = 1.0f, beta = 0.0f;
 #if CUDNN_VERSION_MAJOR == 5
     CUDNN_CHECK(cudnnActivationBackward(
-        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_, yblob->data(),
-        this->desc_, dyblob->data(), this->desc_, xblob->data(), &beta,
-        this->desc_, dxblob->mutable_data()));
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
+        yblock->data(), this->desc_, dyblock->data(), this->desc_,
+        xblock->data(), &beta, this->desc_, dxblock->mutable_data()));
 #elif CUDNN_VERSION_MAJOR == 4
     CUDNN_CHECK(cudnnActivationBackward_v4(
-        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_, yblob->data(),
-        this->desc_, dyblob->data(), this->desc_, xblob->data(), &beta,
-        this->desc_, dxblob->mutable_data()));
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_, yblock->data(),
+        this->desc_, dyblock->data(), this->desc_, xblock->data(), &beta,
+        this->desc_, dxblock->mutable_data()));
 #endif
-  }, {grad.blob(), inout.blob()}, {dx.blob()});
+  }, {grad.block(), inout.block()}, {dx.block()});
   return std::make_pair(dx, param_grad);
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/model/layer/cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_batchnorm.cc b/src/model/layer/cudnn_batchnorm.cc
index 8288a41..1393916 100644
--- a/src/model/layer/cudnn_batchnorm.cc
+++ b/src/model/layer/cudnn_batchnorm.cc
@@ -81,13 +81,13 @@ const Tensor CudnnBatchNorm::Forward(int flag, const Tensor& input) {
   if ((flag & kTrain) == kTrain) {
     output.device()->Exec(
         [=](Context* ctx) {
-          Blob *inBlob = input.blob(), *outBlob = output.blob(),
-            *saveMeanBlob = resultSaveMean_.blob(),
-            *saveVarBlob = resultSaveVariance_.blob(),
-            *runningMeanBlob = runningMean_.blob(),
-            *runningVarBlob = runningVariance_.blob(),
-            *bnScaleBlob = bnScale_.blob(),
-            *bnBiasBlob = bnBias_.blob();
+          Block *inBlock = input.block(), *outBlock = output.block(),
+            *saveMeanBlock = resultSaveMean_.block(),
+            *saveVarBlock = resultSaveVariance_.block(),
+            *runningMeanBlock = runningMean_.block(),
+            *runningVarBlock = runningVariance_.block(),
+            *bnScaleBlock = bnScale_.block(),
+            *bnBiasBlock = bnBias_.block();
           const float alpha = 1.0f, beta = 0.0f;
           double epsilon = CUDNN_BN_MIN_EPSILON;
           CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
@@ -96,36 +96,36 @@ const Tensor CudnnBatchNorm::Forward(int flag, const Tensor& input) {
               &alpha,
               &beta,
               shape_desc_,
-              inBlob->data(),
+              inBlock->data(),
               shape_desc_,
-              outBlob->mutable_data(),
+              outBlock->mutable_data(),
               param_desc_,
-              bnScaleBlob->data(),
-              bnBiasBlob->data(),
+              bnScaleBlock->data(),
+              bnBiasBlock->data(),
               factor_,
-              runningMeanBlob->mutable_data(),
-              runningVarBlob->mutable_data(),
+              runningMeanBlock->mutable_data(),
+              runningVarBlock->mutable_data(),
               epsilon,
-              saveMeanBlob->mutable_data(),
-              saveVarBlob->mutable_data()));
+              saveMeanBlock->mutable_data(),
+              saveVarBlock->mutable_data()));
         },
-        {input.blob(),
-         bnScale_.blob(),
-         bnBias_.blob()},
-        {output.blob(),
-         runningMean_.blob(),
-         runningVariance_.blob(),
-         resultSaveMean_.blob(),
-         resultSaveVariance_.blob()});
+        {input.block(),
+         bnScale_.block(),
+         bnBias_.block()},
+        {output.block(),
+         runningMean_.block(),
+         runningVariance_.block(),
+         resultSaveMean_.block(),
+         resultSaveVariance_.block()});
     buf_.push(input);
   } else {
     output.device()->Exec(
         [=](Context* ctx) {
-          Blob *inBlob = input.blob(), *outBlob = output.blob(),
-            *runningMeanBlob = runningMean_.blob(),
-            *runningVarBlob = runningVariance_.blob(),
-            *bnScaleBlob = bnScale_.blob(),
-            *bnBiasBlob = bnBias_.blob();
+          Block *inBlock = input.block(), *outBlock = output.block(),
+            *runningMeanBlock = runningMean_.block(),
+            *runningVarBlock = runningVariance_.block(),
+            *bnScaleBlock = bnScale_.block(),
+            *bnBiasBlock = bnBias_.block();
           const float alpha = 1.0f, beta = 0.0f;
           double epsilon = CUDNN_BN_MIN_EPSILON;
           CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
@@ -134,22 +134,22 @@ const Tensor CudnnBatchNorm::Forward(int flag, const Tensor& input) {
               &alpha,
               &beta,
               shape_desc_,
-              inBlob->data(),
+              inBlock->data(),
               shape_desc_,
-              outBlob->mutable_data(),
+              outBlock->mutable_data(),
               param_desc_,
-              bnScaleBlob->data(),
-              bnBiasBlob->data(),
-              runningMeanBlob->data(),
-              runningVarBlob->data(),
+              bnScaleBlock->data(),
+              bnBiasBlock->data(),
+              runningMeanBlock->data(),
+              runningVarBlock->data(),
               epsilon));
         },
-        {input.blob(),
-         bnScale_.blob(),
-         bnBias_.blob(),
-         runningMean_.blob(),
-         runningVariance_.blob()},
-        {output.blob()});
+        {input.block(),
+         bnScale_.block(),
+         bnBias_.block(),
+         runningMean_.block(),
+         runningVariance_.block()},
+        {output.block()});
   }
   return output;
 }
@@ -164,13 +164,13 @@ const std::pair<Tensor, vector<Tensor>> CudnnBatchNorm::Backward(
     dx.ResetLike(grad);
     dx.device()->Exec(
         [=](Context* ctx) {
-          Blob *dyblob = grad.blob(), *dxblob = dx.blob(),
-            *xblob = input.blob(),
-            *bnScaleBlob = bnScale_.blob(),
-            *dbnScaleBlob = dbnScale_.blob(),
-            *dbnBiasBlob = dbnBias_.blob(),
-            *saveMeanBlob = resultSaveMean_.blob(),
-            *saveVarBlob = resultSaveVariance_.blob();
+          Block *dyblock = grad.block(), *dxblock = dx.block(),
+            *xblock = input.block(),
+            *bnScaleBlock = bnScale_.block(),
+            *dbnScaleBlock = dbnScale_.block(),
+            *dbnBiasBlock = dbnBias_.block(),
+            *saveMeanBlock = resultSaveMean_.block(),
+            *saveVarBlock = resultSaveVariance_.block();
           const float alpha = 1.0f, beta = .0f;
           double epsilon = CUDNN_BN_MIN_EPSILON;
           CUDNN_CHECK(cudnnBatchNormalizationBackward(ctx->cudnn_handle,
@@ -180,28 +180,28 @@ const std::pair<Tensor, vector<Tensor>> CudnnBatchNorm::Backward(
               &alpha,
               &beta,
               shape_desc_,
-              xblob->data(),
+              xblock->data(),
               shape_desc_,
-              dyblob->data(),
+              dyblock->data(),
               shape_desc_,
-              dxblob->mutable_data(),
+              dxblock->mutable_data(),
               param_desc_,
-              bnScaleBlob->data(),
-              dbnScaleBlob->mutable_data(),
-              dbnBiasBlob->mutable_data(),
+              bnScaleBlock->data(),
+              dbnScaleBlock->mutable_data(),
+              dbnBiasBlock->mutable_data(),
               epsilon,
-              saveMeanBlob->data(),
-              saveVarBlob->data()));
+              saveMeanBlock->data(),
+              saveVarBlock->data()));
 
         },
-        {dx.blob(),
-         grad.blob(),
-         bnScale_.blob(),
-         resultSaveMean_.blob(),
-         resultSaveVariance_.blob()},
-        {dx.blob(),
-         dbnScale_.blob(),
-         dbnBias_.blob()});
+        {dx.block(),
+         grad.block(),
+         bnScale_.block(),
+         resultSaveMean_.block(),
+         resultSaveVariance_.block()},
+        {dx.block(),
+         dbnScale_.block(),
+         dbnBias_.block()});
   } else {
     LOG(ERROR) << "Do not call backward for evaluation phase";
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index b80c3bd..efc7f88 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -167,30 +167,26 @@ const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
 
   Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
   Tensor output(shape, dev, dtype);
-  output.device()->Exec(
-      [input, output, this](Context *ctx) {
-        Blob *inblob = input.blob(), *outblob = output.blob(),
-             *wblob = this->weight_.blob();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
-                                inblob->data(), this->filter_desc_,
-                                wblob->data(), this->conv_desc_, this->fp_alg_,
-                                this->workspace_.blob()->mutable_data(),
-                                this->workspace_count_ * sizeof(float), &beta,
-                                this->y_desc_, outblob->mutable_data());
-      },
-      {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
+  output.device()->Exec([input, output, this](Context *ctx) {
+    Block *inblock = input.block(), *outblock = output.block(),
+          *wblock = this->weight_.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
+                            inblock->data(), this->filter_desc_, wblock->data(),
+                            this->conv_desc_, this->fp_alg_,
+                            this->workspace_.block()->mutable_data(),
+                            this->workspace_count_ * sizeof(float), &beta,
+                            this->y_desc_, outblock->mutable_data());
+  }, {input.block(), weight_.block()}, {output.block()}, workspace_.block());
 
   if (bias_term_) {
-    output.device()->Exec(
-        [output, this](Context *ctx) {
-          float beta = 1.f, alpha = 1.0f;
-          Blob *outblob = output.blob(), *bblob = this->bias_.blob();
-          cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
-                         bblob->data(), &beta, this->y_desc_,
-                         outblob->mutable_data());
-        },
-        {output.blob(), bias_.blob()}, {output.blob()});
+    output.device()->Exec([output, this](Context *ctx) {
+      float beta = 1.f, alpha = 1.0f;
+      Block *outblock = output.block(), *bblock = this->bias_.block();
+      cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
+                     bblock->data(), &beta, this->y_desc_,
+                     outblock->mutable_data());
+    }, {output.block(), bias_.block()}, {output.block()});
   }
   return output;
 }
@@ -212,45 +208,39 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
 
   // LOG(ERROR) << "backward bias";
   if (bias_term_) {
-    dx.device()->Exec(
-        [grad, db, this](Context *ctx) {
-          Blob *dyblob = grad.blob(), *dbblob = db.blob();
-          float alpha = 1.f, beta = 0.f;
-          cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
-                                       dyblob->data(), &beta, this->bias_desc_,
-                                       dbblob->mutable_data());
-        },
-        {grad.blob()}, {db.blob()});
+    dx.device()->Exec([grad, db, this](Context *ctx) {
+      Block *dyblock = grad.block(), *dbblock = db.block();
+      float alpha = 1.f, beta = 0.f;
+      cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
+                                   dyblock->data(), &beta, this->bias_desc_,
+                                   dbblock->mutable_data());
+    }, {grad.block()}, {db.block()});
   }
   // LOG(ERROR) << "backward w";
-  dx.device()->Exec(
-      [grad, dw, src_data, this](Context *ctx) {
-        Blob *inblob = src_data.blob(), *dyblob = grad.blob(),
-             *dwblob = dw.blob();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(),
-            this->y_desc_, dyblob->data(), this->conv_desc_,
-            this->bp_filter_alg_, this->workspace_.blob()->mutable_data(),
-            this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
-            dwblob->mutable_data());
-      },
-      {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
+  dx.device()->Exec([grad, dw, src_data, this](Context *ctx) {
+    Block *inblock = src_data.block(), *dyblock = grad.block(),
+          *dwblock = dw.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+        ctx->cudnn_handle, &alpha, this->x_desc_, inblock->data(),
+        this->y_desc_, dyblock->data(), this->conv_desc_, this->bp_filter_alg_,
+        this->workspace_.block()->mutable_data(),
+        this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
+        dwblock->mutable_data());
+  }, {grad.block(), src_data.block()}, {dw.block(), workspace_.block()});
 
   // LOG(ERROR) << "backward src";
-  dx.device()->Exec(
-      [dx, grad, this](Context *ctx) {
-        Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
-             *dxblob = dx.blob();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(
-            ctx->cudnn_handle, &alpha, this->filter_desc_, wblob->data(),
-            this->y_desc_, dyblob->data(), this->conv_desc_, this->bp_data_alg_,
-            this->workspace_.blob()->mutable_data(),
-            this->workspace_count_ * sizeof(float), &beta, this->x_desc_,
-            dxblob->mutable_data());
-      },
-      {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
+  dx.device()->Exec([dx, grad, this](Context *ctx) {
+    Block *wblock = this->weight_.block(), *dyblock = grad.block(),
+          *dxblock = dx.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, this->filter_desc_,
+                                 wblock->data(), this->y_desc_, dyblock->data(),
+                                 this->conv_desc_, this->bp_data_alg_,
+                                 this->workspace_.block()->mutable_data(),
+                                 this->workspace_count_ * sizeof(float), &beta,
+                                 this->x_desc_, dxblock->mutable_data());
+  }, {grad.block(), weight_.block()}, {dx.block(), workspace_.block()});
   param_grad.push_back(dw);
   param_grad.push_back(db);
   return std::make_pair(dx, param_grad);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index 64a581b..877dd12 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -57,7 +57,7 @@ void CudnnDropout::InitCudnn(int size, DataType dtype, Device* dev,
   // TODO(wangwei) get seed from ctx or user config?
   auto seed = std::chrono::system_clock::now().time_since_epoch().count();
   cudnnSetDropoutDescriptor(drop_desc_, ctx->cudnn_handle, 1 - dropout_ratio_,
-                            state_.blob()->mutable_data(), state_size_, seed);
+                            state_.block()->mutable_data(), state_size_, seed);
   has_init_cudnn_ = true;
 }
 
@@ -67,24 +67,20 @@ const Tensor CudnnDropout::Forward(int flag, const Tensor& input) {
     DataType dtype = input.data_type();
     Device* dev = input.device();
     if (!has_init_cudnn_) {
-      input.device()->Exec(
-          [size, dtype, this, dev](Context* ctx) {
-            this->InitCudnn(size, dtype, dev, ctx);
-          },
-          {}, {this->state_.blob()});
+      input.device()->Exec([size, dtype, this, dev](Context* ctx) {
+        this->InitCudnn(size, dtype, dev, ctx);
+      }, {}, {this->state_.block()});
     }
     Tensor output;
     output.ResetLike(input);
-    output.device()->Exec(
-        [input, output, this](Context* ctx) {
-          Blob *inblob = input.blob(), *outblob = output.blob(),
-               *mblob = mask_.blob();
-          cudnnDropoutForward(ctx->cudnn_handle, this->drop_desc_,
-                              this->x_desc_, inblob->data(), this->y_desc_,
-                              outblob->mutable_data(), mblob->mutable_data(),
-                              this->reserve_size_);
-        },
-        {input.blob()}, {output.blob(), mask_.blob()});
+    output.device()->Exec([input, output, this](Context* ctx) {
+      Block* inblock = input.block(), * outblock = output.block(),
+             * mblock = mask_.block();
+      cudnnDropoutForward(ctx->cudnn_handle, this->drop_desc_, this->x_desc_,
+                          inblock->data(), this->y_desc_,
+                          outblock->mutable_data(), mblock->mutable_data(),
+                          this->reserve_size_);
+    }, {input.block()}, {output.block(), mask_.block()});
     return output;
   } else {
     return input;
@@ -97,16 +93,14 @@ const std::pair<Tensor, vector<Tensor>> CudnnDropout::Backward(
   Tensor dx;
   if (flag & kTrain) {
     dx.ResetLike(grad);
-    dx.device()->Exec(
-        [dx, grad, this](Context* ctx) {
-          Blob *dyblob = grad.blob(), *dxblob = dx.blob(),
-               *mblob = this->mask_.blob();
-          cudnnDropoutBackward(ctx->cudnn_handle, this->drop_desc_,
-                               this->y_desc_, dyblob->data(), this->x_desc_,
-                               dxblob->mutable_data(), mblob->mutable_data(),
-                               this->reserve_size_);
-        },
-        {grad.blob(), mask_.blob()}, {dx.blob()});
+    dx.device()->Exec([dx, grad, this](Context* ctx) {
+      Block* dyblock = grad.block(), * dxblock = dx.block(),
+             * mblock = this->mask_.block();
+      cudnnDropoutBackward(ctx->cudnn_handle, this->drop_desc_, this->y_desc_,
+                           dyblock->data(), this->x_desc_,
+                           dxblock->mutable_data(), mblock->mutable_data(),
+                           this->reserve_size_);
+    }, {grad.block(), mask_.block()}, {dx.block()});
   } else {
     LOG(ERROR) << "Do not call backward for evaluation phase";
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/model/layer/cudnn_lrn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.cc b/src/model/layer/cudnn_lrn.cc
index 1012813..4dbf426 100644
--- a/src/model/layer/cudnn_lrn.cc
+++ b/src/model/layer/cudnn_lrn.cc
@@ -29,47 +29,32 @@ CudnnLRN::~CudnnLRN() {
     CUDNN_CHECK(cudnnDestroyTensorDescriptor(shape_desc_));
   }
 }
-void CudnnLRN::InitCudnn(const Shape& shape , DataType dtype) {
+void CudnnLRN::InitCudnn(const Shape& shape, DataType dtype) {
   CHECK(!has_init_cudnn_);
   mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
   CUDNN_CHECK(cudnnCreateTensorDescriptor(&shape_desc_));
   CHECK_EQ(shape.size(), 4u);
-  CUDNN_CHECK(cudnnSetTensor4dDescriptor(shape_desc_,
-      CUDNN_TENSOR_NCHW,
-      GetCudnnDataType(dtype),
-      shape[0],
-      shape[1],
-      shape[2],
-      shape[3]));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(shape_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), shape[0],
+                                         shape[1], shape[2], shape[3]));
   CUDNN_CHECK(cudnnCreateLRNDescriptor(&lrn_desc_));
-  CUDNN_CHECK(cudnnSetLRNDescriptor(lrn_desc_,
-        local_size_,
-        alpha_,
-        beta_,
-        k_));
+  CUDNN_CHECK(cudnnSetLRNDescriptor(lrn_desc_, local_size_, alpha_, beta_, k_));
   has_init_cudnn_ = true;
 }
 const Tensor CudnnLRN::Forward(int flag, const Tensor& input) {
   auto shape = input.shape();
   auto dtype = input.data_type();
-  if (!has_init_cudnn_)
-    InitCudnn(shape, dtype);
+  if (!has_init_cudnn_) InitCudnn(shape, dtype);
   Tensor output;
   output.ResetLike(input);
-  output.device()->Exec(
-      [=](Context* ctx) {
-      Blob *inblob = input.blob(), *outblob = output.blob();
-      const float alpha = 1.0f, beta = 0.0f;
-      CUDNN_CHECK(cudnnLRNCrossChannelForward(ctx->cudnn_handle,
-            this->lrn_desc_,
-            this->mode_,
-            &alpha,
-            this->shape_desc_,
-            inblob->data(),
-            &beta,
-            this->shape_desc_,
-            outblob->mutable_data()));
-      }, {input.blob()}, {output.blob()});
+  output.device()->Exec([=](Context* ctx) {
+    Block* inblock = input.block(), * outblock = output.block();
+    const float alpha = 1.0f, beta = 0.0f;
+    CUDNN_CHECK(cudnnLRNCrossChannelForward(
+        ctx->cudnn_handle, this->lrn_desc_, this->mode_, &alpha,
+        this->shape_desc_, inblock->data(), &beta, this->shape_desc_,
+        outblock->mutable_data()));
+  }, {input.block()}, {output.block()});
 
   if (flag & kTrain) {
     buf_.push(input);
@@ -78,9 +63,9 @@ const Tensor CudnnLRN::Forward(int flag, const Tensor& input) {
   return output;
 }
 
-const std::pair<Tensor, vector<Tensor>> CudnnLRN::Backward(
-    int flag, const Tensor& grad) {
-  vector <Tensor> param_grad;
+const std::pair<Tensor, vector<Tensor>> CudnnLRN::Backward(int flag,
+                                                           const Tensor& grad) {
+  vector<Tensor> param_grad;
   Tensor dx;
   CHECK(!buf_.empty());
   Tensor output = buf_.top();
@@ -89,25 +74,16 @@ const std::pair<Tensor, vector<Tensor>> CudnnLRN::Backward(
   buf_.pop();
   if ((flag & kTrain) == kTrain) {
     dx.ResetLike(grad);
-    dx.device()->Exec(
-        [=](Context *ctx) {
-          Blob *dyblob = grad.blob(), *dxblob = dx.blob();
-          Blob *yblob = output.blob(), *xblob = input.blob();
-          float alpha = 1.0f, beta = 0.0f;
-          CUDNN_CHECK(cudnnLRNCrossChannelBackward(ctx->cudnn_handle,
-              this->lrn_desc_,
-              this->mode_,
-              &alpha,
-              this->shape_desc_,
-              yblob->data(),
-              this->shape_desc_,
-              dyblob->data(),
-              this->shape_desc_,
-              xblob->data(),
-              &beta,
-              this->shape_desc_,
-              dxblob->mutable_data()));
-        }, {output.blob(), grad.blob(), input.blob()}, {dx.blob()});
+    dx.device()->Exec([=](Context* ctx) {
+      Block* dyblock = grad.block(), * dxblock = dx.block();
+      Block* yblock = output.block(), * xblock = input.block();
+      float alpha = 1.0f, beta = 0.0f;
+      CUDNN_CHECK(cudnnLRNCrossChannelBackward(
+          ctx->cudnn_handle, this->lrn_desc_, this->mode_, &alpha,
+          this->shape_desc_, yblock->data(), this->shape_desc_, dyblock->data(),
+          this->shape_desc_, xblock->data(), &beta, this->shape_desc_,
+          dxblock->mutable_data()));
+    }, {output.block(), grad.block(), input.block()}, {dx.block()});
   } else {
     LOG(ERROR) << "Do not call backward for evaluation phase";
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
index 842685d..fb8256a 100644
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@ -41,7 +41,7 @@ void CudnnPooling::Setup(const LayerConf &conf) {
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
 }
 
-void CudnnPooling::InitCudnn(const Tensor& input) {
+void CudnnPooling::InitCudnn(const Tensor &input) {
   CHECK(!has_init_cudnn_);
   DataType dtype = input.data_type();
   size_t batchsize = input.shape(0);
@@ -53,8 +53,8 @@ void CudnnPooling::InitCudnn(const Tensor& input) {
                                          GetCudnnDataType(dtype), batchsize,
                                          channels_, height_, width_));
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-      channels_, pooled_height_, pooled_width_));
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize, channels_,
+      pooled_height_, pooled_width_));
   auto pool_method = CUDNN_POOLING_MAX;
   if (pool_ == PoolingConf_PoolMethod_MAX)
     pool_method = CUDNN_POOLING_MAX;
@@ -87,15 +87,13 @@ const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
 
   Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
   Tensor output = Tensor(shape, dev, dtype);
-  output.device()->Exec(
-      [input, output, this](Context *ctx) {
-        Blob *inblob = input.blob(), *outblob = output.blob();
-        float alpha = 1.0f, beta = 0.0f;
-        cudnnPoolingForward(ctx->cudnn_handle, this->pool_desc_, &alpha,
-                            this->x_desc_, inblob->data(), &beta, this->y_desc_,
-                            outblob->mutable_data());
-      },
-      {input.blob()}, {output.blob()});
+  output.device()->Exec([input, output, this](Context *ctx) {
+    Block *inblock = input.block(), *outblock = output.block();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnPoolingForward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                        this->x_desc_, inblock->data(), &beta, this->y_desc_,
+                        outblock->mutable_data());
+  }, {input.block()}, {output.block()});
   if (flag & kTrain) {
     buf_.push(input);
     buf_.push(output);
@@ -116,17 +114,15 @@ const std::pair<Tensor, vector<Tensor>> CudnnPooling::Backward(
   Tensor dx;
   dx.ResetLike(x);
 
-  dx.device()->Exec(
-      [dx, grad, x, y, this](Context *ctx) {
-        Blob *dyblob = grad.blob(), *dxblob = dx.blob(), *yblob = y.blob(),
-             *xblob = x.blob();
-        float alpha = 1.0f, beta = 0.0f;
-        cudnnPoolingBackward(ctx->cudnn_handle, this->pool_desc_, &alpha,
-                             this->y_desc_, yblob->data(), this->y_desc_,
-                             dyblob->data(), this->x_desc_, xblob->data(),
-                             &beta, this->x_desc_, dxblob->mutable_data());
-      },
-      {grad.blob(), y.blob(), x.blob()}, {dx.blob()});
+  dx.device()->Exec([dx, grad, x, y, this](Context *ctx) {
+    Block *dyblock = grad.block(), *dxblock = dx.block(), *yblock = y.block(),
+          *xblock = x.block();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnPoolingBackward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                         this->y_desc_, yblock->data(), this->y_desc_,
+                         dyblock->data(), this->x_desc_, xblock->data(), &beta,
+                         this->x_desc_, dxblock->mutable_data());
+  }, {grad.block(), y.block(), x.block()}, {dx.block()});
 
   return std::make_pair(dx, param_grad);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
index 85b0c3d..16d4022 100644
--- a/src/model/layer/cudnn_softmax.cc
+++ b/src/model/layer/cudnn_softmax.cc
@@ -47,14 +47,13 @@ const Tensor CudnnSoftmax::Forward(int flag, const Tensor& input) {
   Tensor output;
   output.ResetLike(input);
   output.device()->Exec([input, output, this](Context* ctx) {
-    Blob* inblob = input.blob(), * outblob = output.blob();
+    Block* inblock = input.block(), * outblock = output.block();
     float alpha = 1.0f, beta = 0.0f;
     cudnnSoftmaxForward(ctx->cudnn_handle, this->algorithm_, this->mode_,
-                        &alpha, this->desc_, inblob->data(), &beta, this->desc_,
-                        outblob->mutable_data());
-  }, {input.blob()}, {output.blob()});
-  if (flag & kTrain)
-    buf_.push(output);
+                        &alpha, this->desc_, inblock->data(), &beta,
+                        this->desc_, outblock->mutable_data());
+  }, {input.block()}, {output.block()});
+  if (flag & kTrain) buf_.push(output);
   return output;
 }
 
@@ -66,13 +65,14 @@ const std::pair<Tensor, vector<Tensor>> CudnnSoftmax::Backward(
   buf_.pop();
   dx.ResetLike(grad);
   dx.device()->Exec([dx, grad, output, this](Context* ctx) {
-    Blob* dyblob = grad.blob(), * dxblob = dx.blob(), * yblob = output.blob();
+    Block* dyblock = grad.block(), * dxblock = dx.block(),
+           * yblock = output.block();
     float alpha = 1.0f, beta = 0.0f;
     cudnnSoftmaxBackward(ctx->cudnn_handle, this->algorithm_, this->mode_,
-                         &alpha, this->desc_, yblob->data(), this->desc_,
-                         dyblob->data(), &beta, this->desc_,
-                         dxblob->mutable_data());
-  }, {grad.blob(), output.blob()}, {dx.blob()});
+                         &alpha, this->desc_, yblock->data(), this->desc_,
+                         dyblock->data(), &beta, this->desc_,
+                         dxblock->mutable_data());
+  }, {grad.block(), output.block()}, {dx.block()});
   return std::make_pair(dx, param_grad);
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/test/singa/test_cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_cpu.cc b/test/singa/test_cpp_cpu.cc
index 86654e1..ec5c7e1 100644
--- a/test/singa/test_cpp_cpu.cc
+++ b/test/singa/test_cpp_cpu.cc
@@ -24,7 +24,7 @@
 #include "singa/proto/core.pb.h"
 
 using singa::CppCPU;
-using singa::Blob;
+using singa::Block;
 TEST(CppCPU, Constructor) {
   CppCPU dev(0, 1);
   EXPECT_EQ(0, dev.id());
@@ -32,15 +32,15 @@ TEST(CppCPU, Constructor) {
 
 TEST(CppCPU, MemoryMallocFree) {
   CppCPU dev(0, 1);
-  Blob* b = dev.NewBlob(4);
+  Block* b = dev.NewBlock(4);
   EXPECT_NE(nullptr, b);
   EXPECT_EQ(4u, b->size());
-  dev.FreeBlob(b);
+  dev.FreeBlock(b);
 }
 
 TEST(CppCPU, Exec) {
   CppCPU dev(0, 1);
-  Blob* b = dev.NewBlob(4);
+  Block* b = dev.NewBlock(4);
   int x = 1, y =3, z = 0;
   dev.Exec([x, y, &z](singa::Context *ctx) {
       z = x + y;
@@ -50,7 +50,7 @@ TEST(CppCPU, Exec) {
 
 TEST(CppCPU, CopyData) {
   CppCPU dev(0, 1);
-  Blob* b = dev.NewBlob(4);
+  Block* b = dev.NewBlock(4);
   char s[] = {'a', 'b', 'c', 'x'};
   dev.CopyDataFromHostPtr(b, s, 4);
   const char* bstr = static_cast<const char*>(b->data());
@@ -58,14 +58,14 @@ TEST(CppCPU, CopyData) {
   EXPECT_EQ('b', bstr[1]);
   EXPECT_EQ('x', bstr[3]);
 
-  Blob* c = dev.NewBlob(4);
+  Block* c = dev.NewBlock(4);
   dev.CopyDataToFrom(c, b, 4, singa::kHostToHost, 0, 0);
   const char* cstr = static_cast<const char*>(c->data());
 
   EXPECT_EQ('a', cstr[0]);
   EXPECT_EQ('b', cstr[1]);
   EXPECT_EQ('x', cstr[3]);
-  dev.FreeBlob(b);
-  dev.FreeBlob(c);
+  dev.FreeBlock(b);
+  dev.FreeBlock(c);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index bd039ad..2cce336 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -18,17 +18,17 @@ TEST(TensorTest, TestConstructor) {
   singa::Tensor float16_t(Shape{2,3}, singa::kFloat16);
   EXPECT_EQ(singa::kFloat16, float16_t.data_type());
   EXPECT_EQ(6u, float16_t.Size());
-  EXPECT_EQ(12u, float16_t.blob()->size());
+  EXPECT_EQ(12u, float16_t.block()->size());
 
   singa::Tensor x(float16_t);
   EXPECT_EQ(float16_t.Size(), x.Size());
-  EXPECT_EQ(float16_t.blob(), x.blob());
+  EXPECT_EQ(float16_t.block(), x.block());
   EXPECT_EQ(float16_t.data_type(), x.data_type());
   EXPECT_EQ(float16_t.device(), x.device());
 
   singa::Tensor y = float16_t;
   EXPECT_EQ(float16_t.Size(), x.Size());
-  EXPECT_EQ(float16_t.blob(), x.blob());
+  EXPECT_EQ(float16_t.block(), x.block());
   EXPECT_EQ(float16_t.data_type(), x.data_type());
   EXPECT_EQ(float16_t.device(), x.device());
 }
@@ -69,7 +69,7 @@ TEST(TensorClass, CopyDataFromHostPtr) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
   t.CopyDataFromHostPtr(data, 3);
-  const float* dptr = static_cast<const float*>(t.blob()->data());
+  const float* dptr = static_cast<const float*>(t.block()->data());
   EXPECT_FLOAT_EQ(1.0f, dptr[0]);
   EXPECT_FLOAT_EQ(2.0f, dptr[1]);
   EXPECT_FLOAT_EQ(3.0f, dptr[2]);
@@ -82,7 +82,7 @@ TEST(TensorClass, CopyData) {
 
   Tensor o(Shape{3});
   o.CopyData(t);
-  const float* dptr = static_cast<const float*>(o.blob()->data());
+  const float* dptr = static_cast<const float*>(o.block()->data());
   EXPECT_FLOAT_EQ(1.0f, dptr[0]);
   EXPECT_FLOAT_EQ(2.0f, dptr[1]);
   EXPECT_FLOAT_EQ(3.0f, dptr[2]);
@@ -94,7 +94,7 @@ TEST(TensorClass, Clone) {
   t.CopyDataFromHostPtr(data, 3);
 
   Tensor o = t.Clone();
-  const float* dptr = static_cast<const float*>(o.blob()->data());
+  const float* dptr = static_cast<const float*>(o.block()->data());
   EXPECT_FLOAT_EQ(1.0f, dptr[0]);
   EXPECT_FLOAT_EQ(2.0f, dptr[1]);
   EXPECT_FLOAT_EQ(3.0f, dptr[2]);
@@ -105,7 +105,7 @@ TEST(TensorClass, T) {
   EXPECT_FALSE(t.transpose());
   Tensor o = t.T();
   EXPECT_EQ(true, o.transpose());
-  EXPECT_EQ(t.blob(), o.blob());
+  EXPECT_EQ(t.block(), o.block());
   EXPECT_EQ(t.data_type(), o.data_type());
   EXPECT_EQ(t.shape()[0],  o.shape()[1]);
   EXPECT_EQ(t.shape()[1],  o.shape()[0]);

[50/50] [abbrv] incubator-singa git commit: SINGA-196 Rename class Blob to Block

Posted by zh...@apache.org.

SINGA-196 Rename class Blob to Block

Rename Blob (blob) into Block (block).
Block represents a block of memory.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/9c2869b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/9c2869b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/9c2869b9

Branch: refs/heads/master
Commit: 9c2869b9ab5da4affa294b4b23c88aec0b226984
Parents: 272100a
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 13 19:15:32 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 19:15:32 2016 +0800

----------------------------------------------------------------------
 include/singa/core/common.h          |   6 +-
 include/singa/core/device.h          |  18 ++--
 include/singa/core/tensor.h          |  32 +++---
 include/singa/model/layer.h          |   2 +-
 src/core/device/device.cc            |  24 ++---
 src/core/tensor/tensor.cc            | 172 ++++++++++++++++--------------
 src/core/tensor/tensor_math.h        | 118 ++++++++++----------
 src/core/tensor/tensor_math_cpp.h    | 151 +++++++++++++-------------
 src/core/tensor/tensor_math_cuda.h   | 135 +++++++++++------------
 src/model/layer/cudnn_activation.cc  |  26 ++---
 src/model/layer/cudnn_batchnorm.cc   | 126 +++++++++++-----------
 src/model/layer/cudnn_convolution.cc | 104 ++++++++----------
 src/model/layer/cudnn_dropout.cc     |  46 ++++----
 src/model/layer/cudnn_lrn.cc         |  78 +++++---------
 src/model/layer/cudnn_pooling.cc     |  42 ++++----
 src/model/layer/cudnn_softmax.cc     |  22 ++--
 test/singa/test_cpp_cpu.cc           |  16 +--
 test/singa/test_tensor.cc            |  14 +--
 18 files changed, 548 insertions(+), 584 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index e6f4c90..b556750 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -42,10 +42,10 @@ typedef struct _Cuda { } Cuda;
 typedef struct _Opencl { } Opencl;
 }  // namespace lang
 
-/// Blob represent a chunk of memory (on device or host) managed by VirtualMemory.
-class Blob {
+/// Block represent a chunk of memory (on device or host).
+class Block {
  public:
-  Blob(void* ptr, size_t size) : data_(ptr), size_(size), ref_count_(1) {}
+  Block(void* ptr, size_t size) : data_(ptr), size_(size), ref_count_(1) {}
   void* mutable_data() const { return data_; }
   const void* data() const { return data_; }
   size_t size() const { return size_; }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 56eda70..f69e4c6 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -47,21 +47,21 @@ class Device {
   virtual void SetRandSeed(unsigned seed) = 0;
 
   /// Called by Tensor.
-  Blob* NewBlob(int size);
+  Block* NewBlock(int size);
 
   /// Called by Tensor.
-  void FreeBlob(Blob* blob);
+  void FreeBlock(Block* block);
 
   /// Copy data within or across devices.
-  void CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
+  void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                       CopyDirection direction, int dst_offset, int src_offset);
 
-  void CopyDataFromHostPtr(Blob* dst, const void* src, size_t nBytes,
+  void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
                            size_t dst_offset = 0);
   /// Submit the operation to the device, which may execute it right now or
   /// delay it depending on the scheduler.
-  void Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
-                    const vector<Blob*> write_blobs,
+  void Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
+                    const vector<Block*> write_blocks,
                     bool use_rand_generator = false);
 
   // Wait for one event.
@@ -205,11 +205,11 @@ class CallbackArg {
 /// Type of callback functions for executing tensor ops.
 typedef function<void(CallbackArg*)> CallbackFn;
 public:
-  /// Operation has a function, and read/write blobs.
+  /// Operation has a function, and read/write blocks.
   typedef struct _Operation {
     function<void(Context*)> fn;
-    const vector<Blob*> read_blobs;
-    const vector<Blob*> write_blobs;
+    const vector<Block*> read_blocks;
+    const vector<Block*> write_blocks;
   } Operation;
 
 */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 8cfa705..48a8c8f 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -64,17 +64,17 @@ class Tensor {
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(Tensor &&from);
 
-  /// For functions in xx_math.cc to access the blob.
-  /// Users should not operate against Blob directly.
-  /// blob_ is allocated in constructors.
-  Blob *blob() const { return blob_; }
+  /// For functions in xx_math.cc to access the block.
+  /// Users should not operate against Block directly.
+  /// block_ is allocated in constructors.
+  Block *block() const { return block_; }
 
   Device *device() const { return device_; }
 
   /// return immutable Tensor values with given type.
   template <typename SType>
   SType data() const {
-    return static_cast<SType>(blob()->data());
+    return static_cast<SType>(block()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -93,23 +93,23 @@ class Tensor {
 
   /// return number of total elements
   size_t Size() const {
-    CHECK_EQ(blob_->size() % SizeOf(data_type_), 0u);
-    return blob_->size() / SizeOf(data_type_);
+    CHECK_EQ(block_->size() % SizeOf(data_type_), 0u);
+    return block_->size() / SizeOf(data_type_);
   }
 
   /// return memory size (i.e., Bytes)
-  size_t MemSize() const { return blob_->size(); }
+  size_t MemSize() const { return block_->size(); }
 
-  /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
+  /// Reset the tensor shape, it may reallocate block, if MemSize() changes.
   void Reshape(const Shape &shape);
   void Reshape(Shape &&shape);
 
   /// Reset the shape, device, and data type as given tensor.
-  /// If blob size changes, then reallocate a new blob. The previous blob would
+  /// If block size changes, then reallocate a new block. The previous block would
   /// be deleted.
   void ResetLike(const Tensor &t);
 
-  /// Reset the data type, it would reallocate blob if type changes.
+  /// Reset the data type, it would reallocate block if type changes.
   void AsType(const DataType type);
 
   /// Reset the device.
@@ -140,10 +140,10 @@ class Tensor {
   /// No data copy, just set the transpose_ filed of the returned tensor.
   Tensor T() const;
 
-  /// Copy the meta info with data blob shared.
+  /// Copy the meta info with data block shared.
   Tensor &operator=(const Tensor &in);
 
-  /// Copy the meta info with data blob shared.
+  /// Copy the meta info with data block shared.
   Tensor &operator=(Tensor &&in);
 
   Tensor &operator+=(const Tensor &in);
@@ -179,9 +179,9 @@ class Tensor {
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
   Device *device_ = nullptr;
-  /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
-  /// If you want to get an allocated Blob, use blob() instead of blob_.
-  Blob *blob_ = nullptr;
+  /// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
+  /// If you want to get an allocated Block, use block() instead of block_.
+  Block *block_ = nullptr;
   Shape shape_ = {};
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 82c8edc..2addc98 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -61,7 +61,7 @@ class Layer {
   virtual void Setup(const LayerConf& conf) {
     name_ = conf.name();
     // for (const auto& spec : conf.param()) param_specs_.push_back(spec);
-    // TODO(wangwei) load param values from checkpoint blobs.
+    // TODO(wangwei) load param values from checkpoint files.
   }
 
   /// Do feature transformation for the given 'input' tensor (denoted as x).

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 1d3c446..36381e4 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -25,31 +25,31 @@ Device::Device(int id, int num_executors, string scheduler, string vm)
   host_ = &defaultDevice;
 }
 
-void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
-                    const vector<Blob*> write_blobs, bool use_rand_generator) {
+void Device::Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
+                    const vector<Block*> write_blocks, bool use_rand_generator) {
   // TODO(wangwei) execute operations scheduled by the scheduler.
   DoExec(std::move(fn), 0);
 }
 
-// TODO(wangwei) get Blob from the memory manager
-Blob* Device::NewBlob(int size) {
+// TODO(wangwei) get Block from the memory manager
+Block* Device::NewBlock(int size) {
   if (size > 0) {
     void* ptr = Malloc(size);
-    return new Blob(ptr, size);
+    return new Block(ptr, size);
   } else {
     return nullptr;
   }
 }
 
-// TODO(wangwei) return Blob to the memory manager
-void Device::FreeBlob(Blob* blob) {
-  if (blob != nullptr) {
-    Free(blob->mutable_data());
-    delete blob;
+// TODO(wangwei) return Block to the memory manager
+void Device::FreeBlock(Block* block) {
+  if (block != nullptr) {
+    Free(block->mutable_data());
+    delete block;
   }
 }
 
-void Device::CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
+void Device::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                             CopyDirection direct, int dst_offset,
                             int src_offset) {
   this->Exec(
@@ -62,7 +62,7 @@ void Device::CopyDataToFrom(Blob* dst, Blob* src, size_t nBytes,
       {src}, {dst});
 }
 
-void Device::CopyDataFromHostPtr(Blob* dst, const void* src, size_t nBytes,
+void Device::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
                                  size_t dst_offset) {
   auto direct = lang_ == kCpp ? kHostToHost : kHostToDevice;
   void* dstptr = reinterpret_cast<char*>(dst->mutable_data()) + dst_offset;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 4e0d94b..8afc17c 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -26,8 +26,9 @@ namespace singa {
 
 Tensor::~Tensor() {
   // LOG(ERROR) << "~";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
-  blob_ = nullptr;
+  if (block_ != nullptr && block_->DecRefCount() == 0)
+    device_->FreeBlock(block_);
+  block_ = nullptr;
 }
 
 Tensor::Tensor() { device_ = &defaultDevice; }
@@ -35,28 +36,28 @@ Tensor::Tensor() { device_ = &defaultDevice; }
 Tensor::Tensor(const Shape &shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
-  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+  block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(Shape &&shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
-  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+  block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
-  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+  block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
-  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+  block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(const Tensor &in)
     : transpose_(in.transpose_),
       data_type_(in.data_type_),
       device_(in.device_),
-      blob_(in.blob()),
+      block_(in.block()),
       shape_(in.shape_) {
-  blob_->IncRefCount();
+  block_->IncRefCount();
 }
 
 Tensor::Tensor(Tensor &&in)
@@ -64,40 +65,44 @@ Tensor::Tensor(Tensor &&in)
       data_type_(in.data_type_),
       device_(in.device_),
       shape_(std::move(in.shape_)) {
-  blob_ = in.blob_;
-  in.blob_ = nullptr;
+  block_ = in.block_;
+  in.block_ = nullptr;
 }
 
 void Tensor::ResetLike(const Tensor &in) {
-  if (blob_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
     shape_ = in.shape_;
     device_ = in.device_;
     data_type_ = in.data_type_;
-    blob_ = device_->NewBlob(in.MemSize());
+    block_ = device_->NewBlock(in.MemSize());
   }
 }
 
 void Tensor::Reshape(const Shape &shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
-    blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = device_->NewBlock(Product(shape) * SizeOf(data_type_));
   }
   shape_ = shape;
 }
 
 void Tensor::Reshape(Shape &&shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
-    blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = device_->NewBlock(Product(shape) * SizeOf(data_type_));
   }
   shape_ = std::move(shape);
 }
 
 void Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
-    blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = device_->NewBlock(Product(shape_) * SizeOf(type));
     data_type_ = type;
   }
 }
@@ -107,9 +112,10 @@ void Tensor::ToDevice(Device *dst) {
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     tmp.CopyData(*this);
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
-    blob_ = tmp.blob_;
-    tmp.blob_ = nullptr;
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = tmp.block_;
+    tmp.block_ = nullptr;
     device_ = dst;
   }
 }
@@ -122,7 +128,7 @@ void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num) {
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
   if (src != nullptr) {
-    device_->CopyDataFromHostPtr(blob(), src, sizeof(DType) * num, 0);
+    device_->CopyDataFromHostPtr(block(), src, sizeof(DType) * num, 0);
   } else {
     LOG(WARNING) << "Copy data from null host ptr";
   }
@@ -132,9 +138,9 @@ template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num);
 
 void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
-  CHECK(blob_ != nullptr);
-  // Do copy only if the src's blob is already initialized.
-  if (src.blob_ != nullptr) {
+  CHECK(block_ != nullptr);
+  // Do copy only if the src's block is already initialized.
+  if (src.block_ != nullptr) {
     singa::CopyDataToFrom(this, src, Size(), 0, 0);
   }
 }
@@ -154,32 +160,34 @@ Tensor Tensor::T() const {
   t.transpose_ = ~transpose_;
   t.shape_.push_back(shape_[1]);
   t.shape_.push_back(shape_[0]);
-  t.blob_ = blob_;
-  blob_->IncRefCount();
+  t.block_ = block_;
+  block_->IncRefCount();
   return t;
 }
 
 Tensor &Tensor::operator=(const Tensor &in) {
   // LOG(ERROR) << "= const &";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  if (block_ != nullptr && block_->DecRefCount() == 0)
+    device_->FreeBlock(block_);
   transpose_ = in.transpose_;
   data_type_ = in.data_type_;
   shape_ = in.shape_;
   device_ = in.device_;
-  blob_ = in.blob();
-  blob_->IncRefCount();
+  block_ = in.block();
+  block_->IncRefCount();
   return *this;
 }
 
 Tensor &Tensor::operator=(Tensor &&in) {
   // LOG(ERROR) << "= &&";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  if (block_ != nullptr && block_->DecRefCount() == 0)
+    device_->FreeBlock(block_);
   transpose_ = in.transpose_;
   data_type_ = in.data_type_;
   shape_ = std::move(in.shape_);
   device_ = in.device_;
-  blob_ = in.blob_;
-  in.blob_ = nullptr;
+  block_ = in.block_;
+  in.block_ = nullptr;
   return *this;
 }
 
@@ -233,7 +241,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
   CHECK_GE(dst->MemSize(), d_offset + nBytes);
 
   Device *src_dev = src.device(), *dst_dev = dst->device();
-  Blob *from = src.blob(), *to = dst->blob();
+  Block *from = src.block(), *to = dst->block();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
     if (dst_dev->lang() == kCpp) {
@@ -317,9 +325,9 @@ float Tensor::L2() const {
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context *ctx) {
       DType ret;
-      Nrm2<DType, Lang>(this->Size(), this->blob(), &ret, ctx);
+      Nrm2<DType, Lang>(this->Size(), this->block(), &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
-    }, {this->blob()}, {});
+    }, {this->block()}, {});
   });
   return nrm;
 }
@@ -327,7 +335,7 @@ template <typename SType>
 void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
   auto size = Size();
-  auto ptr = blob_;
+  auto ptr = block_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     // cast x to DType
     device_->Exec([size, x, ptr](Context *ctx) {
@@ -341,8 +349,8 @@ template void Tensor::SetValue<float>(const float x);
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
       ret->device()->Exec([t, ret](Context * ctx) {                    \
-        fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);         \
-      }, {t.blob()}, {ret->blob()});                                   \
+        fn<DType, Lang>(t.Size(), t.block(), ret->block(), ctx);       \
+      }, {t.block()}, {ret->block()});                                 \
     });                                                                \
   } while (0)
 
@@ -365,14 +373,15 @@ GenUnaryTensorFn(Sqrt);
 GenUnaryTensorFn(Square);
 GenUnaryTensorFn(Tanh);
 
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
-      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
-      ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                     \
-        fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), ctx); \
-      }, {lhs.blob(), rhs.blob()}, {ret->blob()});                             \
-    });                                                                        \
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                            \
+  do {                                                                      \
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {  \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                     \
+      ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                  \
+        fn<DType, Lang>(lhs.Size(), lhs.block(), rhs.block(), ret->block(), \
+                        ctx);                                               \
+      }, {lhs.block(), rhs.block()}, {ret->block()});                       \
+    });                                                                     \
   } while (0)
 
 #define GenBinaryTensorFn(op, fn)                              \
@@ -397,8 +406,8 @@ GenBinaryTensorFn(Pow, Pow);
       static_assert(std::is_same<SType, DType>::value,                  \
                     "The Scalar type must match the Tensor data type"); \
       ret->device()->Exec([t, x, ret](Context * ctx) {                  \
-        fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);       \
-      }, {t.blob()}, {ret->blob()});                                    \
+        fn<DType, Lang>(t.Size(), t.block(), x, ret->block(), ctx);     \
+      }, {t.block()}, {ret->block()});                                  \
     });                                                                 \
   } while (0)
 
@@ -440,8 +449,8 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     // TODO(wangwei) type cast SType to DType;
     in.device()->Exec([alpha, in, out](Context *ctx) {
-      Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
-    }, {in.blob()}, {out->blob()});
+      Div<DType, Lang>(in.Size(), alpha, in.block(), out->block(), ctx);
+    }, {in.block()}, {out->block()});
   });
 }
 template void Div<float>(const float, const Tensor &, Tensor *);
@@ -474,8 +483,8 @@ float Sum<float>(const Tensor &in) {
   float s = 0.0f;
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     in.device()->Exec([in, &s](Context *ctx) {
-      Sum<DType, Lang>(in.Size(), in.blob(), &s, ctx);
-    }, {in.blob()}, {});
+      Sum<DType, Lang>(in.Size(), in.block(), &s, ctx);
+    }, {in.block()}, {});
   });
   return s;
 }
@@ -582,9 +591,9 @@ void MultColumn(const Tensor &v, Tensor *M) {
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context *ctx) {
-      DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(), v.blob(),
-                        M->blob(), ctx);
-    }, {M->blob(), v.blob()}, {M->blob()});
+      DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->block(), v.block(),
+                        M->block(), ctx);
+    }, {M->block(), v.block()}, {M->block()});
   });
 }
 
@@ -597,9 +606,9 @@ void MultRow(const Tensor &v, Tensor *M) {
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context *ctx) {
-      DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
-                        M->blob(), ctx);
-    }, {M->blob(), v.blob()}, {M->blob()});
+      DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->block(), v.block(),
+                        M->block(), ctx);
+    }, {M->block(), v.block()}, {M->block()});
   });
 }
 
@@ -644,8 +653,8 @@ void Bernoulli(const SType p, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto prob = TypeCast<SType, DType>(p);
     out->device()->Exec([prob, out](Context *ctx) {
-      Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
-    }, {}, {out->blob()}, true);
+      Bernoulli<DType, Lang>(out->Size(), prob, out->block(), ctx);
+    }, {}, {out->block()}, true);
   });
 }
 template void Bernoulli<float>(const float p, Tensor *out);
@@ -656,8 +665,8 @@ void Uniform(const SType low, const SType high, Tensor *out) {
     auto l = TypeCast<SType, DType>(low);
     auto h = TypeCast<SType, DType>(high);
     out->device()->Exec([l, h, out](Context *ctx) {
-      Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
-    }, {}, {out->blob()}, true);
+      Uniform<DType, Lang>(out->Size(), l, h, out->block(), ctx);
+    }, {}, {out->block()}, true);
   });
 }
 template void Uniform<float>(const float low, const float high, Tensor *out);
@@ -668,8 +677,8 @@ void Gaussian(const SType mean, const SType std, Tensor *out) {
     auto m = TypeCast<SType, DType>(mean);
     auto s = TypeCast<SType, DType>(std);
     out->device()->Exec([m, s, out](Context *ctx) {
-      Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
-    }, {}, {out->blob()}, true);
+      Gaussian<DType, Lang>(out->Size(), m, s, out->block(), ctx);
+    }, {}, {out->block()}, true);
   });
 }
 template void Gaussian<float>(const float mean, const float std, Tensor *out);
@@ -680,8 +689,8 @@ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     auto a = TypeCast<SType, DType>(alpha);
     out->device()->Exec([a, in, out](Context *ctx) {
-      Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
-    }, {in.blob(), out->blob()}, {out->blob()});
+      Axpy<DType, Lang>(in.Size(), a, in.block(), out->block(), ctx);
+    }, {in.block(), out->block()}, {out->block()});
   });
 }
 template void Axpy(const float alpha, const Tensor &in, Tensor *out);
@@ -708,9 +717,9 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context *ctx) {
-        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
-                          B.blob(), b, C->blob(), ctx);
-      }, {A.blob(), B.blob()}, {C->blob()});
+        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.block(),
+                          B.block(), b, C->block(), ctx);
+      }, {A.block(), B.block()}, {C->block()});
     });
   } else {
     CHECK(!C->transpose());
@@ -719,13 +728,13 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context *ctx) {
         GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
-                          A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
-      }, {A.blob(), B.blob()}, {C->blob()});
+                          A.shape(1), a, A.block(), B.block(), b, C->block(),
+                          ctx);
+      }, {A.block(), B.block()}, {C->block()});
     });
   }
 }
 
-
 // ************************
 // Misc.
 // ***********************
@@ -737,23 +746,22 @@ void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
   size_t dim = p.Size() / batchsize;
   TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
     p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
-      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p.blob(), t.blob(),
-                                       loss->blob(), ctx);
-    }, {p.blob(), t.blob()}, {loss->blob()});
+      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p.block(), t.block(),
+                                       loss->block(), ctx);
+    }, {p.block(), t.block()}, {loss->block()});
   });
 }
 void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
   CHECK_LE(p->nDim(), 2u);
   CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
   size_t batchsize = 1;
-  if (p->nDim() == 2u)
-    batchsize = p->shape(0);
+  if (p->nDim() == 2u) batchsize = p->shape(0);
   size_t dim = p->Size() / batchsize;
   TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
     p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
-      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
-                                          p->blob(), ctx);
-    }, {p->blob(), t.blob()}, {p->blob()});
+      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->block(), t.block(),
+                                          p->block(), ctx);
+    }, {p->block(), t.block()}, {p->block()});
   });
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 12490d1..57ccb88 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -33,20 +33,20 @@ namespace singa {
 /// first
 ///    letter.
 /// 2. Order functions based on function name in alphabetical order.
-/// 3. Function arguments order is [const basic type] [const Blob] [mutable
-/// Blob].
+/// 3. Function arguments order is [const basic type] [const Block] [mutable
+/// Block].
 /// 4. Function argument names, use 'num' for total number of elements in
-///    elementwise operations; use 'in1' 'in2' for in blobs; use 'out' for
-///    output blob or value. With exceptions for some functions, e.g.,
-///      Scale(const float alpha, const Blob* in, Blob* out);
+///    elementwise operations; use 'in1' 'in2' for in blocks; use 'out' for
+///    output block or value. With exceptions for some functions, e.g.,
+///      Scale(const float alpha, const Block* in, Block* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
 ///    For blas functions, follow the blas style for argument names.
 ///    Use 'M' and 'v' for matrix and vector tensors in functions involving both
 ///    matrix and vectors.
-/// 5. For Blob argument xxx, name its raw pointer as xxxPtr.
+/// 5. For Block argument xxx, name its raw pointer as xxxPtr.
 /// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
 /// 7. Use size_t for the number of elements, rows or columns.
-/// 8. Use the same name for the Tensor and Blob level math functions.
+/// 8. Use the same name for the Tensor and Block level math functions.
 
 // **************************************
 // Element-wise functions
@@ -54,41 +54,41 @@ namespace singa {
 
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
-void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Abs(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
 /// out[i] = in[i] + x
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in, const DType x, Blob *out,
+void Add(const size_t num, const Block *in, const DType x, Block *out,
          Context *ctx) {
   LOG(FATAL) << "Add Not Implemented";
 }
 
 /// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+void Add(const size_t num, const Block *in1, const Block *in2, Block *out,
          Context *ctx) {
   LOG(FATAL) << "Add-Pair Not Implemented";
 }
 /// Clamp every element into [low, high]
 /// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
-void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
-           Blob *out, Context *ctx) {
+void Clamp(const size_t num, const DType low, const DType high, const Block *in,
+           Block *out, Context *ctx) {
   LOG(FATAL) << "Clamp Not Implemented";
 }
 
 /// out[i] = x / in[i]
 template <typename DType, typename Lang>
-void Div(const size_t num, const DType x, const Blob *in, Blob *out,
+void Div(const size_t num, const DType x, const Block *in, Block *out,
          Context *ctx) {
   LOG(FATAL) << "Div Not Implemented";
 }
 
 /// out[i] = in[i] / x
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in, const DType x, Blob *out,
+void Div(const size_t num, const Block *in, const DType x, Block *out,
          Context *ctx) {
   CHECK_NE(x, 0.f);
   EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
@@ -96,131 +96,131 @@ void Div(const size_t num, const Blob *in, const DType x, Blob *out,
 
 /// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+void Div(const size_t num, const Block *in1, const Block *in2, Block *out,
          Context *ctx) {
   LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
 /// out[i] = in[i] * x
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+void EltwiseMult(const size_t num, const Block *in, const DType x, Block *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
 /// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+void EltwiseMult(const size_t num, const Block *in1, const Block *in2, Block *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
 
 /// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
-void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Exp(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "Exp Not Implemented";
 }
 
 /// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out,
+void LE(const size_t num, const Block *in, const DType x, Block *out,
         Context *ctx) {
   LOG(FATAL) << "LE Not Implemented";
 }
 /// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
-void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Log(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "Log Not Implemented";
 }
 /// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out,
+void LT(const size_t num, const Block *in, const DType x, Block *out,
         Context *ctx) {
   LOG(FATAL) << "LT Not Implemented";
 }
 /// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out,
+void GE(const size_t num, const Block *in, const DType x, Block *out,
         Context *ctx) {
   LOG(FATAL) << "GE Not Implemented";
 }
 /// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out,
+void GT(const size_t num, const Block *in, const DType x, Block *out,
         Context *ctx) {
   LOG(FATAL) << "GT Not Implemented";
 }
 /// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
+void Pow(const size_t num, const Block *in, const DType x, Block *out,
          Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+void Pow(const size_t num, const Block *in1, const Block *in2, Block *out,
          Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
 /// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
-void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void ReLU(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "ReLU Not Implemented";
 }
 
 /// out[i] = x
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+void Set(const size_t num, const DType x, Block *out, Context *ctx) {
   LOG(FATAL) << "Set Not Implemented";
 }
 /// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
-void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Sigmoid(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
 /// out[i] = sign(in[i])
 template <typename DType, typename Lang>
-void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Sign(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "Sign Not Implemented";
 }
 /// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
-void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Sqrt(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "Sqrt Not Implemented";
 }
 
 /// out[i]=square(in[i])
 template <typename DType, typename Lang>
-void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Square(const size_t num, const Block *in, Block *out, Context *ctx) {
   EltwiseMult<DType, Lang>(num, in, in, out, ctx);
 }
 
 /// out[i] =  in[i] - x
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
+void Sub(const size_t num, const Block *in, const DType x, Block *out,
          Context *ctx) {
   Add<DType, Lang>(num, in, -x, out, ctx);
 }
 
 /// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+void Sub(const size_t num, const Block *in1, const Block *in2, Block *out,
          Context *ctx) {
   LOG(FATAL) << "Sub-Pair Not Implemented";
 }
 
 /// sum all elements of in into out
 template <typename DType, typename Lang>
-void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+void Sum(const size_t num, const Block *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Sum Not Implemented";
 }
 
 /// out[i]=tanh(in[i])
 template <typename DType, typename Lang>
-void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Tanh(const size_t num, const Block *in, Block *out, Context *ctx) {
   LOG(FATAL) << "Tanh Not Implemented";
 }
 
@@ -231,20 +231,20 @@ void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
-void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+void Bernoulli(const size_t num, const float p, Block *out, Context *ctx) {
   LOG(FATAL) << "Bernoulli Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void Gaussian(const size_t num, const float mean, const float std, Blob *out,
+void Gaussian(const size_t num, const float mean, const float std, Block *out,
               Context *ctx) {
   LOG(FATAL) << "Gaussian Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
-void Uniform(const size_t num, const float low, const float high, Blob *out,
+void Uniform(const size_t num, const float low, const float high, Block *out,
              Context *ctx) {
   LOG(FATAL) << "Uniform Not Implemented";
 }
@@ -255,43 +255,43 @@ void Uniform(const size_t num, const float low, const float high, Blob *out,
 
 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+void Amax(const size_t num, const Block *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amax Not Implemented";
 }
 
 /// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+void Amin(const size_t num, const Block *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amin Not Implemented";
 }
 /// out = sum |x| for all x in in
 template <typename DType, typename Lang>
-void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+void Asum(const size_t num, const Block *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Asum Not Implemented";
 }
 
 /// out = alpha * in + out
 template <typename DType, typename Lang>
-void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
+void Axpy(const size_t num, const DType alpha, const Block *in, Block *out,
           Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
 /// out = ||in||_2^2, i.e, L2 norm.
 template <typename DType, typename Lang>
-void Nrm2(const size_t num, const Blob *in, float *out, Context *ctx) {
+void Nrm2(const size_t num, const Block *in, float *out, Context *ctx) {
   LOG(FATAL) << "Nrm2 Not Implemented";
 }
 
 /// out *= x
 template <typename DType, typename Lang>
-void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
+void Scale(const size_t num, const DType x, Block *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
 /// inner product of array in1 and in2
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
+void Dot(const size_t num, const Block *in1, const Block *in2, DType *out,
          Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
@@ -300,7 +300,7 @@ void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
 /// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
 void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
-          const Blob *A, const Blob *v, const DType beta, Blob *out,
+          const Block *A, const Block *v, const DType beta, Block *out,
           Context *ctx) {
   LOG(FATAL) << "GEMV Not Implemented";
 }
@@ -309,7 +309,7 @@ void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
 /// if matrix_lef_side is true, do M*v; else do v*M
 template <typename DType, typename Lang>
 void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
-          const Blob *M, const Blob *v, Blob *out, Context *ctx) {
+          const Block *M, const Block *v, Block *out, Context *ctx) {
   LOG(FATAL) << "DGMM Not Implemented";
 }
 
@@ -318,7 +318,7 @@ void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
 template <typename DType, typename Lang>
 void GEMM(const bool transA, const bool transB, const size_t nrowA,
           const size_t ncolB, const size_t ncolA, const DType alpha,
-          const Blob *A, const Blob *B, const DType beta, Blob *C,
+          const Block *A, const Block *B, const DType beta, Block *C,
           Context *ctx) {
   LOG(FATAL) << "GEMM Not Implemented";
 }
@@ -327,14 +327,14 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
 // following the consistency guide.
 template <typename DType, typename Lang>
 void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
-                         const Blob *p, const Blob *t, Blob *loss,
+                         const Block *p, const Block *t, Block *loss,
                          Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 template <typename DType, typename Lang>
 void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
-                            const Blob *p, const Blob *t, Blob *grad,
+                            const Block *p, const Block *t, Block *grad,
                             Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
@@ -345,40 +345,40 @@ void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
 /*
 /// Add the vector v to every column of A as the column of out
 template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
+void AddCol(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
+            Block *out, Context *ctx) {
   LOG(FATAL) << "AddCol Not Implemented";
 }
 // TODO(wangwei) unify AddRow and AddCol.
 /// Add the vector v to every row of A as the row of out
 template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
+void AddRow(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
+            Block *out, Context *ctx) {
   LOG(FATAL) << "AddRow Not Implemented";
 }
 /// outer-product.
 /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
-           Blob *out, Context *ctx) {
+void Outer(const size_t m, const size_t n, const Block *in1, const Block *in2,
+           Block *out, Context *ctx) {
   LOG(FATAL) << "Outer Not Implemented";
 }
 
 /// Sum the columns of the in matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+void SumColumns(const size_t nrow, const size_t ncol, const Block *in, Block *out,
                 Context *ctx) {
   LOG(FATAL) << "SumColumns Not Implemented";
 }
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+void Set(const size_t num, const DType x, Block *out, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the in matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+void SumRows(const size_t nrow, const size_t ncol, const Block *in, Block *out,
              Context *ctx) {
   LOG(FATAL) << "SumRows Not Implemented";
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9c2869b9/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index c5d092b..4717b5f 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -30,7 +30,7 @@
 namespace singa {
 
 template <>
-void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Abs<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                            Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -40,8 +40,8 @@ void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
 }
 
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
+void Add<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                           Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -50,8 +50,8 @@ void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
 }
 
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
+void Add<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
@@ -63,7 +63,7 @@ void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
 
 template <>
 void Clamp<float, lang::Cpp>(const size_t num, const float low,
-                             const float high, const Blob *in, Blob *out,
+                             const float high, const Block *in, Block *out,
                              Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -79,8 +79,8 @@ void Clamp<float, lang::Cpp>(const size_t num, const float low,
 }
 
 template <>
-void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
+void Div<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
@@ -91,8 +91,8 @@ void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
 }
 
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
-                           Blob *out, Context *ctx) {
+void Div<float, lang::Cpp>(const size_t num, const float x, const Block *in,
+                           Block *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
@@ -102,8 +102,8 @@ void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in,
-                                   const float x, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in,
+                                   const float x, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -112,8 +112,8 @@ void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in,
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1,
-                                   const Blob *in2, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in1,
+                                   const Block *in2, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
@@ -122,7 +122,7 @@ void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1,
   }
 }
 template <>
-void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Exp<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                            Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -132,8 +132,8 @@ void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
 }
 
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void GE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -142,8 +142,8 @@ void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
 }
 
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void GT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -151,8 +151,8 @@ void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
   }
 }
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void LE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -160,7 +160,7 @@ void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
   }
 }
 template <>
-void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Log<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                            Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -170,8 +170,8 @@ void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
   }
 }
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void LT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -179,8 +179,8 @@ void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
   }
 }
 template <>
-void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
+void Pow<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                           Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -189,8 +189,8 @@ void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
 }
 
 template <>
-void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
+void Pow<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
@@ -199,7 +199,7 @@ void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
   }
 }
 template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void ReLU<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -208,13 +208,13 @@ void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
   }
 }
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+void Set<float, lang::Cpp>(const size_t num, const float x, Block *out,
                            Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) outPtr[i] = x;
 }
 template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Sigmoid<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                                Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -224,7 +224,7 @@ void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
 }
 
 template <>
-void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Sign<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -234,7 +234,7 @@ void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
 }
 
 template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Sqrt<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -245,7 +245,7 @@ void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
 }
 /*
 template <>
-void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Square<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -256,8 +256,8 @@ void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
 */
 
 template <>
-void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
+void Sub<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
@@ -270,7 +270,7 @@ void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
 // sum all elements of input into out
 // TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+void Sum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
                            Context *ctx) {
   float s = 0.f;
   const float *inPtr = static_cast<const float *>(in->data());
@@ -281,7 +281,7 @@ void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
 }
 
 template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+void Tanh<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
@@ -292,7 +292,7 @@ void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
 
 // ===============Random operations==========================================
 template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Block *out,
                                  Context *ctx) {
   std::bernoulli_distribution distribution(p);
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -303,7 +303,7 @@ void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
 
 template <>
 void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
-                                const float std, Blob *out, Context *ctx) {
+                                const float std, Block *out, Context *ctx) {
   std::normal_distribution<float> distribution(mean, std);
   float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
@@ -312,7 +312,7 @@ void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
 }
 template <>
 void Uniform<float, lang::Cpp>(const size_t num, const float low,
-                               const float high, Blob *out, Context *ctx) {
+                               const float high, Block *out, Context *ctx) {
   std::uniform_real_distribution<float> distribution(low, high);
   float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
@@ -324,8 +324,8 @@ void Uniform<float, lang::Cpp>(const size_t num, const float low,
 
 template <>
 void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
-                            const size_t ncol, const Blob *M, const Blob *v,
-                            Blob *out, Context *ctx) {
+                            const size_t ncol, const Block *M, const Block *v,
+                            Block *out, Context *ctx) {
   const float *MPtr = static_cast<const float *>(M->data());
   const float *vPtr = static_cast<const float *>(v->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -348,42 +348,42 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
 
 #ifdef USE_CBLAS
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
                             Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   *out = cblas_isamax(num, inPtr, 1);
 }
 
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
                             Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   *out = cblas_sasum(num, inPtr, 1);
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-                            Blob *out, Context *ctx) {
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
+                            const Block *in, Block *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
   cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
 }
 
 template <>
-void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
                            float *out, Context *ctx) {
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
   *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
                              Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   cblas_sscal(num, x, outPtr, 1);
 }
 template <>
-void Nrm2<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+void Nrm2<float, lang::Cpp>(const size_t num, const Block *in, float *out,
                             Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   *out = cblas_snrm2(num, inPtr, 1);
@@ -391,8 +391,8 @@ void Nrm2<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
 
 template <>
 void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-                            const float alpha, const Blob *A, const Blob *v,
-                            const float beta, Blob *out, Context *ctx) {
+                            const float alpha, const Block *A, const Block *v,
+                            const float beta, Block *out, Context *ctx) {
   const float *APtr = static_cast<const float *>(A->data());
   const float *vPtr = static_cast<const float *>(v->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -409,8 +409,8 @@ template <>
 void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
                             const size_t nrowA, const size_t ncolB,
                             const size_t ncolA, const float alpha,
-                            const Blob *A, const Blob *B, const float beta,
-                            Blob *C, Context *ctx) {
+                            const Block *A, const Block *B, const float beta,
+                            Block *C, Context *ctx) {
   auto transa = transA ? CblasTrans : CblasNoTrans;
   auto transb = transB ? CblasTrans : CblasNoTrans;
   auto lda = transA ? nrowA : ncolA;
@@ -426,7 +426,7 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
 #else
 
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
                             Context *ctx) {
   size_t maxPos = 0;
   float maxVal = 0;
@@ -442,7 +442,7 @@ void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
   *out = maxPos;
 }
 template <>
-void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+void Amin<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
                             Context *ctx) {
   size_t minPos = 0;
   float minVal = 0;
@@ -459,7 +459,7 @@ void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
 }
 
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
                             Context *ctx) {
   float sum = 0;
   const float *inPtr = static_cast<const float *>(in->data());
@@ -469,8 +469,8 @@ void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-                            Blob *out, Context *ctx) {
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
+                            const Block *in, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -479,7 +479,7 @@ void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
 }
 
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
                              Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
@@ -488,7 +488,7 @@ void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
 }
 
 template <>
-void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
                            float *out, Context *ctx) {
   float sum = 0;
   const float *in1Ptr = static_cast<const float *>(in1->data());
@@ -500,8 +500,8 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
 
 template <>
 void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-                            const float alpha, const Blob *A, const Blob *v,
-                            const float beta, Blob *out, Context *ctx) {
+                            const float alpha, const Block *A, const Block *v,
+                            const float beta, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *APtr = static_cast<const float *>(A->data());
   const float *vPtr = static_cast<const float *>(v->data());
@@ -518,8 +518,8 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
 #endif  // USE_CBLAS
 template <>
 void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
-                                           const size_t dim, const Blob *p,
-                                           const Blob *t, Blob *loss,
+                                           const size_t dim, const Block *p,
+                                           const Block *t, Block *loss,
                                            Context *ctx) {
   const float *pPtr = static_cast<const float *>(p->data());
   const int *tPtr = static_cast<const int *>(t->data());
@@ -534,9 +534,9 @@ void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
 
 template <>
 void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
-                                              const size_t dim, const Blob *p,
-                                              const Blob *t,
-                                              Blob *grad, Context *ctx) {
+                                              const size_t dim, const Block *p,
+                                              const Block *t, Block *grad,
+                                              Context *ctx) {
   CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
   // const float* pPtr = static_cast<const float*>(p->data());
   const int *tPtr = static_cast<const int *>(t->data());
@@ -549,12 +549,11 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
   }
 }
 
-
 // =========Matrix operations ================================================
 /*
 template <>
 void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
+                              const Block *A, const Block *v, Block *out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *APtr = static_cast<const float *>(A->data());
@@ -569,7 +568,7 @@ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
+                              const Block *A, const Block *v, Block *out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *APtr = static_cast<const float *>(A->data());
@@ -582,8 +581,8 @@ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
   }
 }
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
-                             const Blob *in2, Blob *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Block *in1,
+                             const Block *in2, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
@@ -596,7 +595,7 @@ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
 }
 template <>
 void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
+                               const Block *in, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   float *bPtr = new float[ncol];
@@ -617,7 +616,7 @@ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Blob *in, Blob *out, Context *ctx) {
+                                  const Block *in, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t c = 0; c < ncol; c++) {
@@ -633,7 +632,7 @@ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
+                               const Block *in, Block *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t r = 0; r < nrow; r++) {

[48/50] [abbrv] incubator-singa git commit: SINGA-192 Implement optimization algorithms for v1

Posted by zh...@apache.org.

SINGA-192 Implement optimization algorithms for v1

Merge PR#164.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/272100a3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/272100a3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/272100a3

Branch: refs/heads/master
Commit: 272100a3ff81bb39aef0e9c5b483c0d5ed023e04
Parents: b167dfa 5784bff
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 13 17:54:48 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 17:54:48 2016 +0800

----------------------------------------------------------------------
 include/singa/model/optimizer.h |  43 ++++++++++++++
 src/core/tensor/math_kernel.cu  |  14 ++---
 src/core/tensor/math_kernel.h   |   2 +-
 src/model/optimizer/adagrad.cc  |  36 ++++++++++++
 src/model/optimizer/nesterov.cc |  43 ++++++++++++++
 src/model/optimizer/rmsprop.cc  |  41 ++++++++++++++
 test/singa/test_adagrad.cc      |  96 +++++++++++++++++++++++++++++++
 test/singa/test_nesterov.cc     | 101 +++++++++++++++++++++++++++++++++
 test/singa/test_rmsprop.cc      | 106 +++++++++++++++++++++++++++++++++++
 9 files changed, 474 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/272100a3/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/272100a3/src/core/tensor/math_kernel.h
----------------------------------------------------------------------

[17/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 5ce33ad..97da896 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -29,34 +29,34 @@
 /// For Blob argument xxx, name its pointer as xxxPtr.
 namespace singa {
 template <>
-void Square<float, lang::Cpp>(int count, const Blob* input,
-                           Blob* ret, Context* ctx) {
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* in = static_cast<const float*>(input->data());
+void Square<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
+                              Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *in = static_cast<const float *>(input->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = in[i] * in[i];
   }
 }
 
 template <>
-void Add<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
-                           Blob* ret, Context* ctx) {
+void Add<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
+                           Blob *ret, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* lptr = static_cast<const float*>(lhs->data());
-  const float* rptr = static_cast<const float*>(rhs->data());
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(lhs->data());
+  const float *rptr = static_cast<const float *>(rhs->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = lptr[i] + rptr[i];
   }
 }
 
 template <>
-void Sub<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
-                           Blob* ret, Context* ctx) {
+void Sub<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
+                           Blob *ret, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* lptr = static_cast<const float*>(lhs->data());
-  const float* rptr = static_cast<const float*>(rhs->data());
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(lhs->data());
+  const float *rptr = static_cast<const float *>(rhs->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = lptr[i] - rptr[i];
   }
@@ -64,10 +64,10 @@ void Sub<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
 // sum all elements of input into ret
 // TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(int count, const Blob* input, float* ret,
-    Context* ctx) {
+void Sum<float, lang::Cpp>(int count, const Blob *input, float *ret,
+                           Context *ctx) {
   float s = 0.f;
-  const float* in = static_cast<const float*>(input->data());
+  const float *in = static_cast<const float *>(input->data());
   for (int i = 0; i < count; i++) {
     s += in[i];
   }
@@ -76,10 +76,10 @@ void Sum<float, lang::Cpp>(int count, const Blob* input, float* ret,
 
 // TODO(wangwei) optimize using omp
 template <>
-void SumRows<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* ret,
-    Context* ctx) {
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* in = static_cast<const float*>(input->data());
+void SumRows<float, lang::Cpp>(int nrow, int ncol, const Blob *input, Blob *ret,
+                               Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *in = static_cast<const float *>(input->data());
   memset(dptr, 0, ncol * sizeof(float));
   for (int r = 0; r < nrow; r++) {
     for (int c = 0; c < ncol; c++) {
@@ -91,10 +91,10 @@ void SumRows<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* ret,
 // Sum the rows of the input matrix into a vector
 // TODO(wangwei) optimize using omp
 template <>
-void SumColumns<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* ret,
-    Context* ctx) {
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* in = static_cast<const float*>(input->data());
+void SumColumns<float, lang::Cpp>(int nrow, int ncol, const Blob *input,
+                                  Blob *ret, Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *in = static_cast<const float *>(input->data());
   memset(dptr, 0, ncol * sizeof(float));
   for (int r = 0; r < nrow; r++) {
     for (int c = 0; c < ncol; c++) {
@@ -104,64 +104,127 @@ void SumColumns<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* r
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob* input, float x,
-                                   Blob* ret, Context* ctx) {
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* lptr = static_cast<const float*>(input->data());
+void EltwiseMult<float, lang::Cpp>(int count, const Blob *input, float x,
+                                   Blob *ret, Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = lptr[i] * x;
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
-                                   Blob* ret, Context* ctx) {
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* lptr = static_cast<const float*>(lhs->data());
-  const float* rptr = static_cast<const float*>(rhs->data());
+void EltwiseMult<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
+                                   Blob *ret, Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(lhs->data());
+  const float *rptr = static_cast<const float *>(rhs->data());
   for (int i = 0; i < count; i++) {
     dptr[i] = lptr[i] * rptr[i];
   }
 }
 
 template <>
-void Bernoulli<float, lang::Cpp>(int count, float p, Blob* ret, Context* ctx) {
+void Bernoulli<float, lang::Cpp>(int count, float p, Blob *ret, Context *ctx) {
   std::bernoulli_distribution distribution(p);
-  float* ptr = static_cast<float*>(ret->mutable_data());
+  float *ptr = static_cast<float *>(ret->mutable_data());
   for (int i = 0; i < count; i++) {
     ptr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void Uniform<float, lang::Cpp>(int count, float low, float high, Blob* ret,
-                               Context* ctx) {
+void Uniform<float, lang::Cpp>(int count, float low, float high, Blob *ret,
+                               Context *ctx) {
   std::uniform_real_distribution<float> distribution(low, high);
-  float* ptr = static_cast<float*>(ret->mutable_data());
+  float *ptr = static_cast<float *>(ret->mutable_data());
   for (int i = 0; i < count; i++) {
     ptr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
 template <>
-void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob* ret,
-                                Context* ctx) {
+void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob *ret,
+                                Context *ctx) {
   std::normal_distribution<float> distribution(mean, std);
-  float* ptr = static_cast<float*>(ret->mutable_data());
+  float *ptr = static_cast<float *>(ret->mutable_data());
   for (int i = 0; i < count; i++) {
     ptr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
+// follow the consistency guide of math API
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = alpha / inPtr[i];
+  }
+}
+
+template <>
+void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
+                            const size_t ncol, const Blob *M, const Blob *v,
+                            Blob *out, Context *ctx) {
+  const float *MPtr = static_cast<const float *>(M->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  if (side_right) {
+    for (size_t r = 0; r < nrow; r++) {
+      size_t offset = r * ncol;
+      for (size_t c = 0; c < ncol; c++) {
+        outPtr[offset + c] = MPtr[offset + c] * vPtr[c];
+      }
+    }
+  } else {
+    for (size_t r = 0; r < nrow; r++) {
+      size_t offset = r * ncol;
+      for (size_t c = 0; c < ncol; c++) {
+        outPtr[offset + c] = MPtr[offset + c] * vPtr[r];
+      }
+    }
+  }
+}
+
+template <>
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++)
+    outPtr[i] = x;
+}
 #ifdef USE_CBLAS
 template <>
-void Dot<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
-                           float* ret, Context* ctx) {
-  float dptr = ret->mutable_data(), lptr = lhs->data(), rptr = rhs->data();
-  *ret = cblas_sdot(count, lptr, 1, rptr, 1);
+void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           float *out, Context *ctx) {
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
 
-#endif
+template <>
+void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
+                            const size_t nrowA, const size_t ncolB,
+                            const size_t ncolA, const float alpha,
+                            const Blob *A, const Blob *B, const float beta,
+                            Blob *C, Context *ctx) {
+  auto transa = transA ? CblasTrans : CblasNoTrans;
+  auto transb = transB ? CblasTrans : CblasNoTrans;
+  auto lda = transA ? nrowA : ncolA;
+  auto ldb = transB ? ncolA : ncolB;
+  auto ldc = ncolB;
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *BPtr = static_cast<const float *>(B->data());
+  float *CPtr = static_cast<float *>(C->mutable_data());
+  cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
+              lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
+#endif  // USE_CBLAS
+
+
+}  // namespace singa
+
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index f26b5a3..26299ba 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -22,76 +22,129 @@
 #ifdef USE_CUDA
 #include "./tensor_math.h"
 #include "./math_kernel.h"
+#include "singa/utils/cuda_utils.h"
 #include "singa/core/common.h"
 
 namespace singa {
 
 // TODO(wangwei) Clean implementations following comments in tensor_math_cpp.h.
 // TODO(wangwei) optimize using stream
-template<>
-void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
-                        Blob* ret, Context* ctx) {
-  const float* a = static_cast<const float*> (lhs->data());
-  const float* b = static_cast<const float*> (rhs->data());
-  float* c = static_cast<float*> (ret->mutable_data());
+template <>
+void Add<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
+                            Blob *ret, Context *ctx) {
+  const float *a = static_cast<const float *>(lhs->data());
+  const float *b = static_cast<const float *>(rhs->data());
+  float *c = static_cast<float *>(ret->mutable_data());
   cuda::add(count, a, b, c);
 }
 
 // TODO(wangwei) optimize using stream
-template<>
-void Sub<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
-                        Blob* ret, Context* ctx) {
-  const float* a = static_cast<const float*> (lhs->data());
-  const float* b = static_cast<const float*> (rhs->data());
-  float* c = static_cast<float*> (ret->mutable_data());
+template <>
+void Sub<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
+                            Blob *ret, Context *ctx) {
+  const float *a = static_cast<const float *>(lhs->data());
+  const float *b = static_cast<const float *>(rhs->data());
+  float *c = static_cast<float *>(ret->mutable_data());
   cuda::sub(count, a, b, c);
 }
 
 template <>
-void EltwiseMult<float, lang::Cuda>(int count, const Blob* input, float x,
-    Blob* ret, Context* ctx)
-{
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* lptr = static_cast<const float*>(input->data());
+void EltwiseMult<float, lang::Cuda>(int count, const Blob *input, float x,
+                                    Blob *ret, Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *lptr = static_cast<const float *>(input->data());
   cuda::mult(count, lptr, x, dptr);
 }
 // TODO(wangwei) optimize using stream
 template <>
-void Square<float, lang::Cuda>(int count, const Blob* input, Blob* ret,
-                            Context* ctx) {
-  const float* in = static_cast<const float*>(input->data());
-  float* out = static_cast<float*>(ret->mutable_data());
+void Square<float, lang::Cuda>(int count, const Blob *input, Blob *ret,
+                               Context *ctx) {
+  const float *in = static_cast<const float *>(input->data());
+  float *out = static_cast<float *>(ret->mutable_data());
   cuda::square(count, in, out);
 }
+
 // sum all elements of input into ret
 // TODO(wangwei) optimize using stream
 template <>
-void Sum<float, lang::Cuda>(int count, const Blob* input, float* ret,
-                            Context* ctx) {
-  const float* in = static_cast<const float*>(input->data());
+void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
+                            Context *ctx) {
+  const float *in = static_cast<const float *>(input->data());
   cuda::sum(count, in, ret);
 }
 
 // TODO(wangwei) optimize using stream
 template <>
-void SumRows<float, lang::Cuda>(int nrow, int ncol, const Blob* input,
-                                Blob* ret, Context* ctx) {
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* in = static_cast<const float*>(input->data());
+void SumRows<float, lang::Cuda>(int nrow, int ncol, const Blob *input,
+                                Blob *ret, Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *in = static_cast<const float *>(input->data());
   cuda::sum_row(nrow, ncol, ncol, in, dptr);
 }
 
 // Sum the rows of the input matrix into a vector
 // TODO(wangwei) optimize using stream
 template <>
-void SumColumns<float, lang::Cuda>(int nrow, int ncol, const Blob* input,
-                                   Blob* ret, Context* ctx) {
-  float* dptr = static_cast<float*>(ret->mutable_data());
-  const float* in = static_cast<const float*>(input->data());
+void SumColumns<float, lang::Cuda>(int nrow, int ncol, const Blob *input,
+                                   Blob *ret, Context *ctx) {
+  float *dptr = static_cast<float *>(ret->mutable_data());
+  const float *in = static_cast<const float *>(input->data());
   cuda::sum_col(nrow, ncol, ncol, in, dptr);
 }
+
+// follow the consistency guide of math API
+template <>
+void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::Div(num, alpha, inPtr, outPtr, ctx->stream);
 }
 
+template <>
+void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cuda::Set(num, x, outPtr, ctx->stream);
+}
+// NOTE: cublas uses column major order.
+// http://peterwittek.com/cublas-matrix-c-style.html
+template <>
+void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
+                             const size_t ncol, const Blob *M, const Blob *v,
+                             Blob *out, Context *ctx) {
+  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  const float *MPtr = static_cast<const float *>(M->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  if (side_right) {
+    CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
+                             vPtr, 1, outPtr, ncol));
+  } else {
+    CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_RIGHT, ncol, nrow, MPtr, ncol,
+                             vPtr, 1, outPtr, ncol));
+  }
+}
+// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
+template <>
+void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
+                             const size_t nrowA, const size_t ncolB,
+                             const size_t ncolA, const float alpha,
+                             const Blob *A, const Blob *B, const float beta,
+                             Blob *C, Context *ctx) {
+  auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  int lda = transA ? nrowA : ncolA;
+  int ldb = transB ? ncolA : ncolB;
+  int ldc = ncolB;
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *BPtr = static_cast<const float *>(B->data());
+  float *CPtr = static_cast<float *>(C->mutable_data());
+  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
+                           BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
+}
+}  // namespace singa
 
 #endif  // USE_CUDA
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/test/singa/test_cpp_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_math.cc b/test/singa/test_cpp_math.cc
deleted file mode 100644
index 78c713f..0000000
--- a/test/singa/test_cpp_math.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "gtest/gtest.h"
-#include "../src/core/tensor/tensor_math_cpp.h"
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 3ee6bf8..67f583c 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -23,7 +23,7 @@
 #include "singa/core/tensor.h"
 #include "singa/core/device.h"
 #include "../src/model/loss/mse.h"
-
+#include "singa_config.h"
 using singa::Tensor;
 class TestMSE : public ::testing::Test {
  protected:
@@ -39,6 +39,7 @@ class TestMSE : public ::testing::Test {
   singa::Tensor p, t;
 };
 
+#ifdef USE_CBLAS
 TEST_F(TestMSE, CppForward) {
   singa::MSE mse;
   const Tensor& loss = mse.Forward(p, t);
@@ -54,6 +55,17 @@ TEST_F(TestMSE, CppForward) {
   }
 }
 
+TEST_F(TestMSE, CppBackward) {
+  singa::MSE mse;
+  mse.Forward(p, t);
+  const Tensor& grad = mse.Backward();
+
+  auto gdat = grad.data<const float*>();
+
+  for (size_t i = 0; i < grad.Size(); i++)
+    EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
+}
+#endif
 TEST_F(TestMSE, CudaForward) {
   singa::MSE mse;
   singa::CudaGPU dev;
@@ -73,18 +85,6 @@ TEST_F(TestMSE, CudaForward) {
     EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
   }
 }
-
-TEST_F(TestMSE, CppBackward) {
-  singa::MSE mse;
-  mse.Forward(p, t);
-  const Tensor& grad = mse.Backward();
-
-  auto gdat = grad.data<const float*>();
-
-  for (size_t i = 0; i < grad.Size(); i++)
-    EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
-}
-
 TEST_F(TestMSE, CudaBackward) {
   singa::MSE mse;
   singa::CudaGPU dev;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index f9acdb0..bd039ad 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -111,5 +111,3 @@ TEST(TensorClass, T) {
   EXPECT_EQ(t.shape()[1],  o.shape()[0]);
 }
 
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index fb7e3e8..8368c55 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -5,10 +5,8 @@ using singa::Shape;
 using singa::Device;
 
 class TestTensorMath : public ::testing::Test {
- protected:
+protected:
   virtual void SetUp() {
-    const float dat1[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-    const float dat2[] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
     a.Reshape(singa::Shape{6});
     b.Reshape(singa::Shape{6});
     c.Reshape(singa::Shape{6, 1});
@@ -18,12 +16,14 @@ class TestTensorMath : public ::testing::Test {
     b.CopyDataFromHostPtr<float>(dat2, 6);
   }
   Tensor a, b, c, d;
+  const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
 };
 
 TEST_F(TestTensorMath, MemberAddTensor) {
   Tensor aa = a.Clone();
   aa += a;
-  const float* dptr = aa.data<const float*>();
+  const float *dptr = aa.data<const float *>();
   EXPECT_FLOAT_EQ(2.0f, dptr[0]);
   EXPECT_FLOAT_EQ(4.0f, dptr[1]);
   EXPECT_FLOAT_EQ(6.0f, dptr[2]);
@@ -31,40 +31,467 @@ TEST_F(TestTensorMath, MemberAddTensor) {
   // check p is initialized to 0
   Tensor p(Shape{6});
   p += aa;
-  const float* dptr1 = p.data<const float*>();
+  const float *dptr1 = p.data<const float *>();
   EXPECT_FLOAT_EQ(2.0f, dptr1[0]);
   EXPECT_FLOAT_EQ(4.0f, dptr1[1]);
   EXPECT_FLOAT_EQ(6.0f, dptr1[2]);
 
   a += b;
-  const float* dptr2 = a.data<const float*>();
+  const float *dptr2 = a.data<const float *>();
   EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
   EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
 }
 
-
 TEST_F(TestTensorMath, AddTensors) {
   Tensor ret(a.shape(), a.device(), a.data_type());
   Add(a, b, &ret);
-  const float* dptr = ret.data<const float*>();
+  const float *dptr = ret.data<const float *>();
   EXPECT_FLOAT_EQ(2.1f, dptr[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr[2]);
   EXPECT_FLOAT_EQ(12.1f, dptr[5]);
 
   const Tensor d = a + b;
-  const float* dptr2 = d.data<const float*>();
+  const float *dptr2 = d.data<const float *>();
   EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
   EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
 
   Add(a, b, &a);
-  const float* dptr1 = a.data<const float*>();
+  const float *dptr1 = a.data<const float *>();
   EXPECT_FLOAT_EQ(2.1f, dptr1[0]);
   EXPECT_FLOAT_EQ(4.1f, dptr1[1]);
   EXPECT_FLOAT_EQ(6.1f, dptr1[2]);
   EXPECT_FLOAT_EQ(12.1f, dptr1[5]);
 }
+
+TEST_F(TestTensorMath, SetValue) {
+  Tensor t(Shape{4});
+  t.SetValue(0.3f);
+  const float *ptr = t.data<const float *>();
+  for (int i = 0; i < 4; i++)
+    EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+}
+
+TEST_F(TestTensorMath, Reshape) {
+  Tensor t(Shape{4});
+  t.SetValue(0.3f);
+  Tensor p = Reshape(t, Shape{4, 1});
+  const float *ptr = t.data<const float *>();
+  EXPECT_EQ(p.shape(0), 4u);
+  EXPECT_EQ(p.shape(1), 1u);
+  for (int i = 0; i < 4; i++)
+    EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+}
+#ifdef USE_CBLAS
+TEST_F(TestTensorMath, MultCpp) {
+  const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  Tensor t(Shape{2, 2});
+  t.CopyDataFromHostPtr(x, 4);
+  d.CopyDataFromHostPtr(dat1, 6);
+  Tensor C = Mult(d, t);
+  const float *xptr = C.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * x[k * 2 + j];
+      }
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], tmp);
+    }
+  }
+  const float y[8] = {1.0f, 2.0f, 3.0f, 4.0f, 1.1f, 2.1f, 3.1f, 4.1f};
+  Tensor s(Shape{4, 2});
+  s.CopyDataFromHostPtr(y, 8);
+  const float *sPtr = s.data<const float *>();
+  for (int i = 0; i < 8; i++)
+    EXPECT_FLOAT_EQ(sPtr[i], y[i]);
+  Tensor D = Mult(d, s.T());
+  const float *DPtr = D.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 4; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * y[j * 2 + k];
+      }
+      EXPECT_FLOAT_EQ(DPtr[i * 4 + j], tmp);
+    }
+  }
+  Tensor p(Shape{4, 1});
+  p.CopyDataFromHostPtr(x, 4);
+  Tensor q(Shape{1, 4});
+  q.SetValue(1.0f);
+  Tensor o(Shape{4, 4});
+
+  Mult(p, q, &o);
+  const float *oPtr = o.data<const float *>();
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, AddColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  AddColumn(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[i]);
+    }
+  }
+}
+TEST_F(TestTensorMath, SubColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  SubColumn(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[i]);
+    }
+  }
+}
+
+
+TEST_F(TestTensorMath, DivColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  DivColumn(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[i]);
+    }
+  }
+}
+
+
+TEST_F(TestTensorMath, AddRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  AddRow(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[j]);
+    }
+  }
+}
+
+
+TEST_F(TestTensorMath, SubRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  SubRow(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[j]);
+    }
+  }
+}
+
+
+TEST_F(TestTensorMath, MultRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  MultRow(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[j]);
+    }
+  }
+}
+
+
+TEST_F(TestTensorMath, SumRowsCpp) {
+  Tensor t(Shape{2});
+  d.CopyDataFromHostPtr(dat1, 6);
+  SumRows(d, &t);
+  const float *tptr = t.data<const float *>();
+  for (int i = 0; i < 2; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 3; j++) {
+      tmp += dat1[j * 2 + i];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+}
+
+
+TEST_F(TestTensorMath, SumColumnsCpp) {
+  Tensor t(Shape{3});
+  d.CopyDataFromHostPtr(dat1, 6);
+  SumColumns(d, &t);
+  const float *tptr = t.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 2; j++) {
+      tmp += dat1[i * 2 + j];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+}
+#endif
+TEST_F(TestTensorMath, MultCuda) {
+  const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{2, 2}, &dev);
+  t.CopyDataFromHostPtr(x, 4);
+  d.ToDevice(&dev);
+  d.CopyDataFromHostPtr(dat1, 6);
+  Tensor C = Mult(d, t);
+  C.ToHost();
+  const float *xptr = C.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * x[k * 2 + j];
+      }
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], tmp);
+    }
+  }
+
+  const float y[8] = {1.0f, 2.0f, 3.0f, 4.0f, 1.1f, 2.1f, 3.1f, 4.1f};
+  Tensor s(Shape{4, 2}, &dev);
+  s.CopyDataFromHostPtr(y, 8);
+  Tensor D = Mult(d, s.T());
+  D.ToHost();
+  const float *DPtr = D.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 4; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * y[j * 2 + k];
+      }
+      EXPECT_FLOAT_EQ(DPtr[i * 4 + j], tmp);
+    }
+  }
+  Tensor p(Shape{4, 1}, &dev);
+  p.CopyDataFromHostPtr(x, 4);
+  Tensor q(Shape{1, 4}, &dev);
+  q.SetValue(1.0f);
+  Tensor o(Shape{4, 4}, &dev);
+
+  Mult(p, q, &o);
+  o.ToHost();
+  const float *oPtr = o.data<const float *>();
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, AddColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{3}, &dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  AddColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[i]);
+    }
+  }
+}
+
+
+TEST_F(TestTensorMath, SubColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{3}, &dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  SubColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, MultColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  MultColumn(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, MultColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{3}, &dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  MultColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
+    }
+  }
+}
+TEST_F(TestTensorMath, DivColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{3}, &dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  DivColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[i]);
+    }
+  }
+}
+TEST_F(TestTensorMath, AddRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{2}, &dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  AddRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[j]);
+    }
+  }
+}
+TEST_F(TestTensorMath, SubRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{2}, &dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  SubRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[j]);
+    }
+  }
+}
+TEST_F(TestTensorMath, MultRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{2}, &dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  MultRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[j]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, DivRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  DivRow(t, &d);
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, DivRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  singa::CudaGPU dev;
+  Tensor t(Shape{2}, &dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  DivRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
+    }
+  }
+}
+TEST_F(TestTensorMath, SumRowsCuda) {
+  singa::CudaGPU dev;
+  Tensor t(Shape{2}, &dev);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  SumRows(d, &t);
+  t.ToHost();
+  const float *tptr = t.data<const float *>();
+  for (int i = 0; i < 2; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 3; j++) {
+      tmp += dat1[j * 2 + i];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+}
+TEST_F(TestTensorMath, SumColumnCuda) {
+  singa::CudaGPU dev;
+  Tensor t(Shape{3}, &dev);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(&dev);
+  SumColumns(d, &t);
+  t.ToHost();
+  const float *tptr = t.data<const float *>();
+  for (int i = 0; i < 3; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 2; j++) {
+      tmp += dat1[i * 2 + j];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+}

[47/50] [abbrv] incubator-singa git commit: SINGA-195 Channel for sending training statistics

Posted by zh...@apache.org.

SINGA-195 Channel for sending training statistics

Merge branch PR#166


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/b167dfa5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/b167dfa5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/b167dfa5

Branch: refs/heads/master
Commit: b167dfa5bbf6a84af87a4db0f3946659fb62c4ca
Parents: 21e4b2d a4fc4ea
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 13 17:50:20 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 17:53:29 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/channel.h |  85 ++++++++++++++++++++++++++++++
 src/utils/channel.cc          | 104 +++++++++++++++++++++++++++++++++++++
 test/singa/test_channel.cc    |  39 ++++++++++++++
 3 files changed, 228 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b167dfa5/src/utils/channel.cc
----------------------------------------------------------------------
diff --cc src/utils/channel.cc
index 0000000,95daed6..588a11a
mode 000000,100644..100644
--- a/src/utils/channel.cc
+++ b/src/utils/channel.cc
@@@ -1,0 -1,104 +1,104 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "singa/utils/channel.h"
+ 
+ #include "singa/utils/logging.h"
+ #include "singa/utils/singleton.h"
+ 
+ namespace singa {
+ 
+ ChannelManager::~ChannelManager() {
+   for (auto it : name2ptr_) {
+     if (it.second != nullptr) delete (it.second);
+   }
+ }
+ 
+ void ChannelManager::Init() {
+   // do nothing here
+ }
+ 
+ void ChannelManager::SetDefaultDir(const char* dir) {
+   if (dir != nullptr) {
+     dir_ = dir;
+     if (dir[dir_.length() - 1] != '/') dir_ += '/';
+   }
+ }
+ 
+ Channel* ChannelManager::GetInstance(const std::string& channel) {
+   // find the channel
+   if (name2ptr_.find(channel) == name2ptr_.end()) {
+     // create new channel
+     Channel* chn = new Channel(channel);
+     chn->SetDestFilePath(dir_ + channel);
+     chn->EnableDestFile(true);
+     name2ptr_[channel] = chn;
+   }
+   return name2ptr_[channel];
+ }
+ 
+ Channel::Channel(const std::string& name) { name_ = name; }
+ 
+ Channel::~Channel() {
+   if (os_.is_open()) os_.close();
+ }
+ 
+ void Channel::SetDestFilePath(const std::string& file) {
+   // file is append only
+   if (os_.is_open()) os_.close();
+   {
 -    ifstream fin(file.c_str());
++    std::ifstream fin(file.c_str());
+     if (fin.good())
+       LOG(WARNING) << "Messages will be appended to an existed file: " << file;
+   }
+   os_.open(file.c_str(), std::ios::app);
+   if (os_.is_open() == false)
+     LOG(WARNING) << "Cannot open channel file (" << file << ")";
+ }
+ 
+ void Channel::Send(const std::string& message) {
+   if (stderr_) fprintf(stderr, "%s\n", message.c_str());
+   if (file_ && os_.is_open()) os_ << message << "\n";
+   // TODO(wangwei) flush
+ }
+ 
+ void Channel::Send(const google::protobuf::Message& message) {
+   if (stderr_) fprintf(stderr, "%s\n", message.DebugString().c_str());
+   if (file_ && os_.is_open()) message.SerializeToOstream(&os_);
+   // TODO(wangwei) flush
+ }
+ 
+ void InitChannel(const char* argv) {
+   ChannelManager* mng = Singleton<ChannelManager>().Instance();
+   mng->Init();
+ }
+ 
+ void SetChannelDirectory(const char* path) {
+   ChannelManager* mng = Singleton<ChannelManager>().Instance();
+   mng->SetDefaultDir(path);
+ }
+ 
+ Channel* GetChannel(const std::string& channel_name) {
+   ChannelManager* mng = Singleton<ChannelManager>().Instance();
+   return mng->GetInstance(channel_name);
+ }
+ 
+ }  // namespace singa

[14/50] [abbrv] incubator-singa git commit: SINGA-176 - Add loss and metric base classes

Posted by zh...@apache.org.

SINGA-176 - Add loss and metric base classes

Pass tests for MSE and Accuracy


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a1c3437c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a1c3437c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a1c3437c

Branch: refs/heads/master
Commit: a1c3437c34b6f613911d8b7ef9f11f483099fc63
Parents: 668ae16
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu May 26 14:03:05 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu May 26 14:11:18 2016 +0800

----------------------------------------------------------------------
 include/singa/core/device.h        |  7 +++---
 src/core/device/cpp_cpu.cc         |  4 +++-
 src/core/device/cuda_gpu.cc        |  2 +-
 src/core/device/device.cc          |  1 +
 src/core/tensor/math_kernel.cu     | 34 +++++++++++++++++++++++++++-
 src/core/tensor/math_kernel.h      |  6 +++++
 src/core/tensor/tensor_math_cpp.h  | 12 +++++++++-
 src/core/tensor/tensor_math_cuda.h | 40 ++++++++++++++++++++++++++-------
 src/model/loss/mse.h               |  6 ++---
 test/singa/test_mse.cc             | 24 +++++++++++++++-----
 10 files changed, 111 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 23c2431..a4b3f6d 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -33,7 +33,6 @@ using std::vector;
 using std::string;
 using std::function;
 namespace singa {
-
 /// Allocate memory and execute Tensor operations.
 /// There are three types of devices distinguished by their programming
 /// languages, namely cpp, cuda and opencl.
@@ -76,8 +75,7 @@ class Device {
     return lang_;
   }
 
-  /// TODO(wangwei) remove it?
-  Device* host() const { return host_; }
+  Device* host() const { return host_;}
 
   int id() const { return id_; }
 
@@ -135,6 +133,7 @@ class CppCPU : public Device {
 /// a singleton CppDevice as the host for all devices.
 extern CppCPU defaultDevice;
 
+
 // Implement Device using OpenCL libs.
 // class OpenclDevice : public Device { };
 
@@ -143,7 +142,7 @@ extern CppCPU defaultDevice;
 class CudaGPU : public Device {
  public:
   ~CudaGPU();
-  CudaGPU(int id = -1, int num_executors = 1, string scheduler = "sync",
+  CudaGPU(int id = 0, int num_executors = 1, string scheduler = "sync",
          string vm = "gc-only");
 
   void SetRandSeed(unsigned seed) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 3287911..28b0da4 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -33,7 +33,9 @@ void CppCPU::DoExec(function<void(Context*)>&& fn, int executor) {
 }
 
 void* CppCPU::Malloc(int size) {
-  return malloc(size);
+  void *ptr = malloc(size);
+  memset(ptr, 0, size);
+  return ptr;
 }
 
 void CppCPU::Free(void* ptr) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 59a5f45..0ba05fb 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -50,7 +50,6 @@ CudaGPU::CudaGPU(int id, int num_executors,
   if (id == -1)
     id = FindDevice(0);
   lang_ = kCuda;
-  host_ = nullptr;  // TODO(wangwei) add host device
   ctx_.stream = NULL;  // use the default sync stream
   // TODO(wangwei) create one handle for each steam?
   CUDA_CHECK(cudaSetDevice(FindDevice(0)));
@@ -91,6 +90,7 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
   CUDA_CHECK(cudaMalloc(&ptr, size));
+  CUDA_CHECK(cudaMemset(ptr, 0, size));
   return ptr;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index cd860db..ede3fda 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -22,6 +22,7 @@ namespace singa {
 Device::Device(int id, int num_executors, string scheduler, string vm)
     : id_(id), num_executors_(num_executors) {
       // TODO(wangwei) create scheduler and vm.
+  host_ = &defaultDevice;
 }
 
 void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index 30863a1..e67ea7b 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -147,7 +147,21 @@ __global__ void kernel_add_vec_row(const float *src_vec_data,
     des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
   }
 }
+__global__ void kernel_add(const float *src1, const float *src2, float*out, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    out[index] = src1[index] + src2[index];
+  }
+}
 
+__global__ void kernel_sub(const float *src1, const float *src2, float*out, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    out[index] = src1[index] - src2[index];
+  }
+}
 __global__ void kernel_exp(const float *src_data, float *des_data, int n) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
@@ -275,6 +289,15 @@ __global__ void kernel_mult(const float *src_data_a, const float *src_data_b,
   }
 }
 
+__global__ void kernel_mult(const float *src_data_a, const float x,
+                            float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data_a[index] * x;
+  }
+}
+
 __global__ void kernel_div(const float *src_data_a, const float *src_data_b,
                            float *des_data, int n) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -346,7 +369,12 @@ void add_row(int rows, int cols, int stride, const float *in_row,
   kernel_add_vec_row<<<num_blocks, threads_per_block>>>(in_row, in_mat, out,
                                                         rows, cols, stride);
 }
-
+void add(int n, const float *a, const float *b, float *out) {
+  kernel_add<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
+void sub(int n, const float *a, const float *b, float *out) {
+  kernel_sub<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
 void exp(int n, const float *in, float *out) {
   kernel_exp<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
 }
@@ -407,6 +435,10 @@ void mult(int n, const float *a, const float *b, float *out) {
   kernel_mult<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
 }
 
+void mult(int n, const float *a, const float x, float *out) {
+  kernel_mult<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, x, out, n);
+}
+
 void div(int n, const float *a, const float *b, float *out) {
   kernel_div<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index f5da772..5367f4a 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -44,6 +44,10 @@ void sum_col(int rows, int cols, int stride, const float *in, float *out);
 void add_row(int rows, int cols, int stride, const float *in_row,
   const float *in_mat, float *out);
 
+void add(int n, const float *a, const float *b, float *out);
+
+void sub(int n, const float *a, const float *b, float *out);
+
 void exp(int n, const float *in, float *out);
 
 void log(int n, const float *in, float *out);
@@ -74,6 +78,8 @@ void pow(int n, const float *a, const float *b, float *out);
 
 void mult(int n, const float *a, const float *b, float *out);
 
+void mult(int n, const float *a, const float x, float *out);
+
 void div(int n, const float *a, const float *b, float *out);
 
 void set_value(int n, float v, float *out);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index c584b69..7dc35c9 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -47,7 +47,17 @@ void Add<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
   }
 }
 
-
+template <>
+void Sub<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
+                           Blob* ret, Context* ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* lptr = static_cast<const float*>(lhs->data());
+  const float* rptr = static_cast<const float*>(rhs->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] - rptr[i];
+  }
+}
 // sum all elements of input into ret
 // TODO(wangwei) optimize using omp
 template <>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 2e497d2..12fc58e 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -26,17 +26,41 @@
 
 namespace singa {
 
+// TODO(wangwei) optimize using stream
 template<>
 void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
                         Blob* ret, Context* ctx) {
-  /*
-  cublasSetStream(ctx->cublas_handle, ctx->stream);
-  const float* lptr = static_cast<const float*>(lhs->data());
-  const float* rptr = static_cast<const float*>(rhs->data());
-  float* ptr = static_cast<float*>(ret->mutable_data());
-  cublasScopy(ctx->cublas_handle, count, lptr, 1, ptr, 1);
-  cublasSaxpy(ctx->cublas_handle, 1.0f, rptr, 1, ptr, 1);
-  */
+  const float* a = static_cast<const float*> (lhs->data());
+  const float* b = static_cast<const float*> (rhs->data());
+  float* c = static_cast<float*> (ret->mutable_data());
+  cuda::add(count, a, b, c);
+}
+
+// TODO(wangwei) optimize using stream
+template<>
+void Sub<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
+                        Blob* ret, Context* ctx) {
+  const float* a = static_cast<const float*> (lhs->data());
+  const float* b = static_cast<const float*> (rhs->data());
+  float* c = static_cast<float*> (ret->mutable_data());
+  cuda::sub(count, a, b, c);
+}
+
+template <>
+void EltwiseMult<float, lang::Cuda>(int count, const Blob* input, float x,
+    Blob* ret, Context* ctx)
+{
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* lptr = static_cast<const float*>(input->data());
+  cuda::mult(count, lptr, x, dptr);
+}
+// TODO(wangwei) optimize using stream
+template <>
+void Square<float, lang::Cuda>(int count, const Blob* input, Blob* ret,
+                            Context* ctx) {
+  const float* in = static_cast<const float*>(input->data());
+  float* out = static_cast<float*>(ret->mutable_data());
+  cuda::square(count, in, out);
 }
 // sum all elements of input into ret
 // TODO(wangwei) optimize using stream

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/src/model/loss/mse.h
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.h b/src/model/loss/mse.h
index 5799f13..1a022f9 100644
--- a/src/model/loss/mse.h
+++ b/src/model/loss/mse.h
@@ -51,13 +51,13 @@ Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
   t.Reshape(Shape{batchsize, dim});
   buf_.push(t);
   // TODO(wangwei) use CastType for operator/
-  return Sum(Square(t), 1);
+  return Sum(Square(t), 1) * 0.5f;
 }
 
 Tensor MSE::Backward() {
-  const Tensor& ret = buf_.top();
+  Tensor ret = buf_.top();
   buf_.pop();
-  return ret / (1.0f * ret.shape().at(0));
+  return ret * (1.0f / ret.shape().at(0));
 }
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a1c3437c/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 9056176..3ee6bf8 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -44,8 +44,14 @@ TEST_F(TestMSE, CppForward) {
   const Tensor& loss = mse.Forward(p, t);
   auto ldat = loss.data<const float*>();
 
-  EXPECT_FLOAT_EQ(ldat[0], 0.005);
-  EXPECT_FLOAT_EQ(ldat[1], 0);
+  for (size_t i = 0, k = 0; i < loss.Size(); i++) {
+    float l = 0.f;
+    for (size_t j = 0; j < p.Size() / loss.Size(); j++) {
+      l += (pdat[k] - tdat[k]) * (pdat[k] - tdat[k]);
+      k++;
+    }
+    EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
+  }
 }
 
 TEST_F(TestMSE, CudaForward) {
@@ -58,8 +64,14 @@ TEST_F(TestMSE, CudaForward) {
   loss.ToHost();
   auto ldat = loss.data<const float*>();
 
-  for (size_t i = 0; i < loss.Size(); i++)
-    EXPECT_FLOAT_EQ(ldat[i], 0.5 * (pdat[i] - tdat[i]) * (pdat[i] - tdat[i]));
+  for (size_t i = 0, k = 0; i < loss.Size(); i++) {
+    float l = 0.f;
+    for (size_t j = 0; j < p.Size() / loss.Size(); j++) {
+      l += (pdat[k] - tdat[k]) * (pdat[k] - tdat[k]);
+      k++;
+    }
+    EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
+  }
 }
 
 TEST_F(TestMSE, CppBackward) {
@@ -70,7 +82,7 @@ TEST_F(TestMSE, CppBackward) {
   auto gdat = grad.data<const float*>();
 
   for (size_t i = 0; i < grad.Size(); i++)
-    EXPECT_FLOAT_EQ(gdat[i], pdat[i] - tdat[i]);
+    EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }
 
 TEST_F(TestMSE, CudaBackward) {
@@ -84,5 +96,5 @@ TEST_F(TestMSE, CudaBackward) {
   auto gdat = grad.data<const float*>();
 
   for (size_t i = 0; i < grad.Size(); i++)
-    EXPECT_FLOAT_EQ(gdat[i], pdat[i] - tdat[i]);
+    EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }

[35/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index ec7a892..2c5c272 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -25,12 +25,11 @@
 #include <cblas.h>
 #endif
 
-/// TODO(wangwei) Clean the implementations following the comments in
-/// tensor_math.h.
 namespace singa {
 
-template<>
-void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+template <>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -39,180 +38,150 @@ void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context
 }
 
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] + x;
+  }
 }
 
-// sum all elements of input into out
-// TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
-  float s = 0.f;
-  const float *inPtr = static_cast<const float *>(in->data());
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
-    s += inPtr[i];
+    outPtr[i] = in1Ptr[i] + in2Ptr[i];
   }
-  *out = s;
 }
 
 template <>
-void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Clamp<float, lang::Cpp>(const size_t num, const float low,
+                             const float high, const Blob *in, Blob *out,
+                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float*>(in->data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; 
+    if (inPtr[i] > high) {
+      outPtr[i] = high;
+    } else if (inPtr[i] < low) {
+      outPtr[i] = low;
+    } else {
+      outPtr[i] = inPtr[i];
+    }
   }
 }
 
 template <>
-void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_NE(in2Ptr[i], 0.f);
+    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
+                           Blob *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = exp(inPtr[i]);
+    CHECK_NE(inPtr[i], 0.f);
+    outPtr[i] = x / inPtr[i];
   }
 }
 
 template <>
-void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in,
+                                   const float x, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = log(inPtr[i]);
+    outPtr[i] = inPtr[i] * x;
   }
 }
 
 template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1,
+                                   const Blob *in2, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] * in2Ptr[i];
+  }
+}
+template <>
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = sqrt(inPtr[i]);
+    outPtr[i] = exp(inPtr[i]);
   }
 }
 
 template <>
-void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * inPtr[i];
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
   }
 }
 
 template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = tanh(inPtr[i]);
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
   }
 }
-
 template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
   }
 }
-
 template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = log(inPtr[i]);
   }
 }
-
 template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
-             Blob *out, Context *ctx) {
+void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-	float *bPtr = new float[ncol];
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-		float denom = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-			bPtr[c] = exp(inPtr[offset + c]);
-			denom += bPtr[c];
-    }
-		for (size_t c = 0; c < ncol; c++) {
-			size_t idx = offset + c;
-			outPtr[idx] = bPtr[c] / denom;
-		}
-  }
-	delete bPtr;
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
-             Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		outPtr[r] = 0.f;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[r] += inPtr[offset + c];
-		}
-	}
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t c = 0; c < ncol; c++) {
-		outPtr[c] = 0.f;
-	}
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-				outPtr[c] += inPtr[offset + c];
-		}
-	}
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());                              
-  const float *vPtr = static_cast<const float *>(v->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[offset + c] = APtr[offset + c] + vPtr[c];
-		}
-	}
-}
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());                              
-  const float *vPtr = static_cast<const float *>(v->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[offset + c] = APtr[offset + c] + vPtr[r];
-		}
-	}
-}
-
-template <>
-void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t i = 0; i < num; i++) {
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+  }
+}
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
     outPtr[i] = pow(inPtr[i], x);
   }
 }
@@ -220,252 +189,230 @@ void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob
 template <>
 void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
                            Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr= static_cast<const float *>(in1->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
     outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
   }
 }
-
 template <>
-void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
-														 Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t i = 0; i < num; i++) {
-		if (inPtr[i] > high) {
-			outPtr[i] = high;
-		}
-		else if (inPtr[i] < low) {
-			outPtr[i] = low;
-		}
-		else {
-			outPtr[i] = inPtr[i];			
-		}
-	}
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+  }
 }
-
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, 
-													 Blob *out, Context *ctx) {
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] + x;
+    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
   }
 }
 
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] + in2Ptr[i];
+    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] - in2Ptr[i];
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = sqrt(inPtr[i]);
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * x;
+    outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] * in2Ptr[i];
+    outPtr[i] = in1Ptr[i] - in2Ptr[i];
   }
 }
 
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
 template <>
-void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                           Context *ctx) {
+  float s = 0.f;
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-		CHECK_NE(in2Ptr[i],0.f);
-    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+    s += inPtr[i];
   }
+  *out = s;
 }
 
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, 
-         								  Blob *out, Context *ctx) {
-	float *outPtr= static_cast<float *>(out->mutable_data());
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-		CHECK_NE(inPtr[i],0.f);
-    outPtr[i] = x / inPtr[i];
+    outPtr[i] = tanh(inPtr[i]);
   }
 }
 
+// =========Matrix operations ================================================
+
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
-           Blob *out, Context *ctx) {
-	float *outPtr= static_cast<float *>(out->mutable_data());
-	const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-	for (size_t r = 0; r < m ; r++) {
-		size_t offset = r * n;
-		for (size_t c = 0; c < n; c++) {
-			outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
-		}
-	}
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+    }
+  }
 }
 
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+    }
   }
 }
-
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                             const Blob *in2, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t r = 0; r < m; r++) {
+    size_t offset = r * n;
+    for (size_t c = 0; c < n; c++) {
+      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+    }
   }
 }
-
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      bPtr[c] = exp(inPtr[offset + c]);
+      denom += bPtr[c];
+    }
+    for (size_t c = 0; c < ncol; c++) {
+      size_t idx = offset + c;
+      outPtr[idx] = bPtr[c] / denom;
+    }
   }
+  delete bPtr;
 }
 
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                  const Blob *in, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  for (size_t c = 0; c < ncol; c++) {
+    outPtr[c] = 0.f;
+  }
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[c] += inPtr[offset + c];
+    }
   }
 }
 
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
-	size_t maxPos = 0;
-	float maxVal = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		if (i == 0) {
-			maxVal = inPtr[i]; 
-		}
-		else if (inPtr[i] > maxVal) {
-			maxVal = inPtr[i];
-			maxPos = i;
-		}
-	}
-	*out = maxPos;
-}
-
-template <>
-void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
-	size_t minPos = 0;
-	float minVal = 0;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		if (i == 0) {
-			minVal = inPtr[i]; 
-		}
-		else if (inPtr[i] > minVal) {
-			minVal = inPtr[i];
-			minPos = i;
-		}
-	}
-	*out = minPos;
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    outPtr[r] = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[r] += inPtr[offset + c];
+    }
+  }
 }
 
+// ===============Random operations==========================================
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
-	float sum = 0;
-	const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		sum += fabs(inPtr[i]);
-	}
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
+                                 Context *ctx) {
+  std::bernoulli_distribution distribution(p);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+  }
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-          							 	  Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
+                                const float std, Blob *out, Context *ctx) {
+  std::normal_distribution<float> distribution(mean, std);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {	
-		outPtr[i] += alpha * inPtr[i];
-	}
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
 }
-
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-	for (size_t i = 0; i < num; i++) {
-		outPtr[i] *= x;
-	}
+void Uniform<float, lang::Cpp>(const size_t num, const float low,
+                               const float high, Blob *out, Context *ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
 }
 
-//template <>
-//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-//         									 float *out, Context *ctx) {
-//	float sum = 0;
-//	const float *in1Ptr = static_cast<const float *>(in1->data());
-//	const float *in2Ptr = static_cast<const float *>(in2->data());
-//	for (size_t i = 0; i < num; i++) {
-//		sum += in1Ptr[i] * in2Ptr[i];
-//	}
-//}
-
-template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
-          const Blob *A, const Blob *v, const float beta, 
-					Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-	const float* APtr = static_cast<const float *>(A->data());
-	const float* vPtr = static_cast<const float *>(v->data());
-	for (size_t r = 0; r < m; r++) {
-		float sum = 0; 
-		for (size_t c = 0; c < n; c++) {
-			size_t idx = trans ? c * m + r : r * n + c;	
-			sum += APtr[idx] * vPtr[c];
-		}
-		outPtr[r] = alpha * sum + beta * outPtr[r];
-	}
-}
+// ====================Blas operations======================================
 
 template <>
 void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
@@ -491,37 +438,21 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
   }
 }
 
+#ifdef USE_CBLAS
 template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
-  std::bernoulli_distribution distribution(p);
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                            Blob *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
-  }
+  cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
 }
-
-template <>
-void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
-                               Context *ctx) {
-  std::uniform_real_distribution<float> distribution(low, high);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-  }
-}
-
 template <>
-void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
-                                Context *ctx) {
-  std::normal_distribution<float> distribution(mean, std);
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-  }
+  cblas_sscal(num, x, outPtr, 1);
 }
 
-
-#ifdef USE_CBLAS
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
                            float *out, Context *ctx) {
@@ -529,6 +460,21 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
   const float *in2Ptr = static_cast<const float *>(in2->data());
   *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Blob *A, const Blob *v,
+                            const float beta, Blob *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  if (!trans) {
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+                beta, outPtr, 1);
+  } else {
+    cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
+                outPtr, 1);
+  }
+}
 
 template <>
 void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
@@ -548,6 +494,98 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
               lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
+#else
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  size_t maxPos = 0;
+  float maxVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      maxVal = inPtr[i];
+    } else if (inPtr[i] > maxVal) {
+      maxVal = inPtr[i];
+      maxPos = i;
+    }
+  }
+  *out = maxPos;
+}
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  size_t minPos = 0;
+  float minVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      minVal = inPtr[i];
+    } else if (inPtr[i] > minVal) {
+      minVal = inPtr[i];
+      minPos = i;
+    }
+  }
+  *out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  float sum = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += fabs(inPtr[i]);
+  }
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] += alpha * inPtr[i];
+  }
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] *= x;
+  }
+}
+
+template <>
+void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           float *out, Context *ctx) {
+  float sum = 0;
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += in1Ptr[i] * in2Ptr[i];
+  }
+}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Blob *A, const Blob *v,
+                            const float beta, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < m; r++) {
+    float sum = 0;
+    for (size_t c = 0; c < n; c++) {
+      size_t idx = trans ? c * m + r : r * n + c;
+      sum += APtr[idx] * vPtr[c];
+    }
+    outPtr[r] = alpha * sum + beta * outPtr[r];
+  }
+}
+
 #endif  // USE_CBLAS
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..f9841a3 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -26,75 +26,100 @@
 #include "singa/core/common.h"
 
 namespace singa {
-
-// TODO(wangwei) Clean implementations following comments in tensor_math_cpp.h.
-// TODO(wangwei) optimize using stream
+// =================Elementwise operations===================================
 template <>
-void Add<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
-                            Blob *ret, Context *ctx) {
-  const float *a = static_cast<const float *>(lhs->data());
-  const float *b = static_cast<const float *>(rhs->data());
-  float *c = static_cast<float *>(ret->mutable_data());
-  cuda::add(count, a, b, c);
+void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+                            Blob *out, Context *ctx) {
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cuda::add(num, in1Ptr, in2Ptr, outPtr);
 }
 
-// TODO(wangwei) optimize using stream
+// follow the consistency guide of math API
 template <>
-void Sub<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
-                            Blob *ret, Context *ctx) {
-  const float *a = static_cast<const float *>(lhs->data());
-  const float *b = static_cast<const float *>(rhs->data());
-  float *c = static_cast<float *>(ret->mutable_data());
-  cuda::sub(count, a, b, c);
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::Div(num, x, inPtr, outPtr, ctx->stream);
 }
 
 template <>
-void EltwiseMult<float, lang::Cuda>(int count, const Blob *input, float x,
-                                    Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  cuda::mult(count, lptr, x, dptr);
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
+                                    const float x, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::mult(num, inPtr, x, outPtr);
 }
-// TODO(wangwei) optimize using stream
 template <>
-void Square<float, lang::Cuda>(int count, const Blob *input, Blob *ret,
-                               Context *ctx) {
-  const float *in = static_cast<const float *>(input->data());
-  float *out = static_cast<float *>(ret->mutable_data());
-  cuda::square(count, in, out);
+void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
 }
-
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
 template <>
-void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
-                            Context *ctx) {
-  const float *in = static_cast<const float *>(input->data());
-  cuda::sum(count, in, ret);
+void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
 }
-
-// follow the consistency guide of math API
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
-                            Blob *out, Context *ctx) {
+void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  cuda::Div(num, alpha, inPtr, outPtr, ctx->stream);
+  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
 }
-
 template <>
 void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   cuda::Set(num, x, outPtr, ctx->stream);
 }
+// TODO(wangwei) optimize using stream
+template <>
+void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
+                               Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::square(num, inPtr, outPtr);
+}
+// TODO(wangwei) optimize using stream
+template <>
+void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+}
+// sum all elements of input into ret
+// TODO(wangwei) optimize using stream
+template <>
+void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::sum(num, inPtr, out);
+}
+
+// =========================Blas operations==================================
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              const size_t ncol, const Blob *M, const Blob *v,
                              Blob *out, Context *ctx) {
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   const float *MPtr = static_cast<const float *>(M->data());
   const float *vPtr = static_cast<const float *>(v->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -106,6 +131,22 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              vPtr, 1, outPtr, ncol));
   }
 }
+template <>
+void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
+                             const float alpha, const Blob *A, const Blob *v,
+                             const float beta, Blob *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  if (!trans)
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
+                             1, &beta, outPtr, 1));
+  else
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, APtr, m, vPtr,
+                             1, &beta, outPtr, 1));
+}
+
 // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
 template <>
 void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
@@ -121,44 +162,11 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
   const float *APtr = static_cast<const float *>(A->data());
   const float *BPtr = static_cast<const float *>(B->data());
   float *CPtr = static_cast<float *>(C->mutable_data());
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
 }
 
-template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out, Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out,  Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out, Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out,  Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
-}
-
-
-
-
-
 }  // namespace singa
 
 #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 823445f..94ca283 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -117,12 +117,11 @@ TEST_F(TestTensorMath, MemberTanh) {
 }
 
 TEST_F(TestTensorMath, Sum) {
-	Tensor p1(Shape{1,2});
-	p1 = Sum(e, 0);
+	Tensor p1 = Sum(e, 0);
   const float *dptr1 = p1.data<const float *>();
 	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
 	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
-	
+
 	Tensor p2(Shape{3,1});
 	p2 = Sum(e, 1);
   const float *dptr2 = p2.data<const float *>();
@@ -143,9 +142,9 @@ TEST_F(TestTensorMath, SoftMax) {
 	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
 	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
 	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
-	
+
 	Tensor p2(Shape{3,2});
-	p2 = SoftMax(e,1); 
+	p2 = SoftMax(e,1);
   const float *dptr2 = p2.data<const float *>();
 	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
 	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
@@ -237,12 +236,12 @@ TEST_F(TestTensorMath, MemberDiv) {
 
 TEST_F(TestTensorMath, MemberBernoulli) {
 	Tensor p1(Shape{10000});
-	Bernoulli(0.3,&p1);
+	Bernoulli(0.3f, &p1);
 	const float* dptr1 = p1.data<const float*>();
 	float sum = 0;
 	for(int i = 0; i < 10000; i++) sum += dptr1[i];
 	float mean = sum/10000;
-	EXPECT_NEAR(mean, 0.3, 1e-2);
+	EXPECT_NEAR(mean, 0.3f, 1e-2);
 
 	sum = 0;
 	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
@@ -267,7 +266,7 @@ TEST_F(TestTensorMath, MemberUniform) {
 
 TEST_F(TestTensorMath, MemberGaussian) {
 	Tensor p1(Shape{50000});
-	Gaussian(0.0,1.0,&p1);
+	Gaussian(0.0f,1.0f,&p1);
 	const float* dptr1 = p1.data<const float*>();
 	float sum = 0;
 	for(int i = 0; i < 50000; i++) sum += dptr1[i];

[22/50] [abbrv] incubator-singa git commit: SINGA0-183 Add the base classes for optimizer, constraint and regularizer

Posted by zh...@apache.org.

SINGA0-183 Add the base classes for optimizer, constraint and regularizer

Draft base optimizer, constraint and regularizer classes. The API for local all reduce is also added  (in comments).
Test sgd with/without momentum using cpp and cuda.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/2dac3808
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/2dac3808
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/2dac3808

Branch: refs/heads/master
Commit: 2dac380872402e72b4250981cd99c6c59d66184d
Parents: 7d149ec
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Tue May 24 22:09:24 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon May 30 20:48:32 2016 +0800

----------------------------------------------------------------------
 include/singa/model/optimizer.h         | 222 +++++++++++++++++++++++++++
 src/CMakeLists.txt                      |   1 +
 src/model/optimizer/local_all_reduce.cc |  25 +++
 src/model/optimizer/optimizer.cc        |  93 +++++++++++
 src/model/optimizer/sgd.cc              |  49 ++++++
 src/proto/model.proto                   |  35 ++++-
 test/singa/test_sgd.cc                  | 150 ++++++++++++++++++
 7 files changed, 574 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2dac3808/include/singa/model/optimizer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h
new file mode 100644
index 0000000..7ca9f53
--- /dev/null
+++ b/include/singa/model/optimizer.h
@@ -0,0 +1,222 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_OPTIMIZER_H_
+#define SINGA_MODEL_OPTIMIZER_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "singa/core/tensor.h"
+#include "singa/proto/model.pb.h"
+
+using std::string;
+using std::vector;
+using std::unordered_map;
+namespace singa {
+class Constraint;
+class Regularizer;
+/// The base class for gradient descent algorithms used to update the model
+/// parameters in order to optimize the objective (loss) function.
+/// It updates parameters based on the gradients of the loss w.r.t each
+/// parameter. Most sub-classes uses first order gradients.
+/// An overview of gradient descent algorithms,
+/// http://sebastianruder.com/optimizing-gradient-descent/
+class Optimizer {
+ public:
+  Optimizer() = default;
+
+  /// Setup the optimzier using configurations from serialized string (for
+  /// binding languages).
+  void Setup(const string& str) {
+    OptimizerConf conf;
+    conf.ParseFromString(str);
+    this->Setup(conf);
+  }
+
+  /// Setup the meta fields of the optimizer
+  virtual void Setup(const OptimizerConf& conf) {}
+  /// Register the parameter, e.g., create Constraint and Regularizers.
+  /// If there is no constraint or regularizer, then no need to register the
+  /// parameter.
+  virtual void Register(const string& name, const ParamSpec& specs);
+
+  /// Apply the updating algorithm.
+  /// No learning rate scaling, gradient constraints/regularization will be
+  /// conducted. It assumes all these operations are done either by users or
+  /// by Apply(int, const string&, Tensor*, Tensor*).
+  /// All sub-classes should override this function.
+  virtual void Apply(int step, float lr, const string& name, Tensor* grad,
+                     Tensor* value) = 0;
+
+  /// Apply the updating algorithm.
+  /// It will apply regularization and constraint to the parameters if
+  /// configured during Register(). If will also scale the learning rate if
+  /// configured in ParamSpecs (see Register).
+  void Apply(int step, const string& name, Tensor* grad, Tensor* value);
+
+  /// The argument is a function that returns the learning rate given the
+  /// current step (i.e., curren running iteration).
+  void SetLearningRateGenerator(function<float(int)> func) {
+    learning_rate_generator_ = func;
+  }
+
+ protected:
+  function<float(int)> learning_rate_generator_;
+  std::unordered_map<std::string, float> learning_rate_multplier_;
+  std::unordered_map<std::string, Constraint*> constraints_;
+  std::unordered_map<std::string, Regularizer*> regularizers_;
+};
+
+/// Apply constraints for parameters (gradient).
+/// E.g., restrict the norm of parmeter gradients to be within a threshold.
+/// \ref http://keras.io/constraints/
+/// TODO(wangwei) implement a sub-class for each type of constraint
+class Constraint {
+ public:
+  Constraint() = default;
+  explicit Constraint(const ConstraintConf& conf) { Setup(conf); }
+  Constraint(const string& type, float threshold)
+      : type_(type), threshold_(threshold) {}
+  void Setup(const ConstraintConf& conf);
+  void Setup(const string& conf_str) {
+    ConstraintConf conf;
+    conf.ParseFromString(conf_str);
+    Setup(conf);
+  }
+  /// Apply the constraint to a single parmeter object, e.g., W, or b
+  /// e.g., clip each gradient if it is too large w.r.t the threshold,
+  /// \ref
+  /// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
+  void Apply(int step, Tensor* grad, Tensor* value);
+  /// Apply the constraint for multiple parameter objects together.
+  /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
+  void Apply(int step, const vector<Tensor*>& grads,
+             const vector<Tensor*>& values);
+ private:
+  /// currently only support "L2" norm constraint, i.e., the norm should be less
+  /// than the configured threshold_, otherwise, the parameters would be clipped
+  /// to make the norm within that threshold.
+  /// TODO(wangwei) consider other constraint, e.g., hard clip and unitnorm.
+  string type_ = "Unknown";
+  float threshold_;
+};
+
+/// Apply regularization for parameters (gradient), e.g., L1 norm and L2 norm.
+/// TODO(wangwei) implement a sub-class for each type of regularizer
+class Regularizer {
+ public:
+  Regularizer() = default;
+  explicit Regularizer(const RegularizerConf& conf) { Setup(conf); }
+  Regularizer(const string& type, float coefficient)
+      : type_(type), coefficient_(coefficient) {}
+  void Setup(const RegularizerConf& conf);
+  void Setup(const string& conf_str) {
+    RegularizerConf conf;
+    conf.ParseFromString(conf_str);
+    Setup(conf);
+  }
+
+  /// Apply the regularizer to a single parmeter object, e.g., W, or b
+  /// e.g., clip each gradient if it is too large w.r.t the threshold,
+  /// \ref
+  /// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
+  void Apply(int step, Tensor* grad, Tensor* value);
+  /// Apply the regularizer for multiple parameter objects together.
+  /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
+  void Apply(int step, const vector<Tensor*>& grads,
+             const vector<Tensor*>& values);
+ private:
+  /// currently only support "L2" regularizer. type_ is case insensitive.
+  /// TODO(wangwei) add more regularizer, e.g., L1.
+  string type_ = "NotSet";
+  float coefficient_;
+};
+
+// =============Vallina SGD with Momentum=====================================
+class SGD : Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, Tensor* grad,
+             Tensor* value) override;
+
+  /// The argument function returns the momentum value given the current running
+  /// step (i.e., iterations/mini-batches).
+  void SetMomentumGenerator(std::function<float(int)> func) {
+    momentum_generator_ = func;
+  }
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  std::function<float(int)> momentum_generator_;
+};
+
+// ============LocalAllReduce for single node multiple workers ==============
+/// Updater for training models on a single node with multiple devices (workers)
+/// All model parameters are partitioned such that each parameter is updated on
+/// one device. In specific, each worker has a model replica. All workers share
+/// the same LocalAllReduce instance. Parameters are registered at first, and
+/// then after every iteration, the gradients are aggregated by one worker (or
+/// device) for parameter updating.
+/*
+class LocalAllReduce : public Optimizer{
+ pulbic:
+  LocalAllReduce(Optimizer* opt);
+  void Setup(const string& str) {
+    AllReduce conf;
+    conf.ParseFromString(str);
+    this->Setup(conf);
+  }
+  void Setup(const AllReduce& conf) {}
+
+  /// Register all model parameters.
+  /// Instructions include:
+  /// 1. Copy parameters from the master worker (who initialized the parameters)
+  /// to others.
+  /// 2. Partition parameters onto worker devices. For example, model parameter
+  /// set is {A, B, C}, nb_workers = 3, then worker 0/1/2 would be in charge of
+  /// updating A/B/C respectively. A gradient Tensor for A/B/C would be created
+  /// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt to register the specs
+  /// for A/B/C.
+  void Register(const vector<string>& names,
+                const vector<Tensor>& values,
+                const vector<ParamSpecs>& specs) override;
+
+  /// Aggregate parameter gradients and call internal opt to do the update.
+  /// Continue with the example for Register(), worker 0 would copy B's gradient
+  /// to device 1 and add it with GB.  A callback func is added to
+  /// 1. check UpdateNow() and call opt to do the real update.
+  /// 2. broadcast the new parameters back to worker 0 and 2.
+  void Update(int step, float lr, const string& name, const Tensor& grad,
+              Tensor* param) override;
+
+  /// Decide when to call the internal Optimizer for real update.
+  /// One simple implementation would return true until all workers has
+  /// aggregated their gradients. We can also add a user configuration field
+  /// to control this, e.g., if do it when 80% workers has aggregated.
+  boo UpdateNow();
+
+ private:
+  int nb_workers_;
+  vector<Tensor> aggregated_gradients_;
+};
+*/
+}
+#endif  // SINGA_MODEL_OPTIMIZER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2dac3808/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index df8b22b..28066de 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,6 +35,7 @@ LIST(APPEND SINGA_LINKER_LIBS singa_core)
 #FILE(GLOB_RECURSE model_source ${CMAKE_CURRENT_SOURCE_DIR}/model/ "*.cc")
 AUX_SOURCE_DIRECTORY(model model_source)
 AUX_SOURCE_DIRECTORY(model/layer model_source)
+AUX_SOURCE_DIRECTORY(model/optimizer model_source)
 #MESSAGE(STATUS "MODEL ${model_source}")
 ADD_LIBRARY(singa_model SHARED ${model_source})
 TARGET_LINK_LIBRARIES(singa_model ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2dac3808/src/model/optimizer/local_all_reduce.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/local_all_reduce.cc b/src/model/optimizer/local_all_reduce.cc
new file mode 100644
index 0000000..ea03e39
--- /dev/null
+++ b/src/model/optimizer/local_all_reduce.cc
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_LOCAL_ALL_REDUCE_H_
+#define SRC_MODEL_OPTIMIZER_LOCAL_ALL_REDUCE_H_
+#include "singa/model/optimizer.h"
+
+namespace singa {
+}
+
+#endif  // SRC_MODEL_OPTIMIZER_LOCAL_ALL_REDUCE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2dac3808/src/model/optimizer/optimizer.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/optimizer.cc b/src/model/optimizer/optimizer.cc
new file mode 100644
index 0000000..92b6b3d
--- /dev/null
+++ b/src/model/optimizer/optimizer.cc
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/optimizer.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+
+void Optimizer::Register(const string& name, const ParamSpec& specs) {
+  if (specs.has_constraint()) {
+    CHECK(constraints_.find(name) == constraints_.end())
+        << "Parameter with name = " << name << " has already registered";
+    constraints_[name] = new Constraint(specs.constraint());
+  }
+  if (specs.has_regularizer()) {
+    CHECK(regularizers_.find(name) == regularizers_.end())
+        << "Parameter with name = " << name << " has already registered";
+    regularizers_[name] = new Regularizer(specs.regularizer());
+  }
+  if (specs.has_lr_mult()) {
+    CHECK(learning_rate_multplier_.find(name) == learning_rate_multplier_.end())
+        << "Parameter with name = " << name << " has already registered";
+    learning_rate_multplier_[name] = specs.lr_mult();
+  }
+  /*
+  if (specs.has_lr_generator()) {
+    LOG(FATAL) << "Not implemented yet";
+  }
+  */
+}
+
+void Optimizer::Apply(int step, const string& name, Tensor* grad,
+                      Tensor* param) {
+  // TODO(wangwei) need to consider the order of constraint and regularizer
+  if (regularizers_.find(name) != regularizers_.end())
+    regularizers_.at(name)->Apply(step, param, grad);
+  if (constraints_.find(name) != constraints_.end())
+    constraints_.at(name)->Apply(step, param, grad);
+  float lr = learning_rate_generator_(step);
+  if (learning_rate_multplier_.find(name) != learning_rate_multplier_.end())
+    lr *= learning_rate_multplier_.at(name);
+  Apply(step, lr, name, grad, param);
+}
+
+void Regularizer::Setup(const RegularizerConf& conf) {
+  type_ = conf.type();
+  coefficient_ = conf.coefficient();
+}
+
+void Regularizer::Apply(int step, Tensor* value, Tensor* grad) {
+  if (type_ == "L2" || type_ == "l2") {
+    (*grad) -= (*value) * coefficient_;
+  } else {
+    CHECK(type_ == "NotSet") << "Unknown regularizer type = " << type_;
+  }
+}
+
+void Regularizer::Apply(int step, const vector<Tensor*>& values,
+                        const vector<Tensor*>& grads) {
+  LOG(FATAL) << "Not implemented yet";
+}
+
+void Constraint::Setup(const ConstraintConf& conf) {
+  type_ = conf.type();
+  threshold_ = conf.threshold();
+}
+
+void Constraint::Apply(int step, Tensor* value, Tensor* grad) {
+  // TODO(wangwei) implement L2 and hard constraint
+  CHECK(type_ == "NotSet") << "Unknown regularizer type = " << type_;
+}
+
+void Constraint::Apply(int step, const vector<Tensor*>& values,
+                       const vector<Tensor*>& grads) {
+  LOG(FATAL) << "Not implemented yet";
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2dac3808/src/model/optimizer/sgd.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/sgd.cc b/src/model/optimizer/sgd.cc
new file mode 100644
index 0000000..49c17c9
--- /dev/null
+++ b/src/model/optimizer/sgd.cc
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_SGD_H_
+#define SRC_MODEL_OPTIMIZER_SGD_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void SGD::Setup(const OptimizerConf& conf) {
+  if (conf.has_momentum()) {
+    float m = conf.momentum();
+    SetMomentumGenerator([m](int step) { return m; });
+  }
+}
+
+void SGD::Apply(int step, float lr, const string& name, Tensor* grad,
+                Tensor* value) {
+  (*grad) *= lr;
+  if (momentum_generator_) {
+    float mom = momentum_generator_(step);
+    if (mom != 0) {
+      if (history_gradient_.find(name) == history_gradient_.end())
+        history_gradient_[name].ResetLike(*value);
+      Tensor& history = history_gradient_[name];
+      history *= mom;
+      history += *grad;
+      (*value) -= history;
+      return;
+    }
+  }
+  (*value) -= *grad;
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_SGD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2dac3808/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 66296d5..1b18703 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -52,7 +52,7 @@ message BlobProto {
 }
 
 message FillerConf {
-  // The filler type.
+  // The filler type, case insensitive
   optional string type = 1 [default = 'constant'];
   optional float value = 2 [default = 0]; // the value in constant filler
   optional float min = 3 [default = 0]; // the min value in uniform filler
@@ -72,6 +72,37 @@ message FillerConf {
   optional VarianceNorm variance_norm = 8 [default = FAN_IN];
 }
 
+/// SINGA message
+message OptimizerConf {
+  // case insensitive
+  optional string type = 1 [default = "sgd"];
+
+  // used by RMSprop and Adadelta
+  optional float rho = 2 [default = 0.001];
+
+  // used by Adam and AdamMax
+  optional float beta_1 = 3 [default = 0.9];
+  optional float beta_2 = 4 [default = 0.999];
+
+  // used by vanilla sgd and nesterov
+  optional float momentum = 5 [default = 0.9];
+}
+
+message ConstraintConf {
+  // case insensitive to limit the parameter value/gradient scale
+  optional string type = 1 [default = "l2"];
+  // e.g., the threshold for limiting the parameter scale.
+  optional float threshold = 2;
+}
+
+/// SINGA message
+message RegularizerConf {
+  // case insensitive to regularize the parameters, e.g., L2.
+  optional string type = 1 [default = "l2"];
+  // e.g., the weight decay for L2 regularizer
+  optional float coefficient = 2;
+}
+
 // Specifies training parameters (multipliers on global learning constants,
 // and the name and other settings used for weight sharing).
 message ParamSpec {
@@ -101,6 +132,8 @@ message ParamSpec {
   // SINGA uses this filed internally. Users just configure the fillers in
   // Layer specific conf message as caffe (style).
   optional FillerConf filler = 20;
+  optional ConstraintConf constraint = 21;
+  optional RegularizerConf regularizer = 22;
 }
 
 enum Phase {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2dac3808/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
new file mode 100644
index 0000000..a660556
--- /dev/null
+++ b/test/singa/test_sgd.cc
@@ -0,0 +1,150 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include "singa_config.h"
+
+TEST(SGD, ApplyWithoutMomentum) {
+  singa::SGD sgd;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.1, 0.1, 0.1, 0.1};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  float lr = 0.1f;
+  sgd.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+
+  lr /= 2;
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - g[i] * lr);
+  }
+}
+
+
+TEST(SGD, ApplyWithMomentum) {
+  singa::SGD sgd;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <=5 ? 0.5f: 0.9f;};
+  sgd.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  sgd.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - (g[i] * lr + g[i] * lr * func(1)));
+  }
+}
+
+#ifndef USE_CUDA
+TEST(SGD, ApplyWithoutMomentumCuda) {
+  singa::SGD sgd;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.1, 0.1, 0.1, 0.1};
+
+  singa::CudaGPU dev;
+  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  float lr = 0.1f;
+  sgd.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+
+  lr /= 2;
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - g[i] * lr);
+  }
+}
+
+
+TEST(SGD, ApplyWithMomentumCuda) {
+  singa::SGD sgd;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <=5 ? 0.5f: 0.9f;};
+  sgd.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::CudaGPU dev;
+  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  sgd.Apply(0, lr, "xx", &grad, &value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", &grad, &value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<const float*>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - (g[i] * lr + g[i] * lr * func(1)));
+  }
+}
+#endif