You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@singa.apache.org by ka...@apache.org on 2016/06/13 09:19:49 UTC

[1/3] incubator-singa git commit: SINGA-184 Add Cross Entropy loss computation

Repository: incubator-singa
Updated Branches:
  refs/heads/dev 26df5ac03 -> 21e4b2d79


SINGA-184 Add Cross Entropy loss computation

Implement Cross Entropy loss
Pass cpplint.py, test pass compilation
Todo: check test


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/efd7b627
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/efd7b627
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/efd7b627

Branch: refs/heads/dev
Commit: efd7b627bacb4acd6a3322468350f2b5399f725b
Parents: 3e2507b
Author: kaiping <ka...@comp.nus.edu.sg>
Authored: Fri May 27 12:09:30 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Tue May 31 22:14:09 2016 +0800

----------------------------------------------------------------------
 src/model/loss/cross_entropy.h   | 105 ++++++++++++++++++++++++++++++++++
 test/singa/test_cross_entropy.cc |  66 +++++++++++++++++++++
 2 files changed, 171 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/efd7b627/src/model/loss/cross_entropy.h
----------------------------------------------------------------------
diff --git a/src/model/loss/cross_entropy.h b/src/model/loss/cross_entropy.h
new file mode 100644
index 0000000..815b795
--- /dev/null
+++ b/src/model/loss/cross_entropy.h
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LOSS_CROSS_ENTROPY_H_
+#define SRC_MODEL_LOSS_CROSS_ENTROPY_H_
+#include <stack>
+#include "singa/model/loss.h"
+
+namespace singa {
+
+/// Cross entropy is for cross entropy loss.
+class CrossEntropy : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is sum {-log(prob_of_truth)}
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is: if the entry x corresponds to ground truth,
+  /// then softmax(x) - 1; else, softmax(x)
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., softmax(prediction), target
+  std::stack<Tensor> buf_;
+};
+
+Tensor CrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+
+  size_t batchsize = 1;
+  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
+  size_t dim = prediction.Size() / batchsize;
+  // a temporal Softmax layer for forward computation
+//  LayerConf conf; // TODO(kaiping): this is currently commented
+//  Softmax softmax_tmp;
+//  softmax_tmp.Setup(conf);
+//  Tensor softmax = softmax_tmp.Forward(0, prediction);
+
+  Tensor softmax(Shape{batchsize, dim});  // TODO(kaiping): Delete
+//  softmax.SetValue<float>(0.5f); // TODO(kaiping): Delete
+
+  softmax.Reshape(Shape{batchsize, dim});
+  // buffer intermediate data
+  buf_.push(softmax);
+  buf_.push(target);
+
+  // Compute loss for each sample
+  Tensor loss(Shape{batchsize, 1});
+  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
+  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
+  float * loss_ptr = reinterpret_cast<float*>(loss.blob()->mutable_data());
+  for (size_t i = 0; i < batchsize; i++) {
+    int ilabel = static_cast<int>(truth_ptr[i]);
+    CHECK_GE(ilabel, 0);
+    float prob_of_truth = pre_ptr[ilabel];
+    loss_ptr[i] = -log(prob_of_truth);
+    pre_ptr += dim;  // change to the next sample
+  }
+  return loss;
+}
+
+Tensor CrossEntropy::Backward() {
+  const Tensor& target = buf_.top();
+  buf_.pop();
+  Tensor softmax = buf_.top();
+  buf_.pop();
+
+  size_t batchsize = 1;
+  if (softmax.nDim() > 1)
+    batchsize = softmax.shape().at(0);
+  size_t dim = softmax.Size() / batchsize;
+  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
+  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
+  for (size_t i = 0; i < batchsize; i++) {
+    int ilabel = static_cast<int>(truth_ptr[i]);
+    // CHECK_GE(ilabel, 0);
+    pre_ptr[ilabel] -= 1.0;
+    pre_ptr += dim;  // change to the next sample
+  }
+  return softmax;
+}
+}  // namespace singa
+
+#endif  // SRC_MODEL_LOSS_CROSS_ENTROPY_H_
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/efd7b627/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
new file mode 100644
index 0000000..9bb2321
--- /dev/null
+++ b/test/singa/test_cross_entropy.cc
@@ -0,0 +1,66 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+#include "singa/core/device.h"
+#include "../src/model/loss/cross_entropy.h"
+
+using singa::Tensor;
+class TestCrossEntropy : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    p.Reshape(singa::Shape{2, 4});
+    t.Reshape(singa::Shape{2, 1});
+    p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+    t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
+  }
+  const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+  const float tdat[2] = {0.0, 2.0};
+
+  singa::Tensor p, t;
+};
+
+TEST_F(TestCrossEntropy, CppForward) {
+  singa::CrossEntropy cross_entropy;
+  const Tensor& loss = cross_entropy.Forward(p, t);
+  auto ldat = loss.data<const float*>();
+
+  const float result_test = -log(0.25);
+  EXPECT_FLOAT_EQ(ldat[0], result_test);
+  EXPECT_FLOAT_EQ(ldat[1], result_test);
+}
+
+TEST_F(TestCrossEntropy, CppBackward) {
+  singa::CrossEntropy cross_entropy;
+  cross_entropy.Forward(p, t);
+  const Tensor& grad = cross_entropy.Backward();
+
+  auto gdat = grad.data<const float*>();
+  EXPECT_FLOAT_EQ(gdat[0], -0.75);
+  EXPECT_FLOAT_EQ(gdat[1], 0.25);
+  EXPECT_FLOAT_EQ(gdat[2], 0.25);
+  EXPECT_FLOAT_EQ(gdat[3], 0.25);
+  EXPECT_FLOAT_EQ(gdat[4], 0.25);
+  EXPECT_FLOAT_EQ(gdat[5], 0.25);
+  EXPECT_FLOAT_EQ(gdat[6], -0.75);
+  EXPECT_FLOAT_EQ(gdat[7], 0.25);
+}

[3/3] incubator-singa git commit: SINGA-184 Add Cross Entropy loss computation

Posted by ka...@apache.org.

SINGA-184 Add Cross Entropy loss computation

Merge with asf dev.
Fix the bug for cuda crossentropy caused by the target type (should use
int instead of float).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/21e4b2d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/21e4b2d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/21e4b2d7

Branch: refs/heads/dev
Commit: 21e4b2d79a4bd3acfb8b487cf96c197da464ae70
Parents: ec17aca 26df5ac
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 13 13:04:23 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 13:04:23 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                          |   4 +-
 cmake/Dependencies.cmake                |   7 +-
 cmake/ProtoBuf.cmake                    | 116 ----
 include/singa/core/common.h             |   2 +-
 include/singa/core/tensor.h             | 400 +++++++-------
 src/CMakeLists.txt                      |  27 +-
 src/core/tensor/math_kernel.cu          | 707 +++++++++++++------------
 src/core/tensor/math_kernel.h           |  93 ++--
 src/core/tensor/tensor.cc               | 761 ++++++++++++++-------------
 src/core/tensor/tensor_math.h           | 404 +++++++-------
 src/core/tensor/tensor_math_cpp.h       | 610 +++++++++++++++------
 src/core/tensor/tensor_math_cuda.h      | 423 +++++++++++----
 src/model/layer/activation.cc           |  10 +-
 src/model/layer/batchnorm.cc            |  70 +++
 src/model/layer/batchnorm.h             |  84 +++
 src/model/layer/cudnn_activation.cc     |  13 +-
 src/model/layer/cudnn_batchnorm.cc      | 214 ++++++++
 src/model/layer/cudnn_batchnorm.h       |  60 +++
 src/model/layer/cudnn_convolution.cc    | 183 ++++---
 src/model/layer/cudnn_lrn.cc            | 118 +++++
 src/model/layer/cudnn_lrn.h             |  56 ++
 src/model/layer/cudnn_pooling.cc        |   7 +-
 src/model/layer/cudnn_softmax.cc        |   4 +-
 src/model/layer/dense.cc                |  86 +++
 src/model/layer/dense.h                 |  70 +++
 src/model/layer/flatten.cc              |  55 ++
 src/model/layer/flatten.h               |  53 ++
 src/model/layer/lrn.cc                  |  59 +++
 src/model/layer/lrn.h                   |  70 +++
 src/model/layer/prelu.cc                | 139 +++++
 src/model/layer/prelu.h                 |  59 +++
 src/model/layer/softmax.cc              |  10 +-
 src/model/loss/softmax_cross_entropy.cc |   8 +-
 src/proto/model.proto                   |  39 +-
 test/CMakeLists.txt                     |   1 +
 test/singa/test_activation.cc           |   8 +-
 test/singa/test_cross_entropy.cc        |   4 +-
 test/singa/test_cudnn_activation.cc     |   6 +-
 test/singa/test_cudnn_batchnorm.cc      | 257 +++++++++
 test/singa/test_cudnn_convolution.cc    | 181 +++++++
 test/singa/test_cudnn_lrn.cc            | 205 ++++++++
 test/singa/test_cudnn_softmax.cc        |   6 +-
 test/singa/test_dense.cc                | 249 +++++++++
 test/singa/test_flatten.cc              | 144 +++++
 test/singa/test_mse.cc                  |   1 -
 test/singa/test_prelu.cc                | 137 +++++
 test/singa/test_sgd.cc                  |   2 +-
 test/singa/test_softmax.cc              |  12 +-
 test/singa/test_tensor_math.cc          | 303 ++++++++++-
 49 files changed, 4873 insertions(+), 1664 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/CMakeLists.txt
----------------------------------------------------------------------
diff --cc CMakeLists.txt
index fbe3adc,fbe3adc..a9d9b17
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -10,7 -10,7 +10,9 @@@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT
  IF(UNIX OR APPLE)
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
  ENDIF()
--
++IF(CMAKE_BUILD_TYPE=Debug)
++  SET(NVCC_FLAG "${NVCC_FLAG} -g -G ")
++ENDIF()
  #message(STATUS "${CMAKE_CXX_FLAGS}")
  SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
  #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --cc include/singa/core/tensor.h
index 865e1e4,cd750c5..8cfa705
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@@ -212,199 -213,171 +213,180 @@@ Tensor Reshape(const Tensor &in, Shape 
  
  /// Copy 'num' elements of src to dst.
  /// The first 'src_offset' ('dst_offset') elements will be skipped.
- void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                     size_t src_offset = 0, size_t dst_offset = 0);
- 
- // ==================Simple Linear Algebra Operations=========================
- Tensor Abs(const Tensor &t);
- Tensor Exp(const Tensor &t);
- Tensor Log(const Tensor &t);
- Tensor ReLU(const Tensor &t);
- Tensor Sigmoid(const Tensor &t);
- Tensor Sign(const Tensor &t);
- Tensor Sqrt(const Tensor &t);
- Tensor Square(const Tensor &t);
- Tensor Tanh(const Tensor &t);
+ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                     const size_t src_offset = 0, const size_t dst_offset = 0);
+ 
+ // =============Element-wise operations====================================
+ Tensor Abs(const Tensor &in);
+ Tensor Exp(const Tensor &in);
+ Tensor Log(const Tensor &in);
+ Tensor ReLU(const Tensor &in);
+ Tensor Sigmoid(const Tensor &in);
+ Tensor Sign(const Tensor &in);
+ Tensor Sqrt(const Tensor &in);
+ Tensor Square(const Tensor &in);
+ Tensor Tanh(const Tensor &in);
+ 
+ /// Element-wise opeartion, out[i]=in[i]^x
+ template <typename SType>
+ Tensor Pow(const Tensor &in, const SType x);
+ /// Element-wise opeartion, out[i]=in[i]^x
+ template <typename SType>
+ void Pow(const Tensor &in, const SType x, Tensor *out);
+ /// Element-wise opeartion, out[i]=baes[i]^exp[i]
+ Tensor Pow(const Tensor &base, const Tensor &exp);
+ /// Element-wise opeartion, out[i]=baes[i]^exp[i]
+ void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
  
+ /// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
  template <typename SType>
- SType Sum(const Tensor &t);
- /// Sum elements in the Tensor, currently only support vector and matrix.
- /// if 'axis' is 0, sum all rows into a single row
- /// if 'axis' is 1, sum all columns into a single column
- /// TODO(wangwei) support arbitrary Tensor like numpy.sum
- Tensor Sum(const Tensor &t, int axis);
+ Tensor operator<(const Tensor &in, const SType x);
+ template <typename SType>
+ void LT(const Tensor &in, const SType x, Tensor *out);
  
- /// Average elements in the Tensor, currently only support vector and matrix.
- /// if 'axis' is 0, average all rows into a single row
- /// if 'axis' is 1, average all columns into a single column
- /// TODO(wangwei) support arbitrary Tensor like numpy.average
- Tensor Average(const Tensor &t, int axis);
- /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
- Tensor SoftMax(const Tensor &in);
- /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
- void SoftMax(const Tensor &in, Tensor *out);
+ /// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator<=(const Tensor &in, const SType x);
+ template <typename SType>
+ void LE(const Tensor &in, const SType x, Tensor *out);
 -
+ /// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator>(const Tensor &in, const SType x);
+ template <typename SType>
+ void GT(const Tensor &in, const SType x, Tensor *out);
  
- /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
- /// and shape_[axis+1]*...*shape_[nDim()] columns.
- /// and do softmax along each row.
- // Tensor Softmax(const Tensor& t, int axis = -1);
- // void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
- 
- /// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator<(const Tensor &t, const DType x);
- template <typename DType>
- void LT(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator<=(const Tensor &t, const DType x);
- template <typename DType>
- void LE(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator>(const Tensor &t, const DType x);
- template <typename DType>
- void GT(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator>=(const Tensor &t, const DType x);
- template <typename DType>
- void GE(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise opeartion, ret[i]=t[i]^x
- template <typename DType>
- Tensor Pow(const Tensor &t, DType x);
- /// Element-wise opeartion, ret[i]=t[i]^x
- template <typename DType>
- void Pow(const Tensor &t, DType x, Tensor *ret);
- /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
- Tensor Pow(const Tensor &base, Tensor exp);
- /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
- void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
+ /// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator>=(const Tensor &in, const SType x);
+ template <typename SType>
+ void GE(const Tensor &in, const SType x, Tensor *out);
  
  Tensor operator+(const Tensor &lhs, const Tensor &rhs);
- void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator-(const Tensor &lhs, const Tensor &rhs);
- void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator*(const Tensor &lhs, const Tensor &rhs);
- void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator/(const Tensor &lhs, const Tensor &rhs);
- void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- 
- template <typename DType>
- Tensor operator+(const Tensor &t, DType x);
- template <typename DType>
- void Add(const Tensor &t, DType x, Tensor *ret);
- 
- template <typename DType>
- Tensor operator-(const Tensor &t, DType x);
- template <typename DType>
- void Sub(const Tensor &t, DType x, Tensor *ret);
+ void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  
- template <typename DType>
- Tensor operator*(const Tensor &t, DType x);
- template <typename DType>
- void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
- 
- template <typename DType>
- Tensor operator/(const Tensor &t, DType x);
- template <typename DType>
- void Div(const Tensor &t, DType x, Tensor *ret);
- 
- // ================Blas operations============================================
- // We fix the scalar argument type to be float.
+ template <typename SType>
+ Tensor operator+(const Tensor &in, const SType x);
+ template <typename SType>
+ void Add(const Tensor &in, const SType x, Tensor *out);
  
- // ===== Level 1
- // TODO(wangwei) make amax/amin/asum a member function of tensor
- // void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
- // void Asum(Tensor Context* ctx);
+ template <typename SType>
+ Tensor operator-(const Tensor &in, const SType x);
+ template <typename SType>
+ void Sub(const Tensor &in, const SType x, Tensor *out);
  
- // template <typename DType>
- // void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+ template <typename SType>
+ Tensor operator*(const Tensor &in, const SType x);
+ template <typename SType>
+ void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
  
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape.  result = A * B
- Tensor Mult(const Tensor &A, const Tensor &B);
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape.  C = A * B
- void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+ /// For each element e of Tensor 'in', compute e / x
+ template <typename SType>
+ Tensor operator/(const Tensor &in, const SType x);
+ /// For each element e of Tensor 'in', compute e / x into out
+ template <typename SType>
+ void Div(const Tensor &in, const SType x, Tensor *out);
  
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
- void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
-           const float beta, Tensor *C);
+ /// For each element e of Tensor 'in', compute x/e
+ template <typename SType>
+ Tensor Div(const SType x, const Tensor &in);
+ /// For each element e of Tensor 'in', compute x/e into 'out'
+ template <typename SType>
+ void Div(const SType x, const Tensor &in, Tensor *out);
  
- // ================Random operations==========================================
- /// For each element x set x = 1 if random() < p; otherwise x = 1.
- void Bernoulli(float p, Tensor *t);
- /// Fill in Tensor 't' following uniform distribution.
- void Uniform(float low, float high, Tensor *t);
- /// Fill in Tensor 't' following Gaussian distribution.
- void Gaussian(float mean, float std, Tensor *t);
+ template <typename SType>
+ SType Sum(const Tensor &in);
 -
+ // ============Matrix (row/column) operations==================================
+ /// Average elements in the Tensor, currently only support vector and matrix.
+ /// if 'axis' is 0, average all rows into a single row
+ /// if 'axis' is 1, average all columns into a single column
+ /// TODO(wangwei) support arbitrary Tensor like numpy.average
+ Tensor Average(const Tensor &in, const int axis);
 -/// Sum elements in the Tensor, currently only support vector and matrix.
 -/// if 'axis' is 0, sum all rows into a single row
 -/// if 'axis' is 1, sum all columns into a single column
 -/// TODO(wangwei) support arbitrary Tensor like numpy.sum
 -Tensor Sum(const Tensor &in, const int axis);
 -/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
 -/// and shape_[axis]*...*shape_[nDim()] columns.
 -/// and do softmax along each row.
 -Tensor SoftMax(const Tensor &in, const int axis = 0);
 -void SoftMax(const Tensor &in, const int axis, Tensor *out);
  
- // follow the consistency guide
- // https://issues.apache.org/jira/browse/SINGA-182
- // ============Matrix vector operations=======================================
  /// Add column 'v' with each column of matrix M
  void AddColumn(const Tensor &v, Tensor *M);
- void AddColumn(const float alpha, const float beta, const Tensor &v,
+ /// For each column 'c' of matrix out, do c=alpha*v + beta*c
+ template <typename SType>
+ void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                 Tensor *out);
- /// Sub column 'v' by each column of matrix M
- void SubColumn(const Tensor &v, Tensor *M);
- /// Multiply column 'v' and each column of matrix M; write results into 'out'
- void MultColumn(const Tensor &v, Tensor *M);
- /// Divide column 'v' by each column of matrix M; write results into 'out'
- void DivColumn(const Tensor &v, Tensor *M);
- 
  /// Add row 'v' with each row of matrix M; write results into 'out'
  void AddRow(const Tensor &v, Tensor *out);
- void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
- /// Sub row 'v' by each row of matrix M; write results into 'out'
- void SubRow(const Tensor &v, Tensor *M);
- /// Multiply row 'v' with each row of matrix M; write results into 'out'
- void MultRow(const Tensor &v, Tensor *M);
+ /// For each row 'r' of matrix out, do r=alpha*v + beta*r
+ template <typename SType>
+ void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+ /// Divide column 'v' by each column of matrix M; write results into 'out'
+ void DivColumn(const Tensor &v, Tensor *M);
  /// Divide row 'v' by each row of matrix M; write results into 'out'
  void DivRow(const Tensor &v, Tensor *M);
- 
- /// Sum all rows of matrix M into a single row as 'out'
- void SumRows(const Tensor &M, Tensor *out);
+ /// Multiply column 'v' and each column of matrix M; write results into 'out'
+ void MultColumn(const Tensor &v, Tensor *M);
+ /// Multiply row 'v' with each row of matrix M; write results into 'out'
+ void MultRow(const Tensor &v, Tensor *M);
++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
++Tensor SoftMax(const Tensor &in);
++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
++void SoftMax(const Tensor &in, Tensor *out);
+ /// Sub column 'v' by each column of matrix M
+ void SubColumn(const Tensor &v, Tensor *M);
+ /// Sub row 'v' by each row of matrix M; write results into 'out'
+ void SubRow(const Tensor &v, Tensor *M);
  /// Sum all columns of matrix M into a single column as 'out'
  void SumColumns(const Tensor &M, Tensor *out);
+ /// Sum all rows of matrix M into a single row as 'out'
+ void SumRows(const Tensor &M, Tensor *out);
+ 
++/// Sum elements in the Tensor, currently only support vector and matrix.
++/// if 'axis' is 0, sum all rows into a single row
++/// if 'axis' is 1, sum all columns into a single column
++/// TODO(wangwei) support arbitrary Tensor like numpy.sum
++Tensor Sum(const Tensor &in, const int axis);
++
+ // ================Random operations==========================================
+ /// For each element x set x = 1 if random() < p; otherwise x = 1.
+ template <typename SType>
+ void Bernoulli(const SType p, Tensor *out);
+ /// Fill in Tensor 't' following Gaussian distribution.
+ template <typename SType>
+ void Gaussian(const SType mean, const SType std, Tensor *out);
+ /// Fill in Tensor 't' following uniform distribution.
+ template <typename SType>
+ void Uniform(const SType low, const SType high, Tensor *out);
+ 
+ // ================Blas operations============================================
+ // TODO(wangwei) make amax/amin/asum a member function of tensor
  
- /// For each element x of Tensor 'in', compute alpha/x
+ /// out = alpha*in + out
  template <typename SType>
- Tensor Div(const SType alpha, const Tensor &in);
+ void Axpy(SType alpha, const Tensor &in, Tensor *out);
  
- /// For each element x of Tensor 'in', compute alpha/x into 'out'
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape.  result = A * B
+ Tensor Mult(const Tensor &A, const Tensor &B);
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape.  C = A * B
+ void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 -
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape. out = alpha lhs * rhs + beta * out
  template <typename SType>
- void Div(const SType alpha, const Tensor &in, Tensor *out);
- 
- /*
- /// Multiply each column of the lhs matrix with the rhs column
- Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
- void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Multiply each row of the lhs matrix with the rhs row
- Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
- void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Div each row of the lhs matrix with the rhs column
- Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
- void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Divide each row of the lhs matrix by the rhs row
- Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
- void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- */
+ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+           Tensor *C);
 +
++// *****************
++// Misc.
++// ****************
 +/// Compute the cross entropy loss given the prediction probability 'p' and
 +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
 +/// or 2-d matrix. 'loss' is 1-d vector. The loss is computed into p.
- void ComputeCrossEntropy(const Tensor& t, Tensor* p);
++void ComputeCrossEntropy(const Tensor& p, const Tensor& t, Tensor* loss);
 +/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
 +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
 +/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
 +void SoftmaxCrossEntropyBwd(const Tensor& t, Tensor* p);
  }  // namespace singa
  
  #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/CMakeLists.txt
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.cu
index f12763e,b618f9b..21ebdd8
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@@ -309,142 -258,170 +258,202 @@@ __global__ void KernelLE(const int num
    }
  }
  
- __global__ static void kernel_set_value(float *data, float value, int n) {
-   int index = blockIdx.x * blockDim.x + threadIdx.x;
-   int num_threads = blockDim.x * gridDim.x;
-   for (; index < n; index += num_threads) {
-     data[index] = value;
+ __global__ void KernelLT(const int num, const float *in, const float x,
+                          float *out) {
+   for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+        idx += blockDim.x * gridDim.x) {
+     out[idx] = in[idx] < x ? 1.0f : 0.0f;
+   }
+ }
++__global__ void KernelComputeCrossEntropy(const size_t batchsize,
++                                          const size_t dim, const float *p,
++                                          const int *t, float *loss) {
++  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
++  size_t num_threads = blockDim.x * gridDim.x;
++  for (; sample < batchsize; sample += num_threads) {
++    float prob_of_truth = p[sample * dim + t[sample]];
++    loss[sample] = -std::log(max(prob_of_truth, FLT_MIN));
 +  }
 +}
  
- __global__ void kernel_threshold(const float *src_data, float *des_data,
-                                  float alpha, int n) {
-   int index = blockIdx.x * blockDim.x + threadIdx.x;
-   int num_threads = blockDim.x * gridDim.x;
-   for (; index < n; index += num_threads) {
-     des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
++__global__ void KernelSoftmaxCrossEntropyBwd(const size_t batchsize,
++                                             const size_t dim, const float *p,
++                                             const int *t, float *grad) {
++  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
++  size_t num_threads = blockDim.x * gridDim.x;
++  for (; sample < batchsize; sample += num_threads) {
++    size_t pos = sample * dim + t[sample];
++    grad[pos] = p[pos] - 1.0f;  // TODO(wangwei) Consider p and grad are diff
 +  }
 +}
- void sum(int n, const float *in, float *out) {
-   int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-   //  here, we only need one block
-   int num_blocks = 1;
+ // ********************************
+ // Functions call kernels
+ // ********************************
+ 
+ void set(const size_t n, const float v, float *out, cudaStream_t s) {
+   KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
+ }
+ 
+ void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
+ void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
- void sum_row(int rows, int cols, int stride, const float *in, float *out) {
-   int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-   int num_blocks = cols;
+ void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
+ 
+ void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_row << <num_blocks, threads_per_block>>>
-       (in, out, rows, cols, stride);
+ void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
- void sum_col(int rows, int cols, int stride, const float *in, float *out) {
-   int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
-   int num_blocks = rows;
+ void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_col << <num_blocks, threads_per_block>>>
-       (in, out, rows, cols, stride);
+ void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void add_row(int rows, int cols, int stride, const float *in_row,
-              const float *in_mat, float *out) {
-   dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-   dim3 num_blocks(
-       cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
-       rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
-   kernel_add_vec_row << <num_blocks, threads_per_block>>>
-       (in_row, in_mat, out, rows, cols, stride);
+ 
+ void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void add(int n, const float *a, const float *b, float *out) {
-   kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
+   KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void sub(int n, const float *a, const float *b, float *out) {
-   kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void exp(int n, const float *in, float *out) {
-   kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void clamp(const size_t n, const float low, const float high, const float *in,
+            float *out, cudaStream_t s) {
+   KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
  }
  
- void log(int n, const float *in, float *out) {
-   kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void pow(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+   KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void sigmoid(int n, const float *in, float *out) {
-   kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void add(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+   KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void sigmoid_grad(int n, const float *in, float *out) {
-   kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void mult(const size_t n, const float *in, const float x, float *out,
+           cudaStream_t s) {
+   KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void relu(int n, const float *in, float *out) {
-   kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void div(const size_t n, const float x, const float *in, float *out,
+           cudaStream_t s) {
+   KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
- void relu_grad(int n, const float *in, float *out) {
-   kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void threshold(const size_t n, const float x, const float *in, float *out,
+                cudaStream_t s) {
+   KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
- void tanh(int n, const float *in, float *out) {
-   kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void gt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void ge(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void lt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void le(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
  }
  
- void tanh_grad(int n, const float *in, float *out) {
-   kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void pow(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void softplus(int n, const float *in, float *out) {
-   kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void add(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void softplus_grad(int n, const float *in, float *out) {
-   kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void sub(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void square(int n, const float *in, float *out) {
-   kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void mult(const size_t n, const float *in1, const float *in2, float *out,
+           cudaStream_t s) {
+   KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void square_grad(int n, const float *in, float *out) {
-   kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void div(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void sqrt(int n, const float *in, float *out) {
-   kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+   int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+   //  here, we only need one block
+   int num_blocks = 1;
+   KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
  }
 +
- void pow(int n, const float *a, const float *b, float *out) {
-   kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
++void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float *p,
++                         const int *t, float *loss, cudaStream_t stream) {
++  KernelComputeCrossEntropy <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
++      (batchsize, dim, p, t, loss);
 +}
 +
- void mult(int n, const float *a, const float *b, float *out) {
-   kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
++void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float *p,
++                            const int *t, float *grad, cudaStream_t stream) {
++  KernelSoftmaxCrossEntropyBwd <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
++      (batchsize, dim, p, t, grad);
++}
+ /*
+ void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void mult(int n, const float *a, const float x, float *out) {
-   kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+ void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void div(int n, const float *a, const float *b, float *out) {
-   kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ 
+ void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void set_value(int n, float v, float *out) {
-   kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+ 
+ void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void threshold(int n, float alpha, const float *in, float *out) {
-   kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+ void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- // follow the consistency guide for math API
- __global__ void KernelDiv(const size_t num, const float alpha, const float *in,
-                           float *out) {
-   for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-        idx += blockDim.x * gridDim.x) {
-     out[idx] = alpha / in[idx];
+ 
+ __global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                                int rows, int cols, int stride) {
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   int num_threads = blockDim.x * gridDim.x;
+   for (; index < rows; index += num_threads) {
+     dst_vec_data[index] = 0.0f;
+     for (int k = 0; k < cols; k++) {
+       dst_vec_data[index] += src_mat_data[index * stride + k];
+     }
    }
  }
  

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.h
index 09953e4,d8a58a5..976b78f
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@@ -31,72 -31,66 +31,73 @@@ namespace singa 
  
  // TODO(wangwei) make all function templates.
  namespace cuda {
- void sum(int n, const float *in, float *out);
  
- void sum_row(int rows, int cols, int stride, const float *in, float *out);
- 
- void sum_col(int rows, int cols, int stride, const float *in, float *out);
- 
- void add_row(int rows, int cols, int stride, const float *in_row,
-              const float *in_mat, float *out);
- 
- void add(int n, const float *a, const float *b, float *out);
- 
- void sub(int n, const float *a, const float *b, float *out);
- 
- void exp(int n, const float *in, float *out);
- 
- void log(int n, const float *in, float *out);
- 
- void sigmoid(int n, const float *in, float *out);
- 
- void sigmoid_grad(int n, const float *in, float *out);
- 
- void relu(int n, const float *in, float *out);
- 
- void relu_grad(int n, const float *in, float *out);
- 
- void tanh(int n, const float *in, float *out);
- 
- void tanh_grad(int n, const float *in, float *out);
+ // 0 input
+ void set(const size_t n, const float v, float *out, cudaStream_t s);
+ 
+ // 1 input
+ void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+ void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+ void log(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+ void square(const size_t n, const float *in, float *out, cudaStream_t s);
+ void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+ void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
+ void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+ void clamp(const size_t n, const float low, const float high, const float *in,
+            float *out, cudaStream_t s);
+ 
+ void pow(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
  
- void softplus(int n, const float *in, float *out);
+ void add(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
  
- void softplus_grad(int n, const float *in, float *out);
+ void mult(const size_t n, const float *in, const float x, float *out,
+           cudaStream_t s);
  
- void square(int n, const float *in, float *out);
+ void div(const size_t n, const float x, const float *in, float *out,
+          cudaStream_t s);
  
- void square_grad(int n, const float *in, float *out);
+ void threshold(const size_t n, const float x, const float *in, float *out,
+                cudaStream_t s);
  
- void sqrt(int n, const float *in, float *out);
+ void gt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void ge(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void lt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void le(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
  
- void pow(int n, const float *a, const float *b, float *out);
+ // 2 inputs
+ void pow(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void mult(int n, const float *a, const float *b, float *out);
+ void add(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void mult(int n, const float *a, const float x, float *out);
+ void sub(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void div(int n, const float *a, const float *b, float *out);
+ void mult(const size_t n, const float *in1, const float *in2, float *out,
+           cudaStream_t s);
  
- void set_value(int n, float v, float *out);
+ void div(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void threshold(int n, float alpha, const float *in, float *out);
+ void sum(const size_t n, const float *in, float *out, cudaStream_t s);
  
- // follow the consistency guide for math API
 +void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
 +                         const float *p, const int *t, float *loss,
 +                         cudaStream_t stream);
- void Div(const size_t num, const float x, const float *in, float *out,
-          cudaStream_t s);
- void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void Set(const size_t num, const float x, float *out, cudaStream_t s);
 +void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
 +                            const float *p, const int *t, float *grad,
 +                            cudaStream_t stream);
 +
  }  // cuda
  
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index 1ac25c6,e62386a..4e0d94b
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -592,42 -557,8 +556,8 @@@ void AddRow(const SType alpha, const ST
      Mult(alpha, one, vmat, beta, M);
    }
  }
- void ComputeCrossEntropy(const Tensor& t, Tensor* p) {
-   CHECK_LE(p->nDim(), 2u);
-   CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
-   size_t batchsize = 1;
-   if (p->nDim() == 2u) batchsize = p->shape(0);
-   size_t dim = p->Size() / batchsize;
-   TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
-     p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
-       ComputeCrossEntropy<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
-                                        p->blob(), ctx);
-     }, {p->blob(), t.blob()}, {p->blob()});
-   });
- }
- 
- template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
-   Tensor out(in.shape(), in.device(), in.data_type());
-   Div(alpha, in, &out);
-   return out;
- }
- 
- template Tensor Div<float>(const float, const Tensor &);
- 
- template <typename SType>
- void Div(const SType alpha, const Tensor &in, Tensor *out) {
-   CheckDataTypeAndLang(in, *out);
-   CHECK(in.shape() == out->shape());
-   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-     // TODO(wangwei) type cast SType to DType;
-     in.device()->Exec(
-         [alpha, in, out](Context *ctx) {
-           Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
-         },
-         {in.blob()}, {out->blob()});
-   });
- }
- template void Div<float>(const float, const Tensor &, Tensor *);
 -template <>
++template
+ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
  
  /// Divide column 'v' by each column of matrix M; write results into 'out'
  void DivColumn(const Tensor &v, Tensor *M) {
@@@ -725,4 -639,92 +638,122 @@@ void SumRows(const Tensor &M, Tensor *v
      Mult(X, one, v);
    }
  }
+ // ====================Random operations=====================================
+ template <typename SType>
+ void Bernoulli(const SType p, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto prob = TypeCast<SType, DType>(p);
+     out->device()->Exec([prob, out](Context *ctx) {
+       Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Bernoulli<float>(const float p, Tensor *out);
+ 
+ template <typename SType>
+ void Uniform(const SType low, const SType high, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto l = TypeCast<SType, DType>(low);
+     auto h = TypeCast<SType, DType>(high);
+     out->device()->Exec([l, h, out](Context *ctx) {
+       Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Uniform<float>(const float low, const float high, Tensor *out);
+ 
+ template <typename SType>
+ void Gaussian(const SType mean, const SType std, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto m = TypeCast<SType, DType>(mean);
+     auto s = TypeCast<SType, DType>(std);
+     out->device()->Exec([m, s, out](Context *ctx) {
+       Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Gaussian<float>(const float mean, const float std, Tensor *out);
+ 
+ // ================Blas operations============================================
+ template <typename SType>
+ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+     auto a = TypeCast<SType, DType>(alpha);
+     out->device()->Exec([a, in, out](Context *ctx) {
+       Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+     }, {in.blob(), out->blob()}, {out->blob()});
+   });
+ }
 -template <>
 -void Axpy(const float alpha, const Tensor &in, Tensor *out);
++template void Axpy(const float alpha, const Tensor &in, Tensor *out);
+ 
+ Tensor Mult(const Tensor &A, const Tensor &B) {
+   Shape s;
+   s.push_back(A.shape(0));
+   if (B.nDim() == 2) s.push_back(B.shape(1));
+   Tensor out(s, A.device(), A.data_type());
+   Mult(A, B, &out);
+   return out;
+ }
+ 
+ void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+   Mult(1.0f, A, B, 0.0f, out);
+ }
+ 
+ template <typename SType>
+ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+           Tensor *C) {
+   CHECK_EQ(A.shape().size(), 2u);
+   if (B.nDim() == 1u) {
+     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+       auto a = TypeCast<SType, DType>(alpha);
+       auto b = TypeCast<SType, DType>(beta);
+       C->device()->Exec([a, A, b, B, C](Context *ctx) {
+         GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+                           B.blob(), b, C->blob(), ctx);
+       }, {A.blob(), B.blob()}, {C->blob()});
+     });
+   } else {
+     CHECK(!C->transpose());
+     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+       auto a = TypeCast<SType, DType>(alpha);
+       auto b = TypeCast<SType, DType>(beta);
+       C->device()->Exec([a, A, b, B, C](Context *ctx) {
+         GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+                           A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+       }, {A.blob(), B.blob()}, {C->blob()});
+     });
+   }
+ }
+ 
++
++// ************************
++// Misc.
++// ***********************
++void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
++  CHECK_LE(p.nDim(), 2u);
++  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
++  size_t batchsize = 1;
++  if (p.nDim() == 2u) batchsize = p.shape(0);
++  size_t dim = p.Size() / batchsize;
++  TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
++    p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
++      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p.blob(), t.blob(),
++                                       loss->blob(), ctx);
++    }, {p.blob(), t.blob()}, {loss->blob()});
++  });
++}
++void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
++  CHECK_LE(p->nDim(), 2u);
++  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
++  size_t batchsize = 1;
++  if (p->nDim() == 2u)
++    batchsize = p->shape(0);
++  size_t dim = p->Size() / batchsize;
++  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
++    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
++      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
++                                          p->blob(), ctx);
++    }, {p->blob(), t.blob()}, {p->blob()});
++  });
++}
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math.h
index bcf4908,b86e1cb..12490d1
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@@ -269,109 -293,74 +293,95 @@@ void Scale(const size_t num, const DTyp
  template <typename DType, typename Lang>
  void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
           Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+   LOG(FATAL) << "Dot Not Implemented";
  }
  
- // ===== Level 2
- /// ret = alpha * op(A) * v + beta * ret.
- /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+ /// out = alpha * A * v + beta * out.
+ /// transA indicates if the internal data layout is transposed of A
  template <typename DType, typename Lang>
- void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
-           DType beta, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+           const Blob *A, const Blob *v, const DType beta, Blob *out,
+           Context *ctx) {
+   LOG(FATAL) << "GEMV Not Implemented";
  }
  
- // ===== Level 3
- 
- // ================Random functions===========================================
- /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
- // Get the random generator from 'ctx'
- // If DType is not float, then convert the threshold to DType
- template <typename DType, typename Lang>
- void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
- // The random generator should be extracted from ctx.
- // If DType is not float, then convert the low and high to DType
+ /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+ /// if matrix_lef_side is true, do M*v; else do v*M
  template <typename DType, typename Lang>
- void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
+           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
+   LOG(FATAL) << "DGMM Not Implemented";
  }
- // The random generator should be extracted from ctx.
- // If DType is not float, then convert the mean and std to DType
+ 
+ /// C = alpha * A * B + beta * C.
+ /// transA indicates if the internal data layout is transposed of A
  template <typename DType, typename Lang>
- void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void GEMM(const bool transA, const bool transB, const size_t nrowA,
+           const size_t ncolB, const size_t ncolA, const DType alpha,
+           const Blob *A, const Blob *B, const DType beta, Blob *C,
+           Context *ctx) {
+   LOG(FATAL) << "GEMM Not Implemented";
  }
  
- // ========follow the consistency guide of math API
- 
 +/// Divide alpha by each element of 'in'.
 +// following the consistency guide.
 +template <typename DType, typename Lang>
 +void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
 +                         const Blob *p, const Blob *t, Blob *loss,
 +                         Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
- template <typename DType, typename Lang>
- void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
-          Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
 +
- /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
- /// if matrix_lef_side is true, do M*v; else do v*M
 +template <typename DType, typename Lang>
- void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
-           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
++void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
++                            const Blob *p, const Blob *t, Blob *grad,
++                            Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
 +
- /// C = alpha * A * B + beta * C.
- /// transA indicates if the internal data layout is transposed of A
+ // **************************************
+ // Matrix functions
+ // **************************************
+ /*
+ /// Add the vector v to every column of A as the column of out
  template <typename DType, typename Lang>
- void GEMM(const bool transA, const bool transB, const size_t nrowA,
-           const size_t ncolB, const size_t ncolA, const DType alpha,
-           const Blob *A, const Blob *B, const DType beta, Blob *C,
-           Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+             Blob *out, Context *ctx) {
+   LOG(FATAL) << "AddCol Not Implemented";
  }
- /// ret[i]=(input[i]<x)?1.f:0.f
+ // TODO(wangwei) unify AddRow and AddCol.
+ /// Add the vector v to every row of A as the row of out
  template <typename DType, typename Lang>
- void LT(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+             Blob *out, Context *ctx) {
+   LOG(FATAL) << "AddRow Not Implemented";
  }
- /// ret[i]=(input[i]<=x)?1.f:0.f
- template <typename DType, typename Lang>
- void LE(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
- /// ret[i]=(input[i]>x)?1.f:0.f
+ /// outer-product.
+ /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
  template <typename DType, typename Lang>
- void GT(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+            Blob *out, Context *ctx) {
+   LOG(FATAL) << "Outer Not Implemented";
  }
- /// ret[i]=(input[i]>=x)?1.f:0.f
+ 
+ /// Sum the columns of the in matrix into a vector
  template <typename DType, typename Lang>
- void GE(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                 Context *ctx) {
+   LOG(FATAL) << "SumColumns Not Implemented";
  }
 +template <typename DType, typename Lang>
 +void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
 +
+ // TODO(wangwei) unify SumRow and SumCol.
+ /// Sum the rows of the in matrix into a vector
  template <typename DType, typename Lang>
- void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
-                             const Blob *p, const Blob *t, Blob *grad,
-                             Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+              Context *ctx) {
+   LOG(FATAL) << "SumRows Not Implemented";
  }
- 
+ */
  }  // namespace singa
  #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math_cpp.h
index 907c656,0b280a3..c5d092b
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@@ -323,32 -420,196 +422,229 @@@ void GEMM<float, lang::Cpp>(const bool 
    cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
                lda, BPtr, ldb, beta, CPtr, ldc);
  }
- #endif  // USE_CBLAS
+ 
+ #else
  
  template <>
- void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
+ void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                             Context *ctx) {
+   size_t maxPos = 0;
+   float maxVal = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     if (i == 0) {
+       maxVal = inPtr[i];
+     } else if (inPtr[i] > maxVal) {
+       maxVal = inPtr[i];
+       maxPos = i;
+     }
+   }
+   *out = maxPos;
+ }
+ template <>
+ void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                             Context *ctx) {
+   size_t minPos = 0;
+   float minVal = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     if (i == 0) {
+       minVal = inPtr[i];
+     } else if (inPtr[i] > minVal) {
+       minVal = inPtr[i];
+       minPos = i;
+     }
+   }
+   *out = minPos;
+ }
+ 
+ template <>
+ void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                             Context *ctx) {
+   float sum = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     sum += fabs(inPtr[i]);
+   }
+ }
+ 
+ template <>
+ void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                             Blob *out, Context *ctx) {
    float *outPtr = static_cast<float *>(out->mutable_data());
-   for (size_t i = 0; i < num; i++) outPtr[i] = x;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     outPtr[i] += alpha * inPtr[i];
+   }
  }
+ 
+ template <>
+ void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                              Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   for (size_t i = 0; i < num; i++) {
+     outPtr[i] *= x;
+   }
+ }
+ 
+ template <>
+ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                            float *out, Context *ctx) {
+   float sum = 0;
+   const float *in1Ptr = static_cast<const float *>(in1->data());
+   const float *in2Ptr = static_cast<const float *>(in2->data());
+   for (size_t i = 0; i < num; i++) {
+     sum += in1Ptr[i] * in2Ptr[i];
+   }
+ }
+ 
+ template <>
+ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                             const float alpha, const Blob *A, const Blob *v,
+                             const float beta, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < m; r++) {
+     float sum = 0;
+     for (size_t c = 0; c < n; c++) {
+       size_t idx = trans ? c * m + r : r * n + c;
+       sum += APtr[idx] * vPtr[c];
+     }
+     outPtr[r] = alpha * sum + beta * outPtr[r];
+   }
+ }
+ 
+ #endif  // USE_CBLAS
++template <>
++void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
++                                           const size_t dim, const Blob *p,
++                                           const Blob *t, Blob *loss,
++                                           Context *ctx) {
++  const float *pPtr = static_cast<const float *>(p->data());
++  const int *tPtr = static_cast<const int *>(t->data());
++  float *lossPtr = static_cast<float *>(loss->mutable_data());
++  for (size_t i = 0; i < batchsize; i++) {
++    int truth_idx = tPtr[i];
++    CHECK_GE(truth_idx, 0);
++    float prob_of_truth = pPtr[i * dim + truth_idx];
++    lossPtr[i] = -std::log(std::max(prob_of_truth, FLT_MIN));
++  }
++}
++
 +template <>
 +void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
 +                                              const size_t dim, const Blob *p,
 +                                              const Blob *t,
 +                                              Blob *grad, Context *ctx) {
 +  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
 +  // const float* pPtr = static_cast<const float*>(p->data());
-   const float *tPtr = static_cast<const float *>(t->data());
++  const int *tPtr = static_cast<const int *>(t->data());
 +  float *gradPtr = static_cast<float *>(grad->mutable_data());
 +
 +  for (size_t i = 0; i < batchsize; i++) {
 +    int truth_idx = static_cast<int>(tPtr[i]);
 +    CHECK_GE(truth_idx, 0);
 +    gradPtr[i * dim + truth_idx] -= 1.0;
 +  }
 +}
 +
  
+ // =========Matrix operations ================================================
+ /*
+ template <>
+ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *A, const Blob *v, Blob *out,
+                               Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+     }
+   }
+ }
+ 
+ template <>
+ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *A, const Blob *v, Blob *out,
+                               Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+     }
+   }
+ }
+ template <>
+ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                              const Blob *in2, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *in1Ptr = static_cast<const float *>(in1->data());
+   const float *in2Ptr = static_cast<const float *>(in2->data());
+   for (size_t r = 0; r < m; r++) {
+     size_t offset = r * n;
+     for (size_t c = 0; c < n; c++) {
+       outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+     }
+   }
+ }
+ template <>
+ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   float *bPtr = new float[ncol];
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     float denom = 0.f;
+     for (size_t c = 0; c < ncol; c++) {
+       bPtr[c] = exp(inPtr[offset + c]);
+       denom += bPtr[c];
+     }
+     for (size_t c = 0; c < ncol; c++) {
+       size_t idx = offset + c;
+       outPtr[idx] = bPtr[c] / denom;
+     }
+   }
+   delete bPtr;
+ }
+ 
+ template <>
+ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                   const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t c = 0; c < ncol; c++) {
+     outPtr[c] = 0.f;
+   }
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[c] += inPtr[offset + c];
+     }
+   }
+ }
+ 
+ template <>
+ void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     outPtr[r] = 0.f;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[r] += inPtr[offset + c];
+     }
+   }
+ }
+ */
  }  // namespace singa
  
  #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math_cuda.h
index c69620c,e2597d5..9a8839e
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@@ -131,55 -398,6 +398,28 @@@ void GEMM<float, lang::Cuda>(const boo
                             BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
  }
  
 +template <>
- void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out, Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::GE(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out,  Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::GT(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out, Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::LE(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out,  Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::LT(num, inPtr, x, outPtr, ctx->stream);
- }
- 
- template<>
- void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                             Context *ctx) {
-   float *outPtr = static_cast<float *>(out->mutable_data());
-   cuda::Set(num, x, outPtr, ctx->stream);
++void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize,
++                                            const size_t dim, const Blob *p,
++                                            const Blob *t, Blob *loss,
++                                            Context *ctx) {
++  const float *pPtr = static_cast<const float *>(p->data());
++  const int *tPtr = static_cast<const int *>(t->data());
++  float *lossPtr = static_cast<float *>(loss->mutable_data());
++  cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream);
 +}
- 
 +template <>
 +void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
 +                                               const size_t dim, const Blob *p,
 +                                               const Blob *t, Blob *grad,
 +                                               Context *ctx) {
 +  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
 +  const float *pPtr = static_cast<const float *>(p->data());
 +  const int *tPtr = static_cast<const int *>(t->data());
 +  float *gradPtr = static_cast<float *>(grad->mutable_data());
 +  cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
 +                               ctx->stream);
 +}
- 
  }  // namespace singa
  
  #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --cc src/model/layer/softmax.cc
index 8af1d76,b379fc1..f554f25
--- a/src/model/layer/softmax.cc
+++ b/src/model/layer/softmax.cc
@@@ -25,14 -25,16 +25,17 @@@ void Softmax::Setup(const LayerConf& co
  }
  
  const Tensor Softmax::Forward(int flag, const Tensor& input) {
+   Tensor output;
    if (input.nDim() == 1) {
-     buf_.push(SoftMax(input));
 -    Tensor tmp = Reshape(input, Shape{1, input.Size()});
 -      output = SoftMax(tmp, 0);
++    output = SoftMax(input);
    } else {
 -    output = SoftMax(input, axis_);
 +    size_t nrow = Product(input.shape(), 0, axis_);
 +    const Tensor& tmp = Reshape(input, Shape{nrow, input.Size() / nrow});
-     buf_.push(SoftMax(tmp));
++    output = SoftMax(tmp);
    }
-   return buf_.top();
+   if (flag & kTrain)
+     buf_.push(output);
+   return output;
  }
  
  const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/loss/softmax_cross_entropy.cc
----------------------------------------------------------------------
diff --cc src/model/loss/softmax_cross_entropy.cc
index 4ca323a,0000000..bed3348
mode 100644,000000..100644
--- a/src/model/loss/softmax_cross_entropy.cc
+++ b/src/model/loss/softmax_cross_entropy.cc
@@@ -1,53 -1,0 +1,53 @@@
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +#include <stack>
 +#include "singa/model/loss.h"
 +
 +namespace singa {
 +
- 
- Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
++Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction,
++                                    const Tensor& target) {
 +  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
 +                      << " The calling pattern is [Forward|Evaluate] Backward";
 +  size_t batchsize = 1;
 +  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
 +  size_t dim = prediction.Size() / batchsize;
 +  const Tensor& input = Reshape(prediction, Shape{batchsize, dim});
 +  Tensor prob = SoftMax(input);
 +
 +  // buffer intermediate data
 +  buf_.push(prob);
 +  buf_.push(target);
-   Tensor loss = prob.Clone();
++  Tensor loss(Shape{batchsize}, prob.device(), prob.data_type());
 +
-   ComputeCrossEntropy(target, &loss);
++  ComputeCrossEntropy(prob, target, &loss);
 +  return loss;
 +}
 +
 +Tensor SoftmaxCrossEntropy::Backward() {
 +  const Tensor target = buf_.top();
 +  buf_.pop();
 +  Tensor prob = buf_.top();
 +  buf_.pop();
 +  SoftmaxCrossEntropyBwd(target, &prob);
 +  return prob;
 +}
 +}  // namespace singa
 +
 +

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cross_entropy.cc
index 6b8cb69,0000000..0eb36e5
mode 100644,000000..100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@@ -1,114 -1,0 +1,116 @@@
 +/************************************************************
 +*
 +* Licensed to the Apache Software Foundation (ASF) under one
 +* or more contributor license agreements.  See the NOTICE file
 +* distributed with this work for additional information
 +* regarding copyright ownership.  The ASF licenses this file
 +* to you under the Apache License, Version 2.0 (the
 +* "License"); you may not use this file except in compliance
 +* with the License.  You may obtain a copy of the License at
 +*
 +*   http://www.apache.org/licenses/LICENSE-2.0
 +*
 +* Unless required by applicable law or agreed to in writing,
 +* software distributed under the License is distributed on an
 +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 +* KIND, either express or implied.  See the License for the
 +* specific language governing permissions and limitations
 +* under the License.
 +*
 +*************************************************************/
 +
 +#include "gtest/gtest.h"
 +#include "singa/core/tensor.h"
 +#include "singa/core/device.h"
 +#include "singa/model/loss.h"
 +#include "singa_config.h"
 +
 +using singa::Tensor;
 +class TestSoftmaxCrossEntropy : public ::testing::Test {
 + protected:
 +  virtual void SetUp() {
 +    p.Reshape(singa::Shape{2, 4});
 +    t.Reshape(singa::Shape{2, 1});
 +  }
 +  const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
-   const float tdat[2] = {0.0, 2.0};
++  const int tdat[2] = {0, 2};
 +
 +  singa::Tensor p, t;
 +};
 +
 +TEST_F(TestSoftmaxCrossEntropy, CppForward) {
 +  p.CopyDataFromHostPtr(pdat, 8);
++  t.AsType(singa::kInt);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  const Tensor& loss = cross_entropy.Forward(p, t);
 +  auto ldat = loss.data<const float*>();
 +
 +  const float result_test = -log(0.25);
 +  EXPECT_FLOAT_EQ(ldat[0], result_test);
 +  EXPECT_FLOAT_EQ(ldat[1], result_test);
 +}
 +
 +TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
 +  p.CopyDataFromHostPtr(pdat, 8);
++  t.AsType(singa::kInt);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  cross_entropy.Forward(p, t);
 +  const Tensor& grad = cross_entropy.Backward();
 +
 +  auto gdat = grad.data<const float*>();
 +  EXPECT_FLOAT_EQ(gdat[0], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[1], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[2], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[3], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[4], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[5], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[6], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[7], 0.25);
 +}
 +
 +#ifdef USE_CUDA
 +
 +TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  singa::CudaGPU dev;
 +  p.ToDevice(&dev);
 +  t.ToDevice(&dev);
 +  p.CopyDataFromHostPtr(pdat, 8);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  Tensor loss = cross_entropy.Forward(p, t);
 +  loss.ToHost();
 +  auto ldat = loss.data<const float*>();
 +
 +  const float result_test = -log(0.25);
 +  EXPECT_FLOAT_EQ(ldat[0], result_test);
 +  EXPECT_FLOAT_EQ(ldat[1], result_test);
 +}
 +
 +TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  singa::CudaGPU dev;
 +  p.ToDevice(&dev);
 +  t.ToDevice(&dev);
 +  p.CopyDataFromHostPtr(pdat, 8);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  cross_entropy.Forward(p, t);
 +  Tensor grad = cross_entropy.Backward();
 +
 +  grad.ToHost();
 +  auto gdat = grad.data<const float*>();
 +  EXPECT_FLOAT_EQ(gdat[0], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[1], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[2], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[3], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[4], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[5], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[6], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[7], 0.25);
 +}
 +#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_mse.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --cc test/singa/test_softmax.cc
index 09dfcd9,c087605..fff8510
--- a/test/singa/test_softmax.cc
+++ b/test/singa/test_softmax.cc
@@@ -36,13 -36,14 +36,14 @@@ TEST(Softmax, Setup) 
    EXPECT_EQ(2, sft.Axis());
  }
  
+ #ifdef USE_CBLAS
  TEST(Softmax, Forward) {
    const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
--  size_t n = sizeof(x) / sizeof(float);
    size_t row = 2;
    size_t col = 3;
++  size_t n = row * col;
    singa::Tensor in(singa::Shape{row, col});
--  in.CopyDataFromHostPtr<float>(x, n);
++  in.CopyDataFromHostPtr<float>(x, row * col);
  
    int axis = 1;
    Softmax sft;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --cc test/singa/test_tensor_math.cc
index 8368c55,38a9291..1092d69
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@@ -20,6 -22,263 +22,261 @@@ class TestTensorMath : public ::testing
    const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
  };
  
+ TEST_F(TestTensorMath, MemberAbs) {
+   Tensor aa = a.Clone();
+   Tensor bb = b.Clone();
+   Tensor cc = aa - bb;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+   EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+   EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+ 
+   Tensor p = Abs(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberExp) {
+   Tensor p = Exp(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+   EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+   EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLog) {
+   Tensor p = Log(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+   EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+   EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberReLU) {
+   Tensor aa = a.Clone();
+   Tensor cc = aa - 2.0f;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+ 
+   Tensor p = ReLU(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSigmoid) {
+   Tensor p = Sigmoid(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+   EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+   EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSign) {
+   Tensor aa = a.Clone();
+   Tensor cc = aa - 2.0f;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+ 
+   Tensor p = Sign(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_EQ(0.0f, dptr1[0]);
+   EXPECT_EQ(0.0f, dptr1[1]);
+   EXPECT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSqrt) {
+   Tensor p = Sqrt(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+   EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+   EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSquare) {
+   Tensor p = Square(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+   EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+   EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberTanh) {
+   Tensor p = Tanh(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+   EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+   EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, Sum) {
+   Tensor p1 = Sum(e, 0);
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(12.0f, dptr1[1]);
+ 
+   Tensor p2(Shape{3, 1});
+   p2 = Sum(e, 1);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_FLOAT_EQ(3.0f, dptr2[0]);
+   EXPECT_FLOAT_EQ(7.0f, dptr2[1]);
+   EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
+ }
+ 
+ TEST_F(TestTensorMath, SoftMax) {
 -  Tensor p1(Shape{3, 2});
 -  p1 = SoftMax(e, 0);
++  Tensor p1 = SoftMax(Reshape(e, Shape{1, 6}));
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 6; i++) sum += exp(i + 1);
+   EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5);
+   EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5);
+   EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5);
+   EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5);
+   EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5);
+   EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5);
+ 
 -  Tensor p2(Shape{3, 2});
 -  p2 = SoftMax(e, 1);
++  Tensor p2 = SoftMax(e);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5);
+   EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLT) {
+   Tensor p1 = a < 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLE) {
+   Tensor p1 = a <= 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGT) {
+   Tensor p1 = a > 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGE) {
+   Tensor p1 = a >= 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberPow) {
+   Tensor p1 = Pow(b, 3.0f);
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
+   EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]);
+   EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]);
+ 
+   // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the
+   // function is complete
+   // Tensor p2 = Pow(a,b);
+   // const float *dptr2 = p2.data<const float *>();
+   // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+   // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+   // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSub) {
+   Tensor p1 = a - b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberEltwiseMult) {
+   Tensor p1 = a * b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberDiv) {
+   Tensor p1 = a / b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5);
+ 
+   Tensor p2 = Div(10.0f, b);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5);
+   EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5);
+   EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5);
+ 
+   Tensor p3 = a / 8.0f;
+   const float *dptr3 = p3.data<const float *>();
+   EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5);
+   EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5);
+   EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberBernoulli) {
+   Tensor p1(Shape{10000});
+   Bernoulli(0.3f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 10000; i++) sum += dptr1[i];
+   float mean = sum / 10000;
+   EXPECT_NEAR(mean, 0.3f, 1e-2);
+ 
+   sum = 0;
+   for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 9999;
+   EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
+ }
+ 
+ TEST_F(TestTensorMath, MemberUniform) {
+   Tensor p1(Shape{10000});
+   Uniform(0.1f, 0.2f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 10000; i++) sum += dptr1[i];
+   float mean = sum / 10000;
+   EXPECT_NEAR(mean, 0.15f, 1e-3);
+ 
+   sum = 0;
+   for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 9999;
+   EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGaussian) {
+   Tensor p1(Shape{50000});
+   Gaussian(0.0f, 1.0f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 50000; i++) sum += dptr1[i];
+   float mean = sum / 50000;
+   EXPECT_NEAR(mean, 0.0, 1e-2);
+ 
+   sum = 0;
+   for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 49999;
+   EXPECT_NEAR(variance, 1.0, 1e-2);
+ }
+ 
  TEST_F(TestTensorMath, MemberAddTensor) {
    Tensor aa = a.Clone();
    aa += a;

[2/3] incubator-singa git commit: SINGA-184 Add Cross Entropy loss computation

Posted by ka...@apache.org.

SINGA-184 Add Cross Entropy loss computation

Update softmaxcrossentropy layer to support both cpp and cuda devices;

Fix bugs from crossentropy fwd and bwd; need the cuda version exp();


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ec17acab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ec17acab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ec17acab

Branch: refs/heads/dev
Commit: ec17acab49d595fdc48b2dae6f71901b5a4c8191
Parents: efd7b62
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri May 27 17:25:01 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 11:12:05 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h             |  17 +++--
 include/singa/model/loss.h              |  47 ++++++++++++
 src/CMakeLists.txt                      |   3 +-
 src/core/tensor/math_kernel.cu          |  37 +++++++++-
 src/core/tensor/math_kernel.h           |   9 ++-
 src/core/tensor/tensor.cc               |  52 +++++++++----
 src/core/tensor/tensor_math.h           |  24 ++++--
 src/core/tensor/tensor_math_cpp.h       |  50 +++++++++++--
 src/core/tensor/tensor_math_cuda.h      |  41 ++++++++---
 src/model/layer/softmax.cc              |   7 +-
 src/model/loss/cross_entropy.h          | 105 ---------------------------
 src/model/loss/mse.cc                   |  41 +++++++++++
 src/model/loss/mse.h                    |  66 -----------------
 src/model/loss/softmax_cross_entropy.cc |  53 ++++++++++++++
 test/singa/test_cross_entropy.cc        |  64 ++++++++++++++--
 test/singa/test_mse.cc                  |   6 +-
 test/singa/test_softmax.cc              |   9 +--
 17 files changed, 393 insertions(+), 238 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..865e1e4 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -239,11 +239,10 @@ Tensor Sum(const Tensor &t, int axis);
 /// if 'axis' is 1, average all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.average
 Tensor Average(const Tensor &t, int axis);
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
-/// and shape_[axis]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-Tensor SoftMax(const Tensor &t, int axis = 0);
-void SoftMax(const Tensor &t, int axis, Tensor *ret);
+/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
+Tensor SoftMax(const Tensor &in);
+/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
+void SoftMax(const Tensor &in, Tensor *out);
 
 /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
 /// and shape_[axis+1]*...*shape_[nDim()] columns.
@@ -398,6 +397,14 @@ Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
 void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
 */
 
+/// Compute the cross entropy loss given the prediction probability 'p' and
+/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
+/// or 2-d matrix. 'loss' is 1-d vector. The loss is computed into p.
+void ComputeCrossEntropy(const Tensor& t, Tensor* p);
+/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
+/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
+/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
+void SoftmaxCrossEntropyBwd(const Tensor& t, Tensor* p);
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 6a23067..d188de0 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -18,6 +18,7 @@
 
 #ifndef SINGA_MODEL_LOSS_H_
 #define SINGA_MODEL_LOSS_H_
+#include <stack>
 #include "singa/proto/model.pb.h"
 #include "singa/core/tensor.h"
 namespace singa {
@@ -54,6 +55,52 @@ class Loss {
   /// Compute the gradients of the loss values w.r.t. the prediction.
   virtual Tensor Backward() = 0;
 };
+
+
+
+// ============= Mean Squared Error ===========================================
+/// MSE is for mean squared error or squared euclidean distance.
+class MSE : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is 0.5/||prediction-target||^2
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is (prediction-target)/batchsize
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., prediction-target
+  std::stack<Tensor> buf_;
+};
+
+
+// ===============Softamx Cross Entropy =======================================
+/// Softmax + cross entropy for multi-category classification
+class SoftmaxCrossEntropy : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is -log(p[idx_truth]), idx_truth is the truth
+  /// category's index and p[] is the probability for each category, computed
+  /// from Softmax(prediction).
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is: p[idx] - 1 if idx is the truth category's index; else,
+  /// p[idx]
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., probability for each category and
+  // the target (ground truth)
+  std::stack<Tensor> buf_;
+};
+
 }  // namespace singa
 
 #endif  // SINGA_MODEL_LOSS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 28066de..23cae85 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,7 +21,7 @@ AUX_SOURCE_DIRECTORY(core/tensor core_source)
 FILE(GLOB_RECURSE cuda_source core "*.cu")
 set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
 set(CMAKE_CXX_FLAGS "")
-CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC ")
 #message(STATUS "FLAGS ${CMAKE_CXX_FLAGS}")
 #message(STATUS "CORE ${cuda_source}")
 #message(STATUS "OBJ ${cuda_objs}")
@@ -36,6 +36,7 @@ LIST(APPEND SINGA_LINKER_LIBS singa_core)
 AUX_SOURCE_DIRECTORY(model model_source)
 AUX_SOURCE_DIRECTORY(model/layer model_source)
 AUX_SOURCE_DIRECTORY(model/optimizer model_source)
+AUX_SOURCE_DIRECTORY(model/loss model_source)
 #MESSAGE(STATUS "MODEL ${model_source}")
 ADD_LIBRARY(singa_model SHARED ${model_source})
 TARGET_LINK_LIBRARIES(singa_model ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..f12763e 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -485,8 +485,26 @@ __global__ void KernelSet(const size_t num, const float x, float *out) {
   }
 }
 
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
-  KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+__global__
+void KernelComputeCrossEntropy(const size_t batchsize, const size_t dim, const float* p,
+    const int* t, float* loss) {
+  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads = blockDim.x * gridDim.x;
+  for (; sample < batchsize; sample += num_threads) {
+    float prob_of_truth = p[sample * dim + t[sample]];
+    loss[sample] -= std::log(max(prob_of_truth, FLT_MIN));
+  }
+}
+
+__global__
+void KernelSoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim, const float* p,
+    const int* t, float* grad) {
+  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads = blockDim.x * gridDim.x;
+  for (; sample < batchsize; sample += num_threads) {
+    size_t pos = sample * dim + t[sample];
+    grad[pos] = p[pos] - 1.0f;  // TODO(wangwei) Consider p and grad are diff
+  }
 }
 void Div(const size_t num, float alpha, const float *in, float *out,
          cudaStream_t s) {
@@ -510,6 +528,21 @@ void LE(const size_t num, const float *in, const float x, float *out,
   KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 }
 
+void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float* p,
+    const int *t, float *loss, cudaStream_t stream) {
+  KernelComputeCrossEntropy<<<ceil(batchsize/CU1DBLOCKF), CU1DBLOCKF>>>(batchsize,
+      dim, p, t, loss);
+}
+
+void Set(const size_t num, const float x, float *out, cudaStream_t s) {
+  KernelSet<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, x, out);
+}
+
+void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float* p,
+    const int *t, float *grad, cudaStream_t stream) {
+  KernelSoftmaxCrossEntropyBwd<<<ceil(batchsize/CU1DBLOCKF), CU1DBLOCKF>>>(batchsize,
+      dim, p, t, grad);
+}
 }  // namespace cuda
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..09953e4 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -83,13 +83,20 @@ void set_value(int n, float v, float *out);
 void threshold(int n, float alpha, const float *in, float *out);
 
 // follow the consistency guide for math API
+void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
+                         const float *p, const int *t, float *loss,
+                         cudaStream_t stream);
 void Div(const size_t num, const float x, const float *in, float *out,
          cudaStream_t s);
-void Set(const size_t num, const float x, float *out, cudaStream_t s);
 void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
+void Set(const size_t num, const float x, float *out, cudaStream_t s);
+void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
+                            const float *p, const int *t, float *grad,
+                            cudaStream_t stream);
+
 }  // cuda
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..1ac25c6 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -77,10 +77,9 @@ void Tensor::ResetLike(const Tensor &t) {
   }
 }
 
-void Tensor::Reshape(const Shape &shape) {
-  if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+void Tensor::Reshape(const Shape& shape) {
+  if (Product(shape) != Product(shape_)) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = shape;
@@ -403,22 +402,21 @@ Tensor Average(const Tensor &t, int axis) {
   }
 }
 
-Tensor SoftMax(const Tensor &in, int axis) {
+Tensor SoftMax(const Tensor &in) {
   Tensor out(in.shape(), in.device(), in.data_type());
-  SoftMax(in, axis, &out);
+  SoftMax(in, &out);
   return out;
 }
 
-void SoftMax(const Tensor &in, int axis, Tensor *out) {
+void SoftMax(const Tensor &in, Tensor *out) {
+  CHECK_LE(in.nDim(), 2u);
+  Exp(in, out);
   size_t nrow = 1, ncol = in.Size(), size = ncol;
-  CHECK_GE(axis, 0);
-  if (axis > 0) {
-    nrow = Product(in.shape(), 0, axis);
-    CHECK_EQ(size % nrow, 0u) << "Size = " << size << " nrow = " << nrow;
+  if (in.nDim() == 2u) {
+    nrow = in.shape(0);
     ncol = size / nrow;
+    out->Reshape(Shape{nrow, ncol});
   }
-  Exp(in, out);
-  out->Reshape(Shape{nrow, ncol});
   Tensor sum(Shape{nrow}, in.device(), in.data_type());
   SumColumns(*out, &sum);
   DivColumn(sum, out);
@@ -594,6 +592,19 @@ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
     Mult(alpha, one, vmat, beta, M);
   }
 }
+void ComputeCrossEntropy(const Tensor& t, Tensor* p) {
+  CHECK_LE(p->nDim(), 2u);
+  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
+  size_t batchsize = 1;
+  if (p->nDim() == 2u) batchsize = p->shape(0);
+  size_t dim = p->Size() / batchsize;
+  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
+    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
+                                       p->blob(), ctx);
+    }, {p->blob(), t.blob()}, {p->blob()});
+  });
+}
 
 template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
   Tensor out(in.shape(), in.device(), in.data_type());
@@ -665,7 +676,20 @@ void MultRow(const Tensor &v, Tensor *M) {
         {M->blob(), v.blob()}, {M->blob()});
   });
 }
-
+void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
+  CHECK_LE(p->nDim(), 2u);
+  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
+  size_t batchsize = 1;
+  if (p->nDim() == 2u)
+    batchsize = p->shape(0);
+  size_t dim = p->Size() / batchsize;
+  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
+    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
+                                          p->blob(), ctx);
+    }, {p->blob(), t.blob()}, {p->blob()});
+  });
+}
 void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }
 
 void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index ff865e0..bcf4908 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -110,12 +110,6 @@ void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
-/// Do softmax for each row invidually
-template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
 template <typename DType, typename Lang>
@@ -312,11 +306,14 @@ void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
 
 // ========follow the consistency guide of math API
 
+/// Divide alpha by each element of 'in'.
+// following the consistency guide.
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
+                         const Blob *p, const Blob *t, Blob *loss,
+                         Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
-/// Divide alpha by each element of 'in'.
 template <typename DType, typename Lang>
 void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
          Context *ctx) {
@@ -364,6 +361,17 @@ void GE(const size_t num, const Blob *in, const DType x, Blob *out,
         Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
+template <typename DType, typename Lang>
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lang>
+void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
+                            const Blob *p, const Blob *t, Blob *grad,
+                            Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
 
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 693f09c..907c656 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -17,7 +17,9 @@
  */
 #ifndef SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
 #define SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+
 #include "./tensor_math.h"
+#include <cfloat>
 #include "singa/core/common.h"
 #include <math.h>
 
@@ -210,6 +212,22 @@ void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob *ret,
 
 // follow the consistency guide of math API
 template <>
+void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
+                                           const size_t dim, const Blob *p,
+                                           const Blob *t, Blob *loss,
+                                           Context *ctx) {
+  const float *pPtr = static_cast<const float *>(p->data());
+  const float *tPtr = static_cast<const float *>(t->data());
+  float *lossPtr = static_cast<float *>(loss->mutable_data());
+  for (size_t i = 0; i < batchsize; i++) {
+    int truth_idx = static_cast<int>(tPtr[i]);
+    CHECK_GE(truth_idx, 0);
+    float prob_of_truth = pPtr[i * dim + truth_idx];
+    lossPtr[i] = -std::log(std::max(prob_of_truth, FLT_MIN));
+  }
+}
+
+template <>
 void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
                            Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -249,13 +267,6 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
     }
   }
 }
-
-template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
-}
 template <>
 void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
                           Blob *out, Context *ctx) {
@@ -312,9 +323,32 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
               lda, BPtr, ldb, beta, CPtr, ldc);
 }
-
 #endif  // USE_CBLAS
 
+template <>
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
+                                              const size_t dim, const Blob *p,
+                                              const Blob *t,
+                                              Blob *grad, Context *ctx) {
+  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
+  // const float* pPtr = static_cast<const float*>(p->data());
+  const float *tPtr = static_cast<const float *>(t->data());
+  float *gradPtr = static_cast<float *>(grad->mutable_data());
+
+  for (size_t i = 0; i < batchsize; i++) {
+    int truth_idx = static_cast<int>(tPtr[i]);
+    CHECK_GE(truth_idx, 0);
+    gradPtr[i * dim + truth_idx] -= 1.0;
+  }
+}
+
+
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..c69620c 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -75,6 +75,17 @@ void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
 
 // follow the consistency guide of math API
 template <>
+void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize,
+                                            const size_t dim, const Blob *p,
+                                            const Blob *t, Blob *loss,
+                                            Context *ctx) {
+  const float *pPtr = static_cast<const float *>(p->data());
+  const int *tPtr = static_cast<const int *>(t->data());
+  float *lossPtr = static_cast<float *>(loss->mutable_data());
+  cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream);
+}
+
+template <>
 void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
                             Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -82,19 +93,13 @@ void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
   cuda::Div(num, alpha, inPtr, outPtr, ctx->stream);
 }
 
-template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::Set(num, x, outPtr, ctx->stream);
-}
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              const size_t ncol, const Blob *M, const Blob *v,
                              Blob *out, Context *ctx) {
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   const float *MPtr = static_cast<const float *>(M->data());
   const float *vPtr = static_cast<const float *>(v->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -121,7 +126,7 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
   const float *APtr = static_cast<const float *>(A->data());
   const float *BPtr = static_cast<const float *>(B->data());
   float *CPtr = static_cast<float *>(C->mutable_data());
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
 }
@@ -155,9 +160,25 @@ void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
   cuda::LT(num, inPtr, x, outPtr, ctx->stream);
 }
 
+template<>
+void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cuda::Set(num, x, outPtr, ctx->stream);
+}
 
-
-
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
+                                               const size_t dim, const Blob *p,
+                                               const Blob *t, Blob *grad,
+                                               Context *ctx) {
+  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
+  const float *pPtr = static_cast<const float *>(p->data());
+  const int *tPtr = static_cast<const int *>(t->data());
+  float *gradPtr = static_cast<float *>(grad->mutable_data());
+  cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
+                               ctx->stream);
+}
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/softmax.cc b/src/model/layer/softmax.cc
index 813ebf0..8af1d76 100644
--- a/src/model/layer/softmax.cc
+++ b/src/model/layer/softmax.cc
@@ -26,10 +26,11 @@ void Softmax::Setup(const LayerConf& conf) {
 
 const Tensor Softmax::Forward(int flag, const Tensor& input) {
   if (input.nDim() == 1) {
-    Tensor tmp = Reshape(input, Shape{1, input.Size()});
-    buf_.push(SoftMax(tmp, 0));
+    buf_.push(SoftMax(input));
   } else {
-    buf_.push(SoftMax(input, axis_));
+    size_t nrow = Product(input.shape(), 0, axis_);
+    const Tensor& tmp = Reshape(input, Shape{nrow, input.Size() / nrow});
+    buf_.push(SoftMax(tmp));
   }
   return buf_.top();
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/cross_entropy.h
----------------------------------------------------------------------
diff --git a/src/model/loss/cross_entropy.h b/src/model/loss/cross_entropy.h
deleted file mode 100644
index 815b795..0000000
--- a/src/model/loss/cross_entropy.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SRC_MODEL_LOSS_CROSS_ENTROPY_H_
-#define SRC_MODEL_LOSS_CROSS_ENTROPY_H_
-#include <stack>
-#include "singa/model/loss.h"
-
-namespace singa {
-
-/// Cross entropy is for cross entropy loss.
-class CrossEntropy : public Loss<Tensor> {
- public:
-  /// Compute the loss values for each sample/instance given the prediction
-  /// and the target, which is sum {-log(prob_of_truth)}
-  /// Users can call Average(const Tensor&) to get the average
-  /// loss value over all samples in the batch.
-  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
-
-  /// Compute the gradients of the loss values w.r.t. the prediction,
-  /// which is: if the entry x corresponds to ground truth,
-  /// then softmax(x) - 1; else, softmax(x)
-  Tensor Backward() override;
-
- private:
-  // to buffer intermediate data, i.e., softmax(prediction), target
-  std::stack<Tensor> buf_;
-};
-
-Tensor CrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
-  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
-                      << " The calling pattern is [Forward|Evaluate] Backward";
-
-  size_t batchsize = 1;
-  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
-  size_t dim = prediction.Size() / batchsize;
-  // a temporal Softmax layer for forward computation
-//  LayerConf conf; // TODO(kaiping): this is currently commented
-//  Softmax softmax_tmp;
-//  softmax_tmp.Setup(conf);
-//  Tensor softmax = softmax_tmp.Forward(0, prediction);
-
-  Tensor softmax(Shape{batchsize, dim});  // TODO(kaiping): Delete
-//  softmax.SetValue<float>(0.5f); // TODO(kaiping): Delete
-
-  softmax.Reshape(Shape{batchsize, dim});
-  // buffer intermediate data
-  buf_.push(softmax);
-  buf_.push(target);
-
-  // Compute loss for each sample
-  Tensor loss(Shape{batchsize, 1});
-  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
-  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
-  float * loss_ptr = reinterpret_cast<float*>(loss.blob()->mutable_data());
-  for (size_t i = 0; i < batchsize; i++) {
-    int ilabel = static_cast<int>(truth_ptr[i]);
-    CHECK_GE(ilabel, 0);
-    float prob_of_truth = pre_ptr[ilabel];
-    loss_ptr[i] = -log(prob_of_truth);
-    pre_ptr += dim;  // change to the next sample
-  }
-  return loss;
-}
-
-Tensor CrossEntropy::Backward() {
-  const Tensor& target = buf_.top();
-  buf_.pop();
-  Tensor softmax = buf_.top();
-  buf_.pop();
-
-  size_t batchsize = 1;
-  if (softmax.nDim() > 1)
-    batchsize = softmax.shape().at(0);
-  size_t dim = softmax.Size() / batchsize;
-  float * truth_ptr = reinterpret_cast<float*>(target.blob()->mutable_data());
-  float * pre_ptr = reinterpret_cast<float*>(softmax.blob()->mutable_data());
-  for (size_t i = 0; i < batchsize; i++) {
-    int ilabel = static_cast<int>(truth_ptr[i]);
-    // CHECK_GE(ilabel, 0);
-    pre_ptr[ilabel] -= 1.0;
-    pre_ptr += dim;  // change to the next sample
-  }
-  return softmax;
-}
-}  // namespace singa
-
-#endif  // SRC_MODEL_LOSS_CROSS_ENTROPY_H_
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/mse.cc
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.cc b/src/model/loss/mse.cc
new file mode 100644
index 0000000..a4bbb72
--- /dev/null
+++ b/src/model/loss/mse.cc
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/loss.h"
+
+namespace singa {
+
+Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  Tensor t = prediction - target;
+  size_t batchsize = 1;
+  if (t.nDim() > 1) batchsize = t.shape().at(0);
+  size_t dim = t.Size() / batchsize;
+  t.Reshape(Shape{batchsize, dim});
+  buf_.push(t);
+  // TODO(wangwei) use CastType for operator/
+  return Sum(Square(t), 1) * 0.5f;
+}
+
+Tensor MSE::Backward() {
+  Tensor ret = buf_.top();
+  buf_.pop();
+  return ret * (1.0f / ret.shape().at(0));
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/mse.h
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.h b/src/model/loss/mse.h
deleted file mode 100644
index 1a022f9..0000000
--- a/src/model/loss/mse.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SINGA_MODEL_LOSS_MSE_H_
-#define SINGA_MODEL_LOSS_MSE_H_
-#include <stack>
-#include "singa/model/loss.h"
-
-namespace singa {
-
-/// MSE is for mean squared error or squared euclidean distance.
-class MSE : public Loss<Tensor> {
- public:
-  /// Compute the loss values for each sample/instance given the prediction
-  /// and the target, which is 0.5/||prediction-target||^2
-  /// Users can call Average(const Tensor&) to get the average
-  /// loss value over all samples in the batch.
-  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
-
-  /// Compute the gradients of the loss values w.r.t. the prediction,
-  /// which is (prediction-target)/batchsize
-  Tensor Backward() override;
-
- private:
-  // to buffer intermediate data, i.e., prediction-target
-  std::stack<Tensor> buf_;
-};
-
-Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
-  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
-                      << " The calling pattern is [Forward|Evaluate] Backward";
-  Tensor t = prediction - target;
-  size_t batchsize = 1;
-  if (t.nDim() > 1) batchsize = t.shape().at(0);
-  size_t dim = t.Size() / batchsize;
-  t.Reshape(Shape{batchsize, dim});
-  buf_.push(t);
-  // TODO(wangwei) use CastType for operator/
-  return Sum(Square(t), 1) * 0.5f;
-}
-
-Tensor MSE::Backward() {
-  Tensor ret = buf_.top();
-  buf_.pop();
-  return ret * (1.0f / ret.shape().at(0));
-}
-}  // namespace singa
-
-#endif  // SINGA_MODEL_LOSS_H_
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/src/model/loss/softmax_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/src/model/loss/softmax_cross_entropy.cc b/src/model/loss/softmax_cross_entropy.cc
new file mode 100644
index 0000000..4ca323a
--- /dev/null
+++ b/src/model/loss/softmax_cross_entropy.cc
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stack>
+#include "singa/model/loss.h"
+
+namespace singa {
+
+
+Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  size_t batchsize = 1;
+  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
+  size_t dim = prediction.Size() / batchsize;
+  const Tensor& input = Reshape(prediction, Shape{batchsize, dim});
+  Tensor prob = SoftMax(input);
+
+  // buffer intermediate data
+  buf_.push(prob);
+  buf_.push(target);
+  Tensor loss = prob.Clone();
+
+  ComputeCrossEntropy(target, &loss);
+  return loss;
+}
+
+Tensor SoftmaxCrossEntropy::Backward() {
+  const Tensor target = buf_.top();
+  buf_.pop();
+  Tensor prob = buf_.top();
+  buf_.pop();
+  SoftmaxCrossEntropyBwd(target, &prob);
+  return prob;
+}
+}  // namespace singa
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
index 9bb2321..6b8cb69 100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@ -22,16 +22,15 @@
 #include "gtest/gtest.h"
 #include "singa/core/tensor.h"
 #include "singa/core/device.h"
-#include "../src/model/loss/cross_entropy.h"
+#include "singa/model/loss.h"
+#include "singa_config.h"
 
 using singa::Tensor;
-class TestCrossEntropy : public ::testing::Test {
+class TestSoftmaxCrossEntropy : public ::testing::Test {
  protected:
   virtual void SetUp() {
     p.Reshape(singa::Shape{2, 4});
     t.Reshape(singa::Shape{2, 1});
-    p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
-    t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
   }
   const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
   const float tdat[2] = {0.0, 2.0};
@@ -39,8 +38,11 @@ class TestCrossEntropy : public ::testing::Test {
   singa::Tensor p, t;
 };
 
-TEST_F(TestCrossEntropy, CppForward) {
-  singa::CrossEntropy cross_entropy;
+TEST_F(TestSoftmaxCrossEntropy, CppForward) {
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  singa::SoftmaxCrossEntropy cross_entropy;
   const Tensor& loss = cross_entropy.Forward(p, t);
   auto ldat = loss.data<const float*>();
 
@@ -49,8 +51,11 @@ TEST_F(TestCrossEntropy, CppForward) {
   EXPECT_FLOAT_EQ(ldat[1], result_test);
 }
 
-TEST_F(TestCrossEntropy, CppBackward) {
-  singa::CrossEntropy cross_entropy;
+TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  singa::SoftmaxCrossEntropy cross_entropy;
   cross_entropy.Forward(p, t);
   const Tensor& grad = cross_entropy.Backward();
 
@@ -64,3 +69,46 @@ TEST_F(TestCrossEntropy, CppBackward) {
   EXPECT_FLOAT_EQ(gdat[6], -0.75);
   EXPECT_FLOAT_EQ(gdat[7], 0.25);
 }
+
+#ifdef USE_CUDA
+
+TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
+  singa::SoftmaxCrossEntropy cross_entropy;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  Tensor loss = cross_entropy.Forward(p, t);
+  loss.ToHost();
+  auto ldat = loss.data<const float*>();
+
+  const float result_test = -log(0.25);
+  EXPECT_FLOAT_EQ(ldat[0], result_test);
+  EXPECT_FLOAT_EQ(ldat[1], result_test);
+}
+
+TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
+  singa::SoftmaxCrossEntropy cross_entropy;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  cross_entropy.Forward(p, t);
+  Tensor grad = cross_entropy.Backward();
+
+  grad.ToHost();
+  auto gdat = grad.data<const float*>();
+  EXPECT_FLOAT_EQ(gdat[0], -0.75);
+  EXPECT_FLOAT_EQ(gdat[1], 0.25);
+  EXPECT_FLOAT_EQ(gdat[2], 0.25);
+  EXPECT_FLOAT_EQ(gdat[3], 0.25);
+  EXPECT_FLOAT_EQ(gdat[4], 0.25);
+  EXPECT_FLOAT_EQ(gdat[5], 0.25);
+  EXPECT_FLOAT_EQ(gdat[6], -0.75);
+  EXPECT_FLOAT_EQ(gdat[7], 0.25);
+}
+#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 67f583c..a6bd1c3 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -22,8 +22,9 @@
 #include "gtest/gtest.h"
 #include "singa/core/tensor.h"
 #include "singa/core/device.h"
-#include "../src/model/loss/mse.h"
+#include "singa/model/loss.h"
 #include "singa_config.h"
+
 using singa::Tensor;
 class TestMSE : public ::testing::Test {
  protected:
@@ -66,6 +67,8 @@ TEST_F(TestMSE, CppBackward) {
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }
 #endif
+
+#ifdef USE_CUDA
 TEST_F(TestMSE, CudaForward) {
   singa::MSE mse;
   singa::CudaGPU dev;
@@ -98,3 +101,4 @@ TEST_F(TestMSE, CudaBackward) {
   for (size_t i = 0; i < grad.Size(); i++)
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
 }
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ec17acab/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_softmax.cc b/test/singa/test_softmax.cc
index da2a6ef..09dfcd9 100644
--- a/test/singa/test_softmax.cc
+++ b/test/singa/test_softmax.cc
@@ -55,7 +55,6 @@ TEST(Softmax, Forward) {
   const float* yptr = out.data<const float*>();
   EXPECT_EQ(n, out.Size());
 
-  float* y = new float[n];
   float* sigma = new float[row];
   for (size_t i = 0; i < row; i++)
     sigma[i] = 0.f;
@@ -63,11 +62,9 @@ TEST(Softmax, Forward) {
     sigma[i / col] += exp(x[i]);
   //EXPECT_EQ(0, sigma[1]);
   for (size_t i = 0; i < row; i++)
-    for (size_t j = 0; j < col; j++)
-      y[i * col + j] = exp(x[i * col + j]) / sigma[i];
-  EXPECT_FLOAT_EQ(y[0], yptr[0]);
-  EXPECT_FLOAT_EQ(y[4], yptr[4]);
-  EXPECT_FLOAT_EQ(y[5], yptr[5]);
+    for (size_t j = 0; j < col; j++) {
+      EXPECT_FLOAT_EQ(yptr[i * col + j], exp(x[i * col + j]) / sigma[i]);
+    }
 }
 
 TEST(Softmax, Backward) {