You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/12 07:27:53 UTC

[1/5] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Repository: incubator-singa
Updated Branches:
  refs/heads/dev 01aaf4900 -> 6d69047ad


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 94ca283..38a9291 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -5,17 +5,17 @@ using singa::Shape;
 using singa::Device;
 
 class TestTensorMath : public ::testing::Test {
-protected:
+ protected:
   virtual void SetUp() {
     a.Reshape(singa::Shape{6});
     b.Reshape(singa::Shape{6});
     c.Reshape(singa::Shape{6, 1});
     d.Reshape(singa::Shape{3, 2});
-		e.Reshape(singa::Shape{3, 2});
+    e.Reshape(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);
-		e.CopyDataFromHostPtr<float>(dat1, 6);
+    e.CopyDataFromHostPtr<float>(dat1, 6);
   }
   Tensor a, b, c, d, e;
   const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -23,264 +23,262 @@ protected:
 };
 
 TEST_F(TestTensorMath, MemberAbs) {
-	Tensor aa = a.Clone();
-	Tensor bb = b.Clone();
-	Tensor cc = aa - bb;
-	const float* dptr = cc.data<const float*>();
-	EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+  Tensor aa = a.Clone();
+  Tensor bb = b.Clone();
+  Tensor cc = aa - bb;
+  const float *dptr = cc.data<const float *>();
+  EXPECT_NEAR(-0.1, dptr[0], 1e-5);
   EXPECT_NEAR(-0.1, dptr[1], 1e-5);
   EXPECT_NEAR(-0.1, dptr[2], 1e-5);
 
-	Tensor p = Abs(cc);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+  Tensor p = Abs(cc);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(0.1, dptr1[0], 1e-5);
   EXPECT_NEAR(0.1, dptr1[1], 1e-5);
   EXPECT_NEAR(0.1, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberExp) {
-	Tensor p = Exp(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+  Tensor p = Exp(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
   EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
   EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberLog) {
-	Tensor p = Log(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+  Tensor p = Log(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
   EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
   EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberReLU) {
-	Tensor aa = a.Clone();
-	Tensor cc = aa - 2.0f;
-	const float* dptr = cc.data<const float*>();
-	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  Tensor aa = a.Clone();
+  Tensor cc = aa - 2.0f;
+  const float *dptr = cc.data<const float *>();
+  EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
 
-	Tensor p = ReLU(cc);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+  Tensor p = ReLU(cc);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
   EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
   EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberSigmoid) {
-	Tensor p = Sigmoid(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(1.0f/(1.0f + exp(-1.0f)), dptr1[0], 1e-5);
-  EXPECT_NEAR(1.0f/(1.0f + exp(-2.0f)), dptr1[1], 1e-5);
-  EXPECT_NEAR(1.0f/(1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+  Tensor p = Sigmoid(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberSign) {
-	Tensor aa = a.Clone();
-	Tensor cc = aa - 2.0f;
-	const float* dptr = cc.data<const float*>();
-	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  Tensor aa = a.Clone();
+  Tensor cc = aa - 2.0f;
+  const float *dptr = cc.data<const float *>();
+  EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
 
-	Tensor p = Sign(cc);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_EQ(0.0f, dptr1[0]);
+  Tensor p = Sign(cc);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_EQ(0.0f, dptr1[0]);
   EXPECT_EQ(0.0f, dptr1[1]);
   EXPECT_EQ(1.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberSqrt) {
-	Tensor p = Sqrt(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+  Tensor p = Sqrt(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
   EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
   EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberSquare) {
-	Tensor p = Square(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+  Tensor p = Square(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(1.0, dptr1[0], 1e-5);
   EXPECT_NEAR(4.0, dptr1[1], 1e-5);
   EXPECT_NEAR(9.0, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberTanh) {
-	Tensor p = Tanh(a);
-	const float* dptr1 = p.data<const float*>();
-	EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+  Tensor p = Tanh(a);
+  const float *dptr1 = p.data<const float *>();
+  EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
   EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
   EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, Sum) {
-	Tensor p1 = Sum(e, 0);
+  Tensor p1 = Sum(e, 0);
   const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
-	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
+  EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(12.0f, dptr1[1]);
 
-	Tensor p2(Shape{3,1});
-	p2 = Sum(e, 1);
+  Tensor p2(Shape{3, 1});
+  p2 = Sum(e, 1);
   const float *dptr2 = p2.data<const float *>();
-	EXPECT_FLOAT_EQ(3.0f,dptr2[0]);
-	EXPECT_FLOAT_EQ(7.0f,dptr2[1]);
-	EXPECT_FLOAT_EQ(11.0f,dptr2[2]);
+  EXPECT_FLOAT_EQ(3.0f, dptr2[0]);
+  EXPECT_FLOAT_EQ(7.0f, dptr2[1]);
+  EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
 }
 
 TEST_F(TestTensorMath, SoftMax) {
-	Tensor p1(Shape{3,2});
-	p1 = SoftMax(e,0);
+  Tensor p1(Shape{3, 2});
+  p1 = SoftMax(e, 0);
   const float *dptr1 = p1.data<const float *>();
-	float sum = 0;
-	for(int i = 0; i < 6; i++) sum += exp(i+1);
-	EXPECT_NEAR(exp(1)/sum, dptr1[0],1e-5);
-	EXPECT_NEAR(exp(3)/sum, dptr1[2],1e-5);
-	EXPECT_NEAR(exp(5)/sum, dptr1[4],1e-5);
-	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
-	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
-	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
-
-	Tensor p2(Shape{3,2});
-	p2 = SoftMax(e,1);
+  float sum = 0;
+  for (int i = 0; i < 6; i++) sum += exp(i + 1);
+  EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5);
+  EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5);
+  EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5);
+  EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5);
+  EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5);
+
+  Tensor p2(Shape{3, 2});
+  p2 = SoftMax(e, 1);
   const float *dptr2 = p2.data<const float *>();
-	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
-	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
+  EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5);
+  EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberLT) {
-	Tensor p1 = a < 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+  Tensor p1 = a < 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberLE) {
-	Tensor p1 = a <= 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+  Tensor p1 = a <= 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberGT) {
-	Tensor p1 = a > 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+  Tensor p1 = a > 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberGE) {
-	Tensor p1 = a >= 2.0f;
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
-	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+  Tensor p1 = a >= 2.0f;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
 }
 
 TEST_F(TestTensorMath, MemberPow) {
-	Tensor p1 = Pow(b,3.0f);
-	const float *dptr1 = p1.data<const float *>();
-	EXPECT_FLOAT_EQ(pow(1.1f,3.0f), dptr1[0]);
-	EXPECT_FLOAT_EQ(pow(2.1f,3.0f), dptr1[1]);
-	EXPECT_FLOAT_EQ(pow(3.1f,3.0f), dptr1[2]);
+  Tensor p1 = Pow(b, 3.0f);
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
+  EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]);
+  EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]);
 
-	//TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the function is complete
-	//Tensor p2 = Pow(a,b);
-	//const float *dptr2 = p2.data<const float *>();
-	//EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
-	//EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
-	//EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+  // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the
+  // function is complete
+  // Tensor p2 = Pow(a,b);
+  // const float *dptr2 = p2.data<const float *>();
+  // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+  // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+  // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
 }
 
-
 TEST_F(TestTensorMath, MemberSub) {
-	Tensor p1 = a - b;
-	const float* dptr1 = p1.data<const float*>();
-	EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+  Tensor p1 = a - b;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
   EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
   EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberEltwiseMult) {
-	Tensor p1 = a * b;
-	const float* dptr1 = p1.data<const float*>();
-	EXPECT_NEAR(1.0*1.1, dptr1[0], 1e-5);
-  EXPECT_NEAR(2.0*2.1, dptr1[1], 1e-5);
-  EXPECT_NEAR(3.0*3.1, dptr1[2], 1e-5);
+  Tensor p1 = a * b;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberDiv) {
-	Tensor p1 = a / b;
-	const float* dptr1 = p1.data<const float*>();
-	EXPECT_NEAR(1.0/1.1, dptr1[0], 1e-5);
-  EXPECT_NEAR(2.0/2.1, dptr1[1], 1e-5);
-  EXPECT_NEAR(3.0/3.1, dptr1[2], 1e-5);
+  Tensor p1 = a / b;
+  const float *dptr1 = p1.data<const float *>();
+  EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5);
 
-	Tensor p2 = Div(10.0f,b);
-	const float* dptr2 = p2.data<const float*>();
-	EXPECT_NEAR(10.0/1.1, dptr2[0], 1e-5);
-  EXPECT_NEAR(10.0/2.1, dptr2[1], 1e-5);
-  EXPECT_NEAR(10.0/3.1, dptr2[2], 1e-5);
+  Tensor p2 = Div(10.0f, b);
+  const float *dptr2 = p2.data<const float *>();
+  EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5);
+  EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5);
+  EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5);
 
-	Tensor p3 = a / 8.0f;
-	const float* dptr3 = p3.data<const float*>();
-	EXPECT_NEAR(1.0/8.0, dptr3[0], 1e-5);
-  EXPECT_NEAR(2.0/8.0, dptr3[1], 1e-5);
-  EXPECT_NEAR(3.0/8.0, dptr3[2], 1e-5);
+  Tensor p3 = a / 8.0f;
+  const float *dptr3 = p3.data<const float *>();
+  EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5);
+  EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5);
+  EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
 }
 
 TEST_F(TestTensorMath, MemberBernoulli) {
-	Tensor p1(Shape{10000});
-	Bernoulli(0.3f, &p1);
-	const float* dptr1 = p1.data<const float*>();
-	float sum = 0;
-	for(int i = 0; i < 10000; i++) sum += dptr1[i];
-	float mean = sum/10000;
-	EXPECT_NEAR(mean, 0.3f, 1e-2);
+  Tensor p1(Shape{10000});
+  Bernoulli(0.3f, &p1);
+  const float *dptr1 = p1.data<const float *>();
+  float sum = 0;
+  for (int i = 0; i < 10000; i++) sum += dptr1[i];
+  float mean = sum / 10000;
+  EXPECT_NEAR(mean, 0.3f, 1e-2);
 
-	sum = 0;
-	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
-	float variance = sum/9999;
-	EXPECT_NEAR(variance, 0.3*0.7, 1e-2);
+  sum = 0;
+  for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 9999;
+  EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
 }
 
 TEST_F(TestTensorMath, MemberUniform) {
-	Tensor p1(Shape{10000});
-	Uniform(0.1f,0.2f,&p1);
-	const float* dptr1 = p1.data<const float*>();
-	float sum = 0;
-	for(int i = 0; i < 10000; i++) sum += dptr1[i];
-	float mean = sum/10000;
-	EXPECT_NEAR(mean, 0.15f, 1e-3);
+  Tensor p1(Shape{10000});
+  Uniform(0.1f, 0.2f, &p1);
+  const float *dptr1 = p1.data<const float *>();
+  float sum = 0;
+  for (int i = 0; i < 10000; i++) sum += dptr1[i];
+  float mean = sum / 10000;
+  EXPECT_NEAR(mean, 0.15f, 1e-3);
 
-	sum = 0;
-	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
-	float variance = sum/9999;
-	EXPECT_NEAR(variance, 0.01f/12, 1e-3);
+  sum = 0;
+  for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 9999;
+  EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
 }
 
 TEST_F(TestTensorMath, MemberGaussian) {
-	Tensor p1(Shape{50000});
-	Gaussian(0.0f,1.0f,&p1);
-	const float* dptr1 = p1.data<const float*>();
-	float sum = 0;
-	for(int i = 0; i < 50000; i++) sum += dptr1[i];
-	float mean = sum/50000;
-	EXPECT_NEAR(mean, 0.0, 1e-2);
+  Tensor p1(Shape{50000});
+  Gaussian(0.0f, 1.0f, &p1);
+  const float *dptr1 = p1.data<const float *>();
+  float sum = 0;
+  for (int i = 0; i < 50000; i++) sum += dptr1[i];
+  float mean = sum / 50000;
+  EXPECT_NEAR(mean, 0.0, 1e-2);
 
-	sum = 0;
-	for(int i = 0; i < 50000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
-	float variance = sum/49999;
-	EXPECT_NEAR(variance, 1.0, 1e-2);
+  sum = 0;
+  for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 49999;
+  EXPECT_NEAR(variance, 1.0, 1e-2);
 }
 
-
-
 TEST_F(TestTensorMath, MemberAddTensor) {
   Tensor aa = a.Clone();
   aa += a;
@@ -333,8 +331,7 @@ TEST_F(TestTensorMath, SetValue) {
   Tensor t(Shape{4});
   t.SetValue(0.3f);
   const float *ptr = t.data<const float *>();
-  for (int i = 0; i < 4; i++)
-    EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+  for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
 }
 
 TEST_F(TestTensorMath, Reshape) {
@@ -344,10 +341,15 @@ TEST_F(TestTensorMath, Reshape) {
   const float *ptr = t.data<const float *>();
   EXPECT_EQ(p.shape(0), 4u);
   EXPECT_EQ(p.shape(1), 1u);
-  for (int i = 0; i < 4; i++)
-    EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+  for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
 }
 #ifdef USE_CBLAS
+TEST_F(TestTensorMath, L2Cpp) {
+  float l2 = a.L2();
+  float target = 0.0f;
+  for (size_t i = 0; i < a.Size(); i++) target += dat1[i] * dat1[i];
+  EXPECT_FLOAT_EQ(l2, sqrt(target));
+}
 TEST_F(TestTensorMath, MultCpp) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   Tensor t(Shape{2, 2});
@@ -368,8 +370,7 @@ TEST_F(TestTensorMath, MultCpp) {
   Tensor s(Shape{4, 2});
   s.CopyDataFromHostPtr(y, 8);
   const float *sPtr = s.data<const float *>();
-  for (int i = 0; i < 8; i++)
-    EXPECT_FLOAT_EQ(sPtr[i], y[i]);
+  for (int i = 0; i < 8; i++) EXPECT_FLOAT_EQ(sPtr[i], y[i]);
   Tensor D = Mult(d, s.T());
   const float *DPtr = D.data<const float *>();
   for (int i = 0; i < 3; i++) {
@@ -423,7 +424,6 @@ TEST_F(TestTensorMath, SubColumnCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, DivColumnCpp) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
@@ -438,7 +438,6 @@ TEST_F(TestTensorMath, DivColumnCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, AddRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
@@ -453,7 +452,6 @@ TEST_F(TestTensorMath, AddRowCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, SubRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
@@ -468,7 +466,6 @@ TEST_F(TestTensorMath, SubRowCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, MultRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
@@ -483,7 +480,6 @@ TEST_F(TestTensorMath, MultRowCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, SumRowsCpp) {
   Tensor t(Shape{2});
   d.CopyDataFromHostPtr(dat1, 6);
@@ -498,7 +494,6 @@ TEST_F(TestTensorMath, SumRowsCpp) {
   }
 }
 
-
 TEST_F(TestTensorMath, SumColumnsCpp) {
   Tensor t(Shape{3});
   d.CopyDataFromHostPtr(dat1, 6);
@@ -514,6 +509,15 @@ TEST_F(TestTensorMath, SumColumnsCpp) {
 }
 #endif
 #ifdef USE_CUDA
+TEST_F(TestTensorMath, L2Cuda) {
+  singa::CudaGPU dev;
+  Tensor t(Shape{3, 2}, &dev);
+  t.CopyDataFromHostPtr(dat1, 6);
+  float l2 = t.L2();
+  float target = 0.0f;
+  for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
+  EXPECT_FLOAT_EQ(l2, sqrt(target));
+}
 TEST_F(TestTensorMath, MultCuda) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   singa::CudaGPU dev;
@@ -582,7 +586,6 @@ TEST_F(TestTensorMath, AddColumnCuda) {
   }
 }
 
-
 TEST_F(TestTensorMath, SubColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   singa::CudaGPU dev;
@@ -757,4 +760,5 @@ TEST_F(TestTensorMath, SumColumnCuda) {
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
 }
+
 #endif

[3/5] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index ec7a892..2c5c272 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -25,12 +25,11 @@
 #include <cblas.h>
 #endif
 
-/// TODO(wangwei) Clean the implementations following the comments in
-/// tensor_math.h.
 namespace singa {
 
-template<>
-void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+template <>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -39,180 +38,150 @@ void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context
 }
 
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] + x;
+  }
 }
 
-// sum all elements of input into out
-// TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
-  float s = 0.f;
-  const float *inPtr = static_cast<const float *>(in->data());
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
-    s += inPtr[i];
+    outPtr[i] = in1Ptr[i] + in2Ptr[i];
   }
-  *out = s;
 }
 
 template <>
-void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Clamp<float, lang::Cpp>(const size_t num, const float low,
+                             const float high, const Blob *in, Blob *out,
+                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float*>(in->data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; 
+    if (inPtr[i] > high) {
+      outPtr[i] = high;
+    } else if (inPtr[i] < low) {
+      outPtr[i] = low;
+    } else {
+      outPtr[i] = inPtr[i];
+    }
   }
 }
 
 template <>
-void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_NE(in2Ptr[i], 0.f);
+    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
+                           Blob *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = exp(inPtr[i]);
+    CHECK_NE(inPtr[i], 0.f);
+    outPtr[i] = x / inPtr[i];
   }
 }
 
 template <>
-void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in,
+                                   const float x, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = log(inPtr[i]);
+    outPtr[i] = inPtr[i] * x;
   }
 }
 
 template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1,
+                                   const Blob *in2, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] * in2Ptr[i];
+  }
+}
+template <>
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = sqrt(inPtr[i]);
+    outPtr[i] = exp(inPtr[i]);
   }
 }
 
 template <>
-void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * inPtr[i];
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
   }
 }
 
 template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = tanh(inPtr[i]);
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
   }
 }
-
 template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
   }
 }
-
 template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = log(inPtr[i]);
   }
 }
-
 template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
-             Blob *out, Context *ctx) {
+void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-	float *bPtr = new float[ncol];
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-		float denom = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-			bPtr[c] = exp(inPtr[offset + c]);
-			denom += bPtr[c];
-    }
-		for (size_t c = 0; c < ncol; c++) {
-			size_t idx = offset + c;
-			outPtr[idx] = bPtr[c] / denom;
-		}
-  }
-	delete bPtr;
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
-             Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		outPtr[r] = 0.f;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[r] += inPtr[offset + c];
-		}
-	}
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t c = 0; c < ncol; c++) {
-		outPtr[c] = 0.f;
-	}
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-				outPtr[c] += inPtr[offset + c];
-		}
-	}
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());                              
-  const float *vPtr = static_cast<const float *>(v->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[offset + c] = APtr[offset + c] + vPtr[c];
-		}
-	}
-}
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());                              
-  const float *vPtr = static_cast<const float *>(v->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[offset + c] = APtr[offset + c] + vPtr[r];
-		}
-	}
-}
-
-template <>
-void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t i = 0; i < num; i++) {
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+  }
+}
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
     outPtr[i] = pow(inPtr[i], x);
   }
 }
@@ -220,252 +189,230 @@ void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob
 template <>
 void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
                            Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr= static_cast<const float *>(in1->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
     outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
   }
 }
-
 template <>
-void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
-														 Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t i = 0; i < num; i++) {
-		if (inPtr[i] > high) {
-			outPtr[i] = high;
-		}
-		else if (inPtr[i] < low) {
-			outPtr[i] = low;
-		}
-		else {
-			outPtr[i] = inPtr[i];			
-		}
-	}
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+  }
 }
-
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, 
-													 Blob *out, Context *ctx) {
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] + x;
+    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
   }
 }
 
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] + in2Ptr[i];
+    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] - in2Ptr[i];
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = sqrt(inPtr[i]);
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * x;
+    outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] * in2Ptr[i];
+    outPtr[i] = in1Ptr[i] - in2Ptr[i];
   }
 }
 
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
 template <>
-void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                           Context *ctx) {
+  float s = 0.f;
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-		CHECK_NE(in2Ptr[i],0.f);
-    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+    s += inPtr[i];
   }
+  *out = s;
 }
 
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, 
-         								  Blob *out, Context *ctx) {
-	float *outPtr= static_cast<float *>(out->mutable_data());
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-		CHECK_NE(inPtr[i],0.f);
-    outPtr[i] = x / inPtr[i];
+    outPtr[i] = tanh(inPtr[i]);
   }
 }
 
+// =========Matrix operations ================================================
+
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
-           Blob *out, Context *ctx) {
-	float *outPtr= static_cast<float *>(out->mutable_data());
-	const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-	for (size_t r = 0; r < m ; r++) {
-		size_t offset = r * n;
-		for (size_t c = 0; c < n; c++) {
-			outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
-		}
-	}
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+    }
+  }
 }
 
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+    }
   }
 }
-
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                             const Blob *in2, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t r = 0; r < m; r++) {
+    size_t offset = r * n;
+    for (size_t c = 0; c < n; c++) {
+      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+    }
   }
 }
-
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      bPtr[c] = exp(inPtr[offset + c]);
+      denom += bPtr[c];
+    }
+    for (size_t c = 0; c < ncol; c++) {
+      size_t idx = offset + c;
+      outPtr[idx] = bPtr[c] / denom;
+    }
   }
+  delete bPtr;
 }
 
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                  const Blob *in, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  for (size_t c = 0; c < ncol; c++) {
+    outPtr[c] = 0.f;
+  }
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[c] += inPtr[offset + c];
+    }
   }
 }
 
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
-	size_t maxPos = 0;
-	float maxVal = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		if (i == 0) {
-			maxVal = inPtr[i]; 
-		}
-		else if (inPtr[i] > maxVal) {
-			maxVal = inPtr[i];
-			maxPos = i;
-		}
-	}
-	*out = maxPos;
-}
-
-template <>
-void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
-	size_t minPos = 0;
-	float minVal = 0;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		if (i == 0) {
-			minVal = inPtr[i]; 
-		}
-		else if (inPtr[i] > minVal) {
-			minVal = inPtr[i];
-			minPos = i;
-		}
-	}
-	*out = minPos;
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    outPtr[r] = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[r] += inPtr[offset + c];
+    }
+  }
 }
 
+// ===============Random operations==========================================
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
-	float sum = 0;
-	const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		sum += fabs(inPtr[i]);
-	}
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
+                                 Context *ctx) {
+  std::bernoulli_distribution distribution(p);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+  }
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-          							 	  Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
+                                const float std, Blob *out, Context *ctx) {
+  std::normal_distribution<float> distribution(mean, std);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {	
-		outPtr[i] += alpha * inPtr[i];
-	}
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
 }
-
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-	for (size_t i = 0; i < num; i++) {
-		outPtr[i] *= x;
-	}
+void Uniform<float, lang::Cpp>(const size_t num, const float low,
+                               const float high, Blob *out, Context *ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
 }
 
-//template <>
-//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-//         									 float *out, Context *ctx) {
-//	float sum = 0;
-//	const float *in1Ptr = static_cast<const float *>(in1->data());
-//	const float *in2Ptr = static_cast<const float *>(in2->data());
-//	for (size_t i = 0; i < num; i++) {
-//		sum += in1Ptr[i] * in2Ptr[i];
-//	}
-//}
-
-template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
-          const Blob *A, const Blob *v, const float beta, 
-					Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-	const float* APtr = static_cast<const float *>(A->data());
-	const float* vPtr = static_cast<const float *>(v->data());
-	for (size_t r = 0; r < m; r++) {
-		float sum = 0; 
-		for (size_t c = 0; c < n; c++) {
-			size_t idx = trans ? c * m + r : r * n + c;	
-			sum += APtr[idx] * vPtr[c];
-		}
-		outPtr[r] = alpha * sum + beta * outPtr[r];
-	}
-}
+// ====================Blas operations======================================
 
 template <>
 void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
@@ -491,37 +438,21 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
   }
 }
 
+#ifdef USE_CBLAS
 template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
-  std::bernoulli_distribution distribution(p);
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                            Blob *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
-  }
+  cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
 }
-
-template <>
-void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
-                               Context *ctx) {
-  std::uniform_real_distribution<float> distribution(low, high);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-  }
-}
-
 template <>
-void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
-                                Context *ctx) {
-  std::normal_distribution<float> distribution(mean, std);
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-  }
+  cblas_sscal(num, x, outPtr, 1);
 }
 
-
-#ifdef USE_CBLAS
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
                            float *out, Context *ctx) {
@@ -529,6 +460,21 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
   const float *in2Ptr = static_cast<const float *>(in2->data());
   *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Blob *A, const Blob *v,
+                            const float beta, Blob *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  if (!trans) {
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+                beta, outPtr, 1);
+  } else {
+    cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
+                outPtr, 1);
+  }
+}
 
 template <>
 void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
@@ -548,6 +494,98 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
               lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
+#else
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  size_t maxPos = 0;
+  float maxVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      maxVal = inPtr[i];
+    } else if (inPtr[i] > maxVal) {
+      maxVal = inPtr[i];
+      maxPos = i;
+    }
+  }
+  *out = maxPos;
+}
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  size_t minPos = 0;
+  float minVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      minVal = inPtr[i];
+    } else if (inPtr[i] > minVal) {
+      minVal = inPtr[i];
+      minPos = i;
+    }
+  }
+  *out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  float sum = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += fabs(inPtr[i]);
+  }
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] += alpha * inPtr[i];
+  }
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] *= x;
+  }
+}
+
+template <>
+void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           float *out, Context *ctx) {
+  float sum = 0;
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += in1Ptr[i] * in2Ptr[i];
+  }
+}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Blob *A, const Blob *v,
+                            const float beta, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < m; r++) {
+    float sum = 0;
+    for (size_t c = 0; c < n; c++) {
+      size_t idx = trans ? c * m + r : r * n + c;
+      sum += APtr[idx] * vPtr[c];
+    }
+    outPtr[r] = alpha * sum + beta * outPtr[r];
+  }
+}
+
 #endif  // USE_CBLAS
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..f9841a3 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -26,75 +26,100 @@
 #include "singa/core/common.h"
 
 namespace singa {
-
-// TODO(wangwei) Clean implementations following comments in tensor_math_cpp.h.
-// TODO(wangwei) optimize using stream
+// =================Elementwise operations===================================
 template <>
-void Add<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
-                            Blob *ret, Context *ctx) {
-  const float *a = static_cast<const float *>(lhs->data());
-  const float *b = static_cast<const float *>(rhs->data());
-  float *c = static_cast<float *>(ret->mutable_data());
-  cuda::add(count, a, b, c);
+void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+                            Blob *out, Context *ctx) {
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cuda::add(num, in1Ptr, in2Ptr, outPtr);
 }
 
-// TODO(wangwei) optimize using stream
+// follow the consistency guide of math API
 template <>
-void Sub<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
-                            Blob *ret, Context *ctx) {
-  const float *a = static_cast<const float *>(lhs->data());
-  const float *b = static_cast<const float *>(rhs->data());
-  float *c = static_cast<float *>(ret->mutable_data());
-  cuda::sub(count, a, b, c);
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::Div(num, x, inPtr, outPtr, ctx->stream);
 }
 
 template <>
-void EltwiseMult<float, lang::Cuda>(int count, const Blob *input, float x,
-                                    Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  cuda::mult(count, lptr, x, dptr);
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
+                                    const float x, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::mult(num, inPtr, x, outPtr);
 }
-// TODO(wangwei) optimize using stream
 template <>
-void Square<float, lang::Cuda>(int count, const Blob *input, Blob *ret,
-                               Context *ctx) {
-  const float *in = static_cast<const float *>(input->data());
-  float *out = static_cast<float *>(ret->mutable_data());
-  cuda::square(count, in, out);
+void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
 }
-
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
 template <>
-void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
-                            Context *ctx) {
-  const float *in = static_cast<const float *>(input->data());
-  cuda::sum(count, in, ret);
+void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
 }
-
-// follow the consistency guide of math API
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
-                            Blob *out, Context *ctx) {
+void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  cuda::Div(num, alpha, inPtr, outPtr, ctx->stream);
+  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
 }
-
 template <>
 void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   cuda::Set(num, x, outPtr, ctx->stream);
 }
+// TODO(wangwei) optimize using stream
+template <>
+void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
+                               Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::square(num, inPtr, outPtr);
+}
+// TODO(wangwei) optimize using stream
+template <>
+void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+}
+// sum all elements of input into ret
+// TODO(wangwei) optimize using stream
+template <>
+void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::sum(num, inPtr, out);
+}
+
+// =========================Blas operations==================================
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              const size_t ncol, const Blob *M, const Blob *v,
                              Blob *out, Context *ctx) {
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   const float *MPtr = static_cast<const float *>(M->data());
   const float *vPtr = static_cast<const float *>(v->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -106,6 +131,22 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              vPtr, 1, outPtr, ncol));
   }
 }
+template <>
+void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
+                             const float alpha, const Blob *A, const Blob *v,
+                             const float beta, Blob *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  if (!trans)
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
+                             1, &beta, outPtr, 1));
+  else
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, APtr, m, vPtr,
+                             1, &beta, outPtr, 1));
+}
+
 // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
 template <>
 void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
@@ -121,44 +162,11 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
   const float *APtr = static_cast<const float *>(A->data());
   const float *BPtr = static_cast<const float *>(B->data());
   float *CPtr = static_cast<float *>(C->mutable_data());
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
 }
 
-template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out, Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out,  Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out, Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out,  Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
-}
-
-
-
-
-
 }  // namespace singa
 
 #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 823445f..94ca283 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -117,12 +117,11 @@ TEST_F(TestTensorMath, MemberTanh) {
 }
 
 TEST_F(TestTensorMath, Sum) {
-	Tensor p1(Shape{1,2});
-	p1 = Sum(e, 0);
+	Tensor p1 = Sum(e, 0);
   const float *dptr1 = p1.data<const float *>();
 	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
 	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
-	
+
 	Tensor p2(Shape{3,1});
 	p2 = Sum(e, 1);
   const float *dptr2 = p2.data<const float *>();
@@ -143,9 +142,9 @@ TEST_F(TestTensorMath, SoftMax) {
 	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
 	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
 	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
-	
+
 	Tensor p2(Shape{3,2});
-	p2 = SoftMax(e,1); 
+	p2 = SoftMax(e,1);
   const float *dptr2 = p2.data<const float *>();
 	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
 	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
@@ -237,12 +236,12 @@ TEST_F(TestTensorMath, MemberDiv) {
 
 TEST_F(TestTensorMath, MemberBernoulli) {
 	Tensor p1(Shape{10000});
-	Bernoulli(0.3,&p1);
+	Bernoulli(0.3f, &p1);
 	const float* dptr1 = p1.data<const float*>();
 	float sum = 0;
 	for(int i = 0; i < 10000; i++) sum += dptr1[i];
 	float mean = sum/10000;
-	EXPECT_NEAR(mean, 0.3, 1e-2);
+	EXPECT_NEAR(mean, 0.3f, 1e-2);
 
 	sum = 0;
 	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
@@ -267,7 +266,7 @@ TEST_F(TestTensorMath, MemberUniform) {
 
 TEST_F(TestTensorMath, MemberGaussian) {
 	Tensor p1(Shape{50000});
-	Gaussian(0.0,1.0,&p1);
+	Gaussian(0.0f,1.0f,&p1);
 	const float* dptr1 = p1.data<const float*>();
 	float sum = 0;
 	for(int i = 0; i < 50000; i++) sum += dptr1[i];

[2/5] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

SINGA-182 Clean math function APIs and implementations

Merge branch 'cuda' from #jinyangturbo.
Clean the cuda related code (tensor_math_cuda.h, kernel_math.h and kernel_math.cu)
by unify the function arugments (names and arg order).
Need to reorder the functions.
Add Nrm2 for L2 norm using cblas and cublas.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6d69047a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6d69047a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6d69047a

Branch: refs/heads/dev
Commit: 6d69047addc46e5c9f381b7e1d4cebd20ce9b2e3
Parents: 564c88a
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 12:08:48 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |   2 +
 src/core/tensor/math_kernel.cu     | 656 +++++++++++++++++---------------
 src/core/tensor/math_kernel.h      |  93 ++---
 src/core/tensor/tensor.cc          |  14 +
 src/core/tensor/tensor_math.h      | 140 ++++---
 src/core/tensor/tensor_math_cpp.h  | 227 ++++++-----
 src/core/tensor/tensor_math_cuda.h | 384 +++++++++++++++----
 test/singa/test_tensor_math.cc     | 346 ++++++++---------
 8 files changed, 1092 insertions(+), 770 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 82bbe81..cd750c5 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -173,6 +173,8 @@ class Tensor {
   template <typename SType>
   Tensor &operator/=(const SType x);
 
+  float L2() const;
+
  protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..b618f9b 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -35,36 +35,16 @@
 namespace singa {
 // Cuda Kernel Functions
 namespace cuda {
-__global__ void kernel_softmax_loss(const float *prob, const int *label,
-                                    float *loss, int n, int dim) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    float prob_of_truth = prob[index * dim + label[index]];
-    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
-  }
-}
-
-__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
-                                        int dim, float scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    int pos = index * dim + label[index];
-    grad[pos] = (grad[pos] - 1.0f) * scale;
-  }
-}
-
-__global__ void kernel_sum_vec(const float *data, float *sum, int n) {
+__global__ void KernelSum(const size_t n, const float *in, float *out) {
   int THREADS = blockDim.x;
 
   __shared__ float aux[CU1DBLOCK];
   int steps = (n - 1) / THREADS + 1;
-  aux[threadIdx.x] = data[threadIdx.x];
+  aux[threadIdx.x] = in[threadIdx.x];
 
   for (int i = 1; i < steps; ++i) {
     if (threadIdx.x + i * THREADS < n) {
-      aux[threadIdx.x] += data[threadIdx.x + i * THREADS];
+      aux[threadIdx.x] += in[threadIdx.x + i * THREADS];
     }
   }
 
@@ -83,432 +63,484 @@ __global__ void kernel_sum_vec(const float *data, float *sum, int n) {
   }
 
   __syncthreads();
-  *sum = aux[0];
+  *out = aux[0];
 }
 
-__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
-                               int rows, int cols, int stride) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < rows; index += num_threads) {
-    dst_vec_data[index] = 0.0f;
-    for (int k = 0; k < cols; k++) {
-      dst_vec_data[index] += src_mat_data[index * stride + k];
-    }
+__global__ void KernelAdd(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] + in2[i];
   }
 }
 
-__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
-                               int rows, int cols, int stride) {
-  int j = blockIdx.x;
-  int THREADS = blockDim.x;
-  if (j >= cols) {
-    return;
-  }
-
-  __shared__ float aux[CU1DBLOCK];
-  int steps = (rows - 1) / THREADS + 1;
-  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
-  for (int i = 1; i < steps; ++i) {
-    if (threadIdx.x + i * THREADS < rows) {
-      aux[threadIdx.x] +=
-          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
-    }
+__global__ void KernelAdd(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] + x;
   }
+}
 
-  int total_threads = THREADS;
-  __syncthreads();
-  while (total_threads > 1) {
-    int half_point = ((1 + total_threads) >> 1);
-    if (threadIdx.x < half_point) {
-      if (threadIdx.x + half_point < total_threads) {
-        aux[threadIdx.x] += aux[threadIdx.x + half_point];
-      }
-    }
-    __syncthreads();
-    total_threads = ((total_threads + 1) >> 1);
+__global__ void KernelSub(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] - in2[i];
   }
-
-  __syncthreads();
-  dst_vec_data[j] = aux[0];
 }
 
-__global__ void kernel_add_vec_row(const float *src_vec_data,
-                                   const float *src_mat_data,
-                                   float *des_mat_data, int rows, int cols,
-                                   int stride) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int num_threads_x = blockDim.x * gridDim.x;
-  int num_threads_y = blockDim.y * gridDim.y;
-  int index = 0;
-  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
-    index = j * stride + i;
-    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+__global__ void KernelExp(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::exp(in[i]);
   }
 }
-__global__ void kernel_add(const float *src1, const float *src2, float *out,
-                           int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    out[index] = src1[index] + src2[index];
+
+__global__ void KernelLog(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::log(in[i]);
   }
 }
 
-__global__ void kernel_sub(const float *src1, const float *src2, float *out,
-                           int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    out[index] = src1[index] - src2[index];
+__global__ void KernelSigmoid(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = 1.0f / (1.0f + expf(-in[i]));
   }
 }
-__global__ void kernel_exp(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::exp(src_data[index]);
+__global__ void KernelSign(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > 0.0f)
+      out[i] = 1.0f;
+    else if (in[i] < 0.0f)
+      out[i] = -1.0f;
+    else
+      out[i] = 0.0f;
   }
 }
 
-__global__ void kernel_log(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::log(src_data[index]);
+__global__ void KernelClamp(const size_t n, const float low, const float high,
+                            const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > high)
+      out[i] = high;
+    else if (in[i] < low)
+      out[i] = low;
+    else
+      out[i] = in[i];
   }
 }
 
-__global__ void kernel_sigmoid(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelRelu(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = max(in[i], 0.0f);
   }
 }
 
-__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
-                                    int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * (1.0f - src_data[index]);
+__global__ void KernelAbs(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] =  max(in[i], -in[i]);
   }
 }
 
-__global__ void kernel_relu(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = max(src_data[index], 0.0f);
+__global__ void KernelTanh(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = tanhf(in[i]);
   }
 }
 
-__global__ void kernel_relu_grad(const float *src_data, float *des_data,
-                                 int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+__global__ void KernelSoftplus(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = logf(1 + expf(in[i]));
   }
 }
-
-__global__ void kernel_tanh(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = tanhf(src_data[index]);
+__global__ void KernelSquare(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * in[i];
   }
 }
-
-__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
-                                 int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+__global__ void KernelSqrt(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::sqrt(in[i]);
   }
 }
 
-__global__ void kernel_softplus(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = logf(1 + expf(src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in1[i], in2[i]);
   }
 }
 
-__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
-                                     int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in[i], x);
   }
 }
 
-__global__ void kernel_square(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in1, const float *in2,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] * in2[i];
   }
 }
 
-__global__ void kernel_square_grad(const float *src_data, float *des_data,
-                                   int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 2 * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in, const float x,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * x;
   }
 }
 
-__global__ void kernel_sqrt(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::sqrt(src_data[index]);
+__global__ void KernelDiv(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] / in2[i];
   }
 }
-
-__global__ void kernel_pow(const float *src_data_a, const float *src_data_b,
-                           float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::pow(src_data_a[index], src_data_b[index]);
+__global__ void KernelDiv(const size_t n, const float x, const float *in,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x / in[i];
   }
 }
-
-__global__ void kernel_mult(const float *src_data_a, const float *src_data_b,
-                            float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] * src_data_b[index];
+__global__ static void KernelSet(const size_t n, const float x, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x;
   }
 }
 
-__global__ void kernel_mult(const float *src_data_a, const float x,
-                            float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] * x;
+__global__ void KernelThreshold(const size_t n, const float x, const float *in,
+                                float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] < x ? 1.0f : 0.0f;
   }
 }
 
-__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
-                           float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] / src_data_b[index];
+__global__ void KernelGE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
   }
 }
-
-__global__ static void kernel_set_value(float *data, float value, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    data[index] = value;
+__global__ void KernelGT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] > x ? 1.0f : 0.0f;
   }
 }
-
-__global__ void kernel_threshold(const float *src_data, float *des_data,
-                                 float alpha, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+__global__ void KernelLE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
   }
 }
-void sum(int n, const float *in, float *out) {
-  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-  //  here, we only need one block
-  int num_blocks = 1;
 
-  kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
+__global__ void KernelLT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+  }
 }
 
-void sum_row(int rows, int cols, int stride, const float *in, float *out) {
-  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-  int num_blocks = cols;
+// ********************************
+// Functions call kernels
+// ********************************
 
-  kernel_sum_row << <num_blocks, threads_per_block>>>
-      (in, out, rows, cols, stride);
+void set(const size_t n, const float v, float *out, cudaStream_t s) {
+  KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
 }
 
-void sum_col(int rows, int cols, int stride, const float *in, float *out) {
-  int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
-  int num_blocks = rows;
+void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
 
-  kernel_sum_col << <num_blocks, threads_per_block>>>
-      (in, out, rows, cols, stride);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void add_row(int rows, int cols, int stride, const float *in_row,
-             const float *in_mat, float *out) {
-  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-  dim3 num_blocks(
-      cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
-      rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
-  kernel_add_vec_row << <num_blocks, threads_per_block>>>
-      (in_row, in_mat, out, rows, cols, stride);
+
+void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void add(int n, const float *a, const float *b, float *out) {
-  kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void sub(int n, const float *a, const float *b, float *out) {
-  kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void exp(int n, const float *in, float *out) {
-  kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+
+void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
 
-void log(int n, const float *in, float *out) {
-  kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
 
-void sigmoid(int n, const float *in, float *out) {
-  kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
+  KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s) {
+  KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
 }
 
-void sigmoid_grad(int n, const float *in, float *out) {
-  kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void relu(int n, const float *in, float *out) {
-  kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void relu_grad(int n, const float *in, float *out) {
-  kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void tanh(int n, const float *in, float *out) {
-  kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void div(const size_t n, const float x, const float *in, float *out,
+          cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
 }
 
-void tanh_grad(int n, const float *in, float *out) {
-  kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s) {
+  KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
 }
 
-void softplus(int n, const float *in, float *out) {
-  kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 }
 
-void softplus_grad(int n, const float *in, float *out) {
-  kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void square(int n, const float *in, float *out) {
-  kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void square_grad(int n, const float *in, float *out) {
-  kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void sqrt(int n, const float *in, float *out) {
-  kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void pow(int n, const float *a, const float *b, float *out) {
-  kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void mult(int n, const float *a, const float *b, float *out) {
-  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+  //  here, we only need one block
+  int num_blocks = 1;
+  KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
+}
+/*
+void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void mult(int n, const float *a, const float x, float *out) {
-  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void div(int n, const float *a, const float *b, float *out) {
-  kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void set_value(int n, float v, float *out) {
-  kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+
+void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void threshold(int n, float alpha, const float *in, float *out) {
-  kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-// follow the consistency guide for math API
-__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
-                          float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = alpha / in[idx];
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < rows; index += num_threads) {
+    dst_vec_data[index] = 0.0f;
+    for (int k = 0; k < cols; k++) {
+      dst_vec_data[index] += src_mat_data[index * stride + k];
+    }
   }
 }
 
-__global__ void KernelGE(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
+__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int j = blockIdx.x;
+  int THREADS = blockDim.x;
+  if (j >= cols) {
+    return;
   }
-}
-__global__ void KernelGT(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] > x ? 1.0f : 0.0f;
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (rows - 1) / THREADS + 1;
+  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < rows) {
+      aux[threadIdx.x] +=
+          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
+    }
   }
-}
-__global__ void KernelLE(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
+
+  int total_threads = THREADS;
+  __syncthreads();
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
   }
+
+  __syncthreads();
+  dst_vec_data[j] = aux[0];
 }
 
-__global__ void KernelLT(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+
+__global__ void kernel_add_vec_row(const float *src_vec_data,
+                                   const float *src_mat_data,
+                                   float *des_mat_data, int rows, int cols,
+                                   int stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int num_threads_x = blockDim.x * gridDim.x;
+  int num_threads_y = blockDim.y * gridDim.y;
+  int index = 0;
+  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+    index = j * stride + i;
+    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
   }
 }
 
-__global__ void KernelSet(const size_t num, const float x, float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = x;
+__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
+                                    int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * (1.0f - src_data[index]);
   }
 }
 
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
-  KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+  }
 }
-void Div(const size_t num, float alpha, const float *in, float *out,
-         cudaStream_t s) {
-  KernelDiv << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, alpha, in, out);
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+  }
 }
 
-void GT(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelGT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+                                     int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
 }
-void GE(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelGE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void KernelSquareGrad(const float *src_data, float *des_data,
+                                   int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 2 * src_data[index];
+  }
 }
-void LT(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelLT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_loss(const float *prob, const int *label,
+                                    float *loss, int n, int dim) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    float prob_of_truth = prob[index * dim + label[index]];
+    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
+  }
 }
-void LE(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
+                                        int dim, float scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    int pos = index * dim + label[index];
+    grad[pos] = (grad[pos] - 1.0f) * scale;
+  }
 }
+*/
+
 
 }  // namespace cuda
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..d8a58a5 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -31,65 +31,66 @@ namespace singa {
 
 // TODO(wangwei) make all function templates.
 namespace cuda {
-void sum(int n, const float *in, float *out);
 
-void sum_row(int rows, int cols, int stride, const float *in, float *out);
-
-void sum_col(int rows, int cols, int stride, const float *in, float *out);
-
-void add_row(int rows, int cols, int stride, const float *in_row,
-             const float *in_mat, float *out);
-
-void add(int n, const float *a, const float *b, float *out);
-
-void sub(int n, const float *a, const float *b, float *out);
-
-void exp(int n, const float *in, float *out);
-
-void log(int n, const float *in, float *out);
-
-void sigmoid(int n, const float *in, float *out);
-
-void sigmoid_grad(int n, const float *in, float *out);
-
-void relu(int n, const float *in, float *out);
-
-void relu_grad(int n, const float *in, float *out);
-
-void tanh(int n, const float *in, float *out);
-
-void tanh_grad(int n, const float *in, float *out);
+// 0 input
+void set(const size_t n, const float v, float *out, cudaStream_t s);
+
+// 1 input
+void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+void log(const size_t n, const float *in, float *out, cudaStream_t s);
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+void square(const size_t n, const float *in, float *out, cudaStream_t s);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s);
+
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
 
-void softplus(int n, const float *in, float *out);
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
 
-void softplus_grad(int n, const float *in, float *out);
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
 
-void square(int n, const float *in, float *out);
+void div(const size_t n, const float x, const float *in, float *out,
+         cudaStream_t s);
 
-void square_grad(int n, const float *in, float *out);
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s);
 
-void sqrt(int n, const float *in, float *out);
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
 
-void pow(int n, const float *a, const float *b, float *out);
+// 2 inputs
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void mult(int n, const float *a, const float *b, float *out);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void mult(int n, const float *a, const float x, float *out);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void div(int n, const float *a, const float *b, float *out);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
 
-void set_value(int n, float v, float *out);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void threshold(int n, float alpha, const float *in, float *out);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s);
 
-// follow the consistency guide for math API
-void Div(const size_t num, const float x, const float *in, float *out,
-         cudaStream_t s);
-void Set(const size_t num, const float x, float *out, cudaStream_t s);
-void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 }  // cuda
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index f4e9da2..e62386a 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -219,6 +219,8 @@ GenUnaryScalarArgMemberFn(operator+=, Add);
 GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);
 
+
+
 // ====================Tensor Operations=======================================
 void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                     const size_t dst_offset, const size_t src_offset) {
@@ -309,6 +311,18 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
   } while (0)
 
 // =============Element-wise operations====================================
+/// L2 norm, Do not use Nrm2 (name conflict).
+float Tensor::L2() const {
+  float nrm = 0.0f;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec([&nrm, this](Context *ctx) {
+      DType ret;
+      Nrm2<DType, Lang>(this->Size(), this->blob(), &ret, ctx);
+      nrm = TypeCast<DType, float>(ret);
+    }, {this->blob()}, {});
+  });
+  return nrm;
+}
 template <typename SType>
 void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index b5d0ba9..b86e1cb 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -48,41 +48,45 @@ namespace singa {
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Blob level math functions.
 
-// =============Element-wise operations====================================
+// **************************************
+// Element-wise functions
+// **************************************
+
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
-/// out = in + x
+/// out[i] = in[i] + x
 template <typename DType, typename Lang>
 void Add(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Add Not Implemented";
 }
 
-/// out = in1 + in2
+/// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
 void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Add-Pair Not Implemented";
 }
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Clamp every element into [low, high]
+/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
 void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
            Blob *out, Context *ctx) {
   LOG(FATAL) << "Clamp Not Implemented";
 }
 
-/// out = x / in
+/// out[i] = x / in[i]
 template <typename DType, typename Lang>
 void Div(const size_t num, const DType x, const Blob *in, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Div Not Implemented";
 }
 
+/// out[i] = in[i] / x
 template <typename DType, typename Lang>
 void Div(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
@@ -90,21 +94,21 @@ void Div(const size_t num, const Blob *in, const DType x, Blob *out,
   EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
-/// out = in1 / in2
+/// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
 void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
-/// out = in * x
+/// out[i] = in[i] * x
 template <typename DType, typename Lang>
 void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
-/// out = in2 * in2
+/// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
 void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
                  Context *ctx) {
@@ -146,31 +150,32 @@ void GT(const size_t num, const Blob *in, const DType x, Blob *out,
         Context *ctx) {
   LOG(FATAL) << "GT Not Implemented";
 }
-/// Element-wise operation, do v^x for every v from the in tensor
+/// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
 void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
-/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
+/// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
 void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
-/// Element-wise operation, out[i]=max(0, in[i])
+/// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
 void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "ReLU Not Implemented";
 }
 
+/// out[i] = x
 template <typename DType, typename Lang>
 void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Set Not Implemented";
 }
-/// Element-wise operation, out[i]=sigmoid([in[i])
+/// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
 void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sigmoid Not Implemented";
@@ -181,85 +186,47 @@ template <typename DType, typename Lang>
 void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sign Not Implemented";
 }
-/// Element-wise operation, out[i]=sqrt([in[i])
+/// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
 void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sqrt Not Implemented";
 }
 
-/// Element-wise operation, out[i]=square([in[i])
+/// out[i]=square(in[i])
 template <typename DType, typename Lang>
 void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Square Not Implemented";
+  EltwiseMult<DType, Lang>(num, in, in, out, ctx);
 }
 
-/// out =  in - x
+/// out[i] =  in[i] - x
 template <typename DType, typename Lang>
 void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   Add<DType, Lang>(num, in, -x, out, ctx);
 }
 
-/// out = in1 - in2
+/// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
 void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Sub-Pair Not Implemented";
 }
+
 /// sum all elements of in into out
 template <typename DType, typename Lang>
 void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Sum Not Implemented";
 }
 
-/// Element-wise operation, out[i]=tanh([in[i])
+/// out[i]=tanh(in[i])
 template <typename DType, typename Lang>
 void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Tanh Not Implemented";
 }
 
-// =========== Matrix operations ===========================================
-/// Add the vector v to every column of A as the column of out
-template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddCol Not Implemented";
-}
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out
-template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddRow Not Implemented";
-}
-/// outer-product.
-/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
-template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
-           Blob *out, Context *ctx) {
-  LOG(FATAL) << "Outer Not Implemented";
-}
-// Do softmax for each row invidually
-template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-             Context *ctx) {
-  LOG(FATAL) << "Softmax Not Implemented";
-}
-/// Sum the columns of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-                Context *ctx) {
-  LOG(FATAL) << "SumColumns Not Implemented";
-}
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-             Context *ctx) {
-  LOG(FATAL) << "SumRows Not Implemented";
-}
-
-// ================Random functions===========================================
+// **************************************
+// Random functions
+// **************************************
 /// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
@@ -282,7 +249,10 @@ void Uniform(const size_t num, const float low, const float high, Blob *out,
   LOG(FATAL) << "Uniform Not Implemented";
 }
 
-// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
@@ -307,12 +277,19 @@ void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
+/// out = ||in||_2^2, i.e, L2 norm.
+template <typename DType, typename Lang>
+void Nrm2(const size_t num, const Blob *in, float *out, Context *ctx) {
+  LOG(FATAL) << "Nrm2 Not Implemented";
+}
+
 /// out *= x
 template <typename DType, typename Lang>
 void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
+/// inner product of array in1 and in2
 template <typename DType, typename Lang>
 void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
          Context *ctx) {
@@ -346,5 +323,44 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
   LOG(FATAL) << "GEMM Not Implemented";
 }
 
+// **************************************
+// Matrix functions
+// **************************************
+/*
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
+}
+/// outer-product.
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
+template <typename DType, typename Lang>
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
+  LOG(FATAL) << "Outer Not Implemented";
+}
+
+/// Sum the columns of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
+}
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
+}
+*/
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 2c5c272..0b280a3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -241,7 +241,7 @@ void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
     outPtr[i] = sqrt(inPtr[i]);
   }
 }
-
+/*
 template <>
 void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
                               Context *ctx) {
@@ -251,6 +251,7 @@ void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
     outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
+*/
 
 template <>
 void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -287,101 +288,6 @@ void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
   }
 }
 
-// =========Matrix operations ================================================
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
-    }
-  }
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
-    }
-  }
-}
-template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
-                             const Blob *in2, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t r = 0; r < m; r++) {
-    size_t offset = r * n;
-    for (size_t c = 0; c < n; c++) {
-      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
-    }
-  }
-}
-template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *bPtr = new float[ncol];
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    float denom = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      bPtr[c] = exp(inPtr[offset + c]);
-      denom += bPtr[c];
-    }
-    for (size_t c = 0; c < ncol; c++) {
-      size_t idx = offset + c;
-      outPtr[idx] = bPtr[c] / denom;
-    }
-  }
-  delete bPtr;
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t c = 0; c < ncol; c++) {
-    outPtr[c] = 0.f;
-  }
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[c] += inPtr[offset + c];
-    }
-  }
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    outPtr[r] = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[r] += inPtr[offset + c];
-    }
-  }
-}
-
 // ===============Random operations==========================================
 template <>
 void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
@@ -440,18 +346,26 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
 
 #ifdef USE_CBLAS
 template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_isamax(num, inPtr, 1);
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_sasum(num, inPtr, 1);
+}
+
+template <>
 void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
                             Blob *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
   cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
 }
-template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                             Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cblas_sscal(num, x, outPtr, 1);
-}
 
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -461,6 +375,19 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
   *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
 template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cblas_sscal(num, x, outPtr, 1);
+}
+template <>
+void Nrm2<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_snrm2(num, inPtr, 1);
+}
+
+template <>
 void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
                             const float alpha, const Blob *A, const Blob *v,
                             const float beta, Blob *out, Context *ctx) {
@@ -587,6 +514,102 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
 }
 
 #endif  // USE_CBLAS
+
+// =========Matrix operations ================================================
+/*
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+    }
+  }
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+    }
+  }
+}
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                             const Blob *in2, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t r = 0; r < m; r++) {
+    size_t offset = r * n;
+    for (size_t c = 0; c < n; c++) {
+      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+    }
+  }
+}
+template <>
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      bPtr[c] = exp(inPtr[offset + c]);
+      denom += bPtr[c];
+    }
+    for (size_t c = 0; c < ncol; c++) {
+      size_t idx = offset + c;
+      outPtr[idx] = bPtr[c] / denom;
+    }
+  }
+  delete bPtr;
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                  const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t c = 0; c < ncol; c++) {
+    outPtr[c] = 0.f;
+  }
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[c] += inPtr[offset + c];
+    }
+  }
+}
+
+template <>
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    outPtr[r] = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[r] += inPtr[offset + c];
+    }
+  }
+}
+*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index f9841a3..e2597d5 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -24,105 +24,336 @@
 #include "./math_kernel.h"
 #include "singa/utils/cuda_utils.h"
 #include "singa/core/common.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include "singa/utils/cuda_utils.h"
 
 namespace singa {
-// =================Elementwise operations===================================
+
+/// out[i] = |in[i]|
+template <>
+void Abs<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::abs(num, inPtr, outPtr, ctx->stream);
+}
+/// out = in + x
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 + in2
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
+template <>
+void Clamp<float, lang::Cuda>(const size_t num, const float low,
+                              const float high, const Blob* in, Blob* out,
+                              Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
+}
+/// out = in1 / in2
+template <>
+void Div<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+template <>
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob* in,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, x, inPtr, outPtr, ctx->stream);
+}
+
+/// out = in * x
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in,
+                                    const float x, Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 * in2
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in1,
+                                    const Blob* in2, Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Base is e. out[i]=e^in[i]
+template <>
+void Exp<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::exp(num, inPtr, outPtr, ctx->stream);
+}
+
+template <>
+void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::ge(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::gt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::le(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
+template <>
+void Log<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::log(num, inPtr, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::lt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i] = in[i]^x
+template <>
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr, x, outPtr, ctx->stream);
+}
+/// Element-wise operation, out[i] = in1[i]^in2[i]
 template <>
-void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
-                            Blob *out, Context *ctx) {
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::add(num, in1Ptr, in2Ptr, outPtr);
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
 }
 
-// follow the consistency guide of math API
+/// Element-wise operation, out[i]=max(0, in[i])
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
-                            Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::Div(num, x, inPtr, outPtr, ctx->stream);
+void ReLU<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::relu(num, inPtr, outPtr, ctx->stream);
 }
 
+/// out[i] = x
 template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
-                                    const float x, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::mult(num, inPtr, x, outPtr);
+void Set<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+                            Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::set(num, x, outPtr, ctx->stream);
 }
+/// Element-wise operation, out[i]=sigmoid([in[i])
 template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
+void Sigmoid<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                                Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
 }
+// out[i] = sign(in[i])
 template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
+void Sign<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sign(num, inPtr, outPtr, ctx->stream);
 }
+
+/// Element-wise operation, out[i]=sqrt([in[i])
+template <>
+void Sqrt<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sqrt(num, inPtr, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=in[i]^2
 template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+void Square<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                               Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::square(num, inPtr, outPtr, ctx->stream);
 }
+/// out = in1 - in2
 template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
+void Sub<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sub(num, inPtr1, inPtr2, outPtr, ctx->stream);
 }
+
+/// sum all elements of input into out
 template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::Set(num, x, outPtr, ctx->stream);
+void Sum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::sum(num, inPtr, out, ctx->stream);
 }
-// TODO(wangwei) optimize using stream
+
+/// Element-wise operation, out[i]=tanh([in[i])
 template <>
-void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
-                               Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::square(num, inPtr, outPtr);
+void Tanh<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::tanh(num, inPtr, outPtr, ctx->stream);
 }
-// TODO(wangwei) optimize using stream
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
 template <>
-void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
-                            Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Blob* out,
+                                  Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
 }
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
 template <>
-void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
-                            Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::sum(num, inPtr, out);
+void Uniform<float, lang::Cuda>(const size_t num, const float low,
+                                const float high, Blob* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
+  cuda::add(num, outPtr, low, outPtr, ctx->stream);
+}
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and delta to DType
+template <>
+void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
+                                 const float std, Blob* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
 }
 
 // =========================Blas operations==================================
+// ref to http://docs.nvidia.com/cuda/cublas
+template <>
+void Amax<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;  // cublas index starts from 1
+}
+
+/// return the index of the element with the min value.
+template <>
+void Amin<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;
+}
+
+/// out = sum |x| for all x in in
+template <>
+void Asum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
+}
+
+/// out = alpha * in + out
+template <>
+void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
+                             const Blob* in, Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
+}
+
+/// out = \sum_i in1[i] * in2[i]
+template <>
+void Dot<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            float* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
+}
+template <>
+void Nrm2<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                             Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const float* inPtr = static_cast<const float*>(in->data());
+  cublasSnrm2(handle, num, inPtr, 1, out);
+}
+template <>
+void Scale<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+                              Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
+}
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
-                             const size_t ncol, const Blob *M, const Blob *v,
-                             Blob *out, Context *ctx) {
+                             const size_t ncol, const Blob* M, const Blob* v,
+                             Blob* out, Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float *MPtr = static_cast<const float *>(M->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float* MPtr = static_cast<const float*>(M->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
   if (side_right) {
     CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
                              vPtr, 1, outPtr, ncol));
@@ -133,11 +364,11 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
 }
 template <>
 void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
-                             const float alpha, const Blob *A, const Blob *v,
-                             const float beta, Blob *out, Context *ctx) {
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+                             const float alpha, const Blob* A, const Blob* v,
+                             const float beta, Blob* out, Context* ctx) {
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   if (!trans)
     CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
@@ -152,16 +383,16 @@ template <>
 void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
                              const size_t nrowA, const size_t ncolB,
                              const size_t ncolA, const float alpha,
-                             const Blob *A, const Blob *B, const float beta,
-                             Blob *C, Context *ctx) {
+                             const Blob* A, const Blob* B, const float beta,
+                             Blob* C, Context* ctx) {
   auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
   int lda = transA ? nrowA : ncolA;
   int ldb = transB ? ncolA : ncolB;
   int ldc = ncolB;
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *BPtr = static_cast<const float *>(B->data());
-  float *CPtr = static_cast<float *>(C->mutable_data());
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* BPtr = static_cast<const float*>(B->data());
+  float* CPtr = static_cast<float*>(C->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
@@ -171,4 +402,3 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
 
 #endif  // USE_CUDA
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-

[5/5] incubator-singa git commit: SINGA-168 Implement Cpp Math functions APIs

Posted by zh...@apache.org.

SINGA-168 Implement Cpp Math functions APIs

Update error log for tensor_math.h to include the function name, e.g.
"Foo is not implemented".

Add Tensor Math Cpp Implementation and Test Cases


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/07c49da5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/07c49da5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/07c49da5

Branch: refs/heads/dev
Commit: 07c49da5b1ee6582780f5faef6c6bf3418a7a0b6
Parents: 01aaf49
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Fri Jun 3 20:46:16 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 src/core/tensor/tensor_math.h     | 293 +++++++++----------
 src/core/tensor/tensor_math_cpp.h | 508 ++++++++++++++++++++++++---------
 test/singa/test_tensor_math.cc    | 264 ++++++++++++++++-
 3 files changed, 774 insertions(+), 291 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index ff865e0..1bf6fc7 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -50,277 +50,259 @@ namespace singa {
 // ================Linear algebra functions====================================
 /// ret[i] = |input[i]|
 template <typename DType, typename Lang>
-void Abs(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Abs Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Set(int count, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Set Not Implemented";
 }
+
 /// sum all elements of input into ret
 template <typename DType, typename Lang>
-void Sum(int count, const Blob *input, DType *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Sum Not Implemented";
 }
 
 /// ret[i] = sign(input[i])
 template <typename DType, typename Lang>
-void Sign(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sign Not Implemented";
 }
 
 /// Base is e, Neper number. ret[i]=exp(input[i])
 template <typename DType, typename Lang>
-void Exp(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Exp Not Implemented";
 }
 
 /// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
 template <typename DType, typename Lang>
-void Log(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Log Not Implemented";
 }
-
 /// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sqrt Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=square([input[i])
 template <typename DType, typename Lang>
-void Square(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Square Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=tanh([input[i])
 template <typename DType, typename Lang>
-void Tanh(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Tanh Not Implemented";
 }
 /// Element-wise operation, ret[i]=max(0, input[i])
 template <typename DType, typename Lang>
-void ReLU(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "ReLU Not Implemented";
 }
 /// Element-wise operation, ret[i]=sigmoid([input[i])
 template <typename DType, typename Lang>
-void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
-/// Do softmax for each row invidually
+// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "Softmax Not Implemented";
 }
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
 }
 
 /// Sum the columns of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(int nrow, int ncol, const Blob *input, Blob *ret,
-                Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, 
+	        Blob *out, Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
 }
 
 // TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
+/// Add the vector v to every row of A as the row of out 
 template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
-            Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
 }
 
-/// Add the vector v to every column of A as the column of ret
+/// Add the vector v to every column of A as the column of out
 template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
-            Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lang>
-void Pow(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
 template <typename DType, typename Lang>
-void Pow(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Clamp(int count, DType low, DType high, const Blob *input, Blob *ret,
-           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in, 	   Blob *out, Context *ctx) {
+  LOG(FATAL) << "Clamp Not Implemented";
 }
 
 /// ret = input + x
 template <typename DType, typename Lang>
-void Add(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Add Not Implemented";
 }
+
+/// ret = lhs + rhs
+template <typename DType, typename Lang>
+void Add(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Add-Pair Not Implemented";
+}
+
 /// ret =  input - x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  Add<DType, Lang>(count, input, -x, ret, ctx);
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  Add<DType, Lang>(num, in, -x, out, ctx);
 }
-/// ret = input * x
+
+/// ret = lhs - rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *input, DType x, Blob *ret,
-                 Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sub-Pair Not Implemented";
 }
-/// ret = input / x
+
+/// ret = input * x
 template <typename DType, typename Lang>
-void Div(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx);
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
-/// ret = lhs + rhs
+/// ret = lhs * rhs
 template <typename DType, typename Lang>
-void Add(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, 
+		 Blob *out, Context *ctx) {
+  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
 
-/// ret = lhs - rhs
+/// ret = input / x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in, 
+	 Blob *out, Context *ctx) { 
+  LOG(FATAL) << "Div Not Implemented";
 }
 
-/// ret = lhs * rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *lhs, const Blob *rhs, Blob *ret,
-                 Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  CHECK_NE(x,0.f);
+  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
 /// ret = lhs / rhs
 template <typename DType, typename Lang>
-void Div(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
 /// outer-product.
 /// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret,
-           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2, 
+	   Blob *out, Context *ctx) {
+  LOG(FATAL) << "Outer Not Implemented";
 }
 
 /// ret[i]=(input[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void LT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "LT Not Implemented";
 }
 /// ret[i]=(input[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void LE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "LE Not Implemented";
 }
 /// ret[i]=(input[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void GT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GT Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+/// ret[i]=(input[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void GE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GE Not Implemented";
 }
 
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // ===== Level 1
 /// return the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(int count, const Blob *input, int *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amax Not Implemented";
 }
 
 /// return the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(int count, const Blob *input, int *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amin Not Implemented";
 }
 /// ret = sum |x| for all x in input
 template <typename DType, typename Lang>
-void Asum(int count, const Blob *input, DType *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Asum Not Implemented";
 }
 
 /// ret = alpha * input + ret
 template <typename DType, typename Lang>
-void Axpy(int count, DType alpha, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Axpy(const size_t num, const DType alpha, const Blob *in, 
+	  Blob *out, Context *ctx) {
+  LOG(FATAL) << "Axpy Not Implemented";
 }
 
 /// ret *= x
 template <typename DType, typename Lang>
-void Scale(int count, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Scale Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
-         Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Dot(const size_t num, const Blob *in1, const Blob *in2, 
+	 DType *out, Context *ctx) {
+  LOG(FATAL) << "Dot Not Implemented";
 }
 
 // ===== Level 2
 /// ret = alpha * op(A) * v + beta * ret.
 /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
 template <typename DType, typename Lang>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
-          DType beta, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ========follow the consistency guide of math API
-
-template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// Divide alpha by each element of 'in'.
-template <typename DType, typename Lang>
-void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
-         Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha, 
+	  const Blob *A, const Blob *v,
+          const DType beta, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GEMV Not Implemented";
 }
 
 /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
@@ -328,7 +310,7 @@ void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
 template <typename DType, typename Lang>
 void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG(FATAL) << "DGMM Not Implemented";
 }
 
 /// C = alpha * A * B + beta * C.
@@ -338,32 +320,37 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
           const size_t ncolB, const size_t ncolA, const DType alpha,
           const Blob *A, const Blob *B, const DType beta, Blob *C,
           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG(FATAL) << "GEMM Not Implemented";
 }
-/// ret[i]=(input[i]<x)?1.f:0.f
-template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// ret[i]=(input[i]<=x)?1.f:0.f
+
+
+// ===== Level 3
+
+// ================Random functions===========================================
+/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Bernoulli Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Uniform(const size_t num, const float low, const float high, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "Uniform Not Implemented";
 }
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std, 
+	      Blob *out, Context *ctx) {
+  LOG(FATAL) << "Gaussian Not Implemented";
 }
 
+
+
+
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 693f09c..ec7a892 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -27,195 +27,317 @@
 
 /// TODO(wangwei) Clean the implementations following the comments in
 /// tensor_math.h.
-/// For Blob argument xxx, name its pointer as xxxPtr.
 namespace singa {
+
+template<>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = fabs(inPtr[i]);
+  }
+}
+
 template <>
-void Square<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                              Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *in = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = in[i] * in[i];
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
+template <>
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+  float s = 0.f;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    s += inPtr[i];
   }
+  *out = s;
 }
 
 template <>
-void Add<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + rptr[i];
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float*>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; 
   }
 }
 
 template <>
-void Add<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + x;
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = exp(inPtr[i]);
   }
 }
 
 template <>
-void Sub<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] - rptr[i];
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = log(inPtr[i]);
   }
 }
 
-// sum all elements of input into ret
-// TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(int count, const Blob *input, float *ret,
-                           Context *ctx) {
-  float s = 0.f;
-  const float *in = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    s += in[i];
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = sqrt(inPtr[i]);
   }
-  *ret = s;
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *input, float x,
-                                   Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * x;
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                                   Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * rptr[i];
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = tanh(inPtr[i]);
   }
 }
 
 template <>
-void Exp<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = exp(lptr[i]);
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
   }
 }
 
 template <>
-void Log<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    CHECK_GT(lptr[i], 0.f);
-    dptr[i] = log(lptr[i]);
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
   }
 }
 
 template <>
-void Tanh<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                            Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = tanh(lptr[i]);
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+             Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+	float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+		float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+			bPtr[c] = exp(inPtr[offset + c]);
+			denom += bPtr[c];
+    }
+		for (size_t c = 0; c < ncol; c++) {
+			size_t idx = offset + c;
+			outPtr[idx] = bPtr[c] / denom;
+		}
   }
+	delete bPtr;
 }
 
 template <>
-void ReLU<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                            Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = (lptr[i] >= 0.f) ? lptr[i] : 0.f;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+             Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		outPtr[r] = 0.f;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[r] += inPtr[offset + c];
+		}
+	}
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t c = 0; c < ncol; c++) {
+		outPtr[c] = 0.f;
+	}
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+				outPtr[c] += inPtr[offset + c];
+		}
+	}
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());                              
+  const float *vPtr = static_cast<const float *>(v->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+		}
+	}
+}
+
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());                              
+  const float *vPtr = static_cast<const float *>(v->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+		}
+	}
+}
+
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(inPtr[i], x);
   }
 }
 
 template <>
-void Sigmoid<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                               Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = 1.f / (1.f + exp(-lptr[i]));
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr= static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
   }
 }
 
 template <>
-void Pow<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = pow(lptr[i], x);
+void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
+														 Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t i = 0; i < num; i++) {
+		if (inPtr[i] > high) {
+			outPtr[i] = high;
+		}
+		else if (inPtr[i] < low) {
+			outPtr[i] = low;
+		}
+		else {
+			outPtr[i] = inPtr[i];			
+		}
+	}
+}
+
+template <>
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, 
+													 Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] + x;
   }
 }
 
 template <>
-void Pow<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = pow(lptr[i], rptr[i]);
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] + in2Ptr[i];
   }
 }
 
 template <>
-void Bernoulli<float, lang::Cpp>(int count, float p, Blob *ret, Context *ctx) {
-  std::bernoulli_distribution distribution(p);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] - in2Ptr[i];
   }
 }
 
 template <>
-void Uniform<float, lang::Cpp>(int count, float low, float high, Blob *ret,
-                               Context *ctx) {
-  std::uniform_real_distribution<float> distribution(low, high);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * x;
   }
 }
 
 template <>
-void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob *ret,
-                                Context *ctx) {
-  std::normal_distribution<float> distribution(mean, std);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] * in2Ptr[i];
   }
 }
 
-// follow the consistency guide of math API
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+		CHECK_NE(in2Ptr[i],0.f);
+    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, 
+         								  Blob *out, Context *ctx) {
+	float *outPtr= static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = alpha / inPtr[i];
+  for (size_t i = 0; i < num; i++) {
+		CHECK_NE(inPtr[i],0.f);
+    outPtr[i] = x / inPtr[i];
+  }
 }
+
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
+	float *outPtr= static_cast<float *>(out->mutable_data());
+	const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+	for (size_t r = 0; r < m ; r++) {
+		size_t offset = r * n;
+		for (size_t c = 0; c < n; c++) {
+			outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+		}
+	}
+}
+
 template <>
 void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
                           Blob *out, Context *ctx) {
@@ -227,6 +349,125 @@ void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
 }
 
 template <>
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+	size_t maxPos = 0;
+	float maxVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		if (i == 0) {
+			maxVal = inPtr[i]; 
+		}
+		else if (inPtr[i] > maxVal) {
+			maxVal = inPtr[i];
+			maxPos = i;
+		}
+	}
+	*out = maxPos;
+}
+
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+	size_t minPos = 0;
+	float minVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		if (i == 0) {
+			minVal = inPtr[i]; 
+		}
+		else if (inPtr[i] > minVal) {
+			minVal = inPtr[i];
+			minPos = i;
+		}
+	}
+	*out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+	float sum = 0;
+	const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		sum += fabs(inPtr[i]);
+	}
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+          							 	  Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {	
+		outPtr[i] += alpha * inPtr[i];
+	}
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+	for (size_t i = 0; i < num; i++) {
+		outPtr[i] *= x;
+	}
+}
+
+//template <>
+//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+//         									 float *out, Context *ctx) {
+//	float sum = 0;
+//	const float *in1Ptr = static_cast<const float *>(in1->data());
+//	const float *in2Ptr = static_cast<const float *>(in2->data());
+//	for (size_t i = 0; i < num; i++) {
+//		sum += in1Ptr[i] * in2Ptr[i];
+//	}
+//}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
+          const Blob *A, const Blob *v, const float beta, 
+					Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+	const float* APtr = static_cast<const float *>(A->data());
+	const float* vPtr = static_cast<const float *>(v->data());
+	for (size_t r = 0; r < m; r++) {
+		float sum = 0; 
+		for (size_t c = 0; c < n; c++) {
+			size_t idx = trans ? c * m + r : r * n + c;	
+			sum += APtr[idx] * vPtr[c];
+		}
+		outPtr[r] = alpha * sum + beta * outPtr[r];
+	}
+}
+
+template <>
 void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
                             const size_t ncol, const Blob *M, const Blob *v,
                             Blob *out, Context *ctx) {
@@ -251,41 +492,35 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
 }
 
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
-}
-template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
+  std::bernoulli_distribution distribution(p);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
+                               Context *ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float *outPtr= static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
+                                Context *ctx) {
+  std::normal_distribution<float> distribution(mean, std);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
+
 #ifdef USE_CBLAS
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -314,7 +549,6 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
 }
 
 #endif  // USE_CBLAS
-
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 170b96c..823445f 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -11,15 +11,277 @@ protected:
     b.Reshape(singa::Shape{6});
     c.Reshape(singa::Shape{6, 1});
     d.Reshape(singa::Shape{3, 2});
+		e.Reshape(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);
+		e.CopyDataFromHostPtr<float>(dat1, 6);
   }
-  Tensor a, b, c, d;
+  Tensor a, b, c, d, e;
   const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
 };
 
+TEST_F(TestTensorMath, MemberAbs) {
+	Tensor aa = a.Clone();
+	Tensor bb = b.Clone();
+	Tensor cc = aa - bb;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+
+	Tensor p = Abs(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberExp) {
+	Tensor p = Exp(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLog) {
+	Tensor p = Log(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberReLU) {
+	Tensor aa = a.Clone();
+	Tensor cc = aa - 2.0f;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+	Tensor p = ReLU(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSigmoid) {
+	Tensor p = Sigmoid(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(1.0f/(1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+  EXPECT_NEAR(1.0f/(1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f/(1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSign) {
+	Tensor aa = a.Clone();
+	Tensor cc = aa - 2.0f;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+	Tensor p = Sign(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_EQ(0.0f, dptr1[0]);
+  EXPECT_EQ(0.0f, dptr1[1]);
+  EXPECT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberSqrt) {
+	Tensor p = Sqrt(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSquare) {
+	Tensor p = Square(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+  EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+  EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberTanh) {
+	Tensor p = Tanh(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, Sum) {
+	Tensor p1(Shape{1,2});
+	p1 = Sum(e, 0);
+  const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
+	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
+	
+	Tensor p2(Shape{3,1});
+	p2 = Sum(e, 1);
+  const float *dptr2 = p2.data<const float *>();
+	EXPECT_FLOAT_EQ(3.0f,dptr2[0]);
+	EXPECT_FLOAT_EQ(7.0f,dptr2[1]);
+	EXPECT_FLOAT_EQ(11.0f,dptr2[2]);
+}
+
+TEST_F(TestTensorMath, SoftMax) {
+	Tensor p1(Shape{3,2});
+	p1 = SoftMax(e,0);
+  const float *dptr1 = p1.data<const float *>();
+	float sum = 0;
+	for(int i = 0; i < 6; i++) sum += exp(i+1);
+	EXPECT_NEAR(exp(1)/sum, dptr1[0],1e-5);
+	EXPECT_NEAR(exp(3)/sum, dptr1[2],1e-5);
+	EXPECT_NEAR(exp(5)/sum, dptr1[4],1e-5);
+	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
+	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
+	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
+	
+	Tensor p2(Shape{3,2});
+	p2 = SoftMax(e,1); 
+  const float *dptr2 = p2.data<const float *>();
+	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
+	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLT) {
+	Tensor p1 = a < 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberLE) {
+	Tensor p1 = a <= 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGT) {
+	Tensor p1 = a > 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGE) {
+	Tensor p1 = a >= 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberPow) {
+	Tensor p1 = Pow(b,3.0f);
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(pow(1.1f,3.0f), dptr1[0]);
+	EXPECT_FLOAT_EQ(pow(2.1f,3.0f), dptr1[1]);
+	EXPECT_FLOAT_EQ(pow(3.1f,3.0f), dptr1[2]);
+
+	//TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the function is complete
+	//Tensor p2 = Pow(a,b);
+	//const float *dptr2 = p2.data<const float *>();
+	//EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+	//EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+	//EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+}
+
+
+TEST_F(TestTensorMath, MemberSub) {
+	Tensor p1 = a - b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberEltwiseMult) {
+	Tensor p1 = a * b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(1.0*1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0*2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0*3.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberDiv) {
+	Tensor p1 = a / b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(1.0/1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0/2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0/3.1, dptr1[2], 1e-5);
+
+	Tensor p2 = Div(10.0f,b);
+	const float* dptr2 = p2.data<const float*>();
+	EXPECT_NEAR(10.0/1.1, dptr2[0], 1e-5);
+  EXPECT_NEAR(10.0/2.1, dptr2[1], 1e-5);
+  EXPECT_NEAR(10.0/3.1, dptr2[2], 1e-5);
+
+	Tensor p3 = a / 8.0f;
+	const float* dptr3 = p3.data<const float*>();
+	EXPECT_NEAR(1.0/8.0, dptr3[0], 1e-5);
+  EXPECT_NEAR(2.0/8.0, dptr3[1], 1e-5);
+  EXPECT_NEAR(3.0/8.0, dptr3[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberBernoulli) {
+	Tensor p1(Shape{10000});
+	Bernoulli(0.3,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 10000; i++) sum += dptr1[i];
+	float mean = sum/10000;
+	EXPECT_NEAR(mean, 0.3, 1e-2);
+
+	sum = 0;
+	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/9999;
+	EXPECT_NEAR(variance, 0.3*0.7, 1e-2);
+}
+
+TEST_F(TestTensorMath, MemberUniform) {
+	Tensor p1(Shape{10000});
+	Uniform(0.1f,0.2f,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 10000; i++) sum += dptr1[i];
+	float mean = sum/10000;
+	EXPECT_NEAR(mean, 0.15f, 1e-3);
+
+	sum = 0;
+	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/9999;
+	EXPECT_NEAR(variance, 0.01f/12, 1e-3);
+}
+
+TEST_F(TestTensorMath, MemberGaussian) {
+	Tensor p1(Shape{50000});
+	Gaussian(0.0,1.0,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 50000; i++) sum += dptr1[i];
+	float mean = sum/50000;
+	EXPECT_NEAR(mean, 0.0, 1e-2);
+
+	sum = 0;
+	for(int i = 0; i < 50000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/49999;
+	EXPECT_NEAR(variance, 1.0, 1e-2);
+}
+
+
+
 TEST_F(TestTensorMath, MemberAddTensor) {
   Tensor aa = a.Clone();
   aa += a;

[4/5] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Posted by zh...@apache.org.

SINGA-182 Clean math function APIs and implementations

Clean tensor.h/.cc and tensor_math.h, tensor_math_cpp.h:
re-order the functions by (type, name), where type is a) element-wise
function b) matrix function c) random function d) blas function

Implement GEMV using cblas and cublas.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/564c88ad
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/564c88ad
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/564c88ad

Branch: refs/heads/dev
Commit: 564c88ad95e976e6067198c832f4fcd9a8878cd7
Parents: 07c49da
Author: wangwei <wa...@gmail.com>
Authored: Fri Jun 10 23:12:09 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        | 396 +++++++++---------
 src/core/tensor/tensor.cc          | 688 ++++++++++++++++----------------
 src/core/tensor/tensor_math.h      | 336 ++++++++--------
 src/core/tensor/tensor_math_cpp.h  | 640 +++++++++++++++--------------
 src/core/tensor/tensor_math_cuda.h | 158 ++++----
 test/singa/test_tensor_math.cc     |  15 +-
 6 files changed, 1131 insertions(+), 1102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..82bbe81 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -32,17 +32,6 @@ using std::tuple;
 namespace singa {
 
 typedef vector<size_t> Shape;
-typedef Shape::iterator ShapeIter;
-inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
-  if (len == 0)
-    len = shape.size();
-  CHECK_LE(len, shape.size());
-  size_t v = 1;
-  for (unsigned int i = start; i < len; i++)
-    v *= shape[i];
-  return v;
-}
-
 /// hardcode the width of types defined in DataType
 const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int),
                              sizeof(char), sizeof(double)};
@@ -65,10 +54,10 @@ class Tensor {
  public:
   ~Tensor();
   Tensor();
-  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
-  explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
-  Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
-  Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
+  explicit Tensor(Shape &&shape, const DataType dtype = kFloat32);
+  explicit Tensor(const Shape &shape, const DataType dtype = kFloat32);
+  Tensor(Shape &&shape, Device *dev, const DataType dtype = kFloat32);
+  Tensor(const Shape &shape, Device *dev, const DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor &from);
@@ -82,10 +71,10 @@ class Tensor {
 
   Device *device() const { return device_; }
 
-  /// Return immutable Tensor values with given type.
-  template <typename DType>
-  DType data() const {
-    return static_cast<DType>(blob()->data());
+  /// return immutable Tensor values with given type.
+  template <typename SType>
+  SType data() const {
+    return static_cast<SType>(blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -93,7 +82,7 @@ class Tensor {
 
   const Shape &shape() const { return shape_; }
 
-  const size_t shape(size_t idx) const {
+  const size_t shape(const size_t idx) const {
     CHECK_LT(idx, shape_.size());
     return shape_.at(idx);
   }
@@ -102,13 +91,13 @@ class Tensor {
 
   bool transpose() const { return transpose_; }
 
-  /// Return number of total elements
+  /// return number of total elements
   size_t Size() const {
     CHECK_EQ(blob_->size() % SizeOf(data_type_), 0u);
     return blob_->size() / SizeOf(data_type_);
   }
 
-  /// Return memory size (i.e., Bytes)
+  /// return memory size (i.e., Bytes)
   size_t MemSize() const { return blob_->size(); }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
@@ -121,7 +110,7 @@ class Tensor {
   void ResetLike(const Tensor &t);
 
   /// Reset the data type, it would reallocate blob if type changes.
-  void AsType(DataType type);
+  void AsType(const DataType type);
 
   /// Reset the device.
   /// If the target device is a diff device, then do deep data copy.
@@ -135,14 +124,14 @@ class Tensor {
   void SetValue(const SType x);
 
   /// For init the tensor values, copy 'num' elements.
-  template <typename DType>
-  void CopyDataFromHostPtr(const DType *src, size_t num);
+  template <typename SType>
+  void CopyDataFromHostPtr(const SType *src, const size_t num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
   void CopyData(const Tensor &other);
 
-  /// Return an exactly the same Tensor with data been deep copied.
+  /// return an exactly the same Tensor with data been deep copied.
   Tensor Clone() const;
 
   // Tensor operations
@@ -152,42 +141,37 @@ class Tensor {
   Tensor T() const;
 
   /// Copy the meta info with data blob shared.
-  Tensor &operator=(const Tensor &t);
+  Tensor &operator=(const Tensor &in);
 
   /// Copy the meta info with data blob shared.
-  Tensor &operator=(Tensor &&t);
+  Tensor &operator=(Tensor &&in);
 
-  Tensor &operator+=(const Tensor &t);
-  // void operator+=(Tensor&& t);
-  Tensor &operator-=(const Tensor &t);
-  // void operator-=(Tensor&& t);
-  Tensor &operator*=(const Tensor &t);
-  // void operator*=(Tensor&& t);
-  Tensor &operator/=(const Tensor &t);
-  // void operator/=(Tensor&& t);
+  Tensor &operator+=(const Tensor &in);
+  // void operator+=(Tensor&& in);
+  Tensor &operator-=(const Tensor &in);
+  // void operator-=(Tensor&& in);
+  Tensor &operator*=(const Tensor &in);
+  // void operator*=(Tensor&& in);
+  Tensor &operator/=(const Tensor &in);
+  // void operator/=(Tensor&& in);
 
   // Scalar operations.
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator+=(DType x);
-
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator-=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator+=(const SType x);
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator*=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator-=(const SType x);
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator/=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator*=(const SType x);
 
-  /// save Tensor into a proto msg
-  // void ToProto(TensorProto* t);
-  /// load Tensor from proto msg
-  // void FromProto(const TensorProto& t);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator/=(const SType x);
 
  protected:
   bool transpose_ = false;
@@ -196,14 +180,29 @@ class Tensor {
   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Blob, use blob() instead of blob_.
   Blob *blob_ = nullptr;
-  Shape shape_;
+  Shape shape_ = {};
 };
 
+typedef Shape::iterator ShapeIter;
+inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+  if (len == 0) len = shape.size();
+  CHECK_LE(len, shape.size());
+  size_t v = 1;
+  for (unsigned int i = start; i < len; i++) v *= shape[i];
+  return v;
+}
+
 inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
   CHECK_EQ(in1.data_type(), in2.data_type());
   CHECK_EQ(in1.device()->lang(), in2.device()->lang());
 }
 
+template <typename FromType, typename ToType>
+ToType TypeCast(const FromType &x) {
+  // TODO(wangwei) cast fp16; prevent some casts, e.g., float to char
+  return static_cast<ToType>(x);
+}
+
 Tensor Reshape(const Tensor &in, const Shape &s);
 Tensor Reshape(const Tensor &in, Shape &&s);
 
@@ -212,192 +211,171 @@ Tensor Reshape(const Tensor &in, Shape &&s);
 
 /// Copy 'num' elements of src to dst.
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                    size_t src_offset = 0, size_t dst_offset = 0);
-
-// ==================Simple Linear Algebra Operations=========================
-Tensor Abs(const Tensor &t);
-Tensor Exp(const Tensor &t);
-Tensor Log(const Tensor &t);
-Tensor ReLU(const Tensor &t);
-Tensor Sigmoid(const Tensor &t);
-Tensor Sign(const Tensor &t);
-Tensor Sqrt(const Tensor &t);
-Tensor Square(const Tensor &t);
-Tensor Tanh(const Tensor &t);
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t src_offset = 0, const size_t dst_offset = 0);
+
+// =============Element-wise operations====================================
+Tensor Abs(const Tensor &in);
+Tensor Exp(const Tensor &in);
+Tensor Log(const Tensor &in);
+Tensor ReLU(const Tensor &in);
+Tensor Sigmoid(const Tensor &in);
+Tensor Sign(const Tensor &in);
+Tensor Sqrt(const Tensor &in);
+Tensor Square(const Tensor &in);
+Tensor Tanh(const Tensor &in);
+
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+Tensor Pow(const Tensor &in, const SType x);
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+void Pow(const Tensor &in, const SType x, Tensor *out);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+Tensor Pow(const Tensor &base, const Tensor &exp);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
 
+/// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
 template <typename SType>
-SType Sum(const Tensor &t);
-/// Sum elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, sum all rows into a single row
-/// if 'axis' is 1, sum all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.sum
-Tensor Sum(const Tensor &t, int axis);
+Tensor operator<(const Tensor &in, const SType x);
+template <typename SType>
+void LT(const Tensor &in, const SType x, Tensor *out);
 
-/// Average elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, average all rows into a single row
-/// if 'axis' is 1, average all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.average
-Tensor Average(const Tensor &t, int axis);
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
-/// and shape_[axis]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-Tensor SoftMax(const Tensor &t, int axis = 0);
-void SoftMax(const Tensor &t, int axis, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator<=(const Tensor &in, const SType x);
+template <typename SType>
+void LE(const Tensor &in, const SType x, Tensor *out);
 
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
-/// and shape_[axis+1]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-// Tensor Softmax(const Tensor& t, int axis = -1);
-// void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
-
-/// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<(const Tensor &t, const DType x);
-template <typename DType>
-void LT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<=(const Tensor &t, const DType x);
-template <typename DType>
-void LE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>(const Tensor &t, const DType x);
-template <typename DType>
-void GT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>=(const Tensor &t, const DType x);
-template <typename DType>
-void GE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-Tensor Pow(const Tensor &t, DType x);
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-void Pow(const Tensor &t, DType x, Tensor *ret);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-Tensor Pow(const Tensor &base, Tensor exp);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>(const Tensor &in, const SType x);
+template <typename SType>
+void GT(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>=(const Tensor &in, const SType x);
+template <typename SType>
+void GE(const Tensor &in, const SType x, Tensor *out);
 
 Tensor operator+(const Tensor &lhs, const Tensor &rhs);
-void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator-(const Tensor &lhs, const Tensor &rhs);
-void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator*(const Tensor &lhs, const Tensor &rhs);
-void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator/(const Tensor &lhs, const Tensor &rhs);
-void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 
-template <typename DType>
-Tensor operator+(const Tensor &t, DType x);
-template <typename DType>
-void Add(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator-(const Tensor &t, DType x);
-template <typename DType>
-void Sub(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator*(const Tensor &t, DType x);
-template <typename DType>
-void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator/(const Tensor &t, DType x);
-template <typename DType>
-void Div(const Tensor &t, DType x, Tensor *ret);
+template <typename SType>
+Tensor operator+(const Tensor &in, const SType x);
+template <typename SType>
+void Add(const Tensor &in, const SType x, Tensor *out);
 
-// ================Blas operations============================================
-// We fix the scalar argument type to be float.
+template <typename SType>
+Tensor operator-(const Tensor &in, const SType x);
+template <typename SType>
+void Sub(const Tensor &in, const SType x, Tensor *out);
 
-// ===== Level 1
-// TODO(wangwei) make amax/amin/asum a member function of tensor
-// void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
-// void Asum(Tensor Context* ctx);
+template <typename SType>
+Tensor operator*(const Tensor &in, const SType x);
+template <typename SType>
+void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
 
-// template <typename DType>
-// void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+/// For each element e of Tensor 'in', compute e / x
+template <typename SType>
+Tensor operator/(const Tensor &in, const SType x);
+/// For each element e of Tensor 'in', compute e / x into out
+template <typename SType>
+void Div(const Tensor &in, const SType x, Tensor *out);
 
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  result = A * B
-Tensor Mult(const Tensor &A, const Tensor &B);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  C = A * B
-void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+/// For each element e of Tensor 'in', compute x/e
+template <typename SType>
+Tensor Div(const SType x, const Tensor &in);
+/// For each element e of Tensor 'in', compute x/e into 'out'
+template <typename SType>
+void Div(const SType x, const Tensor &in, Tensor *out);
 
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
-void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
-          const float beta, Tensor *C);
+template <typename SType>
+SType Sum(const Tensor &in);
 
-// ================Random operations==========================================
-/// For each element x set x = 1 if random() < p; otherwise x = 1.
-void Bernoulli(float p, Tensor *t);
-/// Fill in Tensor 't' following uniform distribution.
-void Uniform(float low, float high, Tensor *t);
-/// Fill in Tensor 't' following Gaussian distribution.
-void Gaussian(float mean, float std, Tensor *t);
+// ============Matrix (row/column) operations==================================
+/// Average elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, average all rows into a single row
+/// if 'axis' is 1, average all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.average
+Tensor Average(const Tensor &in, const int axis);
+/// Sum elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, sum all rows into a single row
+/// if 'axis' is 1, sum all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.sum
+Tensor Sum(const Tensor &in, const int axis);
+/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
+/// and shape_[axis]*...*shape_[nDim()] columns.
+/// and do softmax along each row.
+Tensor SoftMax(const Tensor &in, const int axis = 0);
+void SoftMax(const Tensor &in, const int axis, Tensor *out);
 
-// follow the consistency guide
-// https://issues.apache.org/jira/browse/SINGA-182
-// ============Matrix vector operations=======================================
 /// Add column 'v' with each column of matrix M
 void AddColumn(const Tensor &v, Tensor *M);
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+/// For each column 'c' of matrix out, do c=alpha*v + beta*c
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *out);
-/// Sub column 'v' by each column of matrix M
-void SubColumn(const Tensor &v, Tensor *M);
-/// Multiply column 'v' and each column of matrix M; write results into 'out'
-void MultColumn(const Tensor &v, Tensor *M);
-/// Divide column 'v' by each column of matrix M; write results into 'out'
-void DivColumn(const Tensor &v, Tensor *M);
-
 /// Add row 'v' with each row of matrix M; write results into 'out'
 void AddRow(const Tensor &v, Tensor *out);
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
-/// Sub row 'v' by each row of matrix M; write results into 'out'
-void SubRow(const Tensor &v, Tensor *M);
-/// Multiply row 'v' with each row of matrix M; write results into 'out'
-void MultRow(const Tensor &v, Tensor *M);
+/// For each row 'r' of matrix out, do r=alpha*v + beta*r
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M);
 /// Divide row 'v' by each row of matrix M; write results into 'out'
 void DivRow(const Tensor &v, Tensor *M);
-
-/// Sum all rows of matrix M into a single row as 'out'
-void SumRows(const Tensor &M, Tensor *out);
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M);
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M);
+/// Sub column 'v' by each column of matrix M
+void SubColumn(const Tensor &v, Tensor *M);
+/// Sub row 'v' by each row of matrix M; write results into 'out'
+void SubRow(const Tensor &v, Tensor *M);
 /// Sum all columns of matrix M into a single column as 'out'
 void SumColumns(const Tensor &M, Tensor *out);
+/// Sum all rows of matrix M into a single row as 'out'
+void SumRows(const Tensor &M, Tensor *out);
 
-/// For each element x of Tensor 'in', compute alpha/x
+// ================Random operations==========================================
+/// For each element x set x = 1 if random() < p; otherwise x = 1.
 template <typename SType>
-Tensor Div(const SType alpha, const Tensor &in);
+void Bernoulli(const SType p, Tensor *out);
+/// Fill in Tensor 't' following Gaussian distribution.
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out);
+/// Fill in Tensor 't' following uniform distribution.
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out);
 
-/// For each element x of Tensor 'in', compute alpha/x into 'out'
+// ================Blas operations============================================
+// TODO(wangwei) make amax/amin/asum a member function of tensor
+
+/// out = alpha*in + out
 template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out);
-
-/*
-/// Multiply each column of the lhs matrix with the rhs column
-Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
-void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Multiply each row of the lhs matrix with the rhs row
-Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
-void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Div each row of the lhs matrix with the rhs column
-Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
-void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Divide each row of the lhs matrix by the rhs row
-Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
-void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-*/
+void Axpy(SType alpha, const Tensor &in, Tensor *out);
+
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  result = A * B
+Tensor Mult(const Tensor &A, const Tensor &B);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  C = A * B
+void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. out = alpha lhs * rhs + beta * out
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C);
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..f4e9da2 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -26,61 +26,61 @@ namespace singa {
 
 Tensor::~Tensor() {
   // LOG(ERROR) << "~";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
 Tensor::Tensor() { device_ = &defaultDevice; }
 
-Tensor::Tensor(const Shape &shape, DataType dtype)
+Tensor::Tensor(const Shape &shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, DataType dtype)
+Tensor::Tensor(Shape &&shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
+Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
+Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Tensor &t)
-    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-      blob_(t.blob()), shape_(t.shape_) {
+Tensor::Tensor(const Tensor &in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      blob_(in.blob()),
+      shape_(in.shape_) {
   blob_->IncRefCount();
-  // LOG(ERROR) << "const&";
 }
 
-Tensor::Tensor(Tensor &&t)
-    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-      shape_(std::move(t.shape_)) {
-  blob_ = t.blob_;
-  t.blob_ = nullptr;
-  // LOG(ERROR) << "&&";
+Tensor::Tensor(Tensor &&in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      shape_(std::move(in.shape_)) {
+  blob_ = in.blob_;
+  in.blob_ = nullptr;
 }
 
-void Tensor::ResetLike(const Tensor &t) {
-  if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
-    shape_ = t.shape_;
-    device_ = t.device_;
-    data_type_ = t.data_type_;
-    blob_ = device_->NewBlob(t.MemSize());
+void Tensor::ResetLike(const Tensor &in) {
+  if (blob_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+    shape_ = in.shape_;
+    device_ = in.device_;
+    data_type_ = in.data_type_;
+    blob_ = device_->NewBlob(in.MemSize());
   }
 }
 
 void Tensor::Reshape(const Shape &shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = shape;
@@ -88,17 +88,15 @@ void Tensor::Reshape(const Shape &shape) {
 
 void Tensor::Reshape(Shape &&shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = std::move(shape);
 }
 
-void Tensor::AsType(DataType type) {
+void Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
     data_type_ = type;
   }
@@ -109,8 +107,7 @@ void Tensor::ToDevice(Device *dst) {
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     tmp.CopyData(*this);
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = tmp.blob_;
     tmp.blob_ = nullptr;
     device_ = dst;
@@ -120,7 +117,7 @@ void Tensor::ToDevice(Device *dst) {
 void Tensor::ToHost() { ToDevice(device_->host()); }
 
 template <typename DType>
-void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
+void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
@@ -130,8 +127,8 @@ void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
     LOG(WARNING) << "Copy data from null host ptr";
   }
 }
-template void Tensor::CopyDataFromHostPtr(const float *src, size_t num);
-template void Tensor::CopyDataFromHostPtr(const int *src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num);
+template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num);
 
 void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
@@ -162,29 +159,27 @@ Tensor Tensor::T() const {
   return t;
 }
 
-Tensor &Tensor::operator=(const Tensor &t) {
+Tensor &Tensor::operator=(const Tensor &in) {
   // LOG(ERROR) << "= const &";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
-  transpose_ = t.transpose_;
-  data_type_ = t.data_type_;
-  shape_ = t.shape_;
-  device_ = t.device_;
-  blob_ = t.blob();
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = in.shape_;
+  device_ = in.device_;
+  blob_ = in.blob();
   blob_->IncRefCount();
   return *this;
 }
 
-Tensor &Tensor::operator=(Tensor &&t) {
+Tensor &Tensor::operator=(Tensor &&in) {
   // LOG(ERROR) << "= &&";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
-  transpose_ = t.transpose_;
-  data_type_ = t.data_type_;
-  shape_ = std::move(t.shape_);
-  device_ = t.device_;
-  blob_ = t.blob_;
-  t.blob_ = nullptr;
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = std::move(in.shape_);
+  device_ = in.device_;
+  blob_ = in.blob_;
+  in.blob_ = nullptr;
   return *this;
 }
 
@@ -200,10 +195,10 @@ Tensor Reshape(const Tensor &in, Shape &&s) {
   return out;
 }
 
-#define GenUnaryTensorArgMemberFn(op, fn)                                \
-  Tensor &Tensor::op(const Tensor &t) {                                        \
-    fn(*this, t, this);                                                        \
-    return *this;                                                              \
+#define GenUnaryTensorArgMemberFn(op, fn) \
+  Tensor &Tensor::op(const Tensor &in) {  \
+    fn(*this, in, this);                  \
+    return *this;                         \
   }
 
 GenUnaryTensorArgMemberFn(operator+=, Add);
@@ -211,12 +206,13 @@ GenUnaryTensorArgMemberFn(operator-=, Sub);
 GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
 GenUnaryTensorArgMemberFn(operator/=, Div);
 
-#define GenUnaryScalarArgMemberFn(op, fn)                                \
-  template <typename DType> Tensor &Tensor::op(DType x) {                      \
-    fn(*this, x, this);                                                        \
-    return *this;                                                              \
-  }                                                                            \
-  template Tensor &Tensor::op<float>(float x)
+#define GenUnaryScalarArgMemberFn(op, fn) \
+  template <typename DType>               \
+  Tensor &Tensor::op(const DType x) {     \
+    fn(*this, x, this);                   \
+    return *this;                         \
+  }                                       \
+  template Tensor &Tensor::op<float>(const float x)
 
 GenUnaryScalarArgMemberFn(operator-=, Sub);
 GenUnaryScalarArgMemberFn(operator+=, Add);
@@ -224,103 +220,105 @@ GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);
 
 // ====================Tensor Operations=======================================
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                    size_t dst_offset, size_t src_offset) {
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t dst_offset, const size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   size_t nBytes = num * width;
-  dst_offset *= width;
-  src_offset *= width;
-  CHECK_GE(src.MemSize(), src_offset + nBytes);
-  CHECK_GE(dst->MemSize(), dst_offset + nBytes);
+  auto d_offset = dst_offset * width;
+  auto s_offset = src_offset * width;
+  CHECK_GE(src.MemSize(), s_offset + nBytes);
+  CHECK_GE(dst->MemSize(), d_offset + nBytes);
 
   Device *src_dev = src.device(), *dst_dev = dst->device();
   Blob *from = src.blob(), *to = dst->blob();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
     if (dst_dev->lang() == kCpp) {
-      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, dst_offset,
-                              src_offset);
+      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, d_offset,
+                              s_offset);
     } else if (src_dev->lang() == kCpp) {
-      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, dst_offset,
-                              src_offset);
+      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, d_offset,
+                              s_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
     auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
-    src_dev->CopyDataToFrom(to, from, nBytes, direct, dst_offset, src_offset);
+    src_dev->CopyDataToFrom(to, from, nBytes, direct, d_offset, s_offset);
   }
 }
 //============================================================================
 /// typedef DType accroding to type value.
 /// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...)                                          \
-  do {                                                                         \
-    switch (type) {                                                            \
-    case kFloat32: {                                                           \
-      typedef float DType;                                                     \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case kInt: {                                                               \
-      typedef int DType;                                                       \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case kChar: {                                                              \
-      typedef char DType;                                                      \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      LOG(FATAL) << "Unknow data type = " << DataType_Name(type);              \
-    }                                                                          \
+#define TYPE_SWITCH(type, DType, ...)                               \
+  do {                                                              \
+    switch (type) {                                                 \
+      case kFloat32: {                                              \
+        typedef float DType;                                        \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kInt: {                                                  \
+        typedef int DType;                                          \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kChar: {                                                 \
+        typedef char DType;                                         \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      default:                                                      \
+        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
+    }                                                               \
   } while (0)
 
 /// typedef DType and Lang according to data type and device programming
 /// language respectively.
 /// type is from DataType, and lang is from LangType.
 /// DType and Lang would be used in __VA_ARGS__.
-#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)                       \
-  do {                                                                         \
-    const int _SwitchShift = 3;                                                \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);                     \
-    switch (_SwitchHash) {                                                     \
-    case ((kFloat32 << _SwitchShift) + kCuda): {                               \
-      typedef float DType;                                                     \
-      typedef lang::Cuda Lang;                                                 \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case ((kFloat32 << _SwitchShift) + kCpp): {                                \
-      typedef float DType;                                                     \
-      typedef lang::Cpp Lang;                                                  \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case ((kFloat32 << _SwitchShift) + kOpencl): {                             \
-      typedef float DType;                                                     \
-      typedef lang::Opencl Lang;                                               \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      LOG(FATAL) << "Unknown combination of data type "                        \
-                 << DataType_Name(dtype) << " and language "                   \
-                 << LangType_Name(ltype);                                      \
-    }                                                                          \
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
+  do {                                                         \
+    const int _SwitchShift = 3;                                \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
+    switch (_SwitchHash) {                                     \
+      case ((kFloat32 << _SwitchShift) + kCuda): {             \
+        typedef float DType;                                   \
+        typedef lang::Cuda Lang;                               \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kCpp): {              \
+        typedef float DType;                                   \
+        typedef lang::Cpp Lang;                                \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kOpencl): {           \
+        typedef float DType;                                   \
+        typedef lang::Opencl Lang;                             \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      default:                                                 \
+        LOG(FATAL) << "Unknown combination of data type "      \
+                   << DataType_Name(dtype) << " and language " \
+                   << LangType_Name(ltype);                    \
+    }                                                          \
   } while (0)
 
-template <typename SType> void Tensor::SetValue(const SType x) {
+// =============Element-wise operations====================================
+template <typename SType>
+void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
   auto size = Size();
   auto ptr = blob_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     // cast x to DType
-    device_->Exec(
-        [size, x, ptr](Context *ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
-        {}, {ptr});
+    device_->Exec([size, x, ptr](Context *ctx) {
+      Set<DType, Lang>(size, x, ptr, ctx);
+    }, {}, {ptr});
   });
 }
 template void Tensor::SetValue<float>(const float x);
@@ -328,21 +326,19 @@ template void Tensor::SetValue<float>(const float x);
 #define EltwiseUnaryTensorFn(fn, t, ret)                               \
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
-      ret->device()->Exec(                                             \
-          [t, ret](Context* ctx) {                                     \
-            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);     \
-          },                                                           \
-          {t.blob()}, {ret->blob()});                                  \
+      ret->device()->Exec([t, ret](Context * ctx) {                    \
+        fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);         \
+      }, {t.blob()}, {ret->blob()});                                   \
     });                                                                \
   } while (0)
 
-#define GenUnaryTensorFn(fn)                          \
-  Tensor fn(const Tensor &t) {                        \
-    Tensor ret(t.shape(), t.device(), t.data_type()); \
-    auto *retptr = &ret;                              \
-    EltwiseUnaryTensorFn(fn, t, retptr);              \
-    return ret;                                       \
-  }                                                   \
+#define GenUnaryTensorFn(fn)                             \
+  Tensor fn(const Tensor &in) {                          \
+    Tensor ret(in.shape(), in.device(), in.data_type()); \
+    auto *retptr = &ret;                                 \
+    EltwiseUnaryTensorFn(fn, in, retptr);                \
+    return ret;                                          \
+  }                                                      \
   void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }
 
 GenUnaryTensorFn(Abs);
@@ -355,33 +351,89 @@ GenUnaryTensorFn(Sqrt);
 GenUnaryTensorFn(Square);
 GenUnaryTensorFn(Tanh);
 
-// TODO(wangwei) conside async exec
-template <> float Sum<float>(const Tensor &t) {
-  float s = 0.0f;
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    t.device()->Exec(
-        [t, &s](Context *ctx) {
-          Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
-        },
-        {t.blob()}, {});
-  });
-  return s;
-}
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
+  do {                                                                         \
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
+      ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                     \
+        fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), ctx); \
+      }, {lhs.blob(), rhs.blob()}, {ret->blob()});                             \
+    });                                                                        \
+  } while (0)
 
-Tensor Sum(const Tensor &M, int axis) {
-  if (axis == 0) {
-    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
-    SumRows(M, &out);
-    return out;
-  } else {
-    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
-    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
-    SumColumns(M, &out);
-    return out;
+#define GenBinaryTensorFn(op, fn)                              \
+  Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
+    fn(lhs, rhs, &ret);                                        \
+    return ret;                                                \
+  }                                                            \
+  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
   }
+
+GenBinaryTensorFn(operator+, Add);
+GenBinaryTensorFn(operator-, Sub);
+GenBinaryTensorFn(operator*, EltwiseMult);
+GenBinaryTensorFn(operator/, Div);
+GenBinaryTensorFn(Pow, Pow);
+
+#define EltwiseTensorScalarFn(fn, t, x, ret)                            \
+  do {                                                                  \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
+      static_assert(std::is_same<SType, DType>::value,                  \
+                    "The Scalar type must match the Tensor data type"); \
+      ret->device()->Exec([t, x, ret](Context * ctx) {                  \
+        fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);       \
+      }, {t.blob()}, {ret->blob()});                                    \
+    });                                                                 \
+  } while (0)
+
+#define GenTensorScalarFn(op, fn)                             \
+  template <typename SType>                                   \
+  Tensor op(const Tensor &in, const SType x) {                \
+    Tensor ret(in.shape(), in.device(), in.data_type());      \
+    fn(in, x, &ret);                                          \
+    return ret;                                               \
+  }                                                           \
+  template <typename SType>                                   \
+  void fn(const Tensor &in, const SType x, Tensor *ret) {     \
+    EltwiseTensorScalarFn(fn, in, x, ret);                    \
+  }                                                           \
+  template Tensor op<float>(const Tensor &in, const float x); \
+  template void fn<float>(const Tensor &in, const float x, Tensor *ret)
+
+GenTensorScalarFn(operator+, Add);
+GenTensorScalarFn(operator-, Sub);
+GenTensorScalarFn(operator*, EltwiseMult);
+GenTensorScalarFn(operator/, Div);
+GenTensorScalarFn(Pow, Pow);
+GenTensorScalarFn(operator<, LT);
+GenTensorScalarFn(operator<=, LE);
+GenTensorScalarFn(operator>, GT);
+GenTensorScalarFn(operator>=, GE);
+template <typename SType>
+Tensor Div(const SType alpha, const Tensor &in) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  Div(alpha, in, &out);
+  return out;
 }
+template Tensor Div<float>(const float, const Tensor &);
 
-Tensor Average(const Tensor &t, int axis) {
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out) {
+  CheckDataTypeAndLang(in, *out);
+  CHECK(in.shape() == out->shape());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    // TODO(wangwei) type cast SType to DType;
+    in.device()->Exec([alpha, in, out](Context *ctx) {
+      Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
+    }, {in.blob()}, {out->blob()});
+  });
+}
+template void Div<float>(const float, const Tensor &, Tensor *);
+
+// =============Matrix operations============================================
+Tensor Average(const Tensor &M, int axis) {
   // operator/ only has implementation for float scalar type, hence it is
   // necessary to cast the denominator to a float.
   // TODO(wangwei) implement function for cast scalar type involved in Tensor
@@ -396,10 +448,34 @@ Tensor Average(const Tensor &t, int axis) {
   //    ....
   // }
   if (axis == 0) {
-    return Sum(t, 0) / (1.0f * t.shape().at(0));
+    return Sum(M, 0) / (1.0f * M.shape(0));
   } else {
     CHECK_EQ(axis, 1);
-    return Sum(t, 1) / (1.0f * t.shape().at(1));
+    return Sum(M, 1) / (1.0f * M.shape(1));
+  }
+}
+// TODO(wangwei) conside async exec
+template <>
+float Sum<float>(const Tensor &in) {
+  float s = 0.0f;
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    in.device()->Exec([in, &s](Context *ctx) {
+      Sum<DType, Lang>(in.Size(), in.blob(), &s, ctx);
+    }, {in.blob()}, {});
+  });
+  return s;
+}
+
+Tensor Sum(const Tensor &M, int axis) {
+  if (axis == 0) {
+    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
+    SumRows(M, &out);
+    return out;
+  } else {
+    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
+    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
+    SumColumns(M, &out);
+    return out;
   }
 }
 
@@ -424,141 +500,10 @@ void SoftMax(const Tensor &in, int axis, Tensor *out) {
   DivColumn(sum, out);
 }
 
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
-      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
-      ret->device()->Exec(                                                     \
-          [lhs, rhs, ret](Context *ctx) {                                      \
-            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),   \
-                            ctx);                                              \
-          },                                                                   \
-          {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
-    });                                                                        \
-  } while (0)
-
-#define GenBinaryTensorFn(op, fn)                                        \
-  Tensor op(const Tensor &lhs, const Tensor &rhs) {                            \
-    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());                    \
-    fn(lhs, rhs, &ret);                                                        \
-    return ret;                                                                \
-  }                                                                            \
-  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {                 \
-    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                                  \
-  }
-
-GenBinaryTensorFn(operator+, Add);
-GenBinaryTensorFn(operator-, Sub);
-GenBinaryTensorFn(operator*, EltwiseMult);
-GenBinaryTensorFn(operator/, Div);
-GenBinaryTensorFn(Pow, Pow);
-
-#define EltwiseTensorScalarFn(fn, t, x, ret)                                   \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         \
-      static_assert(std::is_same<SType, DType>::value,                         \
-                    "The Scalar type must match the Tensor data type");        \
-      ret->device()->Exec(                                                     \
-          [t, x, ret](Context *ctx) {                                          \
-            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);          \
-          },                                                                   \
-          {t.blob()}, {ret->blob()});                                          \
-    });                                                                        \
-  } while (0)
-
-#define GenTensorScalarFn(op, fn)                                        \
-  template <typename SType> Tensor op(const Tensor &t, SType x) {              \
-    Tensor ret(t.shape(), t.device(), t.data_type());                          \
-    fn(t, x, &ret);                                                            \
-    return ret;                                                                \
-  }                                                                            \
-  template <typename SType> void fn(const Tensor &t, SType x, Tensor *ret) {   \
-    EltwiseTensorScalarFn(fn, t, x, ret);                                      \
-  }                                                                            \
-  template Tensor op<float>(const Tensor &t, float x);                         \
-  template void fn<float>(const Tensor &t, const float x, Tensor *ret)
-
-GenTensorScalarFn(operator+, Add);
-GenTensorScalarFn(operator-, Sub);
-GenTensorScalarFn(operator*, EltwiseMult);
-GenTensorScalarFn(operator/, Div);
-GenTensorScalarFn(Pow, Pow);
-GenTensorScalarFn(operator<, LT);
-GenTensorScalarFn(operator<=, LE);
-GenTensorScalarFn(operator>, GT);
-GenTensorScalarFn(operator>=, GE);
-
-// ================Blas operations============================================
-Tensor Mult(const Tensor &lhs, const Tensor &rhs) {
-  Tensor ret(Shape{lhs.shape(0), rhs.shape(1)}, lhs.device(), lhs.data_type());
-  Mult(lhs, rhs, &ret);
-  return ret;
-}
-
-void Mult(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {
-  Mult(1.0f, lhs, rhs, 0.0f, ret);
-}
-
-void Mult(const float alpha, const Tensor &A, const Tensor &B, const float beta,
-          Tensor *C) {
-  CHECK_EQ(A.shape().size(), 2u);
-  if (B.nDim() == 1u) {
-    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
-      C->device()->Exec(
-          [alpha, A, beta, B, C](Context *ctx) {
-            GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), alpha,
-                              A.blob(), B.blob(), beta, C->blob(), ctx);
-          },
-          {A.blob(), B.blob()}, {C->blob()});
-    });
-  } else {
-    CHECK(!C->transpose());
-    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
-      C->device()->Exec(
-          [alpha, A, beta, B, C](Context *ctx) {
-            GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0),
-                              B.shape(1), A.shape(1), alpha, A.blob(), B.blob(),
-                              beta, C->blob(), ctx);
-          },
-          {A.blob(), B.blob()}, {C->blob()});
-    });
-  }
-}
-
-void Bernoulli(float p, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [p, t](Context *ctx) {
-          Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-void Uniform(float low, float high, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [low, high, t](Context *ctx) {
-          Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-void Gaussian(float mean, float std, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [mean, std, t](Context *ctx) {
-          Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-// ======follow the consistency guide
 void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
 /// Add column 'v' onto each column of matrix M;
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *M) {
   if (M->transpose()) {
     Tensor X = M->T();
@@ -570,15 +515,19 @@ void AddColumn(const float alpha, const float beta, const Tensor &v,
     CHECK_EQ(nb_row, v.Size());
 
     Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor vmat = Reshape(v, Shape{nb_row, 1});
     Mult(alpha, vmat, one, beta, M);
   }
 }
+template <>
+void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
+
 void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
 
 /// Sub column 'v' by each column of matrix M; write results into 'out'
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
   if (M->transpose()) {
     Tensor X = M->T();
     AddColumn(v, &X);
@@ -594,29 +543,8 @@ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
     Mult(alpha, one, vmat, beta, M);
   }
 }
-
-template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
-  Tensor out(in.shape(), in.device(), in.data_type());
-  Div(alpha, in, &out);
-  return out;
-}
-
-template Tensor Div<float>(const float, const Tensor &);
-
-template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out) {
-  CheckDataTypeAndLang(in, *out);
-  CHECK(in.shape() == out->shape());
-  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-    // TODO(wangwei) type cast SType to DType;
-    in.device()->Exec(
-        [alpha, in, out](Context *ctx) {
-          Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
-        },
-        {in.blob()}, {out->blob()});
-  });
-}
-template void Div<float>(const float, const Tensor &, Tensor *);
+template <>
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
 
 /// Divide column 'v' by each column of matrix M; write results into 'out'
 void DivColumn(const Tensor &v, Tensor *M) {
@@ -640,12 +568,10 @@ void MultColumn(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec(
-        [M, v](Context *ctx) {
-          DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(),
-                            v.blob(), M->blob(), ctx);
-        },
-        {M->blob(), v.blob()}, {M->blob()});
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(), v.blob(),
+                        M->blob(), ctx);
+    }, {M->blob(), v.blob()}, {M->blob()});
   });
 }
 
@@ -657,12 +583,10 @@ void MultRow(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec(
-        [M, v](Context *ctx) {
-          DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
-                            M->blob(), ctx);
-        },
-        {M->blob(), v.blob()}, {M->blob()});
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
+                        M->blob(), ctx);
+    }, {M->blob(), v.blob()}, {M->blob()});
   });
 }
 
@@ -680,8 +604,8 @@ void SumColumns(const Tensor &M, Tensor *v) {
     size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
     CHECK_EQ(nb_row, v->Size());
 
-    Tensor one(Shape{nb_col, 1}, M.device(), M.data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor one(Shape{nb_col}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Mult(M, one, v);
   }
 }
@@ -695,10 +619,98 @@ void SumRows(const Tensor &M, Tensor *v) {
     size_t nb_row = M.shape(0), nb_col = M.shape(1);
     CHECK_EQ(nb_col, v->Size());
 
-    Tensor one(Shape{nb_row, 1}, M.device(), M.data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor one(Shape{nb_row}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor X = M.T();
     Mult(X, one, v);
   }
 }
+// ====================Random operations=====================================
+template <typename SType>
+void Bernoulli(const SType p, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto prob = TypeCast<SType, DType>(p);
+    out->device()->Exec([prob, out](Context *ctx) {
+      Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Bernoulli<float>(const float p, Tensor *out);
+
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto l = TypeCast<SType, DType>(low);
+    auto h = TypeCast<SType, DType>(high);
+    out->device()->Exec([l, h, out](Context *ctx) {
+      Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Uniform<float>(const float low, const float high, Tensor *out);
+
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto m = TypeCast<SType, DType>(mean);
+    auto s = TypeCast<SType, DType>(std);
+    out->device()->Exec([m, s, out](Context *ctx) {
+      Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Gaussian<float>(const float mean, const float std, Tensor *out);
+
+// ================Blas operations============================================
+template <typename SType>
+void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    auto a = TypeCast<SType, DType>(alpha);
+    out->device()->Exec([a, in, out](Context *ctx) {
+      Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+    }, {in.blob(), out->blob()}, {out->blob()});
+  });
+}
+template <>
+void Axpy(const float alpha, const Tensor &in, Tensor *out);
+
+Tensor Mult(const Tensor &A, const Tensor &B) {
+  Shape s;
+  s.push_back(A.shape(0));
+  if (B.nDim() == 2) s.push_back(B.shape(1));
+  Tensor out(s, A.device(), A.data_type());
+  Mult(A, B, &out);
+  return out;
+}
+
+void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+  Mult(1.0f, A, B, 0.0f, out);
+}
+
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C) {
+  CHECK_EQ(A.shape().size(), 2u);
+  if (B.nDim() == 1u) {
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+                          B.blob(), b, C->blob(), ctx);
+      }, {A.blob(), B.blob()}, {C->blob()});
+    });
+  } else {
+    CHECK(!C->transpose());
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+                          A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+      }, {A.blob(), B.blob()}, {C->blob()});
+    });
+  }
+}
+
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 1bf6fc7..b5d0ba9 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -29,12 +29,14 @@ namespace singa {
 /// device programming language, e.g., Langice::kCpp, Langice::kCuda
 ///
 /// TODO(wangwei) Clean the functions to make the function APIs consistent:
-/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the first
+/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the
+/// first
 ///    letter.
 /// 2. Order functions based on function name in alphabetical order.
-/// 3. Function arguments order is [const basic type] [const Blob] [mutable Blob].
+/// 3. Function arguments order is [const basic type] [const Blob] [mutable
+/// Blob].
 /// 4. Function argument names, use 'num' for total number of elements in
-///    elementwise operations; use 'in1' 'in2' for input blobs; use 'out' for
+///    elementwise operations; use 'in1' 'in2' for in blobs; use 'out' for
 ///    output blob or value. With exceptions for some functions, e.g.,
 ///      Scale(const float alpha, const Blob* in, Blob* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
@@ -46,262 +48,283 @@ namespace singa {
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Blob level math functions.
 
-
-// ================Linear algebra functions====================================
-/// ret[i] = |input[i]|
+// =============Element-wise operations====================================
+/// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
+/// out = in + x
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Set Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add Not Implemented";
 }
 
-/// sum all elements of input into ret
+/// out = in1 + in2
 template <typename DType, typename Lang>
-void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
-  LOG(FATAL) << "Sum Not Implemented";
+void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add-Pair Not Implemented";
 }
-
-/// ret[i] = sign(input[i])
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sign Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
+           Blob *out, Context *ctx) {
+  LOG(FATAL) << "Clamp Not Implemented";
 }
 
-/// Base is e, Neper number. ret[i]=exp(input[i])
+/// out = x / in
 template <typename DType, typename Lang>
-void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Exp Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div Not Implemented";
 }
 
-/// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
-template <typename DType, typename Lang>
-void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Log Not Implemented";
-}
-/// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sqrt Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  CHECK_NE(x, 0.f);
+  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
-/// Element-wise operation, ret[i]=square([input[i])
+/// out = in1 / in2
 template <typename DType, typename Lang>
-void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Square Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
-/// Element-wise operation, ret[i]=tanh([input[i])
+/// out = in * x
 template <typename DType, typename Lang>
-void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Tanh Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult Not Implemented";
 }
-/// Element-wise operation, ret[i]=max(0, input[i])
+
+/// out = in2 * in2
 template <typename DType, typename Lang>
-void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "ReLU Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
-/// Element-wise operation, ret[i]=sigmoid([input[i])
+
+/// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
-void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sigmoid Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Exp Not Implemented";
 }
 
-// Do softmax for each row invidually
+/// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "Softmax Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "LE Not Implemented";
 }
-
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
+/// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "SumRows Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Log Not Implemented";
 }
-
-/// Sum the columns of the input matrix into a vector
+/// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, 
-	        Blob *out, Context *ctx) {
-  LOG(FATAL) << "SumColumns Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "LT Not Implemented";
 }
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out 
+/// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddRow Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "GE Not Implemented";
 }
-
-/// Add the vector v to every column of A as the column of out
+/// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddCol Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "GT Not Implemented";
 }
-
-/// Element-wise operation, do v^x for every v from the input tensor
+/// Element-wise operation, do v^x for every v from the in tensor
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Element-wise operation, out[i]=max(0, in[i])
 template <typename DType, typename Lang>
-void Clamp(const size_t num, const DType low, const DType high, const Blob *in, 	   Blob *out, Context *ctx) {
-  LOG(FATAL) << "Clamp Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "ReLU Not Implemented";
 }
 
-/// ret = input + x
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in, const DType x, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Add Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Set Not Implemented";
 }
-
-/// ret = lhs + rhs
+/// Element-wise operation, out[i]=sigmoid([in[i])
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Add-Pair Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
-/// ret =  input - x
+/// out[i] = sign(in[i])
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  Add<DType, Lang>(num, in, -x, out, ctx);
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sign Not Implemented";
 }
-
-/// ret = lhs - rhs
+/// Element-wise operation, out[i]=sqrt([in[i])
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sub-Pair Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sqrt Not Implemented";
 }
 
-/// ret = input * x
+/// Element-wise operation, out[i]=square([in[i])
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
-                 Context *ctx) {
-  LOG(FATAL) << "EltwiseMult Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Square Not Implemented";
 }
 
-/// ret = lhs * rhs
+/// out =  in - x
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, 
-		 Blob *out, Context *ctx) {
-  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  Add<DType, Lang>(num, in, -x, out, ctx);
 }
 
-/// ret = input / x
+/// out = in1 - in2
 template <typename DType, typename Lang>
-void Div(const size_t num, const DType x, const Blob *in, 
-	 Blob *out, Context *ctx) { 
-  LOG(FATAL) << "Div Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Sub-Pair Not Implemented";
 }
-
+/// sum all elements of in into out
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  CHECK_NE(x,0.f);
-  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Sum Not Implemented";
 }
 
-/// ret = lhs / rhs
+/// Element-wise operation, out[i]=tanh([in[i])
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Div-Pair Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Tanh Not Implemented";
 }
 
+// =========== Matrix operations ===========================================
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
+}
 /// outer-product.
-/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2, 
-	   Blob *out, Context *ctx) {
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
   LOG(FATAL) << "Outer Not Implemented";
 }
-
-/// ret[i]=(input[i]<x)?1.f:0.f
+// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "LT Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "Softmax Not Implemented";
 }
-/// ret[i]=(input[i]<=x)?1.f:0.f
+/// Sum the columns of the in matrix into a vector
 template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "LE Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
 template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "GT Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
+}
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <typename DType, typename Lang>
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Bernoulli Not Implemented";
 }
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "GE Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std, Blob *out,
+              Context *ctx) {
+  LOG(FATAL) << "Gaussian Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <typename DType, typename Lang>
+void Uniform(const size_t num, const float low, const float high, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "Uniform Not Implemented";
 }
 
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
-// ===== Level 1
-/// return the index of the element with the max value.
+/// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amax Not Implemented";
 }
 
-/// return the index of the element with the min value.
+/// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
 void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amin Not Implemented";
 }
-/// ret = sum |x| for all x in input
+/// out = sum |x| for all x in in
 template <typename DType, typename Lang>
 void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Asum Not Implemented";
 }
 
-/// ret = alpha * input + ret
+/// out = alpha * in + out
 template <typename DType, typename Lang>
-void Axpy(const size_t num, const DType alpha, const Blob *in, 
-	  Blob *out, Context *ctx) {
+void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
+          Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
-/// ret *= x
+/// out *= x
 template <typename DType, typename Lang>
 void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, 
-	 DType *out, Context *ctx) {
+void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
+         Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
 
-// ===== Level 2
-/// ret = alpha * op(A) * v + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+/// out = alpha * A * v + beta * out.
+/// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
-void GEMV(bool trans, const size_t m, const size_t n, const DType alpha, 
-	  const Blob *A, const Blob *v,
-          const DType beta, Blob *out, Context *ctx) {
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+          const Blob *A, const Blob *v, const DType beta, Blob *out,
+          Context *ctx) {
   LOG(FATAL) << "GEMV Not Implemented";
 }
 
@@ -323,34 +346,5 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
   LOG(FATAL) << "GEMM Not Implemented";
 }
 
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Bernoulli Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(const size_t num, const float low, const float high, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "Uniform Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(const size_t num, const float mean, const float std, 
-	      Blob *out, Context *ctx) {
-  LOG(FATAL) << "Gaussian Not Implemented";
-}
-
-
-
-
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_