You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/13 13:20:28 UTC

[35/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index ec7a892..2c5c272 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -25,12 +25,11 @@
 #include <cblas.h>
 #endif
 
-/// TODO(wangwei) Clean the implementations following the comments in
-/// tensor_math.h.
 namespace singa {
 
-template<>
-void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+template <>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
@@ -39,180 +38,150 @@ void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context
 }
 
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] + x;
+  }
 }
 
-// sum all elements of input into out
-// TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
-  float s = 0.f;
-  const float *inPtr = static_cast<const float *>(in->data());
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
-    s += inPtr[i];
+    outPtr[i] = in1Ptr[i] + in2Ptr[i];
   }
-  *out = s;
 }
 
 template <>
-void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Clamp<float, lang::Cpp>(const size_t num, const float low,
+                             const float high, const Blob *in, Blob *out,
+                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float*>(in->data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; 
+    if (inPtr[i] > high) {
+      outPtr[i] = high;
+    } else if (inPtr[i] < low) {
+      outPtr[i] = low;
+    } else {
+      outPtr[i] = inPtr[i];
+    }
   }
 }
 
 template <>
-void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_NE(in2Ptr[i], 0.f);
+    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
+                           Blob *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = exp(inPtr[i]);
+    CHECK_NE(inPtr[i], 0.f);
+    outPtr[i] = x / inPtr[i];
   }
 }
 
 template <>
-void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in,
+                                   const float x, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = log(inPtr[i]);
+    outPtr[i] = inPtr[i] * x;
   }
 }
 
 template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1,
+                                   const Blob *in2, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] * in2Ptr[i];
+  }
+}
+template <>
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = sqrt(inPtr[i]);
+    outPtr[i] = exp(inPtr[i]);
   }
 }
 
 template <>
-void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * inPtr[i];
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
   }
 }
 
 template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = tanh(inPtr[i]);
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
   }
 }
-
 template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
   }
 }
-
 template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                           Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = log(inPtr[i]);
   }
 }
-
 template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
-             Blob *out, Context *ctx) {
+void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-	float *bPtr = new float[ncol];
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-		float denom = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-			bPtr[c] = exp(inPtr[offset + c]);
-			denom += bPtr[c];
-    }
-		for (size_t c = 0; c < ncol; c++) {
-			size_t idx = offset + c;
-			outPtr[idx] = bPtr[c] / denom;
-		}
-  }
-	delete bPtr;
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
-             Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		outPtr[r] = 0.f;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[r] += inPtr[offset + c];
-		}
-	}
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t c = 0; c < ncol; c++) {
-		outPtr[c] = 0.f;
-	}
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-				outPtr[c] += inPtr[offset + c];
-		}
-	}
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());                              
-  const float *vPtr = static_cast<const float *>(v->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[offset + c] = APtr[offset + c] + vPtr[c];
-		}
-	}
-}
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());                              
-  const float *vPtr = static_cast<const float *>(v->data());                              
-	for (size_t r = 0; r < nrow; r++) {
-		size_t offset = r * ncol;
-		for (size_t c = 0; c < ncol; c++) {
-			outPtr[offset + c] = APtr[offset + c] + vPtr[r];
-		}
-	}
-}
-
-template <>
-void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t i = 0; i < num; i++) {
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+  }
+}
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
     outPtr[i] = pow(inPtr[i], x);
   }
 }
@@ -220,252 +189,230 @@ void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob
 template <>
 void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
                            Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr= static_cast<const float *>(in1->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
     outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
   }
 }
-
 template <>
-void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
-														 Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());                              
-	for (size_t i = 0; i < num; i++) {
-		if (inPtr[i] > high) {
-			outPtr[i] = high;
-		}
-		else if (inPtr[i] < low) {
-			outPtr[i] = low;
-		}
-		else {
-			outPtr[i] = inPtr[i];			
-		}
-	}
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+  }
 }
-
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, 
-													 Blob *out, Context *ctx) {
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] + x;
+    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
   }
 }
 
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] + in2Ptr[i];
+    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                           Blob *out, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] - in2Ptr[i];
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = sqrt(inPtr[i]);
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * x;
+    outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] * in2Ptr[i];
+    outPtr[i] = in1Ptr[i] - in2Ptr[i];
   }
 }
 
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
 template <>
-void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-                                   Blob *out, Context *ctx) {
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                           Context *ctx) {
+  float s = 0.f;
+  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-		CHECK_NE(in2Ptr[i],0.f);
-    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+    s += inPtr[i];
   }
+  *out = s;
 }
 
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, 
-         								  Blob *out, Context *ctx) {
-	float *outPtr= static_cast<float *>(out->mutable_data());
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-		CHECK_NE(inPtr[i],0.f);
-    outPtr[i] = x / inPtr[i];
+    outPtr[i] = tanh(inPtr[i]);
   }
 }
 
+// =========Matrix operations ================================================
+
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
-           Blob *out, Context *ctx) {
-	float *outPtr= static_cast<float *>(out->mutable_data());
-	const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-	for (size_t r = 0; r < m ; r++) {
-		size_t offset = r * n;
-		for (size_t c = 0; c < n; c++) {
-			outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
-		}
-	}
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+    }
+  }
 }
 
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+    }
   }
 }
-
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                             const Blob *in2, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t r = 0; r < m; r++) {
+    size_t offset = r * n;
+    for (size_t c = 0; c < n; c++) {
+      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+    }
   }
 }
-
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      bPtr[c] = exp(inPtr[offset + c]);
+      denom += bPtr[c];
+    }
+    for (size_t c = 0; c < ncol; c++) {
+      size_t idx = offset + c;
+      outPtr[idx] = bPtr[c] / denom;
+    }
   }
+  delete bPtr;
 }
 
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                  const Blob *in, Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  for (size_t c = 0; c < ncol; c++) {
+    outPtr[c] = 0.f;
+  }
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[c] += inPtr[offset + c];
+    }
   }
 }
 
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
-	size_t maxPos = 0;
-	float maxVal = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		if (i == 0) {
-			maxVal = inPtr[i]; 
-		}
-		else if (inPtr[i] > maxVal) {
-			maxVal = inPtr[i];
-			maxPos = i;
-		}
-	}
-	*out = maxPos;
-}
-
-template <>
-void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
-	size_t minPos = 0;
-	float minVal = 0;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		if (i == 0) {
-			minVal = inPtr[i]; 
-		}
-		else if (inPtr[i] > minVal) {
-			minVal = inPtr[i];
-			minPos = i;
-		}
-	}
-	*out = minPos;
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    outPtr[r] = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[r] += inPtr[offset + c];
+    }
+  }
 }
 
+// ===============Random operations==========================================
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
-	float sum = 0;
-	const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {
-		sum += fabs(inPtr[i]);
-	}
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
+                                 Context *ctx) {
+  std::bernoulli_distribution distribution(p);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+  }
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-          							 	  Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
+                                const float std, Blob *out, Context *ctx) {
+  std::normal_distribution<float> distribution(mean, std);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-	for (size_t i = 0; i < num; i++) {	
-		outPtr[i] += alpha * inPtr[i];
-	}
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
 }
-
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-	for (size_t i = 0; i < num; i++) {
-		outPtr[i] *= x;
-	}
+void Uniform<float, lang::Cpp>(const size_t num, const float low,
+                               const float high, Blob *out, Context *ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
 }
 
-//template <>
-//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-//         									 float *out, Context *ctx) {
-//	float sum = 0;
-//	const float *in1Ptr = static_cast<const float *>(in1->data());
-//	const float *in2Ptr = static_cast<const float *>(in2->data());
-//	for (size_t i = 0; i < num; i++) {
-//		sum += in1Ptr[i] * in2Ptr[i];
-//	}
-//}
-
-template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
-          const Blob *A, const Blob *v, const float beta, 
-					Blob *out, Context *ctx) {
-	float *outPtr = static_cast<float *>(out->mutable_data());
-	const float* APtr = static_cast<const float *>(A->data());
-	const float* vPtr = static_cast<const float *>(v->data());
-	for (size_t r = 0; r < m; r++) {
-		float sum = 0; 
-		for (size_t c = 0; c < n; c++) {
-			size_t idx = trans ? c * m + r : r * n + c;	
-			sum += APtr[idx] * vPtr[c];
-		}
-		outPtr[r] = alpha * sum + beta * outPtr[r];
-	}
-}
+// ====================Blas operations======================================
 
 template <>
 void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
@@ -491,37 +438,21 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
   }
 }
 
+#ifdef USE_CBLAS
 template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
-  std::bernoulli_distribution distribution(p);
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                            Blob *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
-  }
+  cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
 }
-
-template <>
-void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
-                               Context *ctx) {
-  std::uniform_real_distribution<float> distribution(low, high);
-  float *outPtr= static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-  }
-}
-
 template <>
-void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
-                                Context *ctx) {
-  std::normal_distribution<float> distribution(mean, std);
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-  }
+  cblas_sscal(num, x, outPtr, 1);
 }
 
-
-#ifdef USE_CBLAS
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
                            float *out, Context *ctx) {
@@ -529,6 +460,21 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
   const float *in2Ptr = static_cast<const float *>(in2->data());
   *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Blob *A, const Blob *v,
+                            const float beta, Blob *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  if (!trans) {
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+                beta, outPtr, 1);
+  } else {
+    cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
+                outPtr, 1);
+  }
+}
 
 template <>
 void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
@@ -548,6 +494,98 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
               lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
+#else
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  size_t maxPos = 0;
+  float maxVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      maxVal = inPtr[i];
+    } else if (inPtr[i] > maxVal) {
+      maxVal = inPtr[i];
+      maxPos = i;
+    }
+  }
+  *out = maxPos;
+}
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  size_t minPos = 0;
+  float minVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      minVal = inPtr[i];
+    } else if (inPtr[i] > minVal) {
+      minVal = inPtr[i];
+      minPos = i;
+    }
+  }
+  *out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  float sum = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += fabs(inPtr[i]);
+  }
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] += alpha * inPtr[i];
+  }
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] *= x;
+  }
+}
+
+template <>
+void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           float *out, Context *ctx) {
+  float sum = 0;
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += in1Ptr[i] * in2Ptr[i];
+  }
+}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Blob *A, const Blob *v,
+                            const float beta, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < m; r++) {
+    float sum = 0;
+    for (size_t c = 0; c < n; c++) {
+      size_t idx = trans ? c * m + r : r * n + c;
+      sum += APtr[idx] * vPtr[c];
+    }
+    outPtr[r] = alpha * sum + beta * outPtr[r];
+  }
+}
+
 #endif  // USE_CBLAS
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..f9841a3 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -26,75 +26,100 @@
 #include "singa/core/common.h"
 
 namespace singa {
-
-// TODO(wangwei) Clean implementations following comments in tensor_math_cpp.h.
-// TODO(wangwei) optimize using stream
+// =================Elementwise operations===================================
 template <>
-void Add<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
-                            Blob *ret, Context *ctx) {
-  const float *a = static_cast<const float *>(lhs->data());
-  const float *b = static_cast<const float *>(rhs->data());
-  float *c = static_cast<float *>(ret->mutable_data());
-  cuda::add(count, a, b, c);
+void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+                            Blob *out, Context *ctx) {
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cuda::add(num, in1Ptr, in2Ptr, outPtr);
 }
 
-// TODO(wangwei) optimize using stream
+// follow the consistency guide of math API
 template <>
-void Sub<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
-                            Blob *ret, Context *ctx) {
-  const float *a = static_cast<const float *>(lhs->data());
-  const float *b = static_cast<const float *>(rhs->data());
-  float *c = static_cast<float *>(ret->mutable_data());
-  cuda::sub(count, a, b, c);
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::Div(num, x, inPtr, outPtr, ctx->stream);
 }
 
 template <>
-void EltwiseMult<float, lang::Cuda>(int count, const Blob *input, float x,
-                                    Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  cuda::mult(count, lptr, x, dptr);
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
+                                    const float x, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::mult(num, inPtr, x, outPtr);
 }
-// TODO(wangwei) optimize using stream
 template <>
-void Square<float, lang::Cuda>(int count, const Blob *input, Blob *ret,
-                               Context *ctx) {
-  const float *in = static_cast<const float *>(input->data());
-  float *out = static_cast<float *>(ret->mutable_data());
-  cuda::square(count, in, out);
+void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
 }
-
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
 template <>
-void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
-                            Context *ctx) {
-  const float *in = static_cast<const float *>(input->data());
-  cuda::sum(count, in, ret);
+void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
 }
-
-// follow the consistency guide of math API
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
-                            Blob *out, Context *ctx) {
+void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  cuda::Div(num, alpha, inPtr, outPtr, ctx->stream);
+  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+                           Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
 }
-
 template <>
 void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   cuda::Set(num, x, outPtr, ctx->stream);
 }
+// TODO(wangwei) optimize using stream
+template <>
+void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
+                               Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::square(num, inPtr, outPtr);
+}
+// TODO(wangwei) optimize using stream
+template <>
+void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+                            Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+}
+// sum all elements of input into ret
+// TODO(wangwei) optimize using stream
+template <>
+void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  cuda::sum(num, inPtr, out);
+}
+
+// =========================Blas operations==================================
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              const size_t ncol, const Blob *M, const Blob *v,
                              Blob *out, Context *ctx) {
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   const float *MPtr = static_cast<const float *>(M->data());
   const float *vPtr = static_cast<const float *>(v->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
@@ -106,6 +131,22 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
                              vPtr, 1, outPtr, ncol));
   }
 }
+template <>
+void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
+                             const float alpha, const Blob *A, const Blob *v,
+                             const float beta, Blob *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  if (!trans)
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
+                             1, &beta, outPtr, 1));
+  else
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, APtr, m, vPtr,
+                             1, &beta, outPtr, 1));
+}
+
 // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
 template <>
 void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
@@ -121,44 +162,11 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
   const float *APtr = static_cast<const float *>(A->data());
   const float *BPtr = static_cast<const float *>(B->data());
   float *CPtr = static_cast<float *>(C->mutable_data());
-  auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
 }
 
-template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out, Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out,  Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out, Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                   Blob* out,  Context *ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
-  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
-}
-
-
-
-
-
 }  // namespace singa
 
 #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 823445f..94ca283 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -117,12 +117,11 @@ TEST_F(TestTensorMath, MemberTanh) {
 }
 
 TEST_F(TestTensorMath, Sum) {
-	Tensor p1(Shape{1,2});
-	p1 = Sum(e, 0);
+	Tensor p1 = Sum(e, 0);
   const float *dptr1 = p1.data<const float *>();
 	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
 	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
-	
+
 	Tensor p2(Shape{3,1});
 	p2 = Sum(e, 1);
   const float *dptr2 = p2.data<const float *>();
@@ -143,9 +142,9 @@ TEST_F(TestTensorMath, SoftMax) {
 	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
 	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
 	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
-	
+
 	Tensor p2(Shape{3,2});
-	p2 = SoftMax(e,1); 
+	p2 = SoftMax(e,1);
   const float *dptr2 = p2.data<const float *>();
 	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
 	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
@@ -237,12 +236,12 @@ TEST_F(TestTensorMath, MemberDiv) {
 
 TEST_F(TestTensorMath, MemberBernoulli) {
 	Tensor p1(Shape{10000});
-	Bernoulli(0.3,&p1);
+	Bernoulli(0.3f, &p1);
 	const float* dptr1 = p1.data<const float*>();
 	float sum = 0;
 	for(int i = 0; i < 10000; i++) sum += dptr1[i];
 	float mean = sum/10000;
-	EXPECT_NEAR(mean, 0.3, 1e-2);
+	EXPECT_NEAR(mean, 0.3f, 1e-2);
 
 	sum = 0;
 	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
@@ -267,7 +266,7 @@ TEST_F(TestTensorMath, MemberUniform) {
 
 TEST_F(TestTensorMath, MemberGaussian) {
 	Tensor p1(Shape{50000});
-	Gaussian(0.0,1.0,&p1);
+	Gaussian(0.0f,1.0f,&p1);
 	const float* dptr1 = p1.data<const float*>();
 	float sum = 0;
 	for(int i = 0; i < 50000; i++) sum += dptr1[i];