You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@singa.apache.org by wa...@apache.org on 2016/04/07 15:12:17 UTC

[1/4] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

Repository: incubator-singa
Updated Branches:
  refs/heads/master 369d87960 -> 8329aa0c3


SINGA-80 New Blob Level and Address Level Math Operation Interface

-------

add gtest for blob level functions
and bug fix in math_blob.h


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/247002d3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/247002d3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/247002d3

Branch: refs/heads/master
Commit: 247002d3027b5ac45dda735f62cc2f8df7c2dcac
Parents: 0233049
Author: jinyangturbo <pk...@gmail.com>
Authored: Wed Mar 16 00:16:01 2016 -0700
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Apr 2 21:33:27 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/blob.h      |  15 +
 include/singa/utils/math_blob.h |  70 +++--
 src/test/test_math.cc           | 587 +++++++++++++++++++++++++++++++++++
 3 files changed, 649 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/247002d3/include/singa/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
index 3351cff..9defeac 100644
--- a/include/singa/utils/blob.h
+++ b/include/singa/utils/blob.h
@@ -276,6 +276,21 @@ class Blob {
     ret.transpose_ = !transpose_;
     return ret;
   }
+  // to check if two blob has the exact same content
+  bool check_equal(Blob* other) const {
+    if (transpose() != other->transpose()) return false;
+    if (count() != other->count()) return false;
+    if (shape().size() != other->shape().size()) return false;
+    for (int i = 0; i < shape().size(); i++) {
+      if (shape(i) != other->shape(i)) return false;
+    }
+    const Dtype * a = cpu_data();
+    const Dtype * b = other->cpu_data();
+    for (int i = 0; i < count(); i++) {
+      if (a[i] != b[i]) return false;
+    }
+    return true;
+  }
 
  protected:
   std::shared_ptr<SyncedMemory> data_ = nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/247002d3/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index 55ba44b..35985f1 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -32,14 +32,16 @@
 #include "singa/utils/context.h"
 
 namespace singa {
+
+#define NO_GPU LOG(FATAL) << "Not compiled with GPU";
 /**
  * \file math_blob.h is not tested thorough.
  * Only GEMM() and MMDot() MVSumRow() andMVAddRow() are used now.
  */
 /************* BLAS level 1 *****************/
 /**
- * Scale each element of A with alpha, and put the result into B.
- * Bi = alpha*Ai
+ * Scale each element of A with alpha, and put the result into A.
+ * Ai = alpha*Ai
  * Use blas scale internally.
  */
 template<typename Dtype>
@@ -52,6 +54,8 @@ void Scale(Dtype alpha, Blob<Dtype> * B) {
 #ifdef USE_GPU
     gpu_scale(context->cublas_handle(device), B->count(), alpha,
         B->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif
   }
 }
@@ -70,7 +74,9 @@ void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
 #ifdef USE_GPU
     gpu_axpy(context->cublas_handle(device), A.count(), alpha, A.gpu_data(),
         B->mutable_gpu_data());
-#endif  // USE_GPU
+#else
+    NO_GPU;
+#endif
   }
 }
 
@@ -111,6 +117,8 @@ void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A,
 #ifdef USE_GPU
     gpu_gemv(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n,
         alpha, beta, TranA, C->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -176,6 +184,8 @@ void GEMM(Dtype alpha, Dtype beta, const Blob<Dtype>& A, const Blob<Dtype>& B,
 #ifdef USE_GPU
     gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
         m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -216,9 +226,10 @@ Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) {
     res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
   } else {
 #ifdef USE_GPU
-    // gpu part
     res = gpu_dot(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
         n);
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
   return res;
@@ -242,12 +253,14 @@ void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) {
   auto context = Singleton<Context>::Instance();
   int device = context->device_id(std::this_thread::get_id());
   if (device < 0) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false,
-        C->mutable_cpu_data());
+    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, Dtype(1), Dtype(0), false,
+        false, C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
     gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
         m, n, 1, 1, 0, false, false, C->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -268,7 +281,7 @@ void Map(const Blob<Dtype> & A, Blob<Dtype> * B) {
 #ifdef USE_GPU
     gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
 #else
-    LOG(ERROR) << "Not implemented";
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -310,7 +323,7 @@ void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) {
 #ifdef USE_GPU
     gpu_e_f<Op>(A.count(), A.gpu_data(), alpha, B->mutable_gpu_data());
 #else
-    LOG(FATAL) << "Not implemented";
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -328,9 +341,8 @@ void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
     cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
         C->mutable_cpu_data());
   } else {
-#ifdef USE_GPU
-    LOG(ERROR) << "Not implemented";
-#endif  // USE_GPU
+    // TODO(wangwei) implement gpu version.
+    NO_GPU;
   }
 }
 
@@ -353,7 +365,7 @@ void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) {
   CUDA_CHECK(cudaMemcpy(static_cast<Dtype*>(B->mutable_gpu_data()),
              A.gpu_data(), sizeof(Dtype) * A.count(), cudaMemcpyDefault));
 #else
-  LOG(FATAL) << "Not implemented";
+  NO_GPU;
 #endif
   }
 }
@@ -365,7 +377,7 @@ void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) {
  */
 template<typename Dtype>
 void Add(Dtype alpha,  const Blob<Dtype> & A, Blob<Dtype> * B) {
-  Map<singa::op::Add<Dtype>>(alpha, A, B);
+  Map<singa::op::Add<Dtype>, Dtype>(alpha, A, B);
 }
 
 /**
@@ -385,7 +397,7 @@ void Add(const Blob<Dtype> & A, const Blob<Dtype> & B,
  */
 template<typename Dtype>
 void Sub(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Sub<Dtype>>(alpha, A, B);
+  Map<singa::op::Sub<Dtype>, Dtype>(alpha, A, B);
 }
 
 /**
@@ -406,7 +418,7 @@ void Sub(const Blob<Dtype> & A, const Blob<Dtype> & B,
 template<typename Dtype>
 void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  Map<singa::op::Mult<Dtype>>(A, B, C);
+  Map<singa::op::Mult<Dtype>, Dtype>(A, B, C);
   // TODO(wangwei) use MKL's vector func
 }
 
@@ -417,7 +429,7 @@ void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
 template<typename Dtype>
 void Div(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  Map<singa::op::Div<Dtype>>(A, B, C);
+  Map<singa::op::Div<Dtype>, Dtype>(A, B, C);
   // TODO(wangwei) use MKL's vector func
 }
 /**
@@ -481,6 +493,8 @@ void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
 #ifdef USE_GPU
       singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(),
           m, n, n);
+#else
+      NO_GPU;
 #endif  // USE_GPU
     }
   }
@@ -520,6 +534,8 @@ void MVAddRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
 #ifdef USE_GPU
       singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(),
           m, n, n);
+#else
+      NO_GPU;
 #endif  // USE_GPU
     }
   }
@@ -574,7 +590,8 @@ void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
   } else {
 #ifdef USE_GPU
     singa_gpu_sum_col(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-    // gpu part (TODO check transpose case)
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -599,7 +616,8 @@ void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
   } else {
 #ifdef USE_GPU
     singa_gpu_sum_row(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-    // gpu part (TODO check transpose case)
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -619,8 +637,9 @@ void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
     cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // gpu part
     gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -640,6 +659,8 @@ void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
   } else {
 #ifdef USE_GPU
     gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -659,6 +680,8 @@ Dtype Asum(const Blob<Dtype>& A) {
 #ifdef USE_GPU
     ret = gpu_asum(context->cublas_handle(device), A.count(), A.gpu_data(), 1)
       / A.count();
+#else
+    NO_GPU;
 #endif
   }
   return ret;
@@ -679,7 +702,7 @@ void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) {
     gpu_sample_uniform(context->curand_generator(thread), A->count(), low, high,
         A->mutable_gpu_data());
 #else
-    LOG(FATAL) << "Not implemented";
+    NO_GPU;
 #endif
   }
 }
@@ -696,6 +719,8 @@ void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) {
 #ifdef USE_GPU
     gpu_sample_gaussian(context->curand_generator(thread), A->count(),
         mean, std, A->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif
   }
 }
@@ -712,8 +737,7 @@ void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>* B) {
     cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
       B->mutable_cpu_data());
   } else {
-#ifdef USE_GPU
-#endif  // USE_GPU
+    NO_GPU;
   }
 }
 
@@ -727,7 +751,7 @@ void Zero(Blob<Dtype>* B) {
 #ifdef USE_GPU
     cudaMemset(B->mutable_gpu_data(), 0, B->count() * sizeof(float));
 #else
-    LOG(FATAL) << "Not implemented";
+    NO_GPU;
 #endif  // USE_GPU
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/247002d3/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
index 2627b2e..6bb6001 100644
--- a/src/test/test_math.cc
+++ b/src/test/test_math.cc
@@ -20,6 +20,8 @@
 *************************************************************/
 #include <thread>
 #include "gtest/gtest.h"
+#include "singa/utils/blob.h"
+#include "singa/utils/math_blob.h"
 #include "singa/utils/math_addr.h"
 #include "singa/utils/math_kernel.h"
 #include "singa/utils/singa_op.h"
@@ -34,6 +36,591 @@
 using namespace singa;
 using namespace std;
 
+TEST(MathBlobTest, TestScale) {
+  Blob<float> *A = new Blob<float>(10);
+  Blob<float> *B = new Blob<float>(10);
+  A->SetValue(2);
+  B->SetValue(6);
+  Scale<float>(3.0, A);
+  ASSERT_EQ(A->check_equal(B), true);
+}
+
+TEST(MathBlobTest, TestAXPY) {
+  Blob<float> * A = new Blob<float>(10);
+  Blob<float> * B = new Blob<float>(10);
+  Blob<float> * C = new Blob<float>(10);
+  Blob<float> * D = new Blob<float>(10);
+  A->SetValue(2);
+  B->SetValue(3);
+  C->SetValue(7);
+  D->SetValue(2);
+  AXPY<float>(2.0, *A, B);
+  ASSERT_EQ(B->check_equal(C), true);
+  ASSERT_EQ(A->check_equal(D), true);
+}
+
+TEST(MathBlobTest, TestGEMV) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5] = {};
+  float Res[5] = {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+    }
+    B[i] = 5*i + 3;
+    Res[i] = i;
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  Blob<float> * BlobB = new Blob<float>(5);
+  Blob<float> * BlobAB = new Blob<float>(5);
+  Blob<float> * BlobATB = new Blob<float>(5);
+  Blob<float> * BlobRes = new Blob<float>(5);
+
+  BlobA->set_cpu_data(A[0]);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  BlobB->set_cpu_data(B);
+  BlobAB->set_cpu_data(Res);
+  BlobATB->set_cpu_data(Res);
+
+  for (int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      Res[i] += 2*A[i][j] * B[j];
+    }
+  }
+
+  BlobRes->set_cpu_data(Res);
+
+  GEMV<float>(2, 1, *BlobA, *BlobB, BlobAB);
+  GEMV<float>(2, 1, *BlobAT, *BlobB, BlobATB);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMVDot) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5] = {};
+  float Res[5] = {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+    }
+    B[i] = 5*i -2;
+    Res[i] = 0;
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  Blob<float> * BlobB = new Blob<float>(5);
+  Blob<float> * BlobAB = new Blob<float>(5);
+  Blob<float> * BlobATB = new Blob<float>(5);
+  Blob<float> * BlobRes = new Blob<float>(5);
+
+  BlobA->set_cpu_data(A[0]);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  BlobB->set_cpu_data(B);
+  BlobAB->set_cpu_data(Res);
+  BlobATB->set_cpu_data(Res);
+
+  for (int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      Res[i] += A[i][j] * B[j];
+    }
+  }
+
+  BlobRes->set_cpu_data(Res);
+
+  MVDot<float>(*BlobA, *BlobB, BlobAB);
+  MVDot<float>(*BlobAT, *BlobB, BlobATB);
+
+  const float * addrRes = BlobAB->cpu_data();
+  for (int i = 0; i < 5; i++) {
+    ASSERT_EQ(addrRes[i], Res[i]);
+  }
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestGEMM) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5][5]= {};
+  float BT[5][5]= {};
+  float Res[5][5]= {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+      B[i][j] = - i * j + i * i - j * j;
+      BT[j][i] = - i * j + i * i - j * j;
+      Res[i][j] = i * j + i * i + j * j;
+    }
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  BlobA->set_cpu_data(A[0]);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  Blob<float> * BlobB = new Blob<float>(5, 5);
+  BlobB->set_cpu_data(B[0]);
+  Blob<float> * BlobBT = new Blob<float>(5, 5);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+  Blob<float> * BlobAB = new Blob<float>(5, 5);
+  BlobAB->set_cpu_data(Res[0]);
+  Blob<float> * BlobABT = new Blob<float>(5, 5);
+  BlobABT->set_cpu_data(Res[0]);
+  Blob<float> * BlobATB = new Blob<float>(5, 5);
+  BlobATB->set_cpu_data(Res[0]);
+  Blob<float> * BlobATBT = new Blob<float>(5, 5);
+  BlobATBT->set_cpu_data(Res[0]);
+
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
+      Res[i][j] *= 2;
+      for (int k = 0; k < 5; k++) {
+        Res[i][j] += 3 * A[i][k]*B[k][j];
+      }
+    }
+  }
+
+  Blob<float> * BlobRes = new Blob<float>(5, 5);
+  BlobRes->set_cpu_data(Res[0]);
+
+  GEMM<float>(3, 2, *BlobA, *BlobB, BlobAB);
+  GEMM<float>(3, 2, *BlobA, *BlobBT, BlobABT);
+  GEMM<float>(3, 2, *BlobAT, *BlobB, BlobATB);
+  GEMM<float>(3, 2, *BlobAT, *BlobBT, BlobATBT);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobABT->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATBT->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMMDot) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5][5]= {};
+  float BT[5][5]= {};
+  float Res[5][5]= {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+      B[i][j] = - i * j + i * i - j * j;
+      BT[j][i] = - i * j + i * i - j * j;
+      Res[i][j] = i * j + i * i + j * j;
+    }
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  BlobA->set_cpu_data(A[0]);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  Blob<float> * BlobB = new Blob<float>(5, 5);
+  BlobB->set_cpu_data(B[0]);
+  Blob<float> * BlobBT = new Blob<float>(5, 5);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+  Blob<float> * BlobAB = new Blob<float>(5, 5);
+  BlobAB->set_cpu_data(Res[0]);
+  Blob<float> * BlobABT = new Blob<float>(5, 5);
+  BlobABT->set_cpu_data(Res[0]);
+  Blob<float> * BlobATB = new Blob<float>(5, 5);
+  BlobATB->set_cpu_data(Res[0]);
+  Blob<float> * BlobATBT = new Blob<float>(5, 5);
+  BlobATBT->set_cpu_data(Res[0]);
+
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
+      Res[i][j] = 0;
+      for (int k = 0; k < 5; k++) {
+        Res[i][j] += A[i][k]*B[k][j];
+      }
+    }
+  }
+
+  Blob<float> * BlobRes = new Blob<float>(5, 5);
+  BlobRes->set_cpu_data(Res[0]);
+
+  MMDot<float>(*BlobA, *BlobB, BlobAB);
+  MMDot<float>(*BlobA, *BlobBT, BlobABT);
+  MMDot<float>(*BlobAT, *BlobB, BlobATB);
+  MMDot<float>(*BlobAT, *BlobBT, BlobATBT);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobABT->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATBT->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestVVDot) {
+  float A[10] = {};
+  float B[10] = {};
+  float prod = 0;
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    B[i] = 2* i * i - 3* (i%4);
+    prod += A[i] * B[i];
+  }
+
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  BlobB->set_cpu_data(B);
+  float blobprod = VVDot<float>(*BlobA, *BlobB);
+  ASSERT_EQ(blobprod, prod);
+}
+
+TEST(MathBlobTest, TestOuterProduct) {
+  float A[10] = {};
+  float B[10] = {};
+  float AB[10][10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    B[i] = 2* i * i - 3* (i%4);
+  }
+  for(int i = 0; i < 10; i++) {
+    for(int j = 0; j < 10; j++) {
+      AB[i][j] = A[i]*B[j];
+    }
+  }
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  BlobB->set_cpu_data(B);
+  Blob<float> * BlobAB = new Blob<float>(10, 10);
+  // BlobAB->SetValue(3);
+  Blob<float> * BlobRes = new Blob<float>(10, 10);
+  BlobRes->set_cpu_data(AB[0]);
+  OuterProduct<float>(*BlobA, *BlobB, BlobAB);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMapAB) {
+  float A[10] = {};
+  float Res[10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    Res[i] = A[i] * A[i];
+  }
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  Blob<float> * BlobRes = new Blob<float>(10);
+  BlobRes->set_cpu_data(Res);
+  Map<singa::op::Square<float>, float>(*BlobA, BlobB);
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMapABC) {
+  float A[10] = {};
+  float B[10] = {};
+  float Res[10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    B[i] = 2* i * i - 3* (i%4);
+    Res[i] = A[i] * B[i];
+  }
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  BlobB->set_cpu_data(B);
+  Blob<float> * BlobC = new Blob<float>(10);
+  Blob<float> * BlobRes = new Blob<float>(10);
+  BlobRes->set_cpu_data(Res);
+  Map<singa::op::Mult<float>, float>(*BlobA, *BlobB, BlobC);
+  ASSERT_EQ(BlobC->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestCopy) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10);
+  float A[10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+  }
+  BlobA->set_cpu_data(A);
+  Copy<float>(*BlobA, BlobB);
+  ASSERT_EQ(BlobA->check_equal(BlobB), true);
+}
+
+TEST(MathBlobTest, TestAdd) {
+  Blob<float> *A = new Blob<float>(10);
+  Blob<float> *B = new Blob<float>(10);
+  Blob<float> *C = new Blob<float>(10);
+  Blob<float> *D = new Blob<float>(10);
+  A->SetValue(5);
+  B->SetValue(6);
+  D->SetValue(11);
+  Add<float>(*A, *B, C);
+  ASSERT_EQ(C->check_equal(D), true);
+}
+
+TEST(MathBlobTest, TestSub) {
+  Blob<float> *A = new Blob<float>(10);
+  Blob<float> *B = new Blob<float>(10);
+  Blob<float> *C = new Blob<float>(10);
+  Blob<float> *D = new Blob<float>(10);
+  A->SetValue(5);
+  B->SetValue(6);
+  D->SetValue(-1);
+  Sub<float>(*A, *B, C);
+  ASSERT_EQ(C->check_equal(D), true);
+}
+
+TEST(MathBlobTest, TestMVAddCol) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = i * j + i - j;
+      BT[j][i] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = 2.0 * A[i] + 3.0 * B[i][j];
+      BT[j][i] = 2.0 * A[i] + 3.0 * BT[j][i];
+    }
+  }
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  MVAddCol<float>(2.0, 3.0, *BlobA, BlobB);
+  MVAddCol<float>(2.0, 3.0, *BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestMVAddRow) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = i * j + i - j;
+      BT[j][i] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    for(int j = 0; j < 10; j++) {
+      B[j][i] = 2.0 * A[i] + 3.0 * B[j][i];
+      BT[i][j] = 2.0 * A[i] + 3.0 * BT[i][j];
+    }
+  }
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  MVAddRow<float>(2.0, 3.0, *BlobA, BlobB);
+  MVAddRow<float>(2.0, 3.0, *BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestRepmatCol) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = A[i];
+      BT[j][i] = A[i];
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobBT->set_transpose(true);
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  RepmatCol<float>(*BlobA, BlobB);
+  RepmatCol<float>(*BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestRepmatRow) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[j][i] = A[i];
+      BT[i][j] = A[i];
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobBT->set_transpose(true);
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  RepmatRow<float>(*BlobA, BlobB);
+  RepmatRow<float>(*BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestMVSumCol) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobACopy = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = i * j + i - j;
+      BT[j][i] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobACopy->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    A[i] *= 2.0;
+    for(int j = 0; j < 10; j++) {
+      A[i] += 3.0 * B[i][j];
+    }
+  }
+  BlobRes->set_cpu_data(A);
+
+  MVSumCol<float>(2.0, 3.0, *BlobB, BlobA);
+  MVSumCol<float>(2.0, 3.0, *BlobBT, BlobACopy);
+
+  ASSERT_EQ(BlobA->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobACopy->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMVSumRow) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobACopy = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[j][i] = i * j + i - j;
+      BT[i][j] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobACopy->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    A[i] *= 2.0;
+    for(int j = 0; j < 10; j++) {
+      A[i] += 3.0 * B[j][i];
+    }
+  }
+  BlobRes->set_cpu_data(A);
+
+  MVSumRow<float>(2.0, 3.0, *BlobB, BlobA);
+  MVSumRow<float>(2.0, 3.0, *BlobBT, BlobACopy);
+
+  ASSERT_EQ(BlobA->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobACopy->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestASum) {
+  float A[10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = ((i % 3) -1) * i;
+  }
+
+  Blob<float> *BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+
+  float BlobRes = Asum<float>(*BlobA);
+  float res = cblas_sasum(10, A, 1) / 10;
+
+  ASSERT_EQ(BlobRes, res);
+}
+
 TEST(MathTest, TestGemmCPU) {
   float A[3][2] = {};
   float B[3][2] = {};

[4/4] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

Posted by wa...@apache.org.

SINGA-80 New Blob Level and Address Level Math Operation Interface

Merge PR 134 for SINGA-80


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/8329aa0c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/8329aa0c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/8329aa0c

Branch: refs/heads/master
Commit: 8329aa0c3c613046b2bd8f70c3e43a765f74f602
Parents: 369d879 d452c1f
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu Apr 7 21:01:45 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu Apr 7 21:05:41 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/blob.h      |  15 +
 include/singa/utils/math_addr.h |  12 +-
 include/singa/utils/math_blob.h |  88 ++++--
 src/test/test_math.cc           | 591 ++++++++++++++++++++++++++++++++++-
 4 files changed, 667 insertions(+), 39 deletions(-)
----------------------------------------------------------------------

[2/4] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

Posted by wa...@apache.org.

SINGA-80 New Blob Level and Address Level Math Operation Interface

Uniform the signature of CPU and GPU.
Fixed some bugs about MVAddRow() and OuterProduct().
Run All	Test OK.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/8ade7d76
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/8ade7d76
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/8ade7d76

Branch: refs/heads/master
Commit: 8ade7d76dbe64b75088693febba7019e28d39c30
Parents: 247002d
Author: seaok <se...@gmail.com>
Authored: Fri Mar 18 15:00:44 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Apr 2 21:35:39 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/math_addr.h | 6 +++---
 include/singa/utils/math_blob.h | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8ade7d76/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
index 524e13e..4a05cfd 100644
--- a/include/singa/utils/math_addr.h
+++ b/include/singa/utils/math_addr.h
@@ -234,13 +234,13 @@ void gpu_e_f(const int n, const Dtype * A, const Dtype * B, Dtype * C) {
 }
 
 template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
+void gpu_e_f(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
   Op::CudaMap(alpha, A, B, n);
 }
 
 template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype * B,
-    const Dtype alpha, const Dtype beta, Dtype * C) {
+void gpu_e_f(const int n, const Dtype alpha, const Dtype beta,
+	const Dtype * A, const Dtype * B, Dtype * C) {
   Op::CudaMap(alpha, beta, A, B, C, n);
 }
 // element-wise generalized operation defined in Op

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8ade7d76/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index 35985f1..50da1f0 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -258,7 +258,7 @@ void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) {
   } else {
 #ifdef USE_GPU
     gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
-        m, n, 1, 1, 0, false, false, C->mutable_gpu_data());
+        m, n, 1, Dtype(1), Dtype(0), false, false, C->mutable_gpu_data());
 #else
     NO_GPU;
 #endif  // USE_GPU
@@ -321,7 +321,7 @@ void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) {
     cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    gpu_e_f<Op>(A.count(), A.gpu_data(), alpha, B->mutable_gpu_data());
+    gpu_e_f<Op>(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data());
 #else
     NO_GPU;
 #endif  // USE_GPU
@@ -491,8 +491,8 @@ void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
           B->mutable_cpu_data());
     } else {
 #ifdef USE_GPU
-      singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(),
-          m, n, n);
+      gpu_gemm(context->cublas_handle(device), A.gpu_data(), one.gpu_data(), m, n, 1,
+		  alpha, beta, false, false, B->mutable_gpu_data());
 #else
       NO_GPU;
 #endif  // USE_GPU

[3/4] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

Posted by wa...@apache.org.

SINGA-80 New Blob Level and Address Level Math Operation Interface

Clean the files with cpplint.
Add fatal log for places where GPU is needed but the code is not compiled with GPU.
There are few TODOs in math_blob.h left.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d452c1fb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d452c1fb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d452c1fb

Branch: refs/heads/master
Commit: d452c1fb4128ef8a90198100033160826290b0c3
Parents: 8ade7d7
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Apr 4 11:12:32 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Apr 4 11:12:32 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/blob.h      |  2 +-
 include/singa/utils/math_addr.h |  8 ++---
 include/singa/utils/math_blob.h | 14 ++++----
 src/test/test_math.cc           | 70 ++++++++++++++++++------------------
 4 files changed, 48 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d452c1fb/include/singa/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
index 9defeac..1a0a592 100644
--- a/include/singa/utils/blob.h
+++ b/include/singa/utils/blob.h
@@ -281,7 +281,7 @@ class Blob {
     if (transpose() != other->transpose()) return false;
     if (count() != other->count()) return false;
     if (shape().size() != other->shape().size()) return false;
-    for (int i = 0; i < shape().size(); i++) {
+    for (unsigned int i = 0; i < shape().size(); i++) {
       if (shape(i) != other->shape(i)) return false;
     }
     const Dtype * a = cpu_data();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d452c1fb/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
index 4a05cfd..cf1d227 100644
--- a/include/singa/utils/math_addr.h
+++ b/include/singa/utils/math_addr.h
@@ -78,7 +78,7 @@ void cpu_copy(const int n, const Dtype* A, Dtype *B) {
 }
 
 template<typename Dtype>
-Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) {
+Dtype cpu_dot(const int n, const Dtype * A, const Dtype * B) {
   Dtype sum = 0;
   for (int i = 0 ; i < n ; i++)
     sum += A[i] * B[i];
@@ -210,8 +210,8 @@ void gpu_scale(cublasHandle_t handle, const int n, const Dtype alpha,
 }
 
 template<typename Dtype>
-Dtype gpu_dot(cublasHandle_t handle, const Dtype * A, const Dtype * B,
-    const int n) {
+Dtype gpu_dot(cublasHandle_t handle, const int n, const Dtype * A,
+    const Dtype * B) {
   Dtype result = 0.0;
   cublasSdot(handle, n, A, 1, B, 1, &result);
   return result;
@@ -240,7 +240,7 @@ void gpu_e_f(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
 
 template<typename Op, typename Dtype>
 void gpu_e_f(const int n, const Dtype alpha, const Dtype beta,
-	const Dtype * A, const Dtype * B, Dtype * C) {
+  const Dtype * A, const Dtype * B, Dtype * C) {
   Op::CudaMap(alpha, beta, A, B, C, n);
 }
 // element-wise generalized operation defined in Op

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d452c1fb/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index 50da1f0..abe7722 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -223,11 +223,11 @@ Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) {
   auto context = Singleton<Context>::Instance();
   int device = context->device_id(std::this_thread::get_id());
   if (device < 0) {
-    res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
+    res = cpu_dot(n, A.cpu_data(), B.cpu_data());
   } else {
 #ifdef USE_GPU
-    res = gpu_dot(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
-        n);
+    res = gpu_dot(context->cublas_handle(device), n, A.gpu_data(),
+        B.gpu_data());
 #else
     NO_GPU;
 #endif  // USE_GPU
@@ -302,8 +302,9 @@ void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) {
     cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // gpu part
     gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -491,8 +492,8 @@ void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
           B->mutable_cpu_data());
     } else {
 #ifdef USE_GPU
-      gpu_gemm(context->cublas_handle(device), A.gpu_data(), one.gpu_data(), m, n, 1,
-		  alpha, beta, false, false, B->mutable_gpu_data());
+      gpu_gemm(context->cublas_handle(device), A.gpu_data(), one.gpu_data(), m,
+          n, 1, alpha, beta, false, false, B->mutable_gpu_data());
 #else
       NO_GPU;
 #endif  // USE_GPU
@@ -737,6 +738,7 @@ void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>* B) {
     cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
       B->mutable_cpu_data());
   } else {
+    // TODO(wangwei) implement the GPU version.
     NO_GPU;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d452c1fb/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
index 6bb6001..9830703 100644
--- a/src/test/test_math.cc
+++ b/src/test/test_math.cc
@@ -30,7 +30,7 @@
 
 #ifdef USE_GPU
 #include <cuda_runtime.h>
-#include "cublas_v2.h"
+#include <cublas_v2.h>
 #endif
 
 using namespace singa;
@@ -64,8 +64,8 @@ TEST(MathBlobTest, TestGEMV) {
   float AT[5][5] = {};
   float B[5] = {};
   float Res[5] = {};
-  for(int i = 0; i < 5; i++) {
-    for(int j = 0; j < 5; j++) {
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
       A[i][j] = i * j + i - j;
       AT[j][i] = i * j + i - j;
     }
@@ -88,7 +88,7 @@ TEST(MathBlobTest, TestGEMV) {
   BlobATB->set_cpu_data(Res);
 
   for (int i = 0; i < 5; i++) {
-    for(int j = 0; j < 5; j++) {
+    for (int j = 0; j < 5; j++) {
       Res[i] += 2*A[i][j] * B[j];
     }
   }
@@ -107,8 +107,8 @@ TEST(MathBlobTest, TestMVDot) {
   float AT[5][5] = {};
   float B[5] = {};
   float Res[5] = {};
-  for(int i = 0; i < 5; i++) {
-    for(int j = 0; j < 5; j++) {
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
       A[i][j] = i * j + i - j;
       AT[j][i] = i * j + i - j;
     }
@@ -131,7 +131,7 @@ TEST(MathBlobTest, TestMVDot) {
   BlobATB->set_cpu_data(Res);
 
   for (int i = 0; i < 5; i++) {
-    for(int j = 0; j < 5; j++) {
+    for (int j = 0; j < 5; j++) {
       Res[i] += A[i][j] * B[j];
     }
   }
@@ -156,8 +156,8 @@ TEST(MathBlobTest, TestGEMM) {
   float B[5][5]= {};
   float BT[5][5]= {};
   float Res[5][5]= {};
-  for(int i = 0; i < 5; i++) {
-    for(int j = 0; j < 5; j++) {
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
       A[i][j] = i * j + i - j;
       AT[j][i] = i * j + i - j;
       B[i][j] = - i * j + i * i - j * j;
@@ -214,8 +214,8 @@ TEST(MathBlobTest, TestMMDot) {
   float B[5][5]= {};
   float BT[5][5]= {};
   float Res[5][5]= {};
-  for(int i = 0; i < 5; i++) {
-    for(int j = 0; j < 5; j++) {
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
       A[i][j] = i * j + i - j;
       AT[j][i] = i * j + i - j;
       B[i][j] = - i * j + i * i - j * j;
@@ -292,8 +292,8 @@ TEST(MathBlobTest, TestOuterProduct) {
     A[i] = i * i - 5* (i%2);
     B[i] = 2* i * i - 3* (i%4);
   }
-  for(int i = 0; i < 10; i++) {
-    for(int j = 0; j < 10; j++) {
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
       AB[i][j] = A[i]*B[j];
     }
   }
@@ -392,9 +392,9 @@ TEST(MathBlobTest, TestMVAddCol) {
   float A[10] = {};
   float B[10][10] = {};
   float BT[10][10] = {};
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] = 5*i -2;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       B[i][j] = i * j + i - j;
       BT[j][i] = i * j + i - j;
     }
@@ -405,8 +405,8 @@ TEST(MathBlobTest, TestMVAddCol) {
   BlobBT->set_cpu_data(BT[0]);
   BlobBT->set_transpose(true);
 
-  for(int i = 0; i < 10; i++) {
-    for(int j = 0; j < 10; j++) {
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
       B[i][j] = 2.0 * A[i] + 3.0 * B[i][j];
       BT[j][i] = 2.0 * A[i] + 3.0 * BT[j][i];
     }
@@ -433,9 +433,9 @@ TEST(MathBlobTest, TestMVAddRow) {
   float A[10] = {};
   float B[10][10] = {};
   float BT[10][10] = {};
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] = 5*i -2;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       B[i][j] = i * j + i - j;
       BT[j][i] = i * j + i - j;
     }
@@ -446,8 +446,8 @@ TEST(MathBlobTest, TestMVAddRow) {
   BlobBT->set_cpu_data(BT[0]);
   BlobBT->set_transpose(true);
 
-  for(int i = 0; i < 10; i++) {
-    for(int j = 0; j < 10; j++) {
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
       B[j][i] = 2.0 * A[i] + 3.0 * B[j][i];
       BT[i][j] = 2.0 * A[i] + 3.0 * BT[i][j];
     }
@@ -474,9 +474,9 @@ TEST(MathBlobTest, TestRepmatCol) {
   float A[10] = {};
   float B[10][10] = {};
   float BT[10][10] = {};
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] = 5*i -2;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       B[i][j] = A[i];
       BT[j][i] = A[i];
     }
@@ -506,9 +506,9 @@ TEST(MathBlobTest, TestRepmatRow) {
   float A[10] = {};
   float B[10][10] = {};
   float BT[10][10] = {};
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] = 5*i -2;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       B[j][i] = A[i];
       BT[i][j] = A[i];
     }
@@ -538,9 +538,9 @@ TEST(MathBlobTest, TestMVSumCol) {
   float A[10] = {};
   float B[10][10] = {};
   float BT[10][10] = {};
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] = 5*i -2;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       B[i][j] = i * j + i - j;
       BT[j][i] = i * j + i - j;
     }
@@ -552,9 +552,9 @@ TEST(MathBlobTest, TestMVSumCol) {
   BlobBT->set_cpu_data(BT[0]);
   BlobBT->set_transpose(true);
 
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] *= 2.0;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       A[i] += 3.0 * B[i][j];
     }
   }
@@ -577,9 +577,9 @@ TEST(MathBlobTest, TestMVSumRow) {
   float A[10] = {};
   float B[10][10] = {};
   float BT[10][10] = {};
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] = 5*i -2;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       B[j][i] = i * j + i - j;
       BT[i][j] = i * j + i - j;
     }
@@ -591,9 +591,9 @@ TEST(MathBlobTest, TestMVSumRow) {
   BlobBT->set_cpu_data(BT[0]);
   BlobBT->set_transpose(true);
 
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] *= 2.0;
-    for(int j = 0; j < 10; j++) {
+    for (int j = 0; j < 10; j++) {
       A[i] += 3.0 * B[j][i];
     }
   }
@@ -608,7 +608,7 @@ TEST(MathBlobTest, TestMVSumRow) {
 
 TEST(MathBlobTest, TestASum) {
   float A[10] = {};
-  for(int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++) {
     A[i] = ((i % 3) -1) * i;
   }
 
@@ -888,7 +888,7 @@ TEST(MathTest, TestDotGPU) {
   cudaMemcpy(B_gpu, B, 12*sizeof(float), cudaMemcpyHostToDevice);
   auto context = Singleton<Context>::Instance();
   context->SetupDevice(std::this_thread::get_id(), 0);
-  float gpu_ret = gpu_dot<float>(context->cublas_handle(0), A_gpu, B_gpu, 12);
+  float gpu_ret = gpu_dot<float>(context->cublas_handle(0), 12, A_gpu, B_gpu);
 
   float cpu_ret = 0.0f;
   for (int i = 0; i < 12; i++) {