You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2020/01/23 02:13:08 UTC
[singa] branch dev updated: SINGA-502 Avoid moving data between host and gpu in SoftmaxCrossEntropy

This is an automated email from the ASF dual-hosted git repository.

wangwei pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/singa.git


The following commit(s) were added to refs/heads/dev by this push:
     new 7baf2c4  SINGA-502 Avoid moving data between host and gpu in SoftmaxCrossEntropy
     new b86add0  Merge pull request #577 from chrishkchris/SINGA-502
7baf2c4 is described below

commit 7baf2c40fccb5077b626a66d952c5a9e65bd3f01
Author: chrishkchris <ch...@yahoo.com.hk>
AuthorDate: Tue Jan 21 08:41:32 2020 +0000

    SINGA-502 Avoid moving data between host and gpu in SoftmaxCrossEntropy
---
 include/singa/core/tensor.h        |  1 +
 python/singa/autograd.py           | 15 +++++++--------
 src/api/core_tensor.i              |  1 +
 src/core/tensor/tensor.cc          | 13 +++++++++++++
 src/core/tensor/tensor_math.h      |  5 +++++
 src/core/tensor/tensor_math_cpp.h  | 13 +++++++++++++
 src/core/tensor/tensor_math_cuda.h | 13 +++++++++++++
 7 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 0c313bf..2810a54 100755
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -553,6 +553,7 @@ void SumRows(const Tensor &M, Tensor *out);
 /// if 'axis' is 1, sum all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.sum
 Tensor Sum(const Tensor &in, const int axis);
+Tensor SumAll(const Tensor &in);
 
 // ================Random operations==========================================
 /// For each element x set x = 1 if random() < p; otherwise x = 1.
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 8e245c6..cd6c6af 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -925,9 +925,8 @@ class CrossEntropy(Operation):
         Returns:
             loss (CTensor): scalar.
         """
-        loss = CTensor((1,))
-        loss_data = -singa.SumAsFloat(singa.__mul__(t, singa.Log(x)))
-        loss.SetFloatValue(loss_data / x.shape()[0])
+        loss = singa.SumAll(singa.__mul__(t, singa.Log(x)))
+        loss /= -x.shape()[0]
         self.x = x
         self.t = t
         self.input = (x, t)
@@ -944,7 +943,7 @@ class CrossEntropy(Operation):
                           dy = 1.0
         """
         dx = singa.__div__(self.t, self.x)
-        dx *= float(-1 / self.x.shape()[0])
+        dx *= float(-1.0 / self.x.shape()[0])
         if isinstance(dy, float):
             # dtype of dy: float
             dx *= dy
@@ -964,9 +963,9 @@ class SoftMaxCrossEntropy(Operation):
 
     def forward(self, x):
         self.p = singa.SoftMax(x)
-        loss = CTensor((1,), self.p.device())
         ret = singa.CrossEntropyFwd(self.p, self.t)
-        loss.SetFloatValue(singa.SumAsFloat(ret) / x.shape()[0])
+        loss = singa.SumAll(ret)
+        loss /= x.shape()[0]
         return loss
 
     def backward(self, dy=1.0):
@@ -987,8 +986,8 @@ class MeanSquareError(Operation):
     def forward(self, x, t):
         self.err = singa.__sub__(x, t)
         sqr = singa.Square(self.err)
-        loss = CTensor((1,), x.device())
-        loss.SetFloatValue(singa.SumAsFloat(sqr) / x.shape()[0] / 2)
+        loss = singa.SumAll(sqr)
+        loss /= (x.shape()[0] * 2)
         return loss
 
     def backward(self, dy=1.0):
diff --git a/src/api/core_tensor.i b/src/api/core_tensor.i
index 844a813..d54beed 100755
--- a/src/api/core_tensor.i
+++ b/src/api/core_tensor.i
@@ -196,6 +196,7 @@ namespace singa{
   Tensor Sum(const Tensor &t, int axis);
   template <typename SType> SType Sum(const Tensor &t);
   %template(SumAsFloat) Sum<float>;
+  Tensor SumAll(const Tensor &t);
 
   Tensor Average(const Tensor &t, int axis);
   Tensor SoftMax(const Tensor &t);
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index c8bafed..35282a7 100755
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -940,6 +940,19 @@ Tensor Sum(const Tensor &M, int axis) {
   }
 }
 
+Tensor SumAll(const Tensor &in) {
+  Tensor out({(size_t)1}, in.device(), in.data_type());
+  Tensor one(in.shape(), in.device(), in.data_type());
+  auto *outPtr = &out;
+  one.SetValue(1.0f);
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    one.device()->Exec([in, one, outPtr](Context * ctx) {
+      Dot<DType, Lang>(in, one, outPtr, ctx);
+    }, {in.block(), one.block()}, {outPtr->block()});
+  });
+  return out;
+}
+ 
 Tensor RowMax(const Tensor &in) {
   Tensor ret({in.shape(0)}, in.device(), in.data_type());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 595b1b9..a2dad9d 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -391,6 +391,11 @@ void Dot(const Tensor &in1, const Tensor &in2, DType *out,
          Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
+template <typename DType, typename Lang>
+void Dot(const Tensor &in1, const Tensor &in2, Tensor *out,
+         Context *ctx) {
+  LOG(FATAL) << "Dot Not Implemented";
+}
 
 /// out = alpha * A * v + beta * out.
 /// transA indicates if the internal data layout is transposed of A
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 7d10824..67615e5 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -649,6 +649,19 @@ void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
     LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
   }
 }
+template <>
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
+                           Tensor *out, Context *ctx) {
+  //check input tensor for strides first
+  if (!(in1.transpose()) && !(in2.transpose())) {
+    const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+    const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+    float* outPtr = static_cast<float*>(out->block()->mutable_data());
+    *outPtr = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
+  } else {
+    LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
+  }
+}
 
 template <>
 void Scale<float, lang::Cpp>(const float x, Tensor *out,
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index ca6e706..1668e9c 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -834,6 +834,19 @@ void Dot<float, lang::Cuda>(const Tensor& in1,
   CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
 }
 template <>
+void Dot<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  auto handle = ctx->cublas_handle;
+  const size_t num = in1.Size();
+  CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
+  CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, outPtr));
+  CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
+}
+
+template <>
 void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
                              Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream