You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2020/01/23 02:13:08 UTC
[singa] branch dev updated: SINGA-502 Avoid moving data between
host and gpu in SoftmaxCrossEntropy
This is an automated email from the ASF dual-hosted git repository.
wangwei pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/singa.git
The following commit(s) were added to refs/heads/dev by this push:
new 7baf2c4 SINGA-502 Avoid moving data between host and gpu in SoftmaxCrossEntropy
new b86add0 Merge pull request #577 from chrishkchris/SINGA-502
7baf2c4 is described below
commit 7baf2c40fccb5077b626a66d952c5a9e65bd3f01
Author: chrishkchris <ch...@yahoo.com.hk>
AuthorDate: Tue Jan 21 08:41:32 2020 +0000
SINGA-502 Avoid moving data between host and gpu in SoftmaxCrossEntropy
---
include/singa/core/tensor.h | 1 +
python/singa/autograd.py | 15 +++++++--------
src/api/core_tensor.i | 1 +
src/core/tensor/tensor.cc | 13 +++++++++++++
src/core/tensor/tensor_math.h | 5 +++++
src/core/tensor/tensor_math_cpp.h | 13 +++++++++++++
src/core/tensor/tensor_math_cuda.h | 13 +++++++++++++
7 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 0c313bf..2810a54 100755
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -553,6 +553,7 @@ void SumRows(const Tensor &M, Tensor *out);
/// if 'axis' is 1, sum all columns into a single column
/// TODO(wangwei) support arbitrary Tensor like numpy.sum
Tensor Sum(const Tensor &in, const int axis);
+Tensor SumAll(const Tensor &in);
// ================Random operations==========================================
/// For each element x set x = 1 if random() < p; otherwise x = 1.
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 8e245c6..cd6c6af 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -925,9 +925,8 @@ class CrossEntropy(Operation):
Returns:
loss (CTensor): scalar.
"""
- loss = CTensor((1,))
- loss_data = -singa.SumAsFloat(singa.__mul__(t, singa.Log(x)))
- loss.SetFloatValue(loss_data / x.shape()[0])
+ loss = singa.SumAll(singa.__mul__(t, singa.Log(x)))
+ loss /= -x.shape()[0]
self.x = x
self.t = t
self.input = (x, t)
@@ -944,7 +943,7 @@ class CrossEntropy(Operation):
dy = 1.0
"""
dx = singa.__div__(self.t, self.x)
- dx *= float(-1 / self.x.shape()[0])
+ dx *= float(-1.0 / self.x.shape()[0])
if isinstance(dy, float):
# dtype of dy: float
dx *= dy
@@ -964,9 +963,9 @@ class SoftMaxCrossEntropy(Operation):
def forward(self, x):
self.p = singa.SoftMax(x)
- loss = CTensor((1,), self.p.device())
ret = singa.CrossEntropyFwd(self.p, self.t)
- loss.SetFloatValue(singa.SumAsFloat(ret) / x.shape()[0])
+ loss = singa.SumAll(ret)
+ loss /= x.shape()[0]
return loss
def backward(self, dy=1.0):
@@ -987,8 +986,8 @@ class MeanSquareError(Operation):
def forward(self, x, t):
self.err = singa.__sub__(x, t)
sqr = singa.Square(self.err)
- loss = CTensor((1,), x.device())
- loss.SetFloatValue(singa.SumAsFloat(sqr) / x.shape()[0] / 2)
+ loss = singa.SumAll(sqr)
+ loss /= (x.shape()[0] * 2)
return loss
def backward(self, dy=1.0):
diff --git a/src/api/core_tensor.i b/src/api/core_tensor.i
index 844a813..d54beed 100755
--- a/src/api/core_tensor.i
+++ b/src/api/core_tensor.i
@@ -196,6 +196,7 @@ namespace singa{
Tensor Sum(const Tensor &t, int axis);
template <typename SType> SType Sum(const Tensor &t);
%template(SumAsFloat) Sum<float>;
+ Tensor SumAll(const Tensor &t);
Tensor Average(const Tensor &t, int axis);
Tensor SoftMax(const Tensor &t);
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index c8bafed..35282a7 100755
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -940,6 +940,19 @@ Tensor Sum(const Tensor &M, int axis) {
}
}
+Tensor SumAll(const Tensor &in) {
+ Tensor out({(size_t)1}, in.device(), in.data_type());
+ Tensor one(in.shape(), in.device(), in.data_type());
+ auto *outPtr = &out;
+ one.SetValue(1.0f);
+ TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+ one.device()->Exec([in, one, outPtr](Context * ctx) {
+ Dot<DType, Lang>(in, one, outPtr, ctx);
+ }, {in.block(), one.block()}, {outPtr->block()});
+ });
+ return out;
+}
+
Tensor RowMax(const Tensor &in) {
Tensor ret({in.shape(0)}, in.device(), in.data_type());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 595b1b9..a2dad9d 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -391,6 +391,11 @@ void Dot(const Tensor &in1, const Tensor &in2, DType *out,
Context *ctx) {
LOG(FATAL) << "Dot Not Implemented";
}
+template <typename DType, typename Lang>
+void Dot(const Tensor &in1, const Tensor &in2, Tensor *out,
+ Context *ctx) {
+ LOG(FATAL) << "Dot Not Implemented";
+}
/// out = alpha * A * v + beta * out.
/// transA indicates if the internal data layout is transposed of A
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 7d10824..67615e5 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -649,6 +649,19 @@ void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
}
}
+template <>
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
+ Tensor *out, Context *ctx) {
+ //check input tensor for strides first
+ if (!(in1.transpose()) && !(in2.transpose())) {
+ const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+ const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ *outPtr = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
+ } else {
+ LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
+ }
+}
template <>
void Scale<float, lang::Cpp>(const float x, Tensor *out,
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index ca6e706..1668e9c 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -834,6 +834,19 @@ void Dot<float, lang::Cuda>(const Tensor& in1,
CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
}
template <>
+void Dot<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ auto handle = ctx->cublas_handle;
+ const size_t num = in1.Size();
+ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
+ CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, outPtr));
+ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
+}
+
+template <>
void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream