You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2020/02/11 15:31:16 UTC
[singa] branch dev updated: SINGA-505 SoftMax Backward to be
bufferable
This is an automated email from the ASF dual-hosted git repository.
wangwei pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/singa.git
The following commit(s) were added to refs/heads/dev by this push:
new 497a4fc SINGA-505 SoftMax Backward to be bufferable
new 8bf0c62 Merge pull request #588 from chrishkchris/SINGA-505
497a4fc is described below
commit 497a4fc86fd50ccaf6545b7ed9784b92ce55847e
Author: chrishkchris <ch...@yahoo.com.hk>
AuthorDate: Tue Feb 11 14:39:32 2020 +0000
SINGA-505 SoftMax Backward to be bufferable
---
include/singa/core/tensor.h | 1 +
python/singa/autograd.py | 104 +++++++-----------
src/api/core_tensor.i | 1 +
src/core/tensor/tensor.cc | 219 ++++++++++++++++++++-----------------
src/core/tensor/tensor_math.h | 6 +-
src/core/tensor/tensor_math_cpp.h | 63 +++++------
src/core/tensor/tensor_math_cuda.h | 42 ++++++-
test/python/test_api.py | 21 ++--
8 files changed, 243 insertions(+), 214 deletions(-)
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 93cf44a..846c14c 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -514,6 +514,7 @@ void MultRow(const Tensor &v, Tensor *M);
/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
Tensor SoftMax(const Tensor &in);
Tensor SoftMax(const Tensor &in, int axis);
+Tensor SoftMaxBackward(const Tensor &in, int axis, const Tensor &fdout);
Tensor RowMax(const Tensor &in);
/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 0c5f456..01e4d82 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -647,10 +647,14 @@ class Reshape(Operation):
self._shape = x.shape()
shape = self.shape
# handle the shape with 0
- shape = [self._shape[i] if i < len(self._shape) and shape[i] == 0 else shape[i] for i in range(len(shape))]
+ shape = [
+ self._shape[i]
+ if i < len(self._shape) and shape[i] == 0 else shape[i]
+ for i in range(len(shape))
+ ]
# handle the shape with -1
hidden_shape = int(np.prod(self._shape) // np.abs(np.prod(shape)))
- self.cache=[s if s != -1 else hidden_shape for s in shape]
+ self.cache = [s if s != -1 else hidden_shape for s in shape]
return singa.Reshape(x, self.cache)
@@ -881,32 +885,10 @@ class SoftMax(Operation):
dx (Ctensor): data for the dL / dx, L is the loss,
x is the input of current Opertion
"""
- # calculations are made on numpy array
- if self.axis == 1:
- dy = singa.DefaultTranspose(dy)
- grad = ctensor2numpy(dy)
- output = ctensor2numpy(self.output)
- out_1 = np.einsum("ki,ki->ki", grad, output)
- medium_out = np.einsum("ki,kj->kij", output, output)
- out_2 = np.einsum("kij,kj->ki", medium_out, grad)
- out = out_1 - out_2
- dx = CTensor(out_1.shape)
- dx.CopyFloatDataFromHostPtr(out.flatten())
- """grad = Tensor(data=dy)
- output = Tensor(data=self.output)
- out_1 = einsum('ki,ki->ki', grad, output)
- medium_out = einsum('ki,kj->kij', output, output)
- out_2 = einsum('kij,kj->ki', medium_out, grad)
- out = out_1 - out_2
- dx = CTensor(out_1.data.shape)
- dx.CopyFloatDataFromHostPtr(out.data.flatten())"""
- if self.axis == 0:
- return dx
- elif self.axis == 1:
- return singa.DefaultTranspose(dx)
+ return singa.SoftMaxBackward(dy, self.axis, self.output)
-def softmax(x, axis=0):
+def softmax(x, axis=1):
return SoftMax(axis)(x)[0]
@@ -1236,16 +1218,13 @@ class _Conv2d(Operation):
def backward(self, dy):
assert training is True and hasattr(
- self, "inputs"
- ), "Please set training as True before do BP. "
-
+ self, "inputs"), "Please set training as True before do BP. "
+
if (type(self.handle) != singa.ConvHandle):
- dx = singa.GpuConvBackwardx(
- dy, self.inputs[1], self.inputs[0], self.handle
- )
- dW = singa.GpuConvBackwardW(
- dy, self.inputs[0], self.inputs[1], self.handle
- )
+ dx = singa.GpuConvBackwardx(dy, self.inputs[1], self.inputs[0],
+ self.handle)
+ dW = singa.GpuConvBackwardW(dy, self.inputs[0], self.inputs[1],
+ self.handle)
if self.handle.bias_term:
db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle)
return dx, dW, db
@@ -1420,13 +1399,13 @@ class Conv2d(Layer):
class SeparableConv2d(Layer):
def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size,
- stride=1,
- padding=0,
- bias=False,
+ self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ padding=0,
+ bias=False,
):
self.depthwise_conv = Conv2d(
in_channels,
@@ -1600,9 +1579,8 @@ class _Pooling2d(Operation):
def backward(self, dy):
if (type(self.handle) != singa.PoolingHandle):
- dx = singa.GpuPoolingBackward(
- self.handle, dy, self.cache[0], self.cache[1]
- )
+ dx = singa.GpuPoolingBackward(self.handle, dy, self.cache[0],
+ self.cache[1])
else:
dx = singa.CpuPoolingBackward(self.handle, dy, self.cache[0],
self.cache[1])
@@ -2120,15 +2098,15 @@ class RNN_Base(Layer):
class RNN(RNN_Base):
def __init__(
- self,
- input_size,
- hidden_size,
- num_layers=1,
- nonlinearity="tanh",
- bias=True,
- batch_first=False,
- dropout=0,
- bidirectional=False,
+ self,
+ input_size,
+ hidden_size,
+ num_layers=1,
+ nonlinearity="tanh",
+ bias=True,
+ batch_first=False,
+ dropout=0,
+ bidirectional=False,
):
self.nonlinearity = nonlinearity
@@ -2181,15 +2159,15 @@ class RNN(RNN_Base):
class LSTM(RNN_Base):
def __init__(
- self,
- input_size,
- hidden_size,
- nonlinearity="tanh",
- num_layers=1,
- bias=True,
- batch_first=False,
- dropout=0,
- bidirectional=False,
+ self,
+ input_size,
+ hidden_size,
+ nonlinearity="tanh",
+ num_layers=1,
+ bias=True,
+ batch_first=False,
+ dropout=0,
+ bidirectional=False,
):
self.nonlinearity = nonlinearity
diff --git a/src/api/core_tensor.i b/src/api/core_tensor.i
index d54beed..4550e6a 100755
--- a/src/api/core_tensor.i
+++ b/src/api/core_tensor.i
@@ -201,6 +201,7 @@ namespace singa{
Tensor Average(const Tensor &t, int axis);
Tensor SoftMax(const Tensor &t);
Tensor SoftMax(const Tensor &t, int axis);
+ Tensor SoftMaxBackward(const Tensor &t, int axis, const Tensor &fdout);
Tensor Pow(const Tensor &base, const Tensor &exp);
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index c61d4fa..8b90932 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -627,13 +627,11 @@ void RepeatDataToFrom(bool broadcast_flag, const vector<size_t> &repeats,
float Tensor::l1() const {
float nrm = 0.0f;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
- device_->Exec(
- [&nrm, this](Context *ctx) {
- DType ret = DType(0);
- Asum<DType, Lang>(*this, &ret, ctx);
- nrm = TypeCast<DType, float>(ret);
- },
- {this->block()}, {});
+ device_->Exec([&nrm, this](Context *ctx) {
+ DType ret = DType(0);
+ Asum<DType, Lang>(*this, &ret, ctx);
+ nrm = TypeCast<DType, float>(ret);
+ }, {this->block()}, {});
});
return nrm / Size();
}
@@ -645,13 +643,11 @@ float Tensor::L1() const { return l1(); }
float Tensor::l2() const {
float nrm = 0.0f;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
- device_->Exec(
- [&nrm, this](Context *ctx) {
- DType ret = DType(0);
- Nrm2<DType, Lang>(*this, &ret, ctx);
- nrm = TypeCast<DType, float>(ret);
- },
- {this->block()}, {});
+ device_->Exec([&nrm, this](Context *ctx) {
+ DType ret = DType(0);
+ Nrm2<DType, Lang>(*this, &ret, ctx);
+ nrm = TypeCast<DType, float>(ret);
+ }, {this->block()}, {});
});
return nrm / Size();
}
@@ -667,9 +663,9 @@ void Tensor::SetValue(const SType x) {
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
// TODO(wangwei) cast x to DType
- device_->Exec(
- [this, x, ptr](Context *ctx) { Set<DType, Lang>(x, this, ctx); }, {},
- {ptr});
+ device_->Exec([this, x, ptr](Context *ctx) {
+ Set<DType, Lang>(x, this, ctx);
+ }, {}, {ptr});
});
}
template void Tensor::SetValue<float>(const float x);
@@ -698,9 +694,9 @@ template void Tensor::GetValue<int>(int *value, const size_t num);
#define EltwiseUnaryTensorFn(fn, t, ret) \
do { \
TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
- ret->device()->Exec( \
- [t, ret](Context *ctx) { fn<DType, Lang>(t, ret, ctx); }, \
- {t.block()}, {ret->block()}); \
+ ret->device()->Exec([t, ret](Context *ctx) { \
+ fn<DType, Lang>(t, ret, ctx); \
+ }, {t.block()}, {ret->block()}); \
}); \
} while (0)
@@ -778,16 +774,55 @@ Tensor SoftMax(const Tensor &in, int axis) {
SoftMax(in, retptr, axis);
return ret;
}
+void SoftMaxBackward(const Tensor &in, Tensor *out, int axis,
+ const Tensor &fdout) {
+ // {a_0, a_1, ..., a_k-1, a_k, ... a_n-1}
+ // reshape to
+ // { a_0 * a_1 * ... a_k-1, a_k * ... a_n-1 }
+
+ // assert axis \in {-r, r-1}
+ CHECK_LE(axis, (int)in.shape().size() - 1);
+ CHECK_GE(axis, -1 * (int)in.nDim());
+
+ Shape original_shape = in.shape();
+ if (axis < 0) axis = in.shape().size() + axis;
+
+ Shape coerced_shape = {1, 1};
+ for (std::size_t i = 0, max = in.shape().size(); i != max; ++i) {
+ if (i < axis)
+ coerced_shape[0] *= in.shape()[i];
+ else
+ coerced_shape[1] *= in.shape()[i];
+ }
+
+ Tensor in_reshaped = Reshape(in, coerced_shape);
+ out->Reshape(coerced_shape);
+
+ do {
+ TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+ out->device()->Exec([in, out, fdout](Context *ctx) {
+ SoftMaxBackward<DType, Lang>(in, out, fdout, ctx);
+ }, {in.block(), fdout.block()}, {out->block()});
+ });
+ } while (0);
+
+ out->Reshape(original_shape);
+}
+
+Tensor SoftMaxBackward(const Tensor &in, int axis, const Tensor &fdout) {
+ Tensor ret(in.shape(), in.device(), in.data_type());
+ auto *retptr = &ret;
+ SoftMaxBackward(in, retptr, axis, fdout);
+ return ret;
+}
#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret) \
do { \
TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \
CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())); \
- ret->device()->Exec( \
- [lhs, rhs, ret](Context *ctx) { \
- fn<DType, Lang>(lhs, rhs, ret, ctx); \
- }, \
- {lhs.block(), rhs.block()}, {ret->block()}); \
+ ret->device()->Exec([lhs, rhs, ret](Context *ctx) { \
+ fn<DType, Lang>(lhs, rhs, ret, ctx); \
+ }, {lhs.block(), rhs.block()}, {ret->block()}); \
}); \
} while (0)
@@ -832,15 +867,15 @@ GenBinaryTensorFn(operator>, GT);
GenBinaryTensorFn(operator>=, GE);
GenBinaryTensorFn(ReLUBackward, ReLUBackward);
-#define EltwiseTensorScalarFn(fn, t, x, ret) \
- do { \
- TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
- static_assert(std::is_same<SType, DType>::value, \
- "The Scalar type must match the Tensor data type"); \
- ret->device()->Exec( \
- [t, x, ret](Context *ctx) { fn<DType, Lang>(t, x, ret, ctx); }, \
- {t.block()}, {ret->block()}); \
- }); \
+#define EltwiseTensorScalarFn(fn, t, x, ret) \
+ do { \
+ TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
+ static_assert(std::is_same<SType, DType>::value, \
+ "The Scalar type must match the Tensor data type"); \
+ ret->device()->Exec([t, x, ret](Context *ctx) { \
+ fn<DType, Lang>(t, x, ret, ctx); \
+ }, {t.block()}, {ret->block()}); \
+ }); \
} while (0)
#define GenTensorScalarFn(op, fn) \
@@ -880,11 +915,9 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
CHECK(in.shape() == out->shape());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
// TODO(wangwei) type cast SType to DType;
- in.device()->Exec(
- [alpha, in, out](Context *ctx) {
- Div<DType, Lang>(alpha, in, out, ctx);
- },
- {in.block()}, {out->block()});
+ in.device()->Exec([alpha, in, out](Context *ctx) {
+ Div<DType, Lang>(alpha, in, out, ctx);
+ }, {in.block()}, {out->block()});
});
}
template void Div<float>(const float, const Tensor &, Tensor *);
@@ -919,13 +952,11 @@ float Sum<float>(const Tensor &in) {
Tensor one(in.shape(), in.device(), in.data_type());
one.SetValue(1.0f);
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
- one.device()->Exec(
- [in, one, &s](Context *ctx) {
- DType ret = DType(0);
- Dot<DType, Lang>(in, one, &ret, ctx);
- s = ret;
- },
- {in.block(), one.block()}, {});
+ one.device()->Exec([in, one, &s](Context *ctx) {
+ DType ret = DType(0);
+ Dot<DType, Lang>(in, one, &ret, ctx);
+ s = ret;
+ }, {in.block(), one.block()}, {});
});
return s;
}
@@ -950,24 +981,22 @@ Tensor SumAll(const Tensor &in) {
auto *outPtr = &out;
one.SetValue(1.0f);
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
- one.device()->Exec([in, one, outPtr](Context * ctx) {
+ one.device()->Exec([in, one, outPtr](Context *ctx) {
Dot<DType, Lang>(in, one, outPtr, ctx);
}, {in.block(), one.block()}, {outPtr->block()});
});
return out;
}
-
+
Tensor RowMax(const Tensor &in) {
Tensor ret({in.shape(0)}, in.device(), in.data_type());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
- in.device()->Exec(
- [&in, &ret](Context *ctx) {
- // size_t nrow = 1;
- // if (in.nDim() > 1) nrow = in.shape(0);
- // size_t ncol = in.Size() / nrow;
- RowMax<DType, Lang>(in, &ret, ctx);
- },
- {in.block()}, {ret.block()});
+ in.device()->Exec([&in, &ret](Context *ctx) {
+ // size_t nrow = 1;
+ // if (in.nDim() > 1) nrow = in.shape(0);
+ // size_t ncol = in.Size() / nrow;
+ RowMax<DType, Lang>(in, &ret, ctx);
+ }, {in.block()}, {ret.block()});
});
return ret;
}
@@ -1179,9 +1208,9 @@ void MultColumn(const Tensor &v, Tensor *M) {
CHECK_EQ(v.Size(), M->shape(0));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
- v.device()->Exec(
- [M, v](Context *ctx) { DGMM<DType, Lang>(false, *M, v, M, ctx); },
- {M->block(), v.block()}, {M->block()});
+ v.device()->Exec([M, v](Context *ctx) {
+ DGMM<DType, Lang>(false, *M, v, M, ctx);
+ }, {M->block(), v.block()}, {M->block()});
});
}
@@ -1193,9 +1222,9 @@ void MultRow(const Tensor &v, Tensor *M) {
CHECK_EQ(v.Size(), M->shape(1));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
- v.device()->Exec(
- [M, v](Context *ctx) { DGMM<DType, Lang>(true, *M, v, M, ctx); },
- {M->block(), v.block()}, {M->block()});
+ v.device()->Exec([M, v](Context *ctx) {
+ DGMM<DType, Lang>(true, *M, v, M, ctx);
+ }, {M->block(), v.block()}, {M->block()});
});
}
@@ -1239,9 +1268,9 @@ template <typename SType>
void Bernoulli(const SType p, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto prob = TypeCast<SType, DType>(p);
- out->device()->Exec(
- [prob, out](Context *ctx) { Bernoulli<DType, Lang>(prob, out, ctx); },
- {}, {out->block()}, true);
+ out->device()->Exec([prob, out](Context *ctx) {
+ Bernoulli<DType, Lang>(prob, out, ctx);
+ }, {}, {out->block()}, true);
});
}
@@ -1252,9 +1281,9 @@ void Uniform(const SType low, const SType high, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto l = TypeCast<SType, DType>(low);
auto h = TypeCast<SType, DType>(high);
- out->device()->Exec(
- [l, h, out](Context *ctx) { Uniform<DType, Lang>(l, h, out, ctx); }, {},
- {out->block()}, true);
+ out->device()->Exec([l, h, out](Context *ctx) {
+ Uniform<DType, Lang>(l, h, out, ctx);
+ }, {}, {out->block()}, true);
});
}
@@ -1265,9 +1294,9 @@ void Gaussian(const SType mean, const SType std, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto m = TypeCast<SType, DType>(mean);
auto s = TypeCast<SType, DType>(std);
- out->device()->Exec(
- [m, s, out](Context *ctx) { Gaussian<DType, Lang>(m, s, out, ctx); },
- {}, {out->block()}, true);
+ out->device()->Exec([m, s, out](Context *ctx) {
+ Gaussian<DType, Lang>(m, s, out, ctx);
+ }, {}, {out->block()}, true);
});
}
template void Gaussian<float>(const float mean, const float std, Tensor *out);
@@ -1278,9 +1307,9 @@ template <typename SType>
void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
- out->device()->Exec(
- [a, in, out](Context *ctx) { Axpy<DType, Lang>(a, in, out, ctx); },
- {in.block(), out->block()}, {out->block()});
+ out->device()->Exec([a, in, out](Context *ctx) {
+ Axpy<DType, Lang>(a, in, out, ctx);
+ }, {in.block(), out->block()}, {out->block()});
});
}
@@ -1307,22 +1336,18 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
- C->device()->Exec(
- [a, A, b, B, C](Context *ctx) {
- GEMV<DType, Lang>(a, A, B, b, C, ctx);
- },
- {A.block(), B.block()}, {C->block()});
+ C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ GEMV<DType, Lang>(a, A, B, b, C, ctx);
+ }, {A.block(), B.block()}, {C->block()});
});
} else {
CHECK(!C->transpose());
TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
- C->device()->Exec(
- [a, A, b, B, C](Context *ctx) {
- GEMM<DType, Lang>(a, A, B, b, C, ctx);
- },
- {A.block(), B.block()}, {C->block()});
+ C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ GEMM<DType, Lang>(a, A, B, b, C, ctx);
+ }, {A.block(), B.block()}, {C->block()});
});
}
}
@@ -1349,14 +1374,11 @@ void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
if (p.nDim() == 2u) batchsize = p.shape(0);
size_t dim = p.Size() / batchsize;
TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
- p.device()->Exec(
- [batchsize, dim, t, p, loss](Context *ctx) {
- bool int_target = t.Size() == batchsize;
- ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim,
- p.block(), t.block(), loss->block(),
- ctx);
- },
- {p.block(), t.block()}, {loss->block()});
+ p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
+ bool int_target = t.Size() == batchsize;
+ ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p.block(),
+ t.block(), loss->block(), ctx);
+ }, {p.block(), t.block()}, {loss->block()});
});
}
@@ -1367,14 +1389,11 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
if (p->nDim() == 2u) batchsize = p->shape(0);
size_t dim = p->Size() / batchsize;
TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
- p->device()->Exec(
- [batchsize, dim, t, p](Context *ctx) {
- bool int_target = t.Size() == batchsize;
- SoftmaxCrossEntropyBwd<DType, Lang>(int_target, batchsize, dim,
- p->block(), t.block(), p->block(),
- ctx);
- },
- {p->block(), t.block()}, {p->block()});
+ p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+ bool int_target = t.Size() == batchsize;
+ SoftmaxCrossEntropyBwd<DType, Lang>(
+ int_target, batchsize, dim, p->block(), t.block(), p->block(), ctx);
+ }, {p->block(), t.block()}, {p->block()});
});
}
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index a9b5c70..aef4a59 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -369,8 +369,7 @@ void Dot(const Tensor &in1, const Tensor &in2, DType *out, Context *ctx) {
LOG(FATAL) << "Dot Not Implemented";
}
template <typename DType, typename Lang>
-void Dot(const Tensor &in1, const Tensor &in2, Tensor *out,
- Context *ctx) {
+void Dot(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
LOG(FATAL) << "Dot Not Implemented";
}
@@ -404,7 +403,8 @@ void SoftMax(const Tensor &in, Tensor *out, Context *ctx) {
}
template <typename DType, typename Lang>
-void SoftMax(const Tensor &in, Tensor *out, Context *ctx, int axis) {
+void SoftMaxBackward(const Tensor &in, Tensor *out, const Tensor &fdout,
+ Context *ctx) {
LOG(FATAL) << "Not Implemented";
}
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index b592ecc..fb42576 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -240,36 +240,11 @@ void Abs<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
#ifdef USE_DNNL
template <>
-void SoftMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx,
- int axis) {
- CHECK_EQ(in.device()->lang(), kCpp);
-
- CHECK_LE(axis, (int)in.shape().size() - 1);
- CHECK_GE(axis, -1 * (int)in.nDim());
-
- Shape original_shape = in.shape();
- if (axis < 0) axis = in.shape().size() + axis;
-
- Shape coerced_shape = {1, 1};
- for (int i = 0; i < in.shape().size(); i++) {
- if (i < axis)
- coerced_shape[0] *= in.shape()[i];
- else
- coerced_shape[1] *= in.shape()[i];
- }
- Tensor in_reshaped = Reshape(in, coerced_shape);
- out->Reshape(coerced_shape);
-
- // optimise by minus x - x.max()
- auto in_max = RowMax(in_reshaped);
- in_max.Reshape({coerced_shape[0], 1});
- in_reshaped = in_reshaped - in_max;
-
- auto md = dnnl::memory::desc({coerced_shape[0], coerced_shape[1]},
+void SoftMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
+ auto md = dnnl::memory::desc({in.shape()[0], in.shape()[1]},
dnnl::memory::data_type::f32,
dnnl::memory::format_tag::ab);
- auto in_mem =
- dnnl::memory(md, ctx->dnnl_engine, in_reshaped.block()->mutable_data());
+ auto in_mem = dnnl::memory(md, ctx->dnnl_engine, in.block()->mutable_data());
auto out_mem =
dnnl::memory(md, ctx->dnnl_engine, out->block()->mutable_data());
@@ -281,9 +256,35 @@ void SoftMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx,
softmax.execute(ctx->dnnl_stream,
{{DNNL_ARG_SRC, in_mem}, {DNNL_ARG_DST, out_mem}});
ctx->dnnl_stream.wait();
+}
+
+template <>
+void SoftMaxBackward<float, lang::Cpp>(const Tensor &in, Tensor *out,
+ const Tensor &fdout, Context *ctx) {
+ auto md = dnnl::memory::desc({in.shape()[0], in.shape()[1]},
+ dnnl::memory::data_type::f32,
+ dnnl::memory::format_tag::ab);
+ auto in_mem = dnnl::memory(md, ctx->dnnl_engine, in.block()->mutable_data());
+ auto fdout_mem =
+ dnnl::memory(md, ctx->dnnl_engine, fdout.block()->mutable_data());
+ auto out_mem =
+ dnnl::memory(md, ctx->dnnl_engine, out->block()->mutable_data());
- out->Reshape(original_shape);
+ auto softmax_desc =
+ dnnl::softmax_forward::desc(dnnl::prop_kind::forward_scoring, md, 1);
+ auto softmax_prim_desc =
+ dnnl::softmax_forward::primitive_desc(softmax_desc, ctx->dnnl_engine);
+
+ auto softmaxbwd_desc = dnnl::softmax_backward::desc(md, md, 1);
+ auto softmaxbwd_prim_desc = dnnl::softmax_backward::primitive_desc(
+ softmaxbwd_desc, ctx->dnnl_engine, softmax_prim_desc);
+ auto softmaxbwd = dnnl::softmax_backward(softmaxbwd_prim_desc);
+ softmaxbwd.execute(ctx->dnnl_stream, {{DNNL_ARG_DIFF_SRC, out_mem},
+ {DNNL_ARG_DIFF_DST, in_mem},
+ {DNNL_ARG_DST, fdout_mem}});
+ ctx->dnnl_stream.wait();
}
+
#endif // USE_DNNL
template <>
@@ -927,6 +928,8 @@ void RowMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
}
}
+// =========Matrix operations ================================================
+/*
template <>
void SoftMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
CHECK_LE(in.nDim(), 2u)
@@ -947,8 +950,6 @@ void SoftMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
out->Reshape(in.shape());
}
-// =========Matrix operations ================================================
-/*
template <>
void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
const Tensor& A, const Tensor& v, Tensor* out,
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 0a0b685..4b16af0 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -815,8 +815,8 @@ void Dot<float, lang::Cuda>(const Tensor& in1, const Tensor& in2, float* out,
CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
}
template <>
-void Dot<float, lang::Cuda>(const Tensor& in1,
- const Tensor& in2, Tensor* out, Context* ctx) {
+void Dot<float, lang::Cuda>(const Tensor& in1, const Tensor& in2, Tensor* out,
+ Context* ctx) {
const float* inPtr1 = static_cast<const float*>(in1.block()->data());
const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
@@ -828,8 +828,7 @@ void Dot<float, lang::Cuda>(const Tensor& in1,
}
template <>
-void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
- Context* ctx) {
+void Nrm2<float, lang::Cuda>(const Tensor& in, float* out, Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
const float* inPtr = static_cast<const float*>(in.block()->data());
const size_t num = in.Size();
@@ -937,6 +936,41 @@ void SoftMax<float, lang::Cuda>(const Tensor& in, Tensor* out, Context* ctx) {
}
template <>
+void SoftMaxBackward<float, lang::Cuda>(const Tensor& in, Tensor* out,
+ const Tensor& fdout, Context* ctx) {
+ cudnnSoftmaxAlgorithm_t algorithm = CUDNN_SOFTMAX_FAST;
+ cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_INSTANCE;
+
+ /*
+ * tensor tmp is for generating cudnn descriptor
+ * as for cudnn softmax, it required shape of {N, C, 1, 1}
+ * while helper func `generate_shape_cuda` generate shape of {1, 1, N, C}
+ * Thus this part serve similar purpose as `generate_shape_cuda` but in
+ * reverse manner
+ */
+ CHECK_LE(in.shape().size(), 5)
+ << "Dimensions (shape) beyond 5 are currently not supported";
+ auto tmp = in;
+ while (tmp.shape().size() < 4) {
+ auto s = tmp.shape();
+ s.push_back(1);
+ tmp.Reshape(s);
+ }
+
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const float* fdoutPtr = static_cast<const float*>(fdout.block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+ float alpha = 1.0;
+ float beta = 0.0;
+
+ check_cudnn(cudnnSoftmaxBackward(
+ ctx->cudnn_handle, algorithm, mode, (void*)(&alpha),
+ generate_tensor_nd_desc(tmp), fdoutPtr, generate_tensor_nd_desc(tmp),
+ inPtr, (void*)(&beta), generate_tensor_nd_desc(tmp), outPtr));
+}
+
+template <>
void ComputeCrossEntropy<float, lang::Cuda>(bool int_target,
const size_t batchsize,
const size_t dim, const Block* p,
diff --git a/test/python/test_api.py b/test/python/test_api.py
index 197f884..518c4f9 100644
--- a/test/python/test_api.py
+++ b/test/python/test_api.py
@@ -340,27 +340,22 @@ class TestAPI(unittest.TestCase):
hndl = singa_api.BatchNormHandle(
m_0,
tensor.Tensor(device=dev, data=x_0).data)
- (y_2_c, rm_2_c, rv_2_c, bm_2_c,
- bv_2_c) = singa_api.CpuBatchNormForwardTraining(
- hndl,
- tensor.Tensor(device=dev, data=x_0).data,
- tensor.Tensor(device=dev, data=s_0).data,
- tensor.Tensor(device=dev, data=b_0).data,
- tensor.Tensor(device=dev, data=rm_0).data,
- tensor.Tensor(device=dev, data=rv_0).data)
+ (y_2_c, bm_2_c, bv_2_c) = singa_api.CpuBatchNormForwardTraining(
+ hndl,
+ tensor.Tensor(device=dev, data=x_0).data,
+ tensor.Tensor(device=dev, data=s_0).data,
+ tensor.Tensor(device=dev, data=b_0).data,
+ tensor.Tensor(device=dev, data=rm_0).data,
+ tensor.Tensor(device=dev, data=rv_0).data)
np.testing.assert_array_almost_equal(
y_1, tensor.to_numpy(_cTensor_to_pyTensor(y_2_c)))
- #np.testing.assert_array_almost_equal(
- # bm_1, tensor.to_numpy(_cTensor_to_pyTensor(bm_2_c)))
np.testing.assert_array_almost_equal(
- rm_1, tensor.to_numpy(_cTensor_to_pyTensor(rm_2_c)))
+ bm_1, tensor.to_numpy(_cTensor_to_pyTensor(bm_2_c)))
#print(bv_1)
#print(tensor.to_numpy(_cTensor_to_pyTensor(bv_2_c)))
#np.testing.assert_array_almost_equal(
# bv_1, tensor.to_numpy(_cTensor_to_pyTensor(bv_2_c)), decimal=3)
- np.testing.assert_array_almost_equal(
- rv_1, tensor.to_numpy(_cTensor_to_pyTensor(rv_2_c)), decimal=4)
return
x_0 = np.array([1, 1, 1, 1, 2, 2, 2, 2, 10, 10, 10, 10, 20, 20, 20, 20],