You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2020/02/03 13:29:49 UTC
[singa] branch dev updated: fix for both USE DNNL and USE CUDNN are
enabled, code format, test api all ok
This is an automated email from the ASF dual-hosted git repository.
wangwei pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/singa.git
The following commit(s) were added to refs/heads/dev by this push:
new c39e93a fix for both USE DNNL and USE CUDNN are enabled, code format, test api all ok
new da18df0 Merge pull request #590 from dcslin/fix-both-dnnl-cudnn-enabled
c39e93a is described below
commit c39e93a42005049d89d69e764f36c9e6d6c0f7d9
Author: dcslin <13...@users.noreply.github.com>
AuthorDate: Mon Feb 3 11:53:00 2020 +0000
fix for both USE DNNL and USE CUDNN are enabled, code format, test api all ok
---
src/core/tensor/tensor_math_cpp.h | 53 ++++----
src/model/operation/batchnorm.cc | 265 +++++++++++++++++++++----------------
src/model/operation/batchnorm.h | 23 ++--
src/model/operation/convolution.cc | 56 ++++----
src/model/operation/convolution.h | 13 +-
src/model/operation/pooling.cc | 128 +++++++++---------
test/python/test_api.py | 257 ++++++++++++++++++-----------------
7 files changed, 431 insertions(+), 364 deletions(-)
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 52473cd..b592ecc 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -240,44 +240,51 @@ void Abs<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
#ifdef USE_DNNL
template <>
-void SoftMax<float, lang::Cpp>(const Tensor& in, Tensor* out, Context *ctx, int axis) {
+void SoftMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx,
+ int axis) {
+ CHECK_EQ(in.device()->lang(), kCpp);
- CHECK_LE(axis, (int)in.shape().size()-1 );
- CHECK_GE(axis, -1*(int)in.nDim() );
+ CHECK_LE(axis, (int)in.shape().size() - 1);
+ CHECK_GE(axis, -1 * (int)in.nDim());
Shape original_shape = in.shape();
if (axis < 0) axis = in.shape().size() + axis;
Shape coerced_shape = {1, 1};
for (int i = 0; i < in.shape().size(); i++) {
- if (i < axis)
- coerced_shape[0] *= in.shape()[i];
- else
- coerced_shape[1] *= in.shape()[i];
+ if (i < axis)
+ coerced_shape[0] *= in.shape()[i];
+ else
+ coerced_shape[1] *= in.shape()[i];
}
Tensor in_reshaped = Reshape(in, coerced_shape);
out->Reshape(coerced_shape);
// optimise by minus x - x.max()
auto in_max = RowMax(in_reshaped);
- in_max.Reshape({coerced_shape[0],1});
+ in_max.Reshape({coerced_shape[0], 1});
in_reshaped = in_reshaped - in_max;
-
- auto md = dnnl::memory::desc({coerced_shape[0], coerced_shape[1]}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::ab);
- auto in_mem = dnnl::memory(md, ctx->dnnl_engine, in_reshaped.block()->mutable_data());
- auto out_mem = dnnl::memory(md, ctx->dnnl_engine, out->block()->mutable_data());
-
-
- auto softmax_desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_scoring, md, 1);
- auto softmax_prim_desc = dnnl::softmax_forward::primitive_desc(softmax_desc, ctx->dnnl_engine);
+ auto md = dnnl::memory::desc({coerced_shape[0], coerced_shape[1]},
+ dnnl::memory::data_type::f32,
+ dnnl::memory::format_tag::ab);
+ auto in_mem =
+ dnnl::memory(md, ctx->dnnl_engine, in_reshaped.block()->mutable_data());
+ auto out_mem =
+ dnnl::memory(md, ctx->dnnl_engine, out->block()->mutable_data());
+
+ auto softmax_desc =
+ dnnl::softmax_forward::desc(dnnl::prop_kind::forward_scoring, md, 1);
+ auto softmax_prim_desc =
+ dnnl::softmax_forward::primitive_desc(softmax_desc, ctx->dnnl_engine);
auto softmax = dnnl::softmax_forward(softmax_prim_desc);
- softmax.execute(ctx->dnnl_stream, {{DNNL_ARG_SRC, in_mem}, {DNNL_ARG_DST, out_mem}});
+ softmax.execute(ctx->dnnl_stream,
+ {{DNNL_ARG_SRC, in_mem}, {DNNL_ARG_DST, out_mem}});
ctx->dnnl_stream.wait();
out->Reshape(original_shape);
}
-#endif // USE_DNNL
+#endif // USE_DNNL
template <>
void Add<float, lang::Cpp>(const Tensor &in, const float x, Tensor *out,
@@ -665,16 +672,16 @@ void Dot<float, lang::Cpp>(const Tensor &in1, const Tensor &in2, float *out,
}
}
template <>
-void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
- Tensor *out, Context *ctx) {
- //check input tensor for strides first
+void Dot<float, lang::Cpp>(const Tensor &in1, const Tensor &in2, Tensor *out,
+ Context *ctx) {
+ // check input tensor for strides first
if (!(in1.transpose()) && !(in2.transpose())) {
const float *in1Ptr = static_cast<const float *>(in1.block()->data());
const float *in2Ptr = static_cast<const float *>(in2.block()->data());
- float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
*outPtr = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
} else {
- LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
+ LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet.";
}
}
diff --git a/src/model/operation/batchnorm.cc b/src/model/operation/batchnorm.cc
index 892f477..36c5ab1 100644
--- a/src/model/operation/batchnorm.cc
+++ b/src/model/operation/batchnorm.cc
@@ -41,72 +41,82 @@ BatchNormHandle::BatchNormHandle(const float momentum, const Tensor& input) {
}
#ifdef USE_DNNL
- epsilon = 1e-5f;
- x_dims = dnnl::memory::dims(input.shape().begin(), input.shape().end());
-
- // support f32 only
- auto dtype_ = memory::data_type::f32;
- memory::format_tag format_tag_ = get_dnnl_format_tag(input);
- x_md = dnnl::memory::desc({x_dims}, dtype_, format_tag_);
-
- // add to
- bn_fwd_training_d = new dnnl::batch_normalization_forward::desc(dnnl::prop_kind::forward_training,
- x_md, epsilon, dnnl::normalization_flags::use_scale_shift);
-
- auto eng = input.device()->context(0)->dnnl_engine;
- bn_fwd_training_pd = new dnnl::batch_normalization_forward::primitive_desc(*bn_fwd_training_d, eng);
-
-#endif // USE_DNNL
-
+ if (input.device()->lang() == kCpp) {
+ use_dnnl = true;
+ epsilon = 1e-5f;
+ x_dims = dnnl::memory::dims(input.shape().begin(), input.shape().end());
+
+ // support f32 only
+ auto dtype_ = memory::data_type::f32;
+ memory::format_tag format_tag_ = get_dnnl_format_tag(input);
+ x_md = dnnl::memory::desc({x_dims}, dtype_, format_tag_);
+
+ // add to
+ bn_fwd_training_d = new dnnl::batch_normalization_forward::desc(
+ dnnl::prop_kind::forward_training, x_md, epsilon,
+ dnnl::normalization_flags::use_scale_shift);
+
+ auto eng = input.device()->context(0)->dnnl_engine;
+ bn_fwd_training_pd = new dnnl::batch_normalization_forward::primitive_desc(
+ *bn_fwd_training_d, eng);
+ }
+#endif // USE_DNNL
};
BatchNormHandle::~BatchNormHandle() {
#ifdef USE_DNNL
- delete(bn_fwd_training_d);
- delete(bn_fwd_training_pd);
-#endif // USE_DNNL
+ if (use_dnnl) {
+ delete (bn_fwd_training_d);
+ delete (bn_fwd_training_pd);
+ }
+#endif // USE_DNNL
}
#ifdef USE_DNNL
-Tensor CpuBatchNormForwardInference(const BatchNormHandle &bnh, const Tensor& x, const Tensor& bnScale, const Tensor& bnBias,
- Tensor& running_mean, Tensor& running_var) {
-
+Tensor CpuBatchNormForwardInference(const BatchNormHandle& bnh, const Tensor& x,
+ const Tensor& bnScale, const Tensor& bnBias,
+ Tensor& running_mean, Tensor& running_var) {
CHECK_EQ(x.device()->lang(), kCpp);
Tensor y;
y.ResetLike(x);
Tensor w = get_bn_weight_from(bnScale, bnBias);
- y.device()->Exec([&y, &x, &running_mean, &running_var, &w, &bnh](Context * ctx) {
- auto eng = ctx->dnnl_engine;
- using namespace dnnl;
-
- auto x_mem = memory(bnh.x_md, eng, x.block()->mutable_data());
- auto y_mem = memory(bnh.x_md, eng, y.block()->mutable_data());
- // indicates using scale&bias and running mean&var
- auto flags_ = normalization_flags::use_scale_shift | normalization_flags::use_global_stats;
-
- auto bn_fwd_d = batch_normalization_forward::desc(prop_kind::forward_inference, bnh.x_md, bnh.epsilon, flags_);
- auto bn_fwd_pd = batch_normalization_forward::primitive_desc(bn_fwd_d, eng);
- auto m_mem = memory(bn_fwd_pd.mean_desc(), eng, running_mean.block()->mutable_data());
- auto v_mem = memory(bn_fwd_pd.variance_desc(), eng, running_var.block()->mutable_data());
- auto w_mem = memory(bn_fwd_pd.weights_desc(), eng, w.block()->mutable_data());
-
- // execution
- batch_normalization_forward(bn_fwd_pd).execute(ctx->dnnl_stream, {
- {DNNL_ARG_SRC, x_mem},
- {DNNL_ARG_DST, y_mem},
- {DNNL_ARG_SCALE_SHIFT, w_mem},
- {DNNL_ARG_MEAN, m_mem},
- {DNNL_ARG_VARIANCE, v_mem}
- });
- ctx->dnnl_stream.wait();
-
- },
- {x.block(), w.block(), running_mean.block(), running_var.block()},
- {y.block(), running_mean.block(), running_var.block()}
- );
+ y.device()->Exec(
+ [&y, &x, &running_mean, &running_var, &w, &bnh](Context* ctx) {
+ auto eng = ctx->dnnl_engine;
+ using namespace dnnl;
+
+ auto x_mem = memory(bnh.x_md, eng, x.block()->mutable_data());
+ auto y_mem = memory(bnh.x_md, eng, y.block()->mutable_data());
+ // indicates using scale&bias and running mean&var
+ auto flags_ = normalization_flags::use_scale_shift |
+ normalization_flags::use_global_stats;
+
+ auto bn_fwd_d = batch_normalization_forward::desc(
+ prop_kind::forward_inference, bnh.x_md, bnh.epsilon, flags_);
+ auto bn_fwd_pd =
+ batch_normalization_forward::primitive_desc(bn_fwd_d, eng);
+ auto m_mem = memory(bn_fwd_pd.mean_desc(), eng,
+ running_mean.block()->mutable_data());
+ auto v_mem = memory(bn_fwd_pd.variance_desc(), eng,
+ running_var.block()->mutable_data());
+ auto w_mem =
+ memory(bn_fwd_pd.weights_desc(), eng, w.block()->mutable_data());
+
+ // execution
+ batch_normalization_forward(bn_fwd_pd).execute(
+ ctx->dnnl_stream, {{DNNL_ARG_SRC, x_mem},
+ {DNNL_ARG_DST, y_mem},
+ {DNNL_ARG_SCALE_SHIFT, w_mem},
+ {DNNL_ARG_MEAN, m_mem},
+ {DNNL_ARG_VARIANCE, v_mem}});
+ ctx->dnnl_stream.wait();
+
+ },
+ {x.block(), w.block(), running_mean.block(), running_var.block()},
+ {y.block(), running_mean.block(), running_var.block()});
return y;
}
@@ -114,6 +124,7 @@ Tensor CpuBatchNormForwardInference(const BatchNormHandle &bnh, const Tensor& x,
const std::vector<Tensor> CpuBatchNormForwardTraining(
const BatchNormHandle& bnh, const Tensor& x, const Tensor& bnScale,
const Tensor& bnBias, Tensor& running_mean, Tensor& running_var) {
+ CHECK_EQ(x.device()->lang(), kCpp);
Tensor y;
y.ResetLike(x);
@@ -127,39 +138,44 @@ const std::vector<Tensor> CpuBatchNormForwardTraining(
// backward
Tensor w = get_bn_weight_from(bnScale, bnBias);
- y.device()->Exec([&x, &y, &mean, &var, &w, &running_mean, &running_var, &bnh](Context * ctx) {
-
- auto eng = ctx->dnnl_engine;
- using namespace dnnl;
-
- auto x_mem = memory(bnh.x_md, eng, x.block()->mutable_data());
- auto y_mem = memory(bnh.x_md, eng, y.block()->mutable_data());
- auto m_mem = memory(bnh.bn_fwd_training_pd->mean_desc(), eng, mean.block()->mutable_data());
- auto v_mem = memory(bnh.bn_fwd_training_pd->variance_desc(), eng, var.block()->mutable_data());
- auto w_mem = memory(bnh.bn_fwd_training_pd->weights_desc(), eng, w.block()->mutable_data());
-
- batch_normalization_forward(*bnh.bn_fwd_training_pd).execute(ctx->dnnl_stream, {
- {DNNL_ARG_SRC, x_mem},
- {DNNL_ARG_DST, y_mem},
- {DNNL_ARG_SCALE_SHIFT, w_mem},
- {DNNL_ARG_MEAN, m_mem},
- {DNNL_ARG_VARIANCE, v_mem}
- });
- ctx->dnnl_stream.wait();
-
- // local implemented running mean as mkldnn does not support it yet:
- // https://github.com/intel/mkl-dnn/issues/371
- // https://github.com/intel/mkl-dnn/issues/517
- // https://arxiv.org/pdf/1502.03167.pdf
- auto s=x.shape();
- s[1]=1;
- float p = Product(s); // for unbiased variance
- running_mean = running_mean * (1 - bnh.factor) + mean * bnh.factor;
- running_var = running_var * (1 - bnh.factor) + var * (p/(p-1)) * bnh.factor;
- },
- {x.block(), w.block(), running_mean.block(), running_var.block()},
- {y.block(), running_mean.block(), running_var.block(), mean.block(), var.block()}
- );
+ y.device()->Exec(
+ [&x, &y, &mean, &var, &w, &running_mean, &running_var,
+ &bnh](Context* ctx) {
+
+ auto eng = ctx->dnnl_engine;
+ using namespace dnnl;
+
+ auto x_mem = memory(bnh.x_md, eng, x.block()->mutable_data());
+ auto y_mem = memory(bnh.x_md, eng, y.block()->mutable_data());
+ auto m_mem = memory(bnh.bn_fwd_training_pd->mean_desc(), eng,
+ mean.block()->mutable_data());
+ auto v_mem = memory(bnh.bn_fwd_training_pd->variance_desc(), eng,
+ var.block()->mutable_data());
+ auto w_mem = memory(bnh.bn_fwd_training_pd->weights_desc(), eng,
+ w.block()->mutable_data());
+
+ batch_normalization_forward(*bnh.bn_fwd_training_pd)
+ .execute(ctx->dnnl_stream, {{DNNL_ARG_SRC, x_mem},
+ {DNNL_ARG_DST, y_mem},
+ {DNNL_ARG_SCALE_SHIFT, w_mem},
+ {DNNL_ARG_MEAN, m_mem},
+ {DNNL_ARG_VARIANCE, v_mem}});
+ ctx->dnnl_stream.wait();
+
+ // local implemented running mean as mkldnn does not support it yet:
+ // https://github.com/intel/mkl-dnn/issues/371
+ // https://github.com/intel/mkl-dnn/issues/517
+ // https://arxiv.org/pdf/1502.03167.pdf
+ auto s = x.shape();
+ s[1] = 1;
+ float p = Product(s); // for unbiased variance
+ running_mean = running_mean * (1 - bnh.factor) + mean * bnh.factor;
+ running_var =
+ running_var * (1 - bnh.factor) + var * (p / (p - 1)) * bnh.factor;
+ },
+ {x.block(), w.block(), running_mean.block(), running_var.block()},
+ {y.block(), running_mean.block(), running_var.block(), mean.block(),
+ var.block()});
return {y, running_mean, running_var, mean, var};
}
@@ -168,6 +184,14 @@ const std::vector<Tensor> CpuBatchNormBackwardx(
const BatchNormHandle& bnh, const Tensor& y, const Tensor& dy,
const Tensor& x, const Tensor& bnScale, const Tensor& bnBias,
const Tensor& mean, const Tensor& var) {
+ CHECK_EQ(x.device()->lang(), kCpp);
+ CHECK_EQ(y.device()->lang(), kCpp);
+ CHECK_EQ(dy.device()->lang(), kCpp);
+ CHECK_EQ(mean.device()->lang(), kCpp);
+ CHECK_EQ(var.device()->lang(), kCpp);
+ CHECK_EQ(bnScale.device()->lang(), kCpp);
+ CHECK_EQ(bnBias.device()->lang(), kCpp);
+
Tensor dx;
dx.ResetLike(dy);
@@ -179,37 +203,45 @@ const std::vector<Tensor> CpuBatchNormBackwardx(
Tensor dw;
dw.ResetLike(w);
- dx.device()->Exec([&dw, &x, &dx, &y, &dy, &w, &mean, &var, &bnh](Context * ctx) {
- auto eng = ctx->dnnl_engine;
- using namespace dnnl;
-
- auto x_mem = memory(bnh.x_md, eng, x.block()->mutable_data());
- auto dx_mem = memory(bnh.x_md, eng, dx.block()->mutable_data());
- auto y_mem = memory(bnh.x_md, eng, y.block()->mutable_data());
- auto dy_mem = memory(bnh.x_md, eng, dy.block()->mutable_data());
-
- auto m_mem = memory(bnh.bn_fwd_training_pd->mean_desc(), eng, mean.block()->mutable_data());
- auto v_mem = memory(bnh.bn_fwd_training_pd->variance_desc(), eng, var.block()->mutable_data());
- auto w_mem = memory(bnh.bn_fwd_training_pd->weights_desc(), eng, w.block()->mutable_data());
-
- auto bn_bwd_d = batch_normalization_backward::desc(prop_kind::backward, bnh.x_md, bnh.x_md, bnh.epsilon, normalization_flags::use_scale_shift);
- auto bn_bwd_pd = batch_normalization_backward::primitive_desc(bn_bwd_d, eng, *bnh.bn_fwd_training_pd);
-
- auto dw_mem = memory(bn_bwd_pd.diff_weights_desc(), eng, dw.block()->mutable_data());
-
- batch_normalization_backward(bn_bwd_pd).execute(ctx->dnnl_stream, {
- {DNNL_ARG_SRC, x_mem},
- {DNNL_ARG_DIFF_SRC, dx_mem},
- {DNNL_ARG_DIFF_DST, dy_mem},
- {DNNL_ARG_MEAN, m_mem},
- {DNNL_ARG_VARIANCE, v_mem},
- {DNNL_ARG_DIFF_SCALE_SHIFT, dw_mem},
- {DNNL_ARG_SCALE_SHIFT, w_mem}
- });
- ctx->dnnl_stream.wait();
+ dx.device()->Exec(
+ [&dw, &x, &dx, &y, &dy, &w, &mean, &var, &bnh](Context* ctx) {
+ auto eng = ctx->dnnl_engine;
+ using namespace dnnl;
+
+ auto x_mem = memory(bnh.x_md, eng, x.block()->mutable_data());
+ auto dx_mem = memory(bnh.x_md, eng, dx.block()->mutable_data());
+ auto y_mem = memory(bnh.x_md, eng, y.block()->mutable_data());
+ auto dy_mem = memory(bnh.x_md, eng, dy.block()->mutable_data());
+
+ auto m_mem = memory(bnh.bn_fwd_training_pd->mean_desc(), eng,
+ mean.block()->mutable_data());
+ auto v_mem = memory(bnh.bn_fwd_training_pd->variance_desc(), eng,
+ var.block()->mutable_data());
+ auto w_mem = memory(bnh.bn_fwd_training_pd->weights_desc(), eng,
+ w.block()->mutable_data());
+
+ auto bn_bwd_d = batch_normalization_backward::desc(
+ prop_kind::backward, bnh.x_md, bnh.x_md, bnh.epsilon,
+ normalization_flags::use_scale_shift);
+ auto bn_bwd_pd = batch_normalization_backward::primitive_desc(
+ bn_bwd_d, eng, *bnh.bn_fwd_training_pd);
+
+ auto dw_mem = memory(bn_bwd_pd.diff_weights_desc(), eng,
+ dw.block()->mutable_data());
+
+ batch_normalization_backward(bn_bwd_pd).execute(
+ ctx->dnnl_stream, {{DNNL_ARG_SRC, x_mem},
+ {DNNL_ARG_DIFF_SRC, dx_mem},
+ {DNNL_ARG_DIFF_DST, dy_mem},
+ {DNNL_ARG_MEAN, m_mem},
+ {DNNL_ARG_VARIANCE, v_mem},
+ {DNNL_ARG_DIFF_SCALE_SHIFT, dw_mem},
+ {DNNL_ARG_SCALE_SHIFT, w_mem}});
+ ctx->dnnl_stream.wait();
- }, {x.block(), dy.block(), mean.block(), var.block()},
- {dx.block(), dw.block()});
+ },
+ {x.block(), dy.block(), mean.block(), var.block()},
+ {dx.block(), dw.block()});
singa::Tensor dbnScale(bnScale.shape());
CopyDataToFrom(&dbnScale, dw, bnScale.Size(), 0, 0);
@@ -226,8 +258,7 @@ const std::vector<Tensor> CpuBatchNormBackwardx(
return {dx, dbnScale, dbnBias};
}
-#endif // USE_DNNL
-
+#endif // USE_DNNL
#ifdef USE_CUDNN
CudnnBatchNormHandle::CudnnBatchNormHandle(const float momentum,
diff --git a/src/model/operation/batchnorm.h b/src/model/operation/batchnorm.h
index 1cdc412..00b9b24 100644
--- a/src/model/operation/batchnorm.h
+++ b/src/model/operation/batchnorm.h
@@ -33,13 +33,14 @@
#include <singa/utils/dnnl_utils.h>
// combine scale and bias into weight format required by dnnl
-static inline singa::Tensor get_bn_weight_from(const singa::Tensor &s, const singa::Tensor &b) {
+static inline singa::Tensor get_bn_weight_from(const singa::Tensor &s,
+ const singa::Tensor &b) {
singa::Tensor w(singa::Shape{s.Size(), b.Size()});
CopyDataToFrom(&w, s, s.Size(), 0, 0);
CopyDataToFrom(&w, b, b.Size(), s.Size(), 0);
return w;
}
-#endif // USE_DNNL
+#endif // USE_DNNL
namespace singa {
@@ -55,7 +56,9 @@ class BatchNormHandle {
size_t height;
size_t width;
bool is_2d;
- //bool train = true;
+ // bool train = true;
+ bool use_dnnl =
+ false; // useful flag if both USE_CUDNN and USE_DNNL are enabled
#ifdef USE_DNNL
float epsilon;
@@ -64,13 +67,13 @@ class BatchNormHandle {
// as no default constructor, we need to declare it as pointer
dnnl::batch_normalization_forward::desc *bn_fwd_training_d;
dnnl::batch_normalization_forward::primitive_desc *bn_fwd_training_pd;
-#endif // USE_DNNL
+#endif // USE_DNNL
};
#ifdef USE_DNNL
-Tensor
-CpuBatchNormForwardInference(const BatchNormHandle &bnh, const Tensor &x, const Tensor &bnScale, const Tensor &bnBias,
- Tensor &running_mean, Tensor &running_var);
+Tensor CpuBatchNormForwardInference(const BatchNormHandle &bnh, const Tensor &x,
+ const Tensor &bnScale, const Tensor &bnBias,
+ Tensor &running_mean, Tensor &running_var);
const std::vector<Tensor> CpuBatchNormForwardTraining(
const BatchNormHandle &bnh, const Tensor &x, const Tensor &bnScale,
@@ -80,9 +83,7 @@ const std::vector<Tensor> CpuBatchNormBackwardx(
const BatchNormHandle &bnh, const Tensor &y, const Tensor &dy,
const Tensor &x, const Tensor &bnScale, const Tensor &bnBias,
const Tensor &mean, const Tensor &var);
-#endif // USE_DNNL
-
-
+#endif // USE_DNNL
#ifdef USE_CUDNN
@@ -115,4 +116,4 @@ const std::vector<Tensor> GpuBatchNormBackward(
} // namespace singa
-#endif // SINGA_MODEL_OPERATION_BATCHNORM_H_
+#endif // SINGA_MODEL_OPERATION_BATCHNORM_H_
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
index 21df4e8..d5bb470 100644
--- a/src/model/operation/convolution.cc
+++ b/src/model/operation/convolution.cc
@@ -63,7 +63,8 @@ ConvHandle::ConvHandle(const Tensor &input,
#ifdef USE_DNNL
if (input.device()->lang() == kCpp) {
- const int groups = 1; // only groups 1 is supported for now
+ use_dnnl = true;
+ const int groups = 1; // only groups 1 is supported for now
auto dtype_ = dnnl::memory::data_type::f32;
x_dims = dnnl::memory::dims{(int)input.shape(0), (int)in_channels,
@@ -95,15 +96,17 @@ ConvHandle::ConvHandle(const Tensor &input,
// singa api
db = new Tensor(Shape{num_filters}, input.device(), input.data_type());
}
-#endif // USE_DNNL
+#endif // USE_DNNL
}
ConvHandle::~ConvHandle() {
#ifdef USE_DNNL
- delete (conv_d);
- delete (conv_pd);
- delete (db);
-#endif // USE_DNNL
+ if (use_dnnl) {
+ delete (conv_d);
+ delete (conv_pd);
+ delete (db);
+ }
+#endif // USE_DNNL
}
Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b,
@@ -138,15 +141,15 @@ Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b,
convolution_forward(*ch.conv_pd)
.execute(ctx->dnnl_stream, {{DNNL_ARG_SRC, x_mem},
- {DNNL_ARG_WEIGHTS, w_mem},
- {DNNL_ARG_BIAS, b_mem},
- {DNNL_ARG_DST, y_mem}});
+ {DNNL_ARG_WEIGHTS, w_mem},
+ {DNNL_ARG_BIAS, b_mem},
+ {DNNL_ARG_DST, y_mem}});
ctx->dnnl_stream.wait();
},
{x.block(), W.block(), b.block()}, {output.block()});
return output;
-#else // cpp naive
+#else // cpp naive
Shape w_shape = W.shape();
Shape b_shape;
if (ch.bias_term) b_shape = b.shape();
@@ -178,20 +181,22 @@ Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b,
W.Reshape(w_shape);
if (ch.bias_term) b.Reshape(b_shape);
return output;
-#endif // USE_DNNL
+#endif // USE_DNNL
}
Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x,
const ConvHandle &ch) {
CHECK_EQ(dy.device()->lang(), kCpp);
+ CHECK_EQ(W.device()->lang(), kCpp);
+ CHECK_EQ(x.device()->lang(), kCpp);
CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
dy.shape(3) == ch.conv_width)
<< "input gradients shape should not change";
CHECK(W.shape(0) == ch.num_filters && W.shape(1) == ch.channels &&
- W.shape(2) == ch.kernel_h
- && W.shape(3) == ch.kernel_w) << "weights shape should not change";
+ W.shape(2) == ch.kernel_h && W.shape(3) == ch.kernel_w)
+ << "weights shape should not change";
#ifdef USE_DNNL
Tensor dx;
@@ -214,8 +219,8 @@ Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x,
convolution_backward_data(conv_bwd_data_pd)
.execute(ctx->dnnl_stream, {{DNNL_ARG_DIFF_DST, dy_mem},
- {DNNL_ARG_WEIGHTS, w_mem},
- {DNNL_ARG_DIFF_SRC, dx_mem}});
+ {DNNL_ARG_WEIGHTS, w_mem},
+ {DNNL_ARG_DIFF_SRC, dx_mem}});
ctx->dnnl_stream.wait();
},
@@ -223,7 +228,7 @@ Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x,
return dx;
-#else // NOT USE_DNNL
+#else // NOT USE_DNNL
Shape w_shape = W.shape();
W.Reshape(Shape{ch.num_filters, ch.col_height});
@@ -243,12 +248,14 @@ Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x,
}
W.Reshape(w_shape);
return dx;
-#endif // USE_DNNL
+#endif // USE_DNNL
}
Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W,
const ConvHandle &ch) {
CHECK_EQ(dy.device()->lang(), kCpp);
+ CHECK_EQ(x.device()->lang(), kCpp);
+ CHECK_EQ(W.device()->lang(), kCpp);
CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
dy.shape(3) == ch.conv_width)
@@ -280,16 +287,16 @@ Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W,
conv_dw_d, eng, *ch.conv_pd);
convolution_backward_weights(conv_dw_pd)
.execute(ctx->dnnl_stream, {{DNNL_ARG_DIFF_DST, dy_mem},
- {DNNL_ARG_SRC, x_mem},
- {DNNL_ARG_DIFF_WEIGHTS, dw_mem},
- {DNNL_ARG_DIFF_BIAS, db_mem}});
+ {DNNL_ARG_SRC, x_mem},
+ {DNNL_ARG_DIFF_WEIGHTS, dw_mem},
+ {DNNL_ARG_DIFF_BIAS, db_mem}});
ctx->dnnl_stream.wait();
},
{x.block(), dy.block(), W.block()}, {dW.block()});
return dW;
-#else // native cpp
+#else // native cpp
Tensor dW;
dW.ResetLike(W);
dW.SetValue(0.0f);
@@ -312,12 +319,13 @@ Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W,
}
dW.Reshape(w_shape);
return dW;
-#endif // USE_DNNL
+#endif // USE_DNNL
}
Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b,
const ConvHandle &ch) {
CHECK_EQ(dy.device()->lang(), kCpp);
+ CHECK_EQ(b.device()->lang(), kCpp);
CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
dy.shape(3) == ch.conv_width)
@@ -328,7 +336,7 @@ Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b,
#ifdef USE_DNNL
Tensor db = ch.db->Clone();
return db;
-#else // Native cpp
+#else // Native cpp
Tensor db;
db.ResetLike(b);
@@ -343,7 +351,7 @@ Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b,
SumRows(tmp3, &db);
return db;
-#endif // USE_DNNL
+#endif // USE_DNNL
};
#ifdef USE_CUDNN
diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h
index 7da4ff1..b6ff0ac 100644
--- a/src/model/operation/convolution.h
+++ b/src/model/operation/convolution.h
@@ -34,8 +34,7 @@
#ifdef USE_DNNL
#include <singa/utils/dnnl_utils.h>
-#endif // USE_DNNL
-
+#endif // USE_DNNL
namespace singa {
@@ -72,6 +71,9 @@ class ConvHandle {
size_t col_width;
size_t imagesize;
+ bool use_dnnl =
+ false; // useful flag if both USE_CUDNN and USE_DNNL are enabled
+
#ifdef USE_DNNL
dnnl::memory::data_type dtype;
dnnl::memory::dims b_dims;
@@ -80,17 +82,16 @@ class ConvHandle {
dnnl::memory::dims x_dims;
dnnl::memory::dims o_dims;
dnnl::memory::dims w_dims;
-
+
dnnl::memory::desc x_md;
dnnl::memory::desc w_md;
dnnl::memory::desc b_md;
dnnl::memory::desc y_md;
dnnl::convolution_forward::desc *conv_d;
dnnl::convolution_forward::primitive_desc *conv_pd;
-
- Tensor *db;
-#endif // USE_DNNL
+ Tensor *db;
+#endif // USE_DNNL
};
Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b,
diff --git a/src/model/operation/pooling.cc b/src/model/operation/pooling.cc
index 75e7f44..1d6f3a2 100644
--- a/src/model/operation/pooling.cc
+++ b/src/model/operation/pooling.cc
@@ -51,92 +51,98 @@ PoolingHandle::PoolingHandle(const Tensor &input,
is_max_pooling = is_max;
#ifdef USE_DNNL
- auto x_dims = dnnl::memory::dims(input.shape().begin(), input.shape().end());
- auto y_dims = dnnl::memory::dims({batchsize, channels, pooled_height, pooled_width});
- auto s_dims = dnnl::memory::dims(stride.begin(), stride.end());
- auto k_dims = dnnl::memory::dims(kernel_size.begin(), kernel_size.end());
-
- auto p_dims = dnnl::memory::dims(padding.begin(), padding.end());
-
- auto dtype_ = dnnl::memory::data_type::f32;
- auto format_tag_ = get_dnnl_format_tag(input);
- x_md = dnnl::memory::desc({x_dims}, dtype_, format_tag_);
- y_md = dnnl::memory::desc({y_dims}, dtype_, format_tag_);
-
- // allow max or avg (follow cudnn implementation convention)
- auto pooling_algo = dnnl::algorithm::pooling_avg_exclude_padding;
- if (is_max_pooling)
- pooling_algo = dnnl::algorithm::pooling_max;
-
- auto pool_fwd_d = dnnl::pooling_forward::desc( dnnl::prop_kind::forward_training, pooling_algo, x_md, y_md, s_dims, k_dims, p_dims, p_dims);
- auto pool_bwd_d = dnnl::pooling_backward::desc(pooling_algo, x_md,y_md,s_dims,k_dims,p_dims,p_dims);
-
- auto eng = input.device()->context(0)->dnnl_engine;
- pool_fwd_pd = dnnl::pooling_forward::primitive_desc(pool_fwd_d, eng);
- pool_bwd_pd = dnnl::pooling_backward::primitive_desc(pool_bwd_d, eng, pool_fwd_pd);
-
- auto ws_md = pool_fwd_pd.workspace_desc();
- ws_mem = dnnl::memory(ws_md, eng);
-#endif // USE_DNNL
+ if (input.device()->lang() == kCpp) {
+ auto x_dims =
+ dnnl::memory::dims(input.shape().begin(), input.shape().end());
+ auto y_dims =
+ dnnl::memory::dims({batchsize, channels, pooled_height, pooled_width});
+ auto s_dims = dnnl::memory::dims(stride.begin(), stride.end());
+ auto k_dims = dnnl::memory::dims(kernel_size.begin(), kernel_size.end());
+
+ auto p_dims = dnnl::memory::dims(padding.begin(), padding.end());
+
+ auto dtype_ = dnnl::memory::data_type::f32;
+ auto format_tag_ = get_dnnl_format_tag(input);
+ x_md = dnnl::memory::desc({x_dims}, dtype_, format_tag_);
+ y_md = dnnl::memory::desc({y_dims}, dtype_, format_tag_);
+
+ // allow max or avg (follow cudnn implementation convention)
+ auto pooling_algo = dnnl::algorithm::pooling_avg_exclude_padding;
+ if (is_max_pooling) pooling_algo = dnnl::algorithm::pooling_max;
+
+ auto pool_fwd_d = dnnl::pooling_forward::desc(
+ dnnl::prop_kind::forward_training, pooling_algo, x_md, y_md, s_dims,
+ k_dims, p_dims, p_dims);
+ auto pool_bwd_d = dnnl::pooling_backward::desc(
+ pooling_algo, x_md, y_md, s_dims, k_dims, p_dims, p_dims);
+
+ auto eng = input.device()->context(0)->dnnl_engine;
+ pool_fwd_pd = dnnl::pooling_forward::primitive_desc(pool_fwd_d, eng);
+ pool_bwd_pd =
+ dnnl::pooling_backward::primitive_desc(pool_bwd_d, eng, pool_fwd_pd);
+
+ auto ws_md = pool_fwd_pd.workspace_desc();
+ ws_mem = dnnl::memory(ws_md, eng);
+ }
+#endif // USE_DNNL
}
-PoolingHandle::~PoolingHandle() {
-}
+PoolingHandle::~PoolingHandle() {}
#ifdef USE_DNNL
Tensor CpuPoolingForward(const PoolingHandle &ph, const Tensor &x) {
+ CHECK_EQ(x.device()->lang(), kCpp);
Tensor y({(unsigned long)ph.batchsize, (unsigned long)ph.channels,
(unsigned long)ph.pooled_height, (unsigned long)ph.pooled_width},
x.device(), x.data_type());
- Tensor y({(unsigned long) ph.batchsize, (unsigned long) ph.channels, (unsigned long) ph.pooled_height,
- (unsigned long) ph.pooled_width
- }, x.device(), x.data_type());
+ y.device()->Exec(
+ [&y, &x, &ph](Context *ctx) {
+ auto eng = ctx->dnnl_engine;
+ using namespace dnnl;
- y.device()->Exec([&y, &x, &ph](Context * ctx) {
- auto eng = ctx->dnnl_engine;
- using namespace dnnl;
+ memory x_mem(ph.x_md, eng, x.block()->mutable_data());
+ memory y_mem(ph.y_md, eng, y.block()->mutable_data());
- memory x_mem(ph.x_md,eng,x.block()->mutable_data());
- memory y_mem(ph.y_md,eng,y.block()->mutable_data());
-
- pooling_forward(ph.pool_fwd_pd).execute(ctx->dnnl_stream, {
- {DNNL_ARG_SRC, x_mem},
- {DNNL_ARG_DST, y_mem},
- {DNNL_ARG_WORKSPACE, ph.ws_mem}
- });
- ctx->dnnl_stream.wait();
- }, {x.block()}, {y.block()});
+ pooling_forward(ph.pool_fwd_pd)
+ .execute(ctx->dnnl_stream, {{DNNL_ARG_SRC, x_mem},
+ {DNNL_ARG_DST, y_mem},
+ {DNNL_ARG_WORKSPACE, ph.ws_mem}});
+ ctx->dnnl_stream.wait();
+ },
+ {x.block()}, {y.block()});
return y;
}
-
Tensor CpuPoolingBackward(const PoolingHandle &ph, const Tensor &grad,
- const Tensor& x, const Tensor& y){
+ const Tensor &x, const Tensor &y) {
+ CHECK_EQ(x.device()->lang(), kCpp);
+ CHECK_EQ(grad.device()->lang(), kCpp);
+ CHECK_EQ(y.device()->lang(), kCpp);
Tensor in_grad;
in_grad.ResetLike(x);
- in_grad.device()->Exec([&in_grad, &grad, &ph](Context * ctx) {
- auto eng = ctx->dnnl_engine;
- using namespace dnnl;
+ in_grad.device()->Exec(
+ [&in_grad, &grad, &ph](Context *ctx) {
+ auto eng = ctx->dnnl_engine;
+ using namespace dnnl;
- memory dx_mem(ph.x_md,eng,in_grad.block()->mutable_data());
- memory dy_mem(ph.y_md,eng,grad.block()->mutable_data());
+ memory dx_mem(ph.x_md, eng, in_grad.block()->mutable_data());
+ memory dy_mem(ph.y_md, eng, grad.block()->mutable_data());
- pooling_backward(ph.pool_bwd_pd).execute(ctx->dnnl_stream,{
- {DNNL_ARG_DIFF_DST, dy_mem},
- {DNNL_ARG_DIFF_SRC, dx_mem},
- {DNNL_ARG_WORKSPACE, ph.ws_mem}
- });
- ctx->dnnl_stream.wait();
- }, {x.block(), y.block(), grad.block()}, {in_grad.block()});
+ pooling_backward(ph.pool_bwd_pd)
+ .execute(ctx->dnnl_stream, {{DNNL_ARG_DIFF_DST, dy_mem},
+ {DNNL_ARG_DIFF_SRC, dx_mem},
+ {DNNL_ARG_WORKSPACE, ph.ws_mem}});
+ ctx->dnnl_stream.wait();
+ },
+ {x.block(), y.block(), grad.block()}, {in_grad.block()});
return in_grad;
}
-#endif // USE_DNNL
-
+#endif // USE_DNNL
#ifdef USE_CUDNN
diff --git a/test/python/test_api.py b/test/python/test_api.py
index 4589f58..197f884 100644
--- a/test/python/test_api.py
+++ b/test/python/test_api.py
@@ -20,8 +20,8 @@
from __future__ import division
import unittest
-import numpy as np
import math
+import numpy as np
from singa import singa_wrap as singa_api
from singa import tensor
@@ -63,24 +63,6 @@ def _np_bn_testing(x, scale, bias, rm, rv, momentum=0.1, e=1e-5):
return scale * (x - rm) / np.sqrt(rv + e) + bias
-def _np_to_pyTensor(_np):
- return tensor.Tensor(device=gpu_dev, data=_np)
-
-
-def _cTensor_to_pyTensor(cTensor):
- new_t = tensor.Tensor()
- new_t.data = cTensor
- new_t.shape = tuple(new_t.data.shape())
- new_t.device = new_t.data.device()
- new_t.dtype = new_t.data.data_type()
- return new_t
-
-
-# helper
-def _np_to_pyTensor(_np):
- return tensor.Tensor(device=cpu_dev, data=_np)
-
-
def _cTensor_to_pyTensor(cTensor):
new_t = tensor.Tensor()
new_t.data = cTensor
@@ -90,10 +72,10 @@ def _cTensor_to_pyTensor(cTensor):
return new_t
-
class TestAPI(unittest.TestCase):
- def test_batchnorm_training(self):
+ def test_batchnorm_training_gpu(self):
+ dev = gpu_dev
def _run_training(x_0, s_0, b_0, rm_0, rv_0, m_0=0.1):
# np api
@@ -105,15 +87,16 @@ class TestAPI(unittest.TestCase):
momentum=m_0)
# singa api
- rm_t = _np_to_pyTensor(rm_0)
- rv_t = _np_to_pyTensor(rv_0)
- hndl = singa_api.CudnnBatchNormHandle(m_0,
- _np_to_pyTensor(x_0).data)
+ rm_t = tensor.Tensor(device=dev, data=rm_0)
+ rv_t = tensor.Tensor(device=dev, data=rv_0)
+ hndl = singa_api.CudnnBatchNormHandle(
+ m_0,
+ tensor.Tensor(device=dev, data=x_0).data)
(y_2_c, bm_2_c, bv_2_c) = singa_api.GpuBatchNormForwardTraining(
hndl,
- _np_to_pyTensor(x_0).data,
- _np_to_pyTensor(s_0).data,
- _np_to_pyTensor(b_0).data, rm_t.data, rv_t.data)
+ tensor.Tensor(device=dev, data=x_0).data,
+ tensor.Tensor(device=dev, data=s_0).data,
+ tensor.Tensor(device=dev, data=b_0).data, rm_t.data, rv_t.data)
np.testing.assert_array_almost_equal(
y_1, tensor.to_numpy(_cTensor_to_pyTensor(y_2_c)))
@@ -147,22 +130,24 @@ class TestAPI(unittest.TestCase):
rv_0 = np.random.random((1, c, 1, 1)).astype(np.float32)
_run_training(x_0, s_0, b_0, rm_0, rv_0, m_0=0.2)
- def test_batchnorm_testing(self):
+ def test_batchnorm_testing_gpu(self):
+ dev = gpu_dev
def _run_testing(x_0, s_0, b_0, rm_0, rv_0, m_0=0.1):
# np api
y_1 = _np_bn_testing(x_0, s_0, b_0, rm_0, rv_0, momentum=m_0)
# singa api
- hndl = singa_api.CudnnBatchNormHandle(m_0,
- _np_to_pyTensor(x_0).data)
+ hndl = singa_api.CudnnBatchNormHandle(
+ m_0,
+ tensor.Tensor(device=dev, data=x_0).data)
y_2_c = singa_api.GpuBatchNormForwardInference(
hndl,
- _np_to_pyTensor(x_0).data,
- _np_to_pyTensor(s_0).data,
- _np_to_pyTensor(b_0).data,
- _np_to_pyTensor(rm_0).data,
- _np_to_pyTensor(rv_0).data)
+ tensor.Tensor(device=dev, data=x_0).data,
+ tensor.Tensor(device=dev, data=s_0).data,
+ tensor.Tensor(device=dev, data=b_0).data,
+ tensor.Tensor(device=dev, data=rm_0).data,
+ tensor.Tensor(device=dev, data=rv_0).data)
#print(y_1)
#print(tensor.to_numpy(_cTensor_to_pyTensor(y_2_c)))
@@ -312,11 +297,13 @@ class TestAPI(unittest.TestCase):
s3 = [3, 5]
_test(s1, s2, axis1, axis2, s3, s_op, n_op, dev)
- def test_batchnorm_training(self):
+ def test_batchnorm_training_dnnl(self):
+ dev = cpu_dev
def _np_bn_training(x, scale, bias, rm, rv, momentum=0.1, e=1e-5):
channel = x.shape[1]
- np.testing.assert_array_almost_equal(scale.shape, (1, channel, 1, 1))
+ np.testing.assert_array_almost_equal(scale.shape,
+ (1, channel, 1, 1))
np.testing.assert_array_almost_equal(bias.shape, (1, channel, 1, 1))
np.testing.assert_array_almost_equal(rm.shape, (1, channel, 1, 1))
np.testing.assert_array_almost_equal(rv.shape, (1, channel, 1, 1))
@@ -350,15 +337,17 @@ class TestAPI(unittest.TestCase):
momentum=m_0)
# singa api
- hndl = singa_api.BatchNormHandle(m_0, _np_to_pyTensor(x_0).data)
+ hndl = singa_api.BatchNormHandle(
+ m_0,
+ tensor.Tensor(device=dev, data=x_0).data)
(y_2_c, rm_2_c, rv_2_c, bm_2_c,
bv_2_c) = singa_api.CpuBatchNormForwardTraining(
hndl,
- _np_to_pyTensor(x_0).data,
- _np_to_pyTensor(s_0).data,
- _np_to_pyTensor(b_0).data,
- _np_to_pyTensor(rm_0).data,
- _np_to_pyTensor(rv_0).data)
+ tensor.Tensor(device=dev, data=x_0).data,
+ tensor.Tensor(device=dev, data=s_0).data,
+ tensor.Tensor(device=dev, data=b_0).data,
+ tensor.Tensor(device=dev, data=rm_0).data,
+ tensor.Tensor(device=dev, data=rv_0).data)
np.testing.assert_array_almost_equal(
y_1, tensor.to_numpy(_cTensor_to_pyTensor(y_2_c)))
@@ -374,9 +363,8 @@ class TestAPI(unittest.TestCase):
rv_1, tensor.to_numpy(_cTensor_to_pyTensor(rv_2_c)), decimal=4)
return
- x_0 = np.array(
- [1, 1, 1, 1, 2, 2, 2, 2, 10, 10, 10, 10, 20, 20, 20, 20],
- dtype=np.float32).reshape((2, 2, 2, 2))
+ x_0 = np.array([1, 1, 1, 1, 2, 2, 2, 2, 10, 10, 10, 10, 20, 20, 20, 20],
+ dtype=np.float32).reshape((2, 2, 2, 2))
s_0 = np.array([1, 10], dtype=np.float32).reshape((1, 2, 1, 1))
b_0 = np.array([1, 10], dtype=np.float32).reshape((1, 2, 1, 1))
rm_0 = np.array([1, 10], dtype=np.float32).reshape((1, 2, 1, 1))
@@ -393,10 +381,13 @@ class TestAPI(unittest.TestCase):
rv_0 = np.random.random((1, c, 1, 1)).astype(np.float32)
_run_training(x_0, s_0, b_0, rm_0, rv_0, m_0=0.2)
- def test_batchnorm_testing(self):
+ def test_batchnorm_testing_dnnl(self):
+ dev = cpu_dev
+
def _np_bn_testing(x, scale, bias, rm, rv, momentum=0.1, e=1e-5):
channel = x.shape[1]
- np.testing.assert_array_almost_equal(scale.shape, (1, channel, 1, 1))
+ np.testing.assert_array_almost_equal(scale.shape,
+ (1, channel, 1, 1))
np.testing.assert_array_almost_equal(bias.shape, (1, channel, 1, 1))
np.testing.assert_array_almost_equal(rm.shape, (1, channel, 1, 1))
np.testing.assert_array_almost_equal(rv.shape, (1, channel, 1, 1))
@@ -407,14 +398,16 @@ class TestAPI(unittest.TestCase):
y_1 = _np_bn_testing(x_0, s_0, b_0, rm_0, rv_0, momentum=m_0)
# singa api
- hndl = singa_api.BatchNormHandle(m_0, _np_to_pyTensor(x_0).data)
+ hndl = singa_api.BatchNormHandle(
+ m_0,
+ tensor.Tensor(device=dev, data=x_0).data)
y_2_c = singa_api.CpuBatchNormForwardInference(
hndl,
- _np_to_pyTensor(x_0).data,
- _np_to_pyTensor(s_0).data,
- _np_to_pyTensor(b_0).data,
- _np_to_pyTensor(rm_0).data,
- _np_to_pyTensor(rv_0).data)
+ tensor.Tensor(device=dev, data=x_0).data,
+ tensor.Tensor(device=dev, data=s_0).data,
+ tensor.Tensor(device=dev, data=b_0).data,
+ tensor.Tensor(device=dev, data=rm_0).data,
+ tensor.Tensor(device=dev, data=rv_0).data)
#print(y_1)
#print(tensor.to_numpy(_cTensor_to_pyTensor(y_2_c)))
@@ -422,9 +415,8 @@ class TestAPI(unittest.TestCase):
y_1, tensor.to_numpy(_cTensor_to_pyTensor(y_2_c)), decimal=5)
return
- x_0 = np.array(
- [1, 1, 1, 1, 2, 2, 2, 2, 10, 10, 10, 10, 20, 20, 20, 20],
- dtype=np.float32).reshape((2, 2, 2, 2))
+ x_0 = np.array([1, 1, 1, 1, 2, 2, 2, 2, 10, 10, 10, 10, 20, 20, 20, 20],
+ dtype=np.float32).reshape((2, 2, 2, 2))
s_0 = np.array([1, 10], dtype=np.float32).reshape((1, 2, 1, 1))
b_0 = np.array([1, 10], dtype=np.float32).reshape((1, 2, 1, 1))
rm_0 = np.array([1, 10], dtype=np.float32).reshape((1, 2, 1, 1))
@@ -438,46 +430,53 @@ class TestAPI(unittest.TestCase):
rv_0 = np.random.random((1, c, 1, 1)).astype(np.float32)
_run_testing(x_0, s_0, b_0, rm_0, rv_0, m_0=1.0)
- def test_batchnorm_backward(self):
- N=1; C=3; H=2; W=2
+ def test_batchnorm_backward_dnnl(self):
+ dev = cpu_dev
+ N = 1
+ C = 3
+ H = 2
+ W = 2
- data_shape = [N,C,H,W]
- param_shape = [1,C,1,1]
- data = [1,2,3,4,5,6,7,8,9,10,11,12]
+ data_shape = [N, C, H, W]
+ param_shape = [1, C, 1, 1]
+ data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
- x_0 = np.array( data, dtype=np.float32).reshape(data_shape)
- y_0 = np.array( data, dtype=np.float32).reshape(data_shape)
- dy_0 = np.array( data, dtype=np.float32).reshape(data_shape)
- scale_0 = np.array([1]*C, dtype=np.float32).reshape(param_shape)
- bias_0 = np.array([0]*C, dtype=np.float32).reshape(param_shape)
+ x_0 = np.array(data, dtype=np.float32).reshape(data_shape)
+ y_0 = np.array(data, dtype=np.float32).reshape(data_shape)
+ dy_0 = np.array(data, dtype=np.float32).reshape(data_shape)
+ scale_0 = np.array([1] * C, dtype=np.float32).reshape(param_shape)
+ bias_0 = np.array([0] * C, dtype=np.float32).reshape(param_shape)
mean_0 = x_0.mean(axis=(0, 2, 3), keepdims=True)
var_0 = x_0.var(axis=(0, 2, 3), keepdims=True)
- hndl = singa_api.BatchNormHandle(0.1, _np_to_pyTensor(x_0).data)
+ hndl = singa_api.BatchNormHandle(
+ 0.1,
+ tensor.Tensor(device=dev, data=x_0).data)
(dx_2_c, _, _) = singa_api.CpuBatchNormBackwardx(
hndl,
- _np_to_pyTensor(y_0).data,
- _np_to_pyTensor(dy_0).data,
- _np_to_pyTensor(x_0).data,
- _np_to_pyTensor(scale_0).data,
- _np_to_pyTensor(bias_0).data,
- _np_to_pyTensor(mean_0).data,
- _np_to_pyTensor(var_0).data,
+ tensor.Tensor(device=dev, data=y_0).data,
+ tensor.Tensor(device=dev, data=dy_0).data,
+ tensor.Tensor(device=dev, data=x_0).data,
+ tensor.Tensor(device=dev, data=scale_0).data,
+ tensor.Tensor(device=dev, data=bias_0).data,
+ tensor.Tensor(device=dev, data=mean_0).data,
+ tensor.Tensor(device=dev, data=var_0).data,
)
dx_truth = np.array([[[[-1.0769e-05, -3.5985e-06],
- [ 3.5985e-06, 1.0769e-05]],
- [[-1.0769e-05, -3.5985e-06],
- [ 3.5985e-06, 1.0769e-05]],
- [[-1.0769e-05, -3.5985e-06],
- [ 3.5985e-06, 1.0769e-05]]]])
- np.testing.assert_array_almost_equal(tensor.to_numpy(_cTensor_to_pyTensor(dx_2_c)), dx_truth)
+ [3.5985e-06, 1.0769e-05]],
+ [[-1.0769e-05, -3.5985e-06],
+ [3.5985e-06, 1.0769e-05]],
+ [[-1.0769e-05, -3.5985e-06],
+ [3.5985e-06, 1.0769e-05]]]])
+ np.testing.assert_array_almost_equal(
+ tensor.to_numpy(_cTensor_to_pyTensor(dx_2_c)), dx_truth)
return
-
def test_softmax_api_dnnl_backend(self):
+
def _run_test(org_shape, axis, aft_shape):
x_0 = np.random.random(org_shape).astype(np.float32)
x_0 = x_0 + 1000
@@ -490,9 +489,8 @@ class TestAPI(unittest.TestCase):
x_0 = x_0.reshape(aft_shape)
x_0 = x_0 - np.max(x_0)
y1 = np.divide(np.exp(x_0),
- np.sum(np.exp(x_0),
- axis=1).reshape(x_0.shape[0],
- 1)) # 2d softmax
+ np.sum(np.exp(x_0), axis=1).reshape(x_0.shape[0],
+ 1)) # 2d softmax
y1 = y1.reshape(org_shape)
np.testing.assert_array_almost_equal(tensor.to_numpy(y0), y1)
@@ -519,50 +517,65 @@ class TestAPI(unittest.TestCase):
_run_test([2, 2, 2, 2], -4, [1, 16])
def test_dnnl_pooling_max(self):
- N=1; C=3; H=2; W=2
+ dev = cpu_dev
+ N = 1
+ C = 3
+ H = 2
+ W = 2
- data_shape = [N,C,H,W]
- param_shape = [1,C,1,1]
- data = [1,2,3,4,5,6,7,8,9,10,11,12]
+ data_shape = [N, C, H, W]
+ param_shape = [1, C, 1, 1]
+ data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
- x0 = np.array( data, dtype=np.float32).reshape(data_shape)
- x0_ct = _np_to_pyTensor(x0).data
+ x0 = np.array(data, dtype=np.float32).reshape(data_shape)
+ x0_ct = tensor.Tensor(device=dev, data=x0).data
- dy0 = np.array([1,2,3], dtype=np.float32).reshape([1,3,1,1])
- dy0_ct = _np_to_pyTensor(dy0).data
+ dy0 = np.array([1, 2, 3], dtype=np.float32).reshape([1, 3, 1, 1])
+ dy0_ct = tensor.Tensor(device=dev, data=dy0).data
- hndl = singa_api.PoolingHandle(x0_ct, [2, 2], [1,1], [0,0], True)
+ hndl = singa_api.PoolingHandle(x0_ct, [2, 2], [1, 1], [0, 0], True)
- y0_ct=singa_api.CpuPoolingForward(hndl, x0_ct);
- y1 = np.array([[[[ 4.]], [[ 8.]], [[12.]]]])
- np.testing.assert_array_almost_equal(tensor.to_numpy(_cTensor_to_pyTensor(y0_ct)), y1)
+ y0_ct = singa_api.CpuPoolingForward(hndl, x0_ct)
+ y1 = np.array([[[[4.]], [[8.]], [[12.]]]])
+ np.testing.assert_array_almost_equal(
+ tensor.to_numpy(_cTensor_to_pyTensor(y0_ct)), y1)
- dx0_ct = singa_api.CpuPoolingBackward(hndl, dy0_ct, x0_ct, y0_ct);
- dx1 = np.array( [[[[0., 0.], [0., 1.]], [[0., 0.], [0., 2.]], [[0., 0.], [0., 3.]]]] )
- np.testing.assert_array_almost_equal(tensor.to_numpy(_cTensor_to_pyTensor(dx0_ct)), dx1)
+ dx0_ct = singa_api.CpuPoolingBackward(hndl, dy0_ct, x0_ct, y0_ct)
+ dx1 = np.array([[[[0., 0.], [0., 1.]], [[0., 0.], [0., 2.]],
+ [[0., 0.], [0., 3.]]]])
+ np.testing.assert_array_almost_equal(
+ tensor.to_numpy(_cTensor_to_pyTensor(dx0_ct)), dx1)
def test_dnnl_pooling_avg(self):
- N=1; C=3; H=2; W=2
-
- data_shape = [N,C,H,W]
- param_shape = [1,C,1,1]
- data = [1,2,3,4,5,6,7,8,9,10,11,12]
-
- x0 = np.array( data, dtype=np.float32).reshape(data_shape)
- x0_ct = _np_to_pyTensor(x0).data
-
- dy0 = np.array([1,2,3], dtype=np.float32).reshape([1,3,1,1])
- dy0_ct = _np_to_pyTensor(dy0).data
-
- hndl = singa_api.PoolingHandle(x0_ct, [2, 2], [1,1], [0,0], False)
-
- y0_ct=singa_api.CpuPoolingForward(hndl, x0_ct);
-
- y1 = np.array([[[[ 2.5000]], [[ 6.5000]], [[10.5000]]]])
- np.testing.assert_array_almost_equal(tensor.to_numpy(_cTensor_to_pyTensor(y0_ct)), y1)
- dx0_ct = singa_api.CpuPoolingBackward(hndl, dy0_ct, x0_ct, y0_ct);
- dx1=np.array([[[[0.2500, 0.2500], [0.2500, 0.2500]], [[0.5000, 0.5000], [0.5000, 0.5000]], [[0.7500, 0.7500], [0.7500, 0.7500]]]])
- np.testing.assert_array_almost_equal(tensor.to_numpy(_cTensor_to_pyTensor(dx0_ct)), dx1)
+ dev = cpu_dev
+ N = 1
+ C = 3
+ H = 2
+ W = 2
+
+ data_shape = [N, C, H, W]
+ param_shape = [1, C, 1, 1]
+ data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+
+ x0 = np.array(data, dtype=np.float32).reshape(data_shape)
+ x0_ct = tensor.Tensor(device=dev, data=x0).data
+
+ dy0 = np.array([1, 2, 3], dtype=np.float32).reshape([1, 3, 1, 1])
+ dy0_ct = tensor.Tensor(device=dev, data=dy0).data
+
+ hndl = singa_api.PoolingHandle(x0_ct, [2, 2], [1, 1], [0, 0], False)
+
+ y0_ct = singa_api.CpuPoolingForward(hndl, x0_ct)
+
+ y1 = np.array([[[[2.5000]], [[6.5000]], [[10.5000]]]])
+ np.testing.assert_array_almost_equal(
+ tensor.to_numpy(_cTensor_to_pyTensor(y0_ct)), y1)
+ dx0_ct = singa_api.CpuPoolingBackward(hndl, dy0_ct, x0_ct, y0_ct)
+ dx1 = np.array([[[[0.2500, 0.2500], [0.2500, 0.2500]],
+ [[0.5000, 0.5000], [0.5000, 0.5000]],
+ [[0.7500, 0.7500], [0.7500, 0.7500]]]])
+ np.testing.assert_array_almost_equal(
+ tensor.to_numpy(_cTensor_to_pyTensor(dx0_ct)), dx1)
if __name__ == '__main__':