You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/08 03:27:37 UTC
incubator-singa git commit: SINGA-191 Add "autotune" for CudnnConvolution Layer

Repository: incubator-singa
Updated Branches:
  refs/heads/dev 04e23d1a6 -> 01aaf4900


SINGA-191 Add "autotune" for CudnnConvolution Layer

If users choose "autotune", the layer will choose algorithm preference
and algorithm automatically. The following CUDNN functions are used:
  cudnnFindConvolutionForwardAlgorithm,
  cudnnFindConvolutionBackwardFilterAlgorithm,
  cudnnFindConvolutionBackwardDataAlgorithm


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/01aaf490
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/01aaf490
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/01aaf490

Branch: refs/heads/dev
Commit: 01aaf49009d2b0c6c62a484b21a1d06bce575e81
Parents: 04e23d1
Author: XiangruiCAI <ca...@gmail.com>
Authored: Sat Jun 4 16:18:47 2016 +0800
Committer: XiangruiCAI <ca...@gmail.com>
Committed: Wed Jun 8 10:35:51 2016 +0800

----------------------------------------------------------------------
 src/model/layer/cudnn_convolution.cc | 180 +++++++++++++++++------------
 test/singa/test_cudnn_convolution.cc | 181 ++++++++++++++++++++++++++++++
 2 files changed, 287 insertions(+), 74 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01aaf490/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index 97aa256..b80c3bd 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -41,9 +41,9 @@ void CudnnConvolution::Setup(const LayerConf &conf) {
   workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
   prefer_ = ToLowerCase(conv_conf.prefer());
   CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-        prefer_ == "no_workspace")
-      << "CudnnConvolution only supports three algorithm preferences: fastest, "
-         "limited_workspace and no_workspace";
+        prefer_ == "no_workspace" || prefer_ == "autotune")
+      << "CudnnConvolution only supports four algorithm preferences: fastest, "
+         "limited_workspace, no_workspace and autotune";
 }
 
 void CudnnConvolution::ToDevice(Device *device) {
@@ -52,7 +52,7 @@ void CudnnConvolution::ToDevice(Device *device) {
   workspace_.ToDevice(device);
 }
 
-void CudnnConvolution::InitCudnn(const Tensor& input) {
+void CudnnConvolution::InitCudnn(const Tensor &input) {
   CHECK(!has_init_cudnn_);
   DataType dtype = input.data_type();
   Device *dev = input.device();
@@ -89,34 +89,54 @@ void CudnnConvolution::InitCudnn(const Tensor& input) {
   LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
 #endif
 
-  cudnnConvolutionFwdPreference_t fwd_pref;
-  cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-  cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-  if (prefer_ == "fastest") {
-    fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-  } else if (prefer_ == "limited_workspace") {
-    fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-  } else if (prefer_ == "no_workspace") {
-    fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+  if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+      prefer_ == "no_workspace") {
+    cudnnConvolutionFwdPreference_t fwd_pref;
+    cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+    cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+    if (prefer_ == "fastest") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+    } else if (prefer_ == "limited_workspace") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+    } else {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+    }
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+        ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+        workspace_byte_limit_, &fp_alg_));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+        ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+        bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+        ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+        bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+  } else if (prefer_ == "autotune") {
+    const int topk = 1;
+    int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+    cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+    cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+    cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+        ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+        &num_fp_alg, fp_alg_perf));
+    fp_alg_ = fp_alg_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+        ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+        &num_bp_filt_alg, bp_filt_perf));
+    bp_filter_alg_ = bp_filt_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+        ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+        &num_bp_data_alg, bp_data_perf));
+    bp_data_alg_ = bp_data_perf[0].algo;
   } else {
     LOG(FATAL) << "Preferred algorithm is not available!";
   }
-  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-      ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-      workspace_byte_limit_, &fp_alg_));
-
-  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-      ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-      bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
-  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-      ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-      bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
 
   size_t fp_byte, bp_data_byte, bp_filter_byte;
   CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
@@ -147,25 +167,30 @@ const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
 
   Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
   Tensor output(shape, dev, dtype);
-  output.device()->Exec([input, output, this](Context *ctx) {
-    Blob *inblob = input.blob(), *outblob = output.blob(),
-         *wblob = this->weight_.blob();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
-                            inblob->data(), this->filter_desc_, wblob->data(),
-                            this->conv_desc_, this->fp_alg_,
-                            this->workspace_.blob()->mutable_data(),
-                            this->workspace_count_ * sizeof(float), &beta,
-                            this->y_desc_, outblob->mutable_data());
-  }, {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
+  output.device()->Exec(
+      [input, output, this](Context *ctx) {
+        Blob *inblob = input.blob(), *outblob = output.blob(),
+             *wblob = this->weight_.blob();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
+                                inblob->data(), this->filter_desc_,
+                                wblob->data(), this->conv_desc_, this->fp_alg_,
+                                this->workspace_.blob()->mutable_data(),
+                                this->workspace_count_ * sizeof(float), &beta,
+                                this->y_desc_, outblob->mutable_data());
+      },
+      {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
 
   if (bias_term_) {
-    output.device()->Exec([output, this](Context *ctx) {
-      float beta = 1.f, alpha = 1.0f;
-      Blob *outblob = output.blob(), *bblob = this->bias_.blob();
-      cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_, bblob->data(),
-                     &beta, this->y_desc_, outblob->mutable_data());
-    }, {output.blob(), bias_.blob()}, {output.blob()});
+    output.device()->Exec(
+        [output, this](Context *ctx) {
+          float beta = 1.f, alpha = 1.0f;
+          Blob *outblob = output.blob(), *bblob = this->bias_.blob();
+          cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
+                         bblob->data(), &beta, this->y_desc_,
+                         outblob->mutable_data());
+        },
+        {output.blob(), bias_.blob()}, {output.blob()});
   }
   return output;
 }
@@ -187,38 +212,45 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
 
   // LOG(ERROR) << "backward bias";
   if (bias_term_) {
-    dx.device()->Exec([grad, db, this](Context *ctx) {
-      Blob *dyblob = grad.blob(), *dbblob = db.blob();
-      float alpha = 1.f, beta = 0.f;
-      cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
-                                   dyblob->data(), &beta, this->bias_desc_,
-                                   dbblob->mutable_data());
-    }, {grad.blob()}, {db.blob()});
+    dx.device()->Exec(
+        [grad, db, this](Context *ctx) {
+          Blob *dyblob = grad.blob(), *dbblob = db.blob();
+          float alpha = 1.f, beta = 0.f;
+          cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
+                                       dyblob->data(), &beta, this->bias_desc_,
+                                       dbblob->mutable_data());
+        },
+        {grad.blob()}, {db.blob()});
   }
   // LOG(ERROR) << "backward w";
-  dx.device()->Exec([grad, dw, src_data, this](Context *ctx) {
-    Blob *inblob = src_data.blob(), *dyblob = grad.blob(), *dwblob = dw.blob();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-        ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(), this->y_desc_,
-        dyblob->data(), this->conv_desc_, this->bp_filter_alg_,
-        this->workspace_.blob()->mutable_data(),
-        this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
-        dwblob->mutable_data());
-  }, {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
+  dx.device()->Exec(
+      [grad, dw, src_data, this](Context *ctx) {
+        Blob *inblob = src_data.blob(), *dyblob = grad.blob(),
+             *dwblob = dw.blob();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_,
+            this->bp_filter_alg_, this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
+            dwblob->mutable_data());
+      },
+      {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
 
   // LOG(ERROR) << "backward src";
-  dx.device()->Exec([dx, grad, this](Context *ctx) {
-    Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
-         *dxblob = dx.blob();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, this->filter_desc_,
-                                 wblob->data(), this->y_desc_, dyblob->data(),
-                                 this->conv_desc_, this->bp_data_alg_,
-                                 this->workspace_.blob()->mutable_data(),
-                                 this->workspace_count_ * sizeof(float), &beta,
-                                 this->x_desc_, dxblob->mutable_data());
-  }, {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
+  dx.device()->Exec(
+      [dx, grad, this](Context *ctx) {
+        Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
+             *dxblob = dx.blob();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(
+            ctx->cudnn_handle, &alpha, this->filter_desc_, wblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_, this->bp_data_alg_,
+            this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->x_desc_,
+            dxblob->mutable_data());
+      },
+      {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
   param_grad.push_back(dw);
   param_grad.push_back(db);
   return std::make_pair(dx, param_grad);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01aaf490/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
index 73359b4..2a17da2 100644
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@ -204,4 +204,185 @@ TEST(CudnnConvolution, Backward) {
   EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
   EXPECT_EQ(dy[0] * x[4], dwptr[8]);
 }
+// Tests for prefer=autotune
+TEST(CudnnConvolution_AT, Setup) {
+  CudnnConvolution conv;
+  EXPECT_EQ("CudnnConvolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(256u << 20, conv.workspace_byte_limit());
+  EXPECT_STREQ("autotune", conv.prefer().c_str());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
+}
+
+TEST(CudnnConvolution_AT, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                          6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * h * w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * h * w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(4u, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+}
+
+TEST(CudnnConvolution_AT, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                                  6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * src_h * src_w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
+                       &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
+                                                               0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
+                     &cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToDevice(&host);
+  const float *dx = in_grad.data<const float *>();
+  const float *wptr = we;
+  EXPECT_EQ(9u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  dw.ToDevice(&host);
+  db.ToDevice(&host);
+  const float *dbptr = db.data<const float *>();
+  EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
+
+  const float *dwptr = dw.data<const float *>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_EQ(dy[3] * x[4], dwptr[0]);
+  EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
+  EXPECT_EQ(dy[2] * x[4], dwptr[2]);
+  EXPECT_EQ(dy[1] * x[1] + dy[3] * x[7], dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[1] * x[2] + dy[2] * x[6] + dy[3] * x[8],
+                  dwptr[4]);
+  EXPECT_EQ(dy[0] * x[1] + dy[2] * x[7], dwptr[5]);
+  EXPECT_EQ(dy[1] * x[4], dwptr[6]);
+  EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
+  EXPECT_EQ(dy[0] * x[4], dwptr[8]);
+}
 #endif  // USE_CUDNN