You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/13 13:20:13 UTC

[20/50] [abbrv] incubator-singa git commit: SINGA-178 Add Convolution layer and Pooling layer

SINGA-178 Add Convolution layer and Pooling layer

Add CudnnConvolution layer and CudnnPooling layer.
Add tests for these two layers.
Passed tests and cpplint.py.

(Add on May 30th)
Process the parameters and specs in this way,
1. Each layer is responsible for pushing its own parameters into the param_value vector, and pushing its own param specs into the param_specs vector.
2. Each layer is responsible for deleting the tensors it created including params and buffer tensors.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/152056d4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/152056d4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/152056d4

Branch: refs/heads/master
Commit: 152056d4896cc96dfa63ee5ca53a1df861e408de
Parents: c6ae786
Author: XiangruiCAI <ca...@gmail.com>
Authored: Thu May 26 17:40:25 2016 +0800
Committer: XiangruiCAI <ca...@gmail.com>
Committed: Mon May 30 15:42:01 2016 +0800

----------------------------------------------------------------------
 include/singa/model/layer.h          |  22 ++-
 src/model/layer/convolution.cc       |  94 ++++++++++++
 src/model/layer/convolution.h        |  78 ++++++++++
 src/model/layer/cudnn_convolution.cc | 232 ++++++++++++++++++++++++++++++
 src/model/layer/cudnn_convolution.h  |  70 +++++++++
 src/model/layer/cudnn_pooling.cc     | 129 +++++++++++++++++
 src/model/layer/cudnn_pooling.h      |  57 ++++++++
 src/model/layer/pooling.cc           |  79 ++++++++++
 src/model/layer/pooling.h            |  63 ++++++++
 src/proto/model.proto                |  16 +++
 test/singa/test_cudnn_convolution.cc | 205 ++++++++++++++++++++++++++
 test/singa/test_cudnn_pooling.cc     | 141 ++++++++++++++++++
 12 files changed, 1173 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 5803295..c6a3bd1 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -44,11 +44,7 @@ class Layer {
 
   // ============= Following Functions could be override =====================
   /// Destruct objects created by this layer.
-  virtual ~Layer() {
-    for (Tensor* t : param_values_) {
-      delete t;
-    }
-  }
+  virtual ~Layer() {}; 
 
   /// Each layer sub-class would optionaly have a type name.
   /// Used for debugging and logging.
@@ -64,7 +60,7 @@ class Layer {
   /// batchsize is 1.
   virtual void Setup(const LayerConf& conf) {
     name_ = conf.name();
-    for (const auto& spec : conf.param()) param_specs_.push_back(spec);
+    // for (const auto& spec : conf.param()) param_specs_.push_back(spec);
     // TODO(wangwei) load param values from checkpoint blobs.
   }
 
@@ -130,21 +126,21 @@ class Layer {
   /// Move the layer (including its parameters and other internal Tensor) onto
   /// the given device
   virtual void ToDevice(Device* device) {
-    for (auto p : param_values_) p->ToDevice(device);
+    //for (auto p : param_values_) p->ToDevice(device);
   }
 
   /// Set the data type of Tensor in this layer.
   virtual void AsType(DataType dtype) {
-    for (auto p : param_values_) p->AsType(dtype);
+    //for (auto p : param_values_) p->AsType(dtype);
   }
 
   /// Serialize the layer info (including params) into a LayerConf proto message
   virtual void ToProto(LayerConf* conf) const {
-    conf->set_name(name_);
-    for (const auto& spec : param_specs_) {
-      ParamSpec* p = conf->add_param();
-      p->CopyFrom(spec);
-    }
+    //conf->set_name(name_);
+    //for (const auto& spec : param_specs_) {
+    //  ParamSpec* p = conf->add_param();
+    //  p->CopyFrom(spec);
+    //}
     // TODO(wangwei) add param values into conf;
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
new file mode 100644
index 0000000..6406a31
--- /dev/null
+++ b/src/model/layer/convolution.cc
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./convolution.h"
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+using std::vector;
+
+void Convolution::Setup(const LayerConf &conf) {
+  Layer::Setup(conf);
+  ConvolutionConf conv_conf = conf.convolution_conf();
+  // kernel_size, pad, and stride are repeated fields.
+  if (conv_conf.kernel_size_size() > 0) {
+    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+  } else {
+    kernel_w_ = conv_conf.kernel_w();
+    kernel_h_ = conv_conf.kernel_h();
+  }
+  CHECK_NE(kernel_w_, 0);
+  CHECK_NE(kernel_h_, 0);
+
+  if (conv_conf.pad_size() > 0) {
+    pad_w_ = pad_h_ = conv_conf.pad(0);
+  } else {
+    pad_w_ = conv_conf.pad_w();
+    pad_h_ = conv_conf.pad_h();
+  }
+
+  if (conv_conf.stride_size() > 0) {
+    stride_w_ = stride_h_ = conv_conf.stride(0);
+  } else {
+    stride_w_ = conv_conf.stride_w();
+    stride_h_ = conv_conf.stride_h();
+  }
+
+  num_filters_ = conv_conf.num_output();
+  bias_term_ = conv_conf.bias_term();
+
+  // Shape of src
+  channels_ = conv_conf.channels();
+  height_ = conv_conf.height();
+  width_ = conv_conf.width();
+
+  conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+  conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+  col_height_ = channels_ * kernel_w_ * kernel_h_;
+  col_width_ = conv_height_ * conv_width_;
+
+  // Setup shape of weight_ and bias_
+  weight_.Reshape(Shape{num_filters_, col_height_});
+  bias_.Reshape(Shape{num_filters_});
+  // Push back params into param_values_
+  // Assume the order of param is: weight, bias
+  for (const auto& spec : conf.param()) param_specs_.push_back(spec);
+  param_values_.push_back(&weight_);
+  param_values_.push_back(&bias_);
+}
+
+/// \copydoc Layer::Forward(int flag, const Tensor&)
+const Tensor Convolution::Forward(int flag, const Tensor &input) {
+  Tensor output;
+  // will be used in cpp version later
+  Tensor col_data(Shape{col_height_, col_width_});
+  Tensor col_grad(Shape{col_height_, col_width_});
+  return output;
+}
+
+/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+const std::pair<Tensor, vector<Tensor>> Convolution::Backward(
+    int flag, const Tensor &grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad;
+
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
new file mode 100644
index 0000000..a9bf833
--- /dev/null
+++ b/src/model/layer/convolution.h
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_CONVOLUTION_H_
+#define SRC_MODEL_LAYER_CONVOLUTION_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include <stack>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Convolution : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Convolution"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf &conf) override;
+
+  // void SetupParam(const Tensor &input);
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor &input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+
+  size_t kernel_w() const { return kernel_w_; }
+  size_t kernel_h() const { return kernel_h_; }
+  size_t pad_w() const { return pad_w_; }
+  size_t pad_h() const { return pad_h_; }
+  size_t stride_w() const { return stride_w_; }
+  size_t stride_h() const { return stride_h_; }
+  size_t num_filters() const { return num_filters_; }
+  size_t batchsize() const { return batchsize_; }
+  size_t channels() const { return channels_; }
+  size_t height() const { return height_; }
+  size_t width() const { return width_; }
+  bool bias_term() const { return bias_term_; }
+  const Tensor &weight() const { return weight_; }
+  const Tensor &bias() const { return bias_; }
+
+  void set_weight(Tensor w) {
+    weight_.ResetLike(w);
+    weight_.CopyData(w);
+  }
+  void set_bias(Tensor b) {
+    bias_.ResetLike(b);
+    bias_.CopyData(b);
+  }
+
+ protected:
+  size_t kernel_w_, pad_w_, stride_w_;
+  size_t kernel_h_, pad_h_, stride_h_;
+  size_t batchsize_, channels_, height_, width_;
+  size_t col_height_, col_width_, conv_height_, conv_width_, num_filters_;
+  Tensor weight_, bias_;
+  // store intermediate data, i.e., input tensor
+  std::stack<Tensor> buf_;
+  bool bias_term_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_CONVOLUTION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
new file mode 100644
index 0000000..ec7cd6a
--- /dev/null
+++ b/src/model/layer/cudnn_convolution.cc
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_convolution.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <chrono>
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+CudnnConvolution::~CudnnConvolution() {
+  if (bias_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
+  if (filter_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
+  if (conv_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnConvolution::Setup(const LayerConf &conf) {
+  Convolution::Setup(conf);
+  ConvolutionConf conv_conf = conf.convolution_conf();
+  // convert MB to bytes
+  workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
+  pref_ = conv_conf.algo_pref();
+  CHECK(pref_ == "fastest" || pref_ == "limited_workspace" ||
+        pref_ == "no_workspace")
+      << "CudnnConvolution only supports three algorithm preferences: fastest, "
+         "limited_workspace and no_workspace";
+}
+
+void CudnnConvolution::ToDevice(Device *device) {
+  weight_.ToDevice(device);
+  bias_.ToDevice(device);
+  workspace_.ToDevice(device);
+}
+
+void CudnnConvolution::InitCudnn(DataType dtype, Device *dev, Context *ctx) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+  CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), batchsize_,
+                                         channels_, height_, width_));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize_,
+      num_filters_, conv_height_, conv_width_));
+  if (bias_term_)
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), 1, 1,
+                                           num_filters_, 1));
+  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
+                                              stride_h_, stride_w_, 1, 1,
+                                              CUDNN_CROSS_CORRELATION));
+#if CUDNN_VERSION_MAJOR == 5
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                         CUDNN_TENSOR_NCHW, num_filters_,
+                                         channels_, kernel_h_, kernel_w_));
+#elif CUDNN_VERSION_MAJOR == 4
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(
+      filter_desc_, GetCudnnDataType(dtype), CUDNN_TENSOR_NCHW, num_filters_,
+      channels_, kernel_h_, kernel_w_));
+#else
+  LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
+#endif
+
+  cudnnConvolutionFwdPreference_t fwd_pref;
+  cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+  cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+  if (pref_ == "fastest") {
+    fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+  } else if (pref_ == "limited_workspace") {
+    fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+  } else if (pref_ == "no_workspace") {
+    fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+    bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+    bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+  } else {
+    LOG(FATAL) << "Algorithm preference is not implemented!";
+  }
+  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+      ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+      workspace_byte_limit_, &fp_alg_));
+
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+      ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+      bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+      ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+      bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+
+  size_t fp_byte, bp_data_byte, bp_filter_byte;
+  CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+      ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+      &fp_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+      ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+      bp_data_alg_, &bp_data_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+      bp_filter_alg_, &bp_filter_byte));
+  workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                         sizeof(float) +
+                     1;
+  workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
+  CHECK_EQ(input.device()->lang(), kCuda);
+  CHECK_EQ(input.shape().size(), 4);
+  buf_.push(input);
+  batchsize_ = input.shape()[0];
+  DataType dtype = input.data_type();
+  Device *dev = input.device();
+
+  if (!has_init_cudnn_) InitCudnn(dtype, dev, dev->context(0));
+
+  Shape shape{batchsize_, num_filters_, conv_height_, conv_width_};
+  Tensor output(shape, dev, dtype);
+  float alpha = 1.f, beta = 0.f;
+  output.device()->Exec(
+      [input, output, alpha, beta, this](Context *ctx) {
+        Blob *inblob = input.blob(), *outblob = output.blob(),
+             *wblob = this->weight_.blob();
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
+                                inblob->data(), this->filter_desc_,
+                                wblob->data(), this->conv_desc_, this->fp_alg_,
+                                this->workspace_.blob()->mutable_data(),
+                                this->workspace_count_ * sizeof(float), &beta,
+                                this->y_desc_, outblob->mutable_data());
+      },
+      {input.blob(), weight_.blob()}, {output.blob()}, workspace_.blob());
+
+  if (bias_term_) {
+    beta = 1.f;
+    output.device()->Exec(
+        [output, alpha, beta, this](Context *ctx) {
+          Blob *outblob = output.blob(), *bblob = this->bias_.blob();
+          cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
+                         bblob->data(), &beta, this->y_desc_,
+                         outblob->mutable_data());
+        },
+        {output.blob(), bias_.blob()}, {output.blob()});
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
+    int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kCuda);
+  CHECK_EQ(grad.shape().size(), 4);
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  float alpha = 1.f, beta = 0.f;
+  vector<Tensor> param_grad;
+  Tensor dx;
+  dx.ResetLike(src_data);
+  Tensor db, dw;
+  db.ResetLike(bias_);
+  dw.ResetLike(weight_);
+
+  // LOG(ERROR) << "backward bias";
+  if (bias_term_) {
+    dx.device()->Exec(
+        [grad, db, alpha, beta, this](Context *ctx) {
+          Blob *dyblob = grad.blob(), *dbblob = db.blob();
+          cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
+                                       dyblob->data(), &beta, this->bias_desc_,
+                                       dbblob->mutable_data());
+        },
+        {grad.blob()}, {db.blob()});
+  }
+  // LOG(ERROR) << "backward w";
+  dx.device()->Exec(
+      [grad, dw, src_data, alpha, beta, this](Context *ctx) {
+        Blob *inblob = src_data.blob(), *dyblob = grad.blob(),
+             *dwblob = dw.blob();
+        cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, this->x_desc_, inblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_,
+            this->bp_filter_alg_, this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
+            dwblob->mutable_data());
+      },
+      {grad.blob(), src_data.blob()}, {dw.blob(), workspace_.blob()});
+
+  // LOG(ERROR) << "backward src";
+  dx.device()->Exec(
+      [dx, grad, alpha, beta, this](Context *ctx) {
+        Blob *wblob = this->weight_.blob(), *dyblob = grad.blob(),
+             *dxblob = dx.blob();
+        cudnnConvolutionBackwardData(
+            ctx->cudnn_handle, &alpha, this->filter_desc_, wblob->data(),
+            this->y_desc_, dyblob->data(), this->conv_desc_, this->bp_data_alg_,
+            this->workspace_.blob()->mutable_data(),
+            this->workspace_count_ * sizeof(float), &beta, this->x_desc_,
+            dxblob->mutable_data());
+      },
+      {grad.blob(), weight_.blob()}, {dx.blob(), workspace_.blob()});
+  param_grad.push_back(dw);
+  param_grad.push_back(db);
+  return std::make_pair(dx, param_grad);
+}
+
+}  // namespace singa
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.h b/src/model/layer/cudnn_convolution.h
new file mode 100644
index 0000000..cf04be0
--- /dev/null
+++ b/src/model/layer/cudnn_convolution.h
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
+#define SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
+#include "singa_config.h"
+#ifdef USE_CUDNN
+#include <string>
+#include <utility>
+#include <vector>
+#include "./convolution.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnConvolution : public Convolution {
+ public:
+  ~CudnnConvolution();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "CudnnConvolution"; }
+
+  const Tensor Forward(int flag, const Tensor &input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf &conf) override;
+  /// Init cudnn related data structures.
+  void InitCudnn(DataType dtype, Device *dev, Context *ctx);
+
+  void ToDevice(Device *device) override;
+
+  size_t workspace_byte_limit() { return workspace_byte_limit_; }
+  string pref() { return pref_; }
+
+ protected:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t x_desc_ = nullptr;
+  cudnnTensorDescriptor_t y_desc_ = nullptr;
+  cudnnTensorDescriptor_t bias_desc_ = nullptr;
+  cudnnFilterDescriptor_t filter_desc_ = nullptr;
+  cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
+  cudnnConvolutionFwdAlgo_t fp_alg_;
+  cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+  cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+  size_t workspace_byte_limit_, workspace_count_;
+  Tensor workspace_;
+  string pref_;
+};
+
+}  // namespace singa
+
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
new file mode 100644
index 0000000..d68bcd2
--- /dev/null
+++ b/src/model/layer/cudnn_pooling.cc
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_pooling.h"
+#ifdef USE_CUDNN
+
+#include <cudnn.h>
+#include <chrono>
+
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+CudnnPooling::~CudnnPooling() {
+  if (pool_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnPooling::Setup(const LayerConf &conf) {
+  Pooling::Setup(conf);
+  PoolingConf pool_conf = conf.pooling_conf();
+  if (pool_conf.nan_prop())
+    nan_prop_ = CUDNN_PROPAGATE_NAN;
+  else
+    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
+}
+
+void CudnnPooling::InitCudnn(DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreatePoolingDescriptor(&pool_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), batchsize_,
+                                         channels_, height_, width_));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize_,
+      channels_, pooled_height_, pooled_width_));
+  auto pool_method = CUDNN_POOLING_MAX;
+  if (pool_ == PoolingConf_PoolMethod_MAX)
+    pool_method = CUDNN_POOLING_MAX;
+  else if (pool_ == PoolingConf_PoolMethod_AVE)
+    pool_method = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+  else
+    LOG(FATAL) << "Not implemented!";
+
+#if CUDNN_VERSION_MAJOR == 5
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor(pool_desc_, pool_method, nan_prop_,
+                                          kernel_h_, kernel_w_, pad_h_, pad_w_,
+                                          stride_h_, stride_w_));
+#elif CUDNN_VERSION_MAJOR == 4
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(pool_desc_, pool_method, nan_prop_,
+                                             kernel_h_, kernel_w_, pad_h_,
+                                             pad_w_, stride_h_, stride_w_));
+#else
+  LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
+#endif
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
+  CHECK_EQ(input.device()->lang(), kCuda);
+  CHECK_EQ(input.shape().size(), 4);
+  buf_.push(input);
+  batchsize_ = input.shape()[0];
+  DataType dtype = input.data_type();
+  Device *dev = input.device();
+  float alpha = 1.0f, beta = 0.0f;
+  if (!has_init_cudnn_) InitCudnn(dtype);
+
+  Shape shape{batchsize_, channels_, pooled_height_, pooled_width_};
+  Tensor output = Tensor(shape, dev, dtype);
+  output.device()->Exec(
+      [input, output, alpha, beta, this](Context *ctx) {
+        Blob *inblob = input.blob(), *outblob = output.blob();
+        cudnnPoolingForward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                            this->x_desc_, inblob->data(), &beta, this->y_desc_,
+                            outblob->mutable_data());
+      },
+      {input.blob()}, {output.blob()});
+  buf_.push(output);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnPooling::Backward(
+    int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kCuda);
+  CHECK_EQ(grad.shape().size(), 4);
+  vector<Tensor> param_grad;
+  Tensor dx;
+  Tensor data = buf_.top();
+  buf_.pop();
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  dx.ResetLike(src_data);
+
+  float alpha = 1.0f, beta = 0.0f;
+  dx.device()->Exec(
+      [dx, grad, src_data, data, alpha, beta, this](Context *ctx) {
+        Blob *dyblob = grad.blob(), *dxblob = dx.blob(),
+             *yblob = data.blob(), *xblob = src_data.blob();
+        cudnnPoolingBackward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                             this->y_desc_, yblob->data(), this->y_desc_,
+                             dyblob->data(), this->x_desc_, xblob->data(),
+                             &beta, this->x_desc_, dxblob->mutable_data());
+      },
+      {grad.blob(), data.blob(), src_data.blob()}, {dx.blob()});
+
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+#endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.h b/src/model/layer/cudnn_pooling.h
new file mode 100644
index 0000000..14bdf40
--- /dev/null
+++ b/src/model/layer/cudnn_pooling.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_POOLING_H_
+#define SRC_MODEL_LAYER_CUDNN_POOLING_H_
+#include "singa_config.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "./pooling.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnPooling : public Pooling {
+ public:
+  ~CudnnPooling();
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "CudnnPooling"; }
+
+  void Setup(const LayerConf &conf) override;
+  const Tensor Forward(int flag, const Tensor &input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+
+  /// Init cudnn related data structures.
+  void InitCudnn(DataType dtype);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t x_desc_ = nullptr;
+  cudnnTensorDescriptor_t y_desc_ = nullptr;
+  cudnnPoolingDescriptor_t pool_desc_ = nullptr;
+  cudnnNanPropagation_t nan_prop_;
+};
+}  // namespace singa
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_POOLING_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc
new file mode 100644
index 0000000..05c6bc9
--- /dev/null
+++ b/src/model/layer/pooling.cc
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./pooling.h"
+#include "singa/model/layer.h"
+namespace singa {
+
+void Pooling::Setup(const LayerConf& conf) {
+  Layer::Setup(conf);
+
+  PoolingConf pool_conf = conf.pooling_conf();
+  if (pool_conf.has_kernel_size()) {
+    kernel_w_ = kernel_h_ = pool_conf.kernel_size();
+  } else {
+    kernel_w_ = pool_conf.kernel_w();
+    kernel_h_ = pool_conf.kernel_h();
+  }
+  CHECK_NE(kernel_w_, 0);
+  CHECK_NE(kernel_h_, 0);
+
+  if (pool_conf.has_pad()) {
+    pad_w_ = pad_h_ = pool_conf.pad();
+  } else {
+    pad_w_ = pool_conf.pad_w();
+    pad_h_ = pool_conf.pad_h();
+  }
+
+  if (pool_conf.has_stride()) {
+    stride_w_ = stride_h_ = pool_conf.stride();
+  } else {
+    stride_w_ = pool_conf.stride_w();
+    stride_h_ = pool_conf.stride_h();
+  }
+
+  pool_ = pool_conf.pool();
+  CHECK(pool_ == PoolingConf_PoolMethod_AVE ||
+        pool_ == PoolingConf_PoolMethod_MAX ||
+        pool_ == PoolingConf_PoolMethod_STOCHASTIC)
+      << "Padding implemented only for average and max pooling.";
+
+  channels_ = pool_conf.channels();
+  height_ = pool_conf.height();
+  width_ = pool_conf.width();
+  pooled_height_ =
+      static_cast<size_t>((height_ + 2 * pad_h_ - kernel_h_) / stride_h_) + 1;
+  pooled_width_ =
+      static_cast<size_t>((width_ + 2 * pad_w_ - kernel_w_) / stride_w_) + 1;
+}
+
+const Tensor Pooling::Forward(int flag, const Tensor& input) {
+  Tensor out;
+
+  return out;
+}
+
+const std::pair<Tensor, vector<Tensor>> Pooling::Backward(int flag,
+                                                          const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad;
+
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/model/layer/pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h
new file mode 100644
index 0000000..ce6670d
--- /dev/null
+++ b/src/model/layer/pooling.h
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_POOLING_H_
+#define SRC_MODEL_LAYER_POOLING_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include <stack>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Pooling : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "Pooling"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const LayerConf& conf) override;
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  size_t kernel_w() const { return kernel_w_; }
+  size_t kernel_h() const { return kernel_h_; }
+  size_t pad_w() const { return pad_w_; }
+  size_t pad_h() const { return pad_h_; }
+  size_t stride_w() const { return stride_w_; }
+  size_t stride_h() const { return stride_h_; }
+  PoolingConf_PoolMethod pool_method() const { return pool_; }
+  size_t batchsize() const { return batchsize_; }
+  size_t channels() const { return channels_; }
+  size_t height() const { return height_; }
+  size_t width() const { return width_; }
+
+ protected:
+  size_t kernel_w_, pad_w_, stride_w_;
+  size_t kernel_h_, pad_h_, stride_h_;
+  size_t batchsize_, channels_, height_, width_, pooled_height_, pooled_width_;
+  PoolingConf_PoolMethod pool_;
+  // To store the input and output(of forward) tensors
+  std::stack<Tensor> buf_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_POOLING_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 51225ee..03ad6ad 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -334,6 +334,16 @@ message ConvolutionConf {
   // implementation; for input blobs with num_axes != 2, this option is
   // ignored and the ND implementation will be used.)
   optional bool force_nd_im2col = 17 [default = false];
+  // add by xiangrui
+  // cudnn workspace size in MB
+  optional int32 workspace_byte_limit = 50 [default = 512];
+  // cudnn algorithm preference
+  // options: "fastest", "limited_workspace", "no_workspace"
+  optional string algo_pref = 51 [default = "fastest"];
+  // input shape
+  optional int32 channels = 52;
+  optional int32 height = 53;
+  optional int32 width = 54;
 }
 
 /*
@@ -595,6 +605,12 @@ message PoolingConf {
   // If global_pooling then it will pool over the size of the bottom by doing
   // kernel_h = bottom->height and kernel_w = bottom->width
   optional bool global_pooling = 12 [default = false];
+  // Shape of source
+  optional int32 channels = 50;
+  optional int32 height = 51;
+  optional int32 width = 52;
+  // whether to propagate nan
+  optional bool nan_prop = 53 [default = false];
 }
 
 message PowerConf {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
new file mode 100644
index 0000000..0955c82
--- /dev/null
+++ b/test/singa/test_cudnn_convolution.cc
@@ -0,0 +1,205 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/cudnn_convolution.h"
+#ifdef USE_CUDNN
+
+#include "gtest/gtest.h"
+
+using singa::CudnnConvolution;
+TEST(CudnnConvolution, Setup) {
+  CudnnConvolution conv;
+  EXPECT_EQ("CudnnConvolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_algo_pref("fastest");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  EXPECT_EQ(2, conv.kernel_h());
+  EXPECT_EQ(2, conv.kernel_w());
+  EXPECT_EQ(1, conv.pad_h());
+  EXPECT_EQ(1, conv.pad_w());
+  EXPECT_EQ(1, conv.stride_h());
+  EXPECT_EQ(1, conv.stride_w());
+  EXPECT_EQ(2, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(256 << 20, conv.workspace_byte_limit());
+  EXPECT_STREQ("fastest", conv.pref().c_str());
+  EXPECT_EQ(1, conv.channels());
+  EXPECT_EQ(3, conv.height());
+  EXPECT_EQ(3, conv.width());
+}
+
+TEST(CudnnConvolution, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * h * w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * h * w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_algo_pref("fastest");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(4, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+}
+
+TEST(CudnnConvolution, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                              6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * src_h * src_w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
+                       &cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_algo_pref("fastest");
+  convconf->set_channels(1);
+  convconf->set_height(3);
+  convconf->set_width(3);
+  conv.Setup(conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w}, &cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToDevice(&host);
+  const float *dx = in_grad.data<const float *>();
+  const float *wptr = we;
+  EXPECT_EQ(9, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  dw.ToDevice(&host);
+  db.ToDevice(&host);
+  const float *dbptr = db.data<const float *>();
+  EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
+
+  const float *dwptr = dw.data<const float *>();
+  EXPECT_EQ(9, dw.Size());
+  EXPECT_EQ(dy[3] * x[4], dwptr[0]);
+  EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
+  EXPECT_EQ(dy[2] * x[4], dwptr[2]);
+  EXPECT_EQ(dy[1] * x[1] + dy[3] * x[7], dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[1] * x[2] + dy[2] * x[6] + dy[3] * x[8],
+                  dwptr[4]);
+  EXPECT_EQ(dy[0] * x[1] + dy[2] * x[7], dwptr[5]);
+  EXPECT_EQ(dy[1] * x[4], dwptr[6]);
+  EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
+  EXPECT_EQ(dy[0] * x[4], dwptr[8]);
+}  // USE_CUDNN
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/152056d4/test/singa/test_cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_pooling.cc b/test/singa/test_cudnn_pooling.cc
new file mode 100644
index 0000000..0bfd620
--- /dev/null
+++ b/test/singa/test_cudnn_pooling.cc
@@ -0,0 +1,141 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/cudnn_pooling.h"
+#ifdef USE_CUDNN
+
+#include "gtest/gtest.h"
+
+using singa::CudnnPooling;
+TEST(CudnnPooling, Setup) {
+  CudnnPooling pool;
+  EXPECT_EQ("CudnnPooling", pool.layer_type());
+
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(1);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(1);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(2);
+  poolconf->set_stride_w(1);
+  poolconf->set_channels(1);
+  poolconf->set_height(3);
+  poolconf->set_width(3);
+  pool.Setup(conf);
+
+  EXPECT_EQ(singa::PoolingConf_PoolMethod_MAX, pool.pool_method());
+  EXPECT_EQ(1, pool.kernel_h());
+  EXPECT_EQ(2, pool.kernel_w());
+  EXPECT_EQ(1, pool.pad_h());
+  EXPECT_EQ(0, pool.pad_w());
+  EXPECT_EQ(2, pool.stride_h());
+  EXPECT_EQ(1, pool.stride_w());
+  EXPECT_EQ(1, pool.channels());
+  EXPECT_EQ(3, pool.height());
+  EXPECT_EQ(3, pool.width());
+}
+
+TEST(CudnnPooling, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                      6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c,  h, w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  CudnnPooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  poolconf->set_channels(1);
+  poolconf->set_height(3);
+  poolconf->set_width(3);
+  pool.Setup(conf);
+
+  // Parameter "flag" does not influence pooling
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+  singa::CppCPU host(0, 1);
+  out1.ToDevice(&host);
+  const float *outptr1 = out1.data<const float *>();
+  // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
+  EXPECT_EQ(4, out1.Size());
+  EXPECT_EQ(5.0f, outptr1[0]);
+  EXPECT_EQ(6.0f, outptr1[1]);
+  EXPECT_EQ(8.0f, outptr1[2]);
+  EXPECT_EQ(9.0f, outptr1[3]);
+}
+
+TEST(CudnnPooling, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                              6.0f, 7.0f, 8.0f, 9.0f};
+  singa::CudaGPU cuda(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  CudnnPooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  poolconf->set_channels(1);
+  poolconf->set_height(3);
+  poolconf->set_width(3);
+  pool.Setup(conf);
+
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, &cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
+
+  const auto ret = pool.Backward(singa::kTrain, grad);
+  singa::CppCPU host(0, 1);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToDevice(&host);
+  const float *dx = in_grad.data<const float *>();
+  EXPECT_EQ(9, in_grad.Size());
+  EXPECT_EQ(0.0f, dx[0]);
+  EXPECT_EQ(0.0f, dx[1]);
+  EXPECT_EQ(0.0f, dx[2]);
+  EXPECT_EQ(0.0f, dx[3]);
+  EXPECT_EQ(0.1f, dx[4]);
+  EXPECT_EQ(0.2f, dx[5]);
+  EXPECT_EQ(0.0f, dx[6]);
+  EXPECT_EQ(0.3f, dx[7]);
+  EXPECT_EQ(0.4f, dx[8]);
+}
+#endif  // USE_CUDNN