You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2018/07/05 03:09:56 UTC

[01/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Repository: incubator-singa
Updated Branches:
  refs/heads/master 7a19e63db -> 56292f1fb


SINGA-371 Implement functional operations in c++ for autograd

- fix some bugs in interface files.

- rename files.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d48dea0f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d48dea0f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d48dea0f

Branch: refs/heads/master
Commit: d48dea0f3730cff17534f9af7e6b6ba767781670
Parents: af95cc1
Author: xuewanqi <xu...@u.nus.edu>
Authored: Thu Jun 14 08:08:27 2018 +0000
Committer: xuewanqi <xu...@u.nus.edu>
Committed: Wed Jun 20 14:47:05 2018 +0000

----------------------------------------------------------------------
 src/api/model_operation.i          |  59 ++---
 src/model/convolution_forward.cc   | 367 --------------------------------
 src/model/convolution_forward.h    |  59 -----
 src/model/convolution_functions.cc | 367 ++++++++++++++++++++++++++++++++
 src/model/convolution_functions.h  |  59 +++++
 5 files changed, 439 insertions(+), 472 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d48dea0f/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 64ecca1..79707eb 100644
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -1,59 +1,26 @@
-/* interface file for swig */
-
 %module model_operation
-%include "std_string.i"
 
 %{
 #include "../src/model/convolution_functions.h"
+using singa::Tensor;
+using singa::CudnnConvHandle;
 %}
-
 namespace singa{
-extern struct ConvHandle{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    std::string prefer_;
-};
-
-struct CudnnConvHandle{
-    cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-};
-
-extern ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
+
+struct ConvHandle{};
+
+struct CudnnConvHandle{};
+
+ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
 
 CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
 
-Tensor CudnnConvForward(const Tensor x, const Tensor W, const Tensor b,
+Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
                         const ConvHandle ch, const CudnnConvHandle cch);
 
-Tensor CudnnConvBackwardW(const Tensor dy, const Tensor x, const Tensor W, const CudnnConvHandle cch);
+Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch);
 
-Tensor CudnnConvBackwardb(const Tensor dy, const Tensor b, const CudnnConvHandle cch);
+Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch);
 
-Tensor CudnnConvBackwardx(const Tensor dy, const Tensor W, const Tensor x, const CudnnConvHandle cch);
+Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d48dea0f/src/model/convolution_forward.cc
----------------------------------------------------------------------
diff --git a/src/model/convolution_forward.cc b/src/model/convolution_forward.cc
deleted file mode 100644
index 52acf05..0000000
--- a/src/model/convolution_forward.cc
+++ /dev/null
@@ -1,367 +0,0 @@
-//#include <string>
-//#include <cudnn.h>
-//#include "./layer/cudnn_convolution.h"
-//#include "./layer/cudnn_utils.h"
-//#include "singa/utils/logging.h"
-#include "./convolution_forward.h"
-
-namespace singa{
-
-// Done in conv2d.__init__()
-ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
-
-    size_t kernel_w_, pad_w_, stride_w_;
-    size_t kernel_h_, pad_h_, stride_h_;
-
-    size_t channels_, num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    string prefer_;
-
-    ConvolutionConf conv_conf = conf.convolution_conf();
-
-    workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
-    prefer_ = ToLowerCase(conv_conf.prefer());
-    CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-          prefer_ == "no_workspace" || prefer_ == "autotune")
-            << "CudnnConvolution only supports four algorithm preferences: fastest, "
-               "limited_workspace, no_workspace and autotune";
-
-
-    // kernel_size, pad, and stride are repeated fields.
-    if (conv_conf.kernel_size_size() > 0) {
-    if (conv_conf.kernel_size_size() == 1) {
-    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
-    } else {
-    kernel_w_ = conv_conf.kernel_size(0);
-    kernel_h_ = conv_conf.kernel_size(1);
-    }
-    } else {
-    kernel_w_ = conv_conf.kernel_w();
-    kernel_h_ = conv_conf.kernel_h();
-    }
-    CHECK_GT(kernel_w_, 0u);
-    CHECK_GT(kernel_h_, 0u);
-
-    if (conv_conf.pad_size() > 0) {
-    if (conv_conf.pad_size() == 1) {
-    pad_w_ = pad_h_ = conv_conf.pad(0);
-    } else {
-    pad_w_ = conv_conf.pad(0);
-    pad_h_ = conv_conf.pad(1);
-    }
-    } else {
-    pad_w_ = conv_conf.pad_w();
-    pad_h_ = conv_conf.pad_h();
-    }
-    CHECK_GE(pad_w_, 0u);
-    CHECK_GE(pad_h_, 0u);
-
-    const int kStrideDefault = 1;
-    if (conv_conf.stride_size() > 0) {
-    if (conv_conf.stride_size() == 1) {
-    stride_w_ = stride_h_ = conv_conf.stride(0);
-    } else {
-    stride_w_ = conv_conf.stride(0);
-    stride_h_ = conv_conf.stride(1);
-    }
-    } else {
-    stride_w_ = kStrideDefault;
-    stride_h_ = kStrideDefault;
-    if (conv_conf.has_stride_w()) {
-    stride_w_ = conv_conf.stride_w();
-    }
-    if (conv_conf.has_stride_h()) {
-    stride_h_ = conv_conf.stride_h();
-    }
-    }
-    CHECK_GT(stride_w_, 0u);
-    CHECK_GE(stride_h_, 0u);  // 0 for 1D conv
-
-    channels_ = in_channels;
-    num_filters_ = conv_conf.num_output();
-    bias_term_ = conv_conf.bias_term();
-
-    return ConvHandle{
-            kernel_w_,
-            pad_w_,
-            stride_w_,
-            kernel_h_,
-            pad_h_,
-            stride_h_,
-
-            channels_,
-            num_filters_,
-
-            bias_term_,
-
-            workspace_byte_limit_,
-            prefer_,
-    };
-};
-
-
-
-// Done in conv2d.__call__():
-// if self.cudnnconvhandle is None:
-//     self.cudnnconvhandle= InitCudnn(...)
-// elif x.shape(0) != self.cudnnconvhandle.batchsize:
-//     self.cudnnconvhandle= InitCudnn(...)
-CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
-
-    cudnnTensorDescriptor_t x_desc_ = nullptr;
-    cudnnTensorDescriptor_t y_desc_ = nullptr;
-    cudnnTensorDescriptor_t bias_desc_ = nullptr;
-    cudnnFilterDescriptor_t filter_desc_ = nullptr;
-    cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-
-    DataType dtype = input.data_type();
-    auto dev = input.device();
-    Context *ctx = dev->context(0);
-
-    size_t batchsize, channels_;
-    batchsize = input.shape(0);
-    channels_ = input.shape(1);
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
-
-    conv_height_ = 1;
-    if (ch.stride_h_ > 0)
-        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
-    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
-
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-    if (ch.bias_term_)
-        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), batchsize,
-                                           ch.channels_, height_, width_));
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-            ch.num_filters_, conv_height_, conv_width_));
-    if (ch.bias_term_)
-        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                               GetCudnnDataType(dtype), 1,
-                                               ch.num_filters_, 1, 1));
-    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, ch.pad_h_, ch.pad_w_,
-                                                ch.stride_h_, ch.stride_w_, 1, 1,
-                                                CUDNN_CROSS_CORRELATION,
-                                                GetCudnnDataType(dtype)));
-    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                           CUDNN_TENSOR_NCHW, ch.num_filters_,
-                                           channels_, ch.kernel_h_, ch.kernel_w_));
-    if (ch.prefer_ == "fastest" || ch.prefer_ == "limited_workspace" ||
-        ch.prefer_ == "no_workspace") {
-        cudnnConvolutionFwdPreference_t fwd_pref;
-        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-        if (ch.prefer_ == "fastest") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-        } else if (ch.prefer_ == "limited_workspace") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        } else {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        }
-        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                ch.workspace_byte_limit_, &fp_alg_));
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bwd_filt_pref, ch.workspace_byte_limit_, &bp_filter_alg_));
-        // deprecated in cudnn v7
-        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bwd_data_pref, ch.workspace_byte_limit_, &bp_data_alg_));
-    } else if (ch.prefer_ == "autotune") {
-        const int topk = 1;
-        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
-        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
-        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
-        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                &num_fp_alg, fp_alg_perf));
-        fp_alg_ = fp_alg_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
-                &num_bp_filt_alg, bp_filt_perf));
-        bp_filter_alg_ = bp_filt_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
-                &num_bp_data_alg, bp_data_perf));
-        bp_data_alg_ = bp_data_perf[0].algo;
-    } else {
-        LOG(FATAL) << "Preferred algorithm is not available!";
-    }
-
-    size_t fp_byte, bp_data_byte, bp_filter_byte;
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
-            &fp_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-            bp_data_alg_, &bp_data_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-            bp_filter_alg_, &bp_filter_byte));
-    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
-                       sizeof(float) +
-                       1;
-    if (workspace_count_ * sizeof(float) > ch.workspace_byte_limit_)
-        LOG(WARNING) << "The required memory for workspace ("
-                     << workspace_count_ * sizeof(float)
-                     << ") is larger than the expected Bytes ("
-                     << ch.workspace_byte_limit_ << ")";
-    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
-
-    return CudnnConvHandle{
-            x_desc_,
-            y_desc_,
-            bias_desc_,
-            filter_desc_,
-            conv_desc_,
-            fp_alg_,
-            bp_filter_alg_,
-            bp_data_alg_,
-
-            workspace_count_,
-            workspace_,
-
-            height_,
-            width_,
-            conv_height_,
-            conv_width_,
-            batchsize,
-    };
-};
-
-Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
-                        const ConvHandle ch, const CudnnConvHandle cch){
-    CHECK_EQ(x.device()->lang(), kCuda);
-    CHECK_EQ(x.nDim(), 4u);
-    CHECK_EQ(x.shape()[0],cch.batchsize);
-    CHECK_EQ(x.shape()[1],ch.channels_);
-    CHECK_EQ(x.shape()[2],cch.height_);
-    CHECK_EQ(x.shape()[3],cch.width_);
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-
-    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    output.device()->Exec([output, x, W, cch](Context *ctx) {
-        Block *inblock = x.block(), *outblock = output.block(),
-                *wblock = W.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                                inblock->data(), cch.filter_desc_, wblock->data(),
-                                cch.conv_desc_, cch.fp_alg_,
-                                cch.workspace_.block()->mutable_data(),
-                                cch.workspace_count_ * sizeof(float), &beta,
-                                cch.y_desc_, outblock->mutable_data());
-    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-    if (ch.bias_term_) {
-        output.device()->Exec([output, b, cch](Context *ctx) {
-            float beta = 1.f, alpha = 1.0f;
-            Block *outblock = output.block(), *bblock = b.block();
-            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                           bblock->data(), &beta, cch.y_desc_,
-                           outblock->mutable_data());
-        }, {output.block(), b.block()}, {output.block()});
-    }
-    return output;
-};
-
-// input Tensor W for Reset dW purpose, can avoid this later.
-Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor dW;
-    dW.ResetLike(W);
-
-    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
-    Block *inblock = x.block(), *dyblock = dy.block(),
-            *dwblock = dW.block();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-            cch.workspace_.block()->mutable_data(),
-            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
-            dwblock->mutable_data());
-    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
-
-    return dW;
-};
-
-// input Tensor b for Reset db purpose, can avoid this later.
-Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor db;
-    db.ResetLike(b);
-
-    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
-        Block *dyblock = dy.block(), *dbblock = db.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                     dyblock->data(), &beta, cch.bias_desc_,
-                                     dbblock->mutable_data());
-    }, {dy.block()}, {db.block()});
-    return db;
-};
-
-Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor dx;
-    dx.ResetLike(x);
-
-    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
-        Block *wblock = W.block(), *dyblock = dy.block(),
-                *dxblock = dx.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                     wblock->data(), cch.y_desc_, dyblock->data(),
-                                     cch.conv_desc_, cch.bp_data_alg_,
-                                     cch.workspace_.block()->mutable_data(),
-                                     cch.workspace_count_ * sizeof(float), &beta,
-                                     cch.x_desc_, dxblock->mutable_data());
-    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
-
-    return dx;
-};
-
-} //namespace_singa
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d48dea0f/src/model/convolution_forward.h
----------------------------------------------------------------------
diff --git a/src/model/convolution_forward.h b/src/model/convolution_forward.h
deleted file mode 100644
index eba0e50..0000000
--- a/src/model/convolution_forward.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <string>
-#include <cudnn.h>
-#include "./layer/cudnn_convolution.h"
-#include "./layer/cudnn_utils.h"
-#include "singa/utils/logging.h"
-
-namespace singa{
-
-struct ConvHandle{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    string prefer_;
-};
-
-struct CudnnConvHandle{
-    cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-};
-
-ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
-
-CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
-
-Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
-                        const ConvHandle ch, const CudnnConvHandle cch);
-
-Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch);
-
-Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch);
-
-Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d48dea0f/src/model/convolution_functions.cc
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.cc b/src/model/convolution_functions.cc
new file mode 100644
index 0000000..0fc8e65
--- /dev/null
+++ b/src/model/convolution_functions.cc
@@ -0,0 +1,367 @@
+//#include <string>
+//#include <cudnn.h>
+//#include "./layer/cudnn_convolution.h"
+//#include "./layer/cudnn_utils.h"
+//#include "singa/utils/logging.h"
+#include "./convolution_functions.h"
+
+namespace singa{
+
+// Done in conv2d.__init__()
+ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
+
+    size_t kernel_w_, pad_w_, stride_w_;
+    size_t kernel_h_, pad_h_, stride_h_;
+
+    size_t channels_, num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    string prefer_;
+
+    ConvolutionConf conv_conf = conf.convolution_conf();
+
+    workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
+    prefer_ = ToLowerCase(conv_conf.prefer());
+    CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+          prefer_ == "no_workspace" || prefer_ == "autotune")
+            << "CudnnConvolution only supports four algorithm preferences: fastest, "
+               "limited_workspace, no_workspace and autotune";
+
+
+    // kernel_size, pad, and stride are repeated fields.
+    if (conv_conf.kernel_size_size() > 0) {
+    if (conv_conf.kernel_size_size() == 1) {
+    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+    } else {
+    kernel_w_ = conv_conf.kernel_size(0);
+    kernel_h_ = conv_conf.kernel_size(1);
+    }
+    } else {
+    kernel_w_ = conv_conf.kernel_w();
+    kernel_h_ = conv_conf.kernel_h();
+    }
+    CHECK_GT(kernel_w_, 0u);
+    CHECK_GT(kernel_h_, 0u);
+
+    if (conv_conf.pad_size() > 0) {
+    if (conv_conf.pad_size() == 1) {
+    pad_w_ = pad_h_ = conv_conf.pad(0);
+    } else {
+    pad_w_ = conv_conf.pad(0);
+    pad_h_ = conv_conf.pad(1);
+    }
+    } else {
+    pad_w_ = conv_conf.pad_w();
+    pad_h_ = conv_conf.pad_h();
+    }
+    CHECK_GE(pad_w_, 0u);
+    CHECK_GE(pad_h_, 0u);
+
+    const int kStrideDefault = 1;
+    if (conv_conf.stride_size() > 0) {
+    if (conv_conf.stride_size() == 1) {
+    stride_w_ = stride_h_ = conv_conf.stride(0);
+    } else {
+    stride_w_ = conv_conf.stride(0);
+    stride_h_ = conv_conf.stride(1);
+    }
+    } else {
+    stride_w_ = kStrideDefault;
+    stride_h_ = kStrideDefault;
+    if (conv_conf.has_stride_w()) {
+    stride_w_ = conv_conf.stride_w();
+    }
+    if (conv_conf.has_stride_h()) {
+    stride_h_ = conv_conf.stride_h();
+    }
+    }
+    CHECK_GT(stride_w_, 0u);
+    CHECK_GE(stride_h_, 0u);  // 0 for 1D conv
+
+    channels_ = in_channels;
+    num_filters_ = conv_conf.num_output();
+    bias_term_ = conv_conf.bias_term();
+
+    return ConvHandle{
+            kernel_w_,
+            pad_w_,
+            stride_w_,
+            kernel_h_,
+            pad_h_,
+            stride_h_,
+
+            channels_,
+            num_filters_,
+
+            bias_term_,
+
+            workspace_byte_limit_,
+            prefer_,
+    };
+};
+
+
+
+// Done in conv2d.__call__():
+// if self.cudnnconvhandle is None:
+//     self.cudnnconvhandle= InitCudnn(...)
+// elif x.shape(0) != self.cudnnconvhandle.batchsize:
+//     self.cudnnconvhandle= InitCudnn(...)
+CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
+
+    cudnnTensorDescriptor_t x_desc_ = nullptr;
+    cudnnTensorDescriptor_t y_desc_ = nullptr;
+    cudnnTensorDescriptor_t bias_desc_ = nullptr;
+    cudnnFilterDescriptor_t filter_desc_ = nullptr;
+    cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    Context *ctx = dev->context(0);
+
+    size_t batchsize, channels_;
+    batchsize = input.shape(0);
+    channels_ = input.shape(1);
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
+
+    conv_height_ = 1;
+    if (ch.stride_h_ > 0)
+        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
+    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+    if (ch.bias_term_)
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), batchsize,
+                                           ch.channels_, height_, width_));
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+            ch.num_filters_, conv_height_, conv_width_));
+    if (ch.bias_term_)
+        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                               GetCudnnDataType(dtype), 1,
+                                               ch.num_filters_, 1, 1));
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, ch.pad_h_, ch.pad_w_,
+                                                ch.stride_h_, ch.stride_w_, 1, 1,
+                                                CUDNN_CROSS_CORRELATION,
+                                                GetCudnnDataType(dtype)));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                           CUDNN_TENSOR_NCHW, ch.num_filters_,
+                                           channels_, ch.kernel_h_, ch.kernel_w_));
+    if (ch.prefer_ == "fastest" || ch.prefer_ == "limited_workspace" ||
+        ch.prefer_ == "no_workspace") {
+        cudnnConvolutionFwdPreference_t fwd_pref;
+        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+        if (ch.prefer_ == "fastest") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+        } else if (ch.prefer_ == "limited_workspace") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        } else {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        }
+        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                ch.workspace_byte_limit_, &fp_alg_));
+        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bwd_filt_pref, ch.workspace_byte_limit_, &bp_filter_alg_));
+        // deprecated in cudnn v7
+        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bwd_data_pref, ch.workspace_byte_limit_, &bp_data_alg_));
+    } else if (ch.prefer_ == "autotune") {
+        const int topk = 1;
+        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                &num_fp_alg, fp_alg_perf));
+        fp_alg_ = fp_alg_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                &num_bp_filt_alg, bp_filt_perf));
+        bp_filter_alg_ = bp_filt_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                &num_bp_data_alg, bp_data_perf));
+        bp_data_alg_ = bp_data_perf[0].algo;
+    } else {
+        LOG(FATAL) << "Preferred algorithm is not available!";
+    }
+
+    size_t fp_byte, bp_data_byte, bp_filter_byte;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+            &fp_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+            bp_data_alg_, &bp_data_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+            bp_filter_alg_, &bp_filter_byte));
+    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                       sizeof(float) +
+                       1;
+    if (workspace_count_ * sizeof(float) > ch.workspace_byte_limit_)
+        LOG(WARNING) << "The required memory for workspace ("
+                     << workspace_count_ * sizeof(float)
+                     << ") is larger than the expected Bytes ("
+                     << ch.workspace_byte_limit_ << ")";
+    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+
+    return CudnnConvHandle{
+            x_desc_,
+            y_desc_,
+            bias_desc_,
+            filter_desc_,
+            conv_desc_,
+            fp_alg_,
+            bp_filter_alg_,
+            bp_data_alg_,
+
+            workspace_count_,
+            workspace_,
+
+            height_,
+            width_,
+            conv_height_,
+            conv_width_,
+            batchsize,
+    };
+};
+
+Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
+                        const ConvHandle ch, const CudnnConvHandle cch){
+    CHECK_EQ(x.device()->lang(), kCuda);
+    CHECK_EQ(x.nDim(), 4u);
+    CHECK_EQ(x.shape()[0],cch.batchsize);
+    CHECK_EQ(x.shape()[1],ch.channels_);
+    CHECK_EQ(x.shape()[2],cch.height_);
+    CHECK_EQ(x.shape()[3],cch.width_);
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+
+    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    output.device()->Exec([output, x, W, cch](Context *ctx) {
+        Block *inblock = x.block(), *outblock = output.block(),
+                *wblock = W.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                                inblock->data(), cch.filter_desc_, wblock->data(),
+                                cch.conv_desc_, cch.fp_alg_,
+                                cch.workspace_.block()->mutable_data(),
+                                cch.workspace_count_ * sizeof(float), &beta,
+                                cch.y_desc_, outblock->mutable_data());
+    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+    if (ch.bias_term_) {
+        output.device()->Exec([output, b, cch](Context *ctx) {
+            float beta = 1.f, alpha = 1.0f;
+            Block *outblock = output.block(), *bblock = b.block();
+            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                           bblock->data(), &beta, cch.y_desc_,
+                           outblock->mutable_data());
+        }, {output.block(), b.block()}, {output.block()});
+    }
+    return output;
+};
+
+// input Tensor W for Reset dW purpose, can avoid this later.
+Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor dW;
+    dW.ResetLike(W);
+
+    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
+    Block *inblock = x.block(), *dyblock = dy.block(),
+            *dwblock = dW.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+            cch.workspace_.block()->mutable_data(),
+            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+            dwblock->mutable_data());
+    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+    return dW;
+};
+
+// input Tensor b for Reset db purpose, can avoid this later.
+Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
+        Block *dyblock = dy.block(), *dbblock = db.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                     dyblock->data(), &beta, cch.bias_desc_,
+                                     dbblock->mutable_data());
+    }, {dy.block()}, {db.block()});
+    return db;
+};
+
+Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor dx;
+    dx.ResetLike(x);
+
+    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
+        Block *wblock = W.block(), *dyblock = dy.block(),
+                *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                     wblock->data(), cch.y_desc_, dyblock->data(),
+                                     cch.conv_desc_, cch.bp_data_alg_,
+                                     cch.workspace_.block()->mutable_data(),
+                                     cch.workspace_count_ * sizeof(float), &beta,
+                                     cch.x_desc_, dxblock->mutable_data());
+    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+
+    return dx;
+};
+
+} //namespace_singa
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d48dea0f/src/model/convolution_functions.h
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.h b/src/model/convolution_functions.h
new file mode 100644
index 0000000..eba0e50
--- /dev/null
+++ b/src/model/convolution_functions.h
@@ -0,0 +1,59 @@
+#include <string>
+#include <cudnn.h>
+#include "./layer/cudnn_convolution.h"
+#include "./layer/cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa{
+
+struct ConvHandle{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    string prefer_;
+};
+
+struct CudnnConvHandle{
+    cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+};
+
+ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
+
+CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
+
+Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
+                        const ConvHandle ch, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
+
+}



[13/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- make convolution operation stateless

- implement new conv2d layer linear layer by calling corresponding operations

- new developed layers have passed test in network(/example/autograd/mnist_cnn.py)


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/82ef4179
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/82ef4179
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/82ef4179

Branch: refs/heads/master
Commit: 82ef41799ead7c6fd13746f4b155c1d98c14e6ae
Parents: aa9c52a
Author: xuewanqi <xu...@outlook.com>
Authored: Sun Jul 1 15:55:40 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Mon Jul 2 06:09:07 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py | 222 ++++++++++++++++++++++++++++--------------
 1 file changed, 150 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82ef4179/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index b1475bb..474fff4 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -468,41 +468,6 @@ class Conv2d(Operation):
         ret = self.PyLayer.layer.Backward(self.flag, dy)
         return (ret[0],) + ret[1]
 
-
-class Linear(Operation):
-
-    def __init__(self, in_features, out_features, bias=True):
-        self.in_features = in_features
-        self.out_features = out_features
-        self.w_shape = (in_features, out_features)
-        self.b_shape = (1, out_features)
-        self.bias = bias
-        self.init_value = False
-
-    def get_params(self):
-        assert self.init_value is True, 'must initialize before get_params()'
-        if self.bias:
-            return (self.w, self.b)
-        else:
-            return self.w
-
-    def __call__(self, x):
-        if self.init_value is False:
-            self.w = Tensor(shape=self.w_shape,
-                            requires_grad=True, stores_grad=True)
-            std = math.sqrt(2.0 / (self.in_features + self.out_features))
-            self.w.gaussian(0.0, std)
-            if self.bias:
-                self.b = Tensor(shape=self.b_shape,
-                                requires_grad=True, stores_grad=True)
-                self.b.set_value(0.0)
-            self.init_value = True
-        y = matmul(x, self.w)
-        if self.bias:
-            y = add_bias(y, self.b, axis=0)
-        return y
-
-
 class MaxPool2d(Operation):
 
     def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1,
@@ -583,8 +548,8 @@ class Flatten(Operation):
 def flatten(x):
     return Flatten()(x)[0]
 
-class Conv2D(Operation):
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+class CONV2D(Operation):
+    '''def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True, **kwargs):
 
         self.in_channels = in_channels
@@ -654,60 +619,52 @@ class Conv2D(Operation):
 
     	xs = [x, self.W, self.b]
 
-    	return self._do_forward(*xs)[0]
+    	return self._do_forward(*xs)[0]'''
+    def __init__(self, handles):
+        self.handles = handles
 
-    def forward(self, *xs):
-        assert xs[0].nDim() == 4, 'The dimensions of input should be 4D.'
-        assert xs[0].shape()[1] == self.in_channels, 'in_channels dismatched.'
-        #assert (xs[0].shape()[2]+2*self.padding[0]-self.kernel_size[0]-1)%self.stride[0] == 0, 'invalid padding.'
-        assert 0==0, 'invalid padding'
+    def forward(self, x, W, b):
+        #assert x.nDim() == 4, 'The dimensions of input should be 4D.'
+        #assert x.shape()[1] == self.in_channels, 'in_channels dismatched.'
+        #assert (xs[0].shape()[2]+2*self.padding[0]-self.kernel_size[0])%self.stride[0] == 0, 'invalid padding.'
+        #assert (xs[0].shape()[3]+2*self.padding[1]-self.kernel_size[1])%self.stride[1] == 0, 'invalid padding'
+        #assert 0 == 0, 'invalid padding'
 
         if training:
-            self.x = xs[0]
+            self.inputs = (x,W,b)
 
-        if self.device_id == -1:
-            if not hasattr (self, 'handles'):
-                self.handles = singa.ConvHandles(xs[0], self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias)
-            elif xs[0].shape()[0] != self.handles.batchsize:
-                self.handles = singa.ConvHandles(xs[0], self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias)
-            return singa.CpuConvForward(xs[0], xs[1], xs[2], self.handles)
+        if self.handles.device_id == -1:
+            return singa.CpuConvForward(x, W, b, self.handles)
 
         else:
-            if not hasattr(self, 'handles'):
-                self.handles = singa.CudnnConvHandles(xs[0], self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias,
-                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
-            elif xs[0].shape()[0] != self.handles.batchsize:
-                self.handles = singa.CudnnConvHandles(xs[0], self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias,
-                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
-            return singa.GpuConvForward(xs[0], xs[1], xs[2], self.handles)
+            return singa.GpuConvForward(x, W, b, self.handles)
 
     def backward(self, dy):
-        assert training is True and hasattr(self, 'x'), 'Please set training as True before do BP. '
+        assert training is True and hasattr(self, 'inputs'), 'Please set training as True before do BP. '
 
-        if dy.device().id() != self.device_id:
+        if dy.device().id() != self.handles.device_id:
             dy.ToDevice(self.x.device())
 
-        if self.device_id == -1: 
-            dx = singa.CpuConvBackwardx(dy, self.W.data, self.x, self.handles)
-            dW = singa.CpuConvBackwardW(dy, self.x, self.W.data, self.handles)
-            if self.bias:
-                db = singa.CpuConvBackwardb(dy, self.b.data, self.handles)
+        if self.handles.device_id == -1: 
+            dx = singa.CpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles)
+            dW = singa.CpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles)
+            if self.handles.bias:
+                db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handles)
                 return dx, dW, db
             else:
                 return dx, dW
         else:
-            dx = singa.GpuConvBackwardx(dy, self.W.data, self.x, self.handles)
-            dW = singa.GpuConvBackwardW(dy, self.x, self.W.data, self.handles)
-            if self.bias:
-                db = singa.GpuConvBackwardb(dy, self.b.data, self.handles)
+            dx = singa.GpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles)
+            dW = singa.GpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles)
+            if self.handles.bias:
+                db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handles)
                 return dx, dW, db
             else:
                 return dx, dW
 
+def conv2d(x,W,b,handles):
+    return CONV2D(handles)(x,W,b)[0]
+
 def infer_dependency(op):
     '''
     Infer the dependency of all operations with the
@@ -818,3 +775,124 @@ def backward(y, dy=None):
                     del not_ready[src_op]
 
     return gradients
+
+class newlayer(object):
+    def __init__(self):
+        pass
+
+    def device_check(*inputs):
+        pass
+
+
+class Linear(newlayer):
+    def __init__(self, in_features, out_features, bias=True):
+        #self.in_features = in_features
+        #self.out_features = out_features
+        w_shape = (in_features, out_features)
+        b_shape = (1, out_features)
+        self.bias = bias
+        
+        self.W = Tensor(shape=w_shape,
+                        requires_grad=True, stores_grad=True)
+        std = math.sqrt(2.0 / (in_features + out_features))
+        self.W.gaussian(0.0, std)
+        
+        if self.bias:
+            self.b = Tensor(shape=b_shape,
+                            requires_grad=True, stores_grad=True)
+            self.b.set_value(0.0)
+
+    def __call__(self, x):
+        if self.bias:
+            self.device_check(x, self.W, self.b)
+        else:
+            self.device_check(x, self.W)
+        y = matmul(x, self.W)
+        if self.bias:
+            y = add_bias(y, self.b, axis=0)
+        return y
+
+class Conv2D(newlayer):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True, **kwargs):
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        if isinstance(kernel_size, int):
+            self.kernel_size = (kernel_size, kernel_size)
+        elif isinstance(kernel_size, tuple):
+            self.kernel_size = kernel_size
+        else:
+            raise TypeError('Wrong kernel_size type.')
+        
+        if isinstance(stride, int):
+            self.stride = (stride,stride)
+        elif isinstance(stride, tuple):
+            self.stride = stride
+        else:
+            raise TypeError('Wrong stride type.')
+
+        if isinstance(padding, int):
+            self.padding = (padding,padding)
+        elif isinstance(padding, tuple):
+            self.padding = padding
+        else:
+            raise TypeError('Wrong padding type.')
+
+        if dilation != 1 or groups != 1:
+            raise ValueError('Not implemented yet')
+
+        self.bias = bias
+
+        self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
+        # TODO valid value of inner_params check
+
+        for kwarg in kwargs:
+            if kwarg not in self.inner_params:
+                raise TypeError('Keyword argument not understood:', kwarg)
+            else:
+                self.inner_params[kwarg] = kwargs[kwarg]
+        
+        w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
+        self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
+        std = math.sqrt(
+                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
+        self.W.gaussian(0.0, std)
+
+        if self.bias:
+            b_shape = (self.out_channels,)
+            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
+            self.b.set_value(0.0)
+        else:
+            #to keep consistency when to do forward.
+            self.b = Tensor(data=CTensor([1]), requires_grad=False, stores_grad=False)
+            self.b.set_value(0.0)
+
+    def __call__(self, x):
+        self.device_check(x, self.W, self.b)
+
+        if x.device.id() == -1:
+            if not hasattr (self, 'handles'):
+                self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias)
+            elif x.shape[0] != self.handles.batchsize:
+                self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias)
+        else:
+            if not hasattr(self, 'handles'):
+                self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias,
+                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
+            elif x.shape[0] != self.handles.batchsize:
+                self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias,
+                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
+        self.handles.device_id= x.device.id()
+        self.handles.bias=self.bias # can simplified
+        y = conv2d(x, self.W, self.b, self.handles)
+        return y
+
+
+
+


[09/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- implement autograd convolution operation's support functions for CPU part.

- these functions have been tested and almostly passed unit tests.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/dfe4478d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/dfe4478d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/dfe4478d

Branch: refs/heads/master
Commit: dfe4478d330a919107936968f85eea4dea91fc19
Parents: 5c8504a
Author: xuewanqi <xu...@outlook.com>
Authored: Tue Jun 26 01:52:48 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Tue Jun 26 08:24:40 2018 +0000

----------------------------------------------------------------------
 src/api/model_operation.i          |  16 ++++
 src/model/convolution_functions.cc | 160 ++++++++++++++++++++++++++++++++
 src/model/convolution_functions.h  |  28 ++++++
 3 files changed, 204 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dfe4478d/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index a74ec5e..20f112e 100644
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -9,6 +9,8 @@ struct ConvHandle{};
 
 struct CudnnConvHandle{size_t batchsize;};
 
+struct CpuConvHandle{};
+
 ConvHandle SetupConv(
     const size_t kernel_h_, const size_t kernel_w_,
     const size_t pad_h_, const size_t pad_w_,
@@ -28,4 +30,18 @@ Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHand
 
 Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
 
+
+CpuConvHandle InitCpuHandle(const Tensor &input, const ConvHandle ch);
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b,
+                        const ConvHandle ch, const CpuConvHandle cch);
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, 
+    const ConvHandle ch, const CpuConvHandle cch);
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, 
+    const ConvHandle ch, const CpuConvHandle cch);
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle ch, const CpuConvHandle cch);
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dfe4478d/src/model/convolution_functions.cc
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.cc b/src/model/convolution_functions.cc
index 7ff399a..6e4b195 100644
--- a/src/model/convolution_functions.cc
+++ b/src/model/convolution_functions.cc
@@ -4,6 +4,7 @@
 //#include "./layer/cudnn_utils.h"
 //#include "singa/utils/logging.h"
 #include "./convolution_functions.h"
+#include "./layer/convolution.h"
 #include<iostream>
 namespace singa{
 
@@ -292,6 +293,165 @@ Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, co
     return dx;
 };
 
+CpuConvHandle InitCpuHandle(const Tensor &input, const ConvHandle ch){
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;    
+    size_t batchsize;
+    size_t channels_;
+
+    size_t col_height_;
+    size_t col_width_;
+
+    batchsize = input.shape(0);
+    channels_ = input.shape(1);
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
+
+    conv_height_ = 1;
+    if (ch.stride_h_ > 0)
+        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
+    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
+
+    col_height_ = ch.channels_ * ch.kernel_w_ * ch.kernel_h_;
+    col_width_ = conv_height_ * conv_width_;
+
+    return CpuConvHandle{
+        height_,
+        width_,
+        conv_height_,
+        conv_width_,
+        batchsize,
+
+        col_height_,
+        col_width_
+    };
+};
+
+Convolution C;
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b,
+                        const ConvHandle ch, const CpuConvHandle cch){
+    CHECK_EQ(x.device()->lang(), kCpp);
+    CHECK_EQ(x.nDim(), 4u);
+    CHECK_EQ(x.shape()[0],cch.batchsize);
+    CHECK_EQ(x.shape()[1],ch.channels_);
+    CHECK_EQ(x.shape()[2],cch.height_);
+    CHECK_EQ(x.shape()[3],cch.width_);
+
+    size_t imagesize = x.Size() / cch.batchsize;
+
+    Shape w_shape= W.shape();
+    Shape b_shape= b.shape();
+
+    W.Reshape(Shape{ch.num_filters_, cch.col_height_});
+    if (ch.bias_term_)
+      b.Reshape(Shape{ch.num_filters_});
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    Tensor col_data(Shape{cch.col_height_, cch.col_width_});//broadcasted image
+
+    float *data_col = new float[cch.col_height_ * cch.col_width_];
+    auto in_data = x.data<float>();
+    for (size_t num = 0; num < cch.batchsize; num++) {
+      C.Im2col(in_data + num * imagesize, ch.channels_, cch.height_, cch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
+      col_data.CopyDataFromHostPtr(data_col, cch.col_height_ * cch.col_width_);
+      Tensor each = Mult(W, col_data);
+      if (ch.bias_term_) {
+          AddColumn(b, &each);
+        }
+      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
+  }
+  W.Reshape(w_shape);
+  b.Reshape(b_shape);
+  return output;
+}; 
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, 
+    const ConvHandle ch, const CpuConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Shape w_shape= W.shape();
+    W.Reshape(Shape{ch.num_filters_, cch.col_height_});
+
+    Tensor dx;
+    dx.ResetLike(x);
+    
+    size_t imagesize = x.Size() / cch.batchsize;
+    float *dx_b = new float[imagesize];
+
+    for (size_t num = 0; num < cch.batchsize; num++) {
+      Tensor grad_b(Shape{ch.num_filters_, cch.conv_height_ * cch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      Tensor dcol_b = Mult(W.T(), grad_b);
+      auto dcol_data = dcol_b.data<float>();
+      C.Col2im(dcol_data, ch.channels_, cch.height_, cch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
+           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
+      dx.CopyDataFromHostPtr(dx_b, imagesize, num * imagesize);
+    }
+  W.Reshape(w_shape); 
+  return dx;
+};
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, 
+    const ConvHandle ch, const CpuConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    size_t imagesize = x.Size() / cch.batchsize;
+
+    Tensor dW;
+    dW.ResetLike(W);
+    dW.SetValue(0.0f);
+    
+    Shape w_shape= W.shape();
+    dW.Reshape(Shape{ch.num_filters_, cch.col_height_});
+
+    Tensor col_data(Shape{cch.col_height_, cch.col_width_});//broadcasted image
+
+    float *data_col = new float[cch.col_height_ * cch.col_width_];
+    auto in_data = dy.data<float>();
+    for (size_t num = 0; num < cch.batchsize; num++) {
+      C.Im2col(in_data + num * imagesize, ch.channels_, cch.height_, cch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
+      col_data.CopyDataFromHostPtr(data_col, cch.col_height_ * cch.col_width_);
+      Tensor grad_b(Shape{ch.num_filters_, cch.conv_height_ * cch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      dW += Mult(grad_b, col_data.T());
+    }
+   dW.Reshape(w_shape);
+    //dW.Reshape(Shape{ch.num_filters_,ch.channels_ , ch.kernel_w_ , ch.kernel_h_});
+   return dW;
+};
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle ch, const CpuConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    auto tmpshp = Shape{cch.batchsize * ch.num_filters_, dy.Size() / (cch.batchsize * ch.num_filters_)};
+    Tensor tmp1 = Reshape(dy, tmpshp);
+
+    Tensor tmp2(Shape{cch.batchsize * ch.num_filters_});
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{cch.batchsize, ch.num_filters_});
+
+    SumRows(tmp3, &db);
+
+    return db;
+};
+
 } //namespace_singa
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dfe4478d/src/model/convolution_functions.h
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.h b/src/model/convolution_functions.h
index e34423f..1b90941 100644
--- a/src/model/convolution_functions.h
+++ b/src/model/convolution_functions.h
@@ -43,6 +43,20 @@ struct CudnnConvHandle{
     size_t batchsize;
 };
 
+struct CpuConvHandle{
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+
+    size_t col_height_;
+    size_t col_width_;
+
+};
+
+    
+
 ConvHandle SetupConv(
     const size_t kernel_h_, const size_t kernel_w_,
     const size_t pad_h_, const size_t pad_w_,
@@ -64,4 +78,18 @@ Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHand
 
 Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
 
+
+CpuConvHandle InitCpuHandle(const Tensor &input, const ConvHandle ch);
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b,
+                        const ConvHandle ch, const CpuConvHandle cch);
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, 
+    const ConvHandle ch, const CpuConvHandle cch);
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, 
+    const ConvHandle ch, const CpuConvHandle cch);
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle ch, const CpuConvHandle cch);
+
 }


[14/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- tidy some files and fixed some bugs.

- add few shape checks and functions in new developed layer.

- rename some files, classes, variables


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/15c0230c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/15c0230c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/15c0230c

Branch: refs/heads/master
Commit: 15c0230cbc98c3662f5e2519bed4da4b26741a4f
Parents: 82ef417
Author: xuewanqi <xu...@outlook.com>
Authored: Mon Jul 2 05:53:13 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Tue Jul 3 03:37:48 2018 +0000

----------------------------------------------------------------------
 examples/autograd/mlp.py                     |   2 +-
 examples/autograd/mnist_cnn.py               |   2 +-
 python/singa/autograd.py                     | 313 +++++-------------
 src/api/core_device.i                        |   3 -
 src/api/model_operation.i                    |  28 +-
 src/model/operation/convolution.cc           | 371 ++++++++++++++++++++++
 src/model/operation/convolution.h            |  78 +++++
 src/model/operation/convolution_operation.cc | 366 ---------------------
 src/model/operation/convolution_operation.h  |  78 -----
 test/python/test_operation.py                |  27 +-
 10 files changed, 564 insertions(+), 704 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/examples/autograd/mlp.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mlp.py b/examples/autograd/mlp.py
old mode 100644
new mode 100755
index f7c4353..0447927
--- a/examples/autograd/mlp.py
+++ b/examples/autograd/mlp.py
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     label = to_categorical(label, 2).astype(np.float32)
     print('train_data_shape:', data.shape)
     print('train_label_shape:', label.shape)
-    # 1
+    
     inputs = Tensor(data=data)
     target = Tensor(data=label)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/examples/autograd/mnist_cnn.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py
old mode 100644
new mode 100755
index cbb5650..a82f64c
--- a/examples/autograd/mnist_cnn.py
+++ b/examples/autograd/mnist_cnn.py
@@ -100,7 +100,7 @@ if __name__ == '__main__':
     print('the shape of testing label is', y_test.shape)
 
     # operations initialization
-    conv1 = autograd.Conv2D(1, 32, 3, padding=1)
+    conv1 = autograd.Conv2D(1, 32, 3, padding=1, bias=False)
     conv2 = autograd.Conv2D(32, 32, 3, padding=1)
     linear = autograd.Linear(32 * 28 * 28, 10)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
old mode 100644
new mode 100755
index 474fff4..2a10608
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -369,105 +369,6 @@ def ctensor2numpy(x):
     return np_array.reshape(x.shape())
 
 
-class Conv2d(Operation):
-
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True, **kwargs):
-
-        inner_params = {'name': 'Conv2d',
-                        'border_mode': 'same',
-                        'cudnn_prefer': 'fastest',
-                        'workspace_byte_limit': 1024,
-                        'data_format': 'NCHW',
-                        'W_specs': {'init': 'xavier'},
-                        'b_specs': {'init': 'constant'},
-                        'input_sample_shape': None}
-        # TODO valid value of inner_params check
-
-        for kwarg in kwargs:
-            if kwarg not in inner_params:
-                raise TypeError('Keyword argument not understood:', kwarg)
-            else:
-                inner_params[kwarg] = kwargs[kwarg]
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.W_specs = inner_params['W_specs']
-        self.b_specs = inner_params['b_specs']
-
-        if isinstance(kernel_size, int):
-            self.kernel_size = (kernel_size, kernel_size)
-        else:
-            self.kernel_size = kernel_size
-
-        if padding == 0:
-            pad = None
-        else:
-            pad = padding
-
-        if dilation != 1 or groups != 1:
-            raise ValueError('Not implemented yet')
-
-        self.PyLayer = layer.Conv2D(inner_params['name'],
-                                    nb_kernels=out_channels,
-                                    kernel=kernel_size,
-                                    stride=stride,
-                                    border_mode=inner_params['border_mode'],
-                                    cudnn_prefer=inner_params['cudnn_prefer'],
-                                    workspace_byte_limit=inner_params[
-                                        'workspace_byte_limit'],
-                                    data_format=inner_params['data_format'],
-                                    use_bias=bias,
-                                    W_specs=self.W_specs,
-                                    b_specs=self.b_specs,
-                                    pad=pad,
-                                    input_sample_shape=inner_params['input_sample_shape'])
-
-    def get_params(self):
-        assert self.init_value is True, 'must initialize before get_params()'
-        if self.bias:
-            return (self.w, self.b)
-        else:
-            return self.w
-
-    def __call__(self, x):
-        if training:
-            self.flag = model_pb2.kTrain
-        else:
-            self.flag = model_pb2.kEval
-
-        if not self.PyLayer.has_setup:
-            self.PyLayer.setup(x.shape[1:])
-
-        param_data = self.PyLayer.layer.param_values()
-
-        if not hasattr(self, 'w'):
-            self.w = Tensor(device=param_data[0].device(), data=param_data[
-                            0], requires_grad=True, stores_grad=True)
-            std = math.sqrt(
-                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
-            self.w.gaussian(0.0, std)
-
-        xs = [x, self.w]
-
-        if len(param_data) == 2:
-            if not hasattr(self, 'b'):
-                self.b = Tensor(device=param_data[1].device(), data=param_data[
-                                1], requires_grad=True, stores_grad=True)
-                self.b.set_value(0.0)
-
-            xs.append(self.b)
-
-        xs = tuple(xs)
-        return self._do_forward(*xs)[0]
-
-    def forward(self, *xs):
-        return self.PyLayer.layer.Forward(self.flag, xs[0])
-
-    def backward(self, dy):
-        ret = self.PyLayer.layer.Backward(self.flag, dy)
-        return (ret[0],) + ret[1]
-
 class MaxPool2d(Operation):
 
     def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1,
@@ -548,80 +449,11 @@ class Flatten(Operation):
 def flatten(x):
     return Flatten()(x)[0]
 
-class CONV2D(Operation):
-    '''def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True, **kwargs):
 
-        self.in_channels = in_channels
-        self.out_channels = out_channels
+class _Conv2D(Operation):
 
-        if isinstance(kernel_size, int):
-            self.kernel_size = (kernel_size, kernel_size)
-        elif isinstance(kernel_size, tuple):
-            self.kernel_size = kernel_size
-        else:
-            raise TypeError('Wrong kernel_size type.')
-        
-        if isinstance(stride, int):
-            self.stride = (stride,stride)
-        elif isinstance(stride, tuple):
-            self.stride = stride
-        else:
-            raise TypeError('Wrong stride type.')
-
-        if isinstance(padding, int):
-            self.padding = (padding,padding)
-        elif isinstance(padding, tuple):
-            self.padding = padding
-        else:
-            raise TypeError('Wrong padding type.')
-
-        if dilation != 1 or groups != 1:
-            raise ValueError('Not implemented yet')
-
-        self.bias = bias
-
-        self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
-        # TODO valid value of inner_params check
-
-        for kwarg in kwargs:
-            if kwarg not in self.inner_params:
-                raise TypeError('Keyword argument not understood:', kwarg)
-            else:
-                self.inner_params[kwarg] = kwargs[kwarg]
-        
-        w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
-        self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
-        std = math.sqrt(
-                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
-        self.W.gaussian(0.0, std)
-
-        if self.bias:
-            b_shape = (self.out_channels,)
-            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
-            self.b.set_value(0.0)
-        else:
-            #to keep consistency when to do forward.
-            self.b = Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)
-
-    def __call__(self, x): 
-        if not hasattr(self, 'device_id'):
-            self.device_id = x.device.id()
-        else:
-            assert self.device_id == x.device.id(),'Not the same device.'
-
-        if self.W.device.id() != self.device_id:
-            self.W.to_device(x.device)
-
-        if self.bias:
-            if self.b.device.id() != self.device_id:
-                self.b.to_device(x.device)
-
-    	xs = [x, self.W, self.b]
-
-    	return self._do_forward(*xs)[0]'''
-    def __init__(self, handles):
-        self.handles = handles
+    def __init__(self, handle):
+        self.handle = handle
 
     def forward(self, x, W, b):
         #assert x.nDim() == 4, 'The dimensions of input should be 4D.'
@@ -631,39 +463,46 @@ class CONV2D(Operation):
         #assert 0 == 0, 'invalid padding'
 
         if training:
-            self.inputs = (x,W,b)
+            self.inputs = (x, W, b)
 
-        if self.handles.device_id == -1:
-            return singa.CpuConvForward(x, W, b, self.handles)
+        if self.handle.device_id == -1:
+            return singa.CpuConvForward(x, W, b, self.handle)
 
         else:
-            return singa.GpuConvForward(x, W, b, self.handles)
+            return singa.GpuConvForward(x, W, b, self.handle)
 
     def backward(self, dy):
-        assert training is True and hasattr(self, 'inputs'), 'Please set training as True before do BP. '
-
-        if dy.device().id() != self.handles.device_id:
-            dy.ToDevice(self.x.device())
-
-        if self.handles.device_id == -1: 
-            dx = singa.CpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles)
-            dW = singa.CpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles)
-            if self.handles.bias:
-                db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handles)
+        assert training is True and hasattr(
+            self, 'inputs'), 'Please set training as True before do BP. '
+
+        if dy.device().id() != self.handle.device_id:
+            dy.ToDevice(self.inputs[0].device())
+
+        if self.handle.device_id == -1:
+            dx = singa.CpuConvBackwardx(
+                dy, self.inputs[1], self.inputs[0], self.handle)
+            dW = singa.CpuConvBackwardW(
+                dy, self.inputs[0], self.inputs[1], self.handle)
+            if self.handle.bias_term_:
+                db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
-                return dx, dW
+                return dx, dW, None
         else:
-            dx = singa.GpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles)
-            dW = singa.GpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles)
-            if self.handles.bias:
-                db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handles)
+            dx = singa.GpuConvBackwardx(
+                dy, self.inputs[1], self.inputs[0], self.handle)
+            dW = singa.GpuConvBackwardW(
+                dy, self.inputs[0], self.inputs[1], self.handle)
+            if self.handle.bias_term_:
+                db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
-                return dx, dW
+                return dx, dW, None
+
+
+def conv2d(x, W, b, handle):
+    return _Conv2D(handle)(x, W, b)[0]
 
-def conv2d(x,W,b,handles):
-    return CONV2D(handles)(x,W,b)[0]
 
 def infer_dependency(op):
     '''
@@ -776,27 +615,33 @@ def backward(y, dy=None):
 
     return gradients
 
-class newlayer(object):
+
+class NewLayer(object):
+
     def __init__(self):
         pass
 
-    def device_check(*inputs):
-        pass
+    def device_check(self, *inputs):
+        x_device = inputs[0].device
+        for var in inputs:
+            if var.device.id() != x_device:
+                var.to_device(x_device)
+
 
+class Linear(NewLayer):
 
-class Linear(newlayer):
     def __init__(self, in_features, out_features, bias=True):
         #self.in_features = in_features
         #self.out_features = out_features
         w_shape = (in_features, out_features)
         b_shape = (1, out_features)
         self.bias = bias
-        
+
         self.W = Tensor(shape=w_shape,
                         requires_grad=True, stores_grad=True)
         std = math.sqrt(2.0 / (in_features + out_features))
         self.W.gaussian(0.0, std)
-        
+
         if self.bias:
             self.b = Tensor(shape=b_shape,
                             requires_grad=True, stores_grad=True)
@@ -812,7 +657,9 @@ class Linear(newlayer):
             y = add_bias(y, self.b, axis=0)
         return y
 
-class Conv2D(newlayer):
+
+class Conv2D(NewLayer):
+
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True, **kwargs):
 
@@ -825,16 +672,16 @@ class Conv2D(newlayer):
             self.kernel_size = kernel_size
         else:
             raise TypeError('Wrong kernel_size type.')
-        
+
         if isinstance(stride, int):
-            self.stride = (stride,stride)
+            self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
         else:
             raise TypeError('Wrong stride type.')
 
         if isinstance(padding, int):
-            self.padding = (padding,padding)
+            self.padding = (padding, padding)
         elif isinstance(padding, tuple):
             self.padding = padding
         else:
@@ -845,7 +692,8 @@ class Conv2D(newlayer):
 
         self.bias = bias
 
-        self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
+        self.inner_params = {'cudnn_prefer': 'fastest',
+                             'workspace_MB_limit': 1024}
         # TODO valid value of inner_params check
 
         for kwarg in kwargs:
@@ -853,46 +701,49 @@ class Conv2D(newlayer):
                 raise TypeError('Keyword argument not understood:', kwarg)
             else:
                 self.inner_params[kwarg] = kwargs[kwarg]
-        
-        w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
+
+        w_shape = (self.out_channels, self.in_channels,
+                   self.kernel_size[0], self.kernel_size[1])
         self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
         std = math.sqrt(
-                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
+            2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
         self.W.gaussian(0.0, std)
 
         if self.bias:
             b_shape = (self.out_channels,)
-            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
+            self.b = Tensor(shape=b_shape, requires_grad=True,
+                            stores_grad=True)
             self.b.set_value(0.0)
         else:
-            #to keep consistency when to do forward.
-            self.b = Tensor(data=CTensor([1]), requires_grad=False, stores_grad=False)
+            # to keep consistency when to do forward.
+            self.b = Tensor(data=CTensor(
+                [1]), requires_grad=False, stores_grad=False)
             self.b.set_value(0.0)
 
     def __call__(self, x):
+        assert x.shape[1] == self.in_channels,'in_channels dismatched'
+        assert (x.shape[2]+2*self.padding[0]-self.kernel_size[0])%self.stride[0] == 0, 'invalid padding or strides.'
+        assert (x.shape[3]+2*self.padding[1]-self.kernel_size[1])%self.stride[1] == 0, 'invalid padding or stride.'
+
         self.device_check(x, self.W, self.b)
 
         if x.device.id() == -1:
-            if not hasattr (self, 'handles'):
-                self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias)
-            elif x.shape[0] != self.handles.batchsize:
-                self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias)
+            if not hasattr(self, 'handle'):
+                self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
+                                                 self.padding, self.in_channels, self.out_channels, self.bias)
+            elif x.shape[0] != self.handle.batchsize:
+                self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
+                                                 self.padding, self.in_channels, self.out_channels, self.bias)
         else:
-            if not hasattr(self, 'handles'):
-                self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias,
-                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
-            elif x.shape[0] != self.handles.batchsize:
-                self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias,
-                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
-        self.handles.device_id= x.device.id()
-        self.handles.bias=self.bias # can simplified
-        y = conv2d(x, self.W, self.b, self.handles)
+            if not hasattr(self, 'handle'):
+                self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
+                                                      self.padding, self.in_channels, self.out_channels, self.bias,
+                                                      self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
+            elif x.shape[0] != self.handle.batchsize:
+                self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
+                                                      self.padding, self.in_channels, self.out_channels, self.bias,
+                                                      self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
+        self.handle.device_id = x.device.id()
+
+        y = conv2d(x, self.W, self.b, self.handle)
         return y
-
-
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/api/core_device.i
----------------------------------------------------------------------
diff --git a/src/api/core_device.i b/src/api/core_device.i
index 381f7c6..a5b7de6 100644
--- a/src/api/core_device.i
+++ b/src/api/core_device.i
@@ -43,14 +43,11 @@ namespace std{
 
 namespace singa{
 
-enum LangType {kCpp, kCuda, kOpencl,kNumDeviceType};
-
 class Device {
  public:
   virtual void SetRandSeed(unsigned seed) = 0;
   std::shared_ptr<Device> host();
   int id() const;
-  LangType lang() const;
 };
 
 class Platform {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
old mode 100644
new mode 100755
index 29f8f58..58e5270
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -1,46 +1,48 @@
 %module model_operation
 
 %{
-#include "../src/model/operation/convolution_operation.h"
+#include "../src/model/operation/convolution.h"
 %}
 namespace singa{
 
-struct ConvHandles{
+struct ConvHandle{
 
 		size_t batchsize;
+        const bool bias_term_;
 
-		ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+		ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
                     const std::vector<size_t> stride, const std::vector<size_t> padding,
                     const size_t in_channels, const size_t out_channels,
                     const bool bias_term_);
               	};
 
-struct CudnnConvHandles{
+struct CudnnConvHandle{
 
 		size_t batchsize;
+        const bool bias_term_;
 		
-		CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+		CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
                     const std::vector<size_t> stride, const std::vector<size_t> padding,
                     const size_t in_channels, const size_t out_channels,
                     const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024,
                     const std::string prefer_="fastest");
                 };
 
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch);
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);
 
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch);
 
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch);
 
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch);
 
 
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch);
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch);
 
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch);
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch);
 
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch);
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch);
 
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch);
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch);
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
new file mode 100755
index 0000000..8d60df4
--- /dev/null
+++ b/src/model/operation/convolution.cc
@@ -0,0 +1,371 @@
+#include "./convolution.h"
+#include "../layer/convolution.h"
+#include<iostream>
+
+namespace singa{
+
+ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+	                const std::vector<size_t> stride, const std::vector<size_t> padding,
+	                const size_t in_channels, const size_t out_channels,
+	                const bool bias){
+    kernel_h_=kernel_size[0];
+    kernel_w_=kernel_size[1];
+
+    pad_h_=padding[0];
+    pad_w_=padding[1];
+
+    stride_h_=stride[0];
+    stride_w_=stride[1];
+
+    channels_=in_channels;
+    num_filters_=out_channels;
+
+    bias_term_ = bias;
+
+	batchsize = input.shape(0);
+	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    conv_height_ = 1;
+    if (stride_h_ > 0)
+        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+
+    col_height_ = in_channels * kernel_w_ * kernel_h_;
+    col_width_ = conv_height_ * conv_width_;
+    imagesize = input.Size() / batchsize;
+};	
+
+CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,const bool bias_term_, 
+                    const size_t workspace_byte_limit_,const std::string prefer_)
+                    :ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){
+
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    Context *ctx = dev->context(0);
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+    if (bias_term_)
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), batchsize,
+                                           channels_, height_, width_));
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+            num_filters_, conv_height_, conv_width_));
+    if (bias_term_)
+        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                               GetCudnnDataType(dtype), 1,
+                                               num_filters_, 1, 1));
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
+                                                stride_h_, stride_w_, 1, 1,
+                                                CUDNN_CROSS_CORRELATION,
+                                                GetCudnnDataType(dtype)));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                           CUDNN_TENSOR_NCHW, num_filters_,
+                                           channels_, kernel_h_, kernel_w_));
+    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+        prefer_ == "no_workspace") {
+        cudnnConvolutionFwdPreference_t fwd_pref;
+        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+        if (prefer_ == "fastest") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+        } else if (prefer_ == "limited_workspace") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        } else {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        }
+        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                workspace_byte_limit_, &fp_alg_));
+        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+        // deprecated in cudnn v7
+        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+        } else if (prefer_ == "autotune") {
+        const int topk = 1;
+        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                &num_fp_alg, fp_alg_perf));
+        fp_alg_ = fp_alg_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                &num_bp_filt_alg, bp_filt_perf));
+        bp_filter_alg_ = bp_filt_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                &num_bp_data_alg, bp_data_perf));
+        bp_data_alg_ = bp_data_perf[0].algo;
+    } else {
+        LOG(FATAL) << "Preferred algorithm is not available!";
+    }
+
+    size_t fp_byte, bp_data_byte, bp_filter_byte;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+            &fp_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+            bp_data_alg_, &bp_data_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+            bp_filter_alg_, &bp_filter_byte));
+    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                       sizeof(float) +
+                       1;
+    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
+        LOG(WARNING) << "The required memory for workspace ("
+                     << workspace_count_ * sizeof(float)
+                     << ") is larger than the expected Bytes ("
+                     << workspace_byte_limit_ << ")";
+    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+};
+
+Convolution C;
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch){
+	CHECK_EQ(x.device()->lang(), kCpp);
+
+	CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+    x.shape(3) == ch.width_) << "input sample shape should not change";
+
+    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
+    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    Shape b_shape;
+    if (ch.bias_term_)
+      b_shape= b.shape();
+
+    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+    if (ch.bias_term_)
+      b.Reshape(Shape{ch.num_filters_});
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+    Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+
+    float *data_col = new float[ch.col_height_ * ch.col_width_];
+    auto in_data = x.data<float>();
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);    
+
+      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+      Tensor each = Mult(W, col_data);
+      if (ch.bias_term_) {
+          AddColumn(b, &each);
+        }
+      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
+    };
+  W.Reshape(w_shape);
+  if (ch.bias_term_)
+    b.Reshape(b_shape);
+  return output;
+}; 
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
+    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+    Tensor dx;
+    dx.ResetLike(x);
+    
+    float *dx_b = new float[ch.imagesize];
+
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      Tensor dcol_b = Mult(W.T(), grad_b);
+      auto dcol_data = dcol_b.data<float>();
+      C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
+           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
+      dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
+    }
+  W.Reshape(w_shape); 
+  return dx;
+};
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+    CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+    x.shape(3) == ch.width_) << "input sample shape should not change";
+
+    Tensor dW;
+    dW.ResetLike(W);
+    dW.SetValue(0.0f);
+    
+    Shape w_shape= W.shape();
+    dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+
+    float *data_col = new float[ch.col_height_ * ch.col_width_];
+    auto in_data = dy.data<float>();
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
+      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      dW += Mult(grad_b, col_data.T());
+    }
+   dW.Reshape(w_shape);
+   return dW;
+};
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+	
+	CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change";
+
+    Tensor db;
+    db.ResetLike(b);
+
+    auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
+    Tensor tmp1 = Reshape(dy, tmpshp);
+
+    Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
+
+    SumRows(tmp3, &db);
+
+    return db;
+};
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch){
+	CHECK_EQ(x.device()->lang(), kCuda);
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+
+    Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    output.device()->Exec([output, x, W, cch](Context *ctx) {
+        Block *inblock = x.block(), *outblock = output.block(),
+                *wblock = W.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                                inblock->data(), cch.filter_desc_, wblock->data(),
+                                cch.conv_desc_, cch.fp_alg_,
+                                cch.workspace_.block()->mutable_data(),
+                                cch.workspace_count_ * sizeof(float), &beta,
+                                cch.y_desc_, outblock->mutable_data());
+    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+    if (cch.bias_term_) {
+        output.device()->Exec([output, b, cch](Context *ctx) {
+            float beta = 1.f, alpha = 1.0f;
+            Block *outblock = output.block(), *bblock = b.block();
+            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                           bblock->data(), &beta, cch.y_desc_,
+                           outblock->mutable_data());
+        }, {output.block(), b.block()}, {output.block()});
+    }
+
+    return output;
+};
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dx;
+    dx.ResetLike(x);
+
+    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
+        Block *wblock = W.block(), *dyblock = dy.block(),
+                *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                     wblock->data(), cch.y_desc_, dyblock->data(),
+                                     cch.conv_desc_, cch.bp_data_alg_,
+                                     cch.workspace_.block()->mutable_data(),
+                                     cch.workspace_count_ * sizeof(float), &beta,
+                                     cch.x_desc_, dxblock->mutable_data());
+    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+
+    return dx;
+};
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dW;
+    dW.ResetLike(W);
+
+    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
+    Block *inblock = x.block(), *dyblock = dy.block(),
+            *dwblock = dW.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+            cch.workspace_.block()->mutable_data(),
+            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+            dwblock->mutable_data());
+    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+    return dW;
+};
+
+// input Tensor b for Reset db purpose, can avoid this later.
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
+        Block *dyblock = dy.block(), *dbblock = db.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                     dyblock->data(), &beta, cch.bias_desc_,
+                                     dbblock->mutable_data());
+    }, {dy.block()}, {db.block()});
+
+    return db;
+};
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h
new file mode 100755
index 0000000..96a6d60
--- /dev/null
+++ b/src/model/operation/convolution.h
@@ -0,0 +1,78 @@
+#include <string>
+#include <vector>
+#include <cudnn.h>
+#include "../layer/cudnn_convolution.h"
+#include "../layer/cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa{
+
+struct ConvHandle{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+
+    size_t col_height_;
+    size_t col_width_;
+    size_t imagesize;
+
+    ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias);
+
+};
+
+struct CudnnConvHandle:ConvHandle{
+	cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias, const size_t workspace_byte_limit_=1024*1024*1024,
+                    const std::string prefer_="fastest");
+};
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch);
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch);
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch);
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch);
+
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch);
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch);
+
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch);
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution_operation.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_operation.cc b/src/model/operation/convolution_operation.cc
deleted file mode 100644
index 90b1b4a..0000000
--- a/src/model/operation/convolution_operation.cc
+++ /dev/null
@@ -1,366 +0,0 @@
-#include "./convolution_operation.h"
-#include "../layer/convolution.h"
-#include<iostream>
-
-namespace singa{
-
-ConvHandles::ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-	                const std::vector<size_t> stride, const std::vector<size_t> padding,
-	                const size_t in_channels, const size_t out_channels,
-	                const bool bias_term_){
-    kernel_h_=kernel_size[0];
-    kernel_w_=kernel_size[1];
-
-    pad_h_=padding[0];
-    pad_w_=padding[1];
-
-    stride_h_=stride[0];
-    stride_w_=stride[1];
-
-    channels_=in_channels;
-    num_filters_=out_channels;
-
-	batchsize = input.shape(0);
-	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    conv_height_ = 1;
-    if (stride_h_ > 0)
-        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
-    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
-
-    col_height_ = in_channels * kernel_w_ * kernel_h_;
-    col_width_ = conv_height_ * conv_width_;
-    imagesize = input.Size() / batchsize;
-};	
-
-CudnnConvHandles::CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,const bool bias_term_, 
-                    const size_t workspace_byte_limit_,const std::string prefer_)
-                    :ConvHandles(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){
-
-    DataType dtype = input.data_type();
-    auto dev = input.device();
-    Context *ctx = dev->context(0);
-
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-    if (bias_term_)
-        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), batchsize,
-                                           channels_, height_, width_));
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-            num_filters_, conv_height_, conv_width_));
-    if (bias_term_)
-        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                               GetCudnnDataType(dtype), 1,
-                                               num_filters_, 1, 1));
-    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
-                                                stride_h_, stride_w_, 1, 1,
-                                                CUDNN_CROSS_CORRELATION,
-                                                GetCudnnDataType(dtype)));
-    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                           CUDNN_TENSOR_NCHW, num_filters_,
-                                           channels_, kernel_h_, kernel_w_));
-    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-        prefer_ == "no_workspace") {
-        cudnnConvolutionFwdPreference_t fwd_pref;
-        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-        if (prefer_ == "fastest") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-        } else if (prefer_ == "limited_workspace") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        } else {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        }
-        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                workspace_byte_limit_, &fp_alg_));
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
-        // deprecated in cudnn v7
-        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
-        } else if (prefer_ == "autotune") {
-        const int topk = 1;
-        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
-        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
-        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
-        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                &num_fp_alg, fp_alg_perf));
-        fp_alg_ = fp_alg_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
-                &num_bp_filt_alg, bp_filt_perf));
-        bp_filter_alg_ = bp_filt_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
-                &num_bp_data_alg, bp_data_perf));
-        bp_data_alg_ = bp_data_perf[0].algo;
-    } else {
-        LOG(FATAL) << "Preferred algorithm is not available!";
-    }
-
-    size_t fp_byte, bp_data_byte, bp_filter_byte;
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
-            &fp_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-            bp_data_alg_, &bp_data_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-            bp_filter_alg_, &bp_filter_byte));
-    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
-                       sizeof(float) +
-                       1;
-    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
-        LOG(WARNING) << "The required memory for workspace ("
-                     << workspace_count_ * sizeof(float)
-                     << ") is larger than the expected Bytes ("
-                     << workspace_byte_limit_ << ")";
-    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
-};
-
-Convolution C;
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch){
-	CHECK_EQ(x.device()->lang(), kCpp);
-
-	CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-    x.shape(3) == ch.width_) << "input sample shape should not change";
-
-    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
-    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
-
-    Shape w_shape= W.shape();
-    Shape b_shape= b.shape();
-
-    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
-    if (ch.bias_term_)
-      b.Reshape(Shape{ch.num_filters_});
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-    Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
-
-    float *data_col = new float[ch.col_height_ * ch.col_width_];
-    auto in_data = x.data<float>();
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);    
-
-      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
-      Tensor each = Mult(W, col_data);
-      if (ch.bias_term_) {
-          AddColumn(b, &each);
-        }
-      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
-    };
-  W.Reshape(w_shape);
-  b.Reshape(b_shape);
-  return output;
-}; 
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-
-    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
-    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
-
-    Shape w_shape= W.shape();
-    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
-
-    Tensor dx;
-    dx.ResetLike(x);
-    
-    float *dx_b = new float[ch.imagesize];
-
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      Tensor dcol_b = Mult(W.T(), grad_b);
-      auto dcol_data = dcol_b.data<float>();
-      C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
-           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
-      dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
-    }
-  W.Reshape(w_shape); 
-  return dx;
-};
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-
-    CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-    x.shape(3) == ch.width_) << "input sample shape should not change";
-
-    Tensor dW;
-    dW.ResetLike(W);
-    dW.SetValue(0.0f);
-    
-    Shape w_shape= W.shape();
-    dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
-
-    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
-
-    float *data_col = new float[ch.col_height_ * ch.col_width_];
-    auto in_data = dy.data<float>();
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
-      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
-      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      dW += Mult(grad_b, col_data.T());
-    }
-   dW.Reshape(w_shape);
-   return dW;
-};
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-	
-	CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change";
-
-    Tensor db;
-    db.ResetLike(b);
-
-    auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
-    Tensor tmp1 = Reshape(dy, tmpshp);
-
-    Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
-    SumColumns(tmp1, &tmp2);
-    Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
-
-    SumRows(tmp3, &db);
-
-    return db;
-};
-
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch){
-	CHECK_EQ(x.device()->lang(), kCuda);
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-
-    Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    output.device()->Exec([output, x, W, cch](Context *ctx) {
-        Block *inblock = x.block(), *outblock = output.block(),
-                *wblock = W.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                                inblock->data(), cch.filter_desc_, wblock->data(),
-                                cch.conv_desc_, cch.fp_alg_,
-                                cch.workspace_.block()->mutable_data(),
-                                cch.workspace_count_ * sizeof(float), &beta,
-                                cch.y_desc_, outblock->mutable_data());
-    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-    if (cch.bias_term_) {
-        output.device()->Exec([output, b, cch](Context *ctx) {
-            float beta = 1.f, alpha = 1.0f;
-            Block *outblock = output.block(), *bblock = b.block();
-            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                           bblock->data(), &beta, cch.y_desc_,
-                           outblock->mutable_data());
-        }, {output.block(), b.block()}, {output.block()});
-    }
-
-    return output;
-};
-
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor dx;
-    dx.ResetLike(x);
-
-    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
-        Block *wblock = W.block(), *dyblock = dy.block(),
-                *dxblock = dx.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                     wblock->data(), cch.y_desc_, dyblock->data(),
-                                     cch.conv_desc_, cch.bp_data_alg_,
-                                     cch.workspace_.block()->mutable_data(),
-                                     cch.workspace_count_ * sizeof(float), &beta,
-                                     cch.x_desc_, dxblock->mutable_data());
-    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
-
-    return dx;
-};
-
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor dW;
-    dW.ResetLike(W);
-
-    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
-    Block *inblock = x.block(), *dyblock = dy.block(),
-            *dwblock = dW.block();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-            cch.workspace_.block()->mutable_data(),
-            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
-            dwblock->mutable_data());
-    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
-
-    return dW;
-};
-
-// input Tensor b for Reset db purpose, can avoid this later.
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor db;
-    db.ResetLike(b);
-
-    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
-        Block *dyblock = dy.block(), *dbblock = db.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                     dyblock->data(), &beta, cch.bias_desc_,
-                                     dbblock->mutable_data());
-    }, {dy.block()}, {db.block()});
-
-    return db;
-};
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution_operation.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_operation.h b/src/model/operation/convolution_operation.h
deleted file mode 100644
index 835581e..0000000
--- a/src/model/operation/convolution_operation.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#include <string>
-#include <vector>
-#include <cudnn.h>
-#include "../layer/cudnn_convolution.h"
-#include "../layer/cudnn_utils.h"
-#include "singa/utils/logging.h"
-
-namespace singa{
-
-struct ConvHandles{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-
-    size_t col_height_;
-    size_t col_width_;
-    size_t imagesize;
-
-    ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias_term_);
-
-};
-
-struct CudnnConvHandles:ConvHandles{
-	cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024,
-                    const std::string prefer_="fastest");
-};
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch);
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch);
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch);
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch);
-
-
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
-
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/test/python/test_operation.py
----------------------------------------------------------------------
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
index ece537d..1bbc70c 100644
--- a/test/python/test_operation.py
+++ b/test/python/test_operation.py
@@ -16,9 +16,6 @@ cpu_dev = device.get_default_device()
 dy = CTensor([2, 1, 2, 2])
 singa.Gaussian(0.0, 1.0, dy)
 
-conv = autograd.Conv2D(3, 1, 2)  # (in_channels, out_channels, kernel_size)
-conv_without_bias = autograd.Conv2D(3,1,2,bias=False)
-
 
 def _tuple_to_string(t):
     lt = [str(x) for x in t]
@@ -34,35 +31,43 @@ class TestPythonOperation(unittest.TestCase):
                          )
 
     def test_conv2d_gpu(self):
+        # (in_channels, out_channels, kernel_size)
+        conv_0 = autograd.Conv2D(3, 1, 2)
+        conv_without_bias_0 = autograd.Conv2D(3, 1, 2, bias=False)
+
         gpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev)
         gpu_input_tensor.gaussian(0.0, 1.0)
 
-        y = conv(gpu_input_tensor)  # PyTensor
-        dx, dW, db = conv.backward(dy)  # CTensor
+        y = conv_0(gpu_input_tensor)  # PyTensor
+        dx, dW, db = y.creator.backward(dy)  # CTensor
 
         self.check_shape(y.shape, (2, 1, 2, 2))
         self.check_shape(dx.shape(), (2, 3, 3, 3))
         self.check_shape(dW.shape(), (1, 3, 2, 2))
         self.check_shape(db.shape(), (1,))
 
-        #forward without bias
-        y_without_bias=conv_without_bias(gpu_input_tensor)
+        # forward without bias
+        y_without_bias = conv_without_bias_0(gpu_input_tensor)
         self.check_shape(y.shape, (2, 1, 2, 2))
 
     def test_conv2d_cpu(self):
+        # (in_channels, out_channels, kernel_size)
+        conv_1 = autograd.Conv2D(3, 1, 2)
+        conv_without_bias_1 = autograd.Conv2D(3, 1, 2, bias=False)
+
         cpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=cpu_dev)
         cpu_input_tensor.gaussian(0.0, 1.0)
 
-        y = conv(cpu_input_tensor)  # PyTensor
-        dx, dW, db = conv.backward(dy)  # CTensor
+        y = conv_1(cpu_input_tensor)  # PyTensor
+        dx, dW, db = y.creator.backward(dy)  # CTensor
 
         self.check_shape(y.shape, (2, 1, 2, 2))
         self.check_shape(dx.shape(), (2, 3, 3, 3))
         self.check_shape(dW.shape(), (1, 3, 2, 2))
         self.check_shape(db.shape(), (1,))
 
-        #forward without bias
-        y_without_bias=conv_without_bias(cpu_input_tensor)
+        # forward without bias
+        y_without_bias = conv_without_bias_1(cpu_input_tensor)
         self.check_shape(y.shape, (2, 1, 2, 2))
 
 if __name__ == '__main__':


[12/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- merge definition of handles and their init functions

- modified conv2d operation in python part


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/aa9c52ae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/aa9c52ae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/aa9c52ae

Branch: refs/heads/master
Commit: aa9c52aeba2e71638c2d8905a8bc37fd8603a510
Parents: e68ea2e
Author: xuewanqi <xu...@outlook.com>
Authored: Sat Jun 30 09:09:30 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Mon Jul 2 06:09:07 2018 +0000

----------------------------------------------------------------------
 examples/autograd/mlp.py                     |   4 +-
 examples/autograd/mnist_cnn.py               |  11 +-
 python/singa/autograd.py                     | 106 +++---
 python/singa/tensor.py                       |   2 +-
 src/api/model_operation.i                    |  36 +-
 src/core/tensor/tensor_math_cpp.h            |  44 ++-
 src/model/operation/convolution_operation.cc | 366 ++++++++++++++++++
 src/model/operation/convolution_operation.h  |  78 ++++
 src/model/operation/convolution_related.cc   | 431 ----------------------
 src/model/operation/convolution_related.h    |  75 ----
 10 files changed, 567 insertions(+), 586 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/examples/autograd/mlp.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mlp.py b/examples/autograd/mlp.py
index 3910369..f7c4353 100644
--- a/examples/autograd/mlp.py
+++ b/examples/autograd/mlp.py
@@ -26,6 +26,8 @@ import numpy as np
 
 if __name__ == '__main__':
 
+    autograd.training = True
+
     # prepare training data in numpy array
 
     # generate the boundary
@@ -60,7 +62,7 @@ if __name__ == '__main__':
     label = to_categorical(label, 2).astype(np.float32)
     print('train_data_shape:', data.shape)
     print('train_label_shape:', label.shape)
-
+    # 1
     inputs = Tensor(data=data)
     target = Tensor(data=label)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/examples/autograd/mnist_cnn.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py
index 7b72c75..cbb5650 100644
--- a/examples/autograd/mnist_cnn.py
+++ b/examples/autograd/mnist_cnn.py
@@ -21,10 +21,13 @@ import numpy as np
 import argparse
 import os
 
+import singa
 from singa import tensor
 from singa import autograd
 from singa import optimizer
 
+singa.layer.engine = 'singacpp'
+
 
 def load_data(path):
     f = np.load(path)
@@ -97,8 +100,8 @@ if __name__ == '__main__':
     print('the shape of testing label is', y_test.shape)
 
     # operations initialization
-    conv1 = autograd.Conv2d(3, 32)
-    conv2 = autograd.Conv2d(32, 32)
+    conv1 = autograd.Conv2D(1, 32, 3, padding=1)
+    conv2 = autograd.Conv2D(32, 32, 3, padding=1)
     linear = autograd.Linear(32 * 28 * 28, 10)
 
     def forward(x, t):
@@ -121,8 +124,8 @@ if __name__ == '__main__':
 
             loss, y = forward(inputs, targets)
 
-            accuracy_rate = accuracy(autograd.ctensor2numpy(
-                y.data), autograd.ctensor2numpy(targets.data))
+            accuracy_rate = accuracy(autograd.ctensor2numpy(y.data),
+                                     autograd.ctensor2numpy(targets.data))
             if (i % 5 == 0):
                 print('accuracy is:', accuracy_rate, 'loss is:',
                       autograd.ctensor2numpy(loss.data)[0])

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index e301e51..b1475bb 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -88,7 +88,7 @@ class Operation(object):
             ys = (ys,)
         # create Tensor based on CTensor(data);
         # assume outputs are all Tensor instances
-        ys = tuple(Tensor(device=y.device,
+        ys = tuple(Tensor(device=y.device(),
                           data=y,
                           requires_grad=self.requires_grad,
                           creator=self) for y in ys)
@@ -442,7 +442,7 @@ class Conv2d(Operation):
         param_data = self.PyLayer.layer.param_values()
 
         if not hasattr(self, 'w'):
-            self.w = Tensor(device=param_data[0].device, data=param_data[
+            self.w = Tensor(device=param_data[0].device(), data=param_data[
                             0], requires_grad=True, stores_grad=True)
             std = math.sqrt(
                 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
@@ -452,7 +452,7 @@ class Conv2d(Operation):
 
         if len(param_data) == 2:
             if not hasattr(self, 'b'):
-                self.b = Tensor(device=param_data[1].device, data=param_data[
+                self.b = Tensor(device=param_data[1].device(), data=param_data[
                                 1], requires_grad=True, stores_grad=True)
                 self.b.set_value(0.0)
 
@@ -638,79 +638,75 @@ class Conv2D(Operation):
         else:
             #to keep consistency when to do forward.
             self.b = Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)
-        
-        self.reset = False
 
-    def __call__(self, x):
-        assert x.ndim() == 4, 'The dimensions of input should be 4D.'
-        assert x.shape[1] == self.in_channels, 'in_channels dismatched.'
-        assert 0 == 0, 'invalid padding.'
-    	# TODO valid padding check.
-
-    	if not hasattr (self, 'recorder'):
-    	    self.recorder = singa.SetupRecorder(x.data, self.kernel_size, self.stride,
-                                self.padding, self.in_channels, self.out_channels, self.bias)
-    	elif x.shape[0] != self.recorder.batchsize:
-    	    self.recorder = singa.SetupRecorder(x.data, self.kernel_size, self.stride,
-                                self.padding, self.in_channels, self.out_channels, self.bias)
-            self.reset = True
-        
-        if training:
-            self.x = x
+    def __call__(self, x): 
+        if not hasattr(self, 'device_id'):
+            self.device_id = x.device.id()
+        else:
+            assert self.device_id == x.device.id(),'Not the same device.'
 
-    	self.dev = x.device
+        if self.W.device.id() != self.device_id:
+            self.W.to_device(x.device)
 
-    	self.W.to_device(self.dev)
-    	xs = [x, self.W]
-    	
         if self.bias:
-    	   self.b.to_device(self.dev)
-    	xs.append(self.b)
+            if self.b.device.id() != self.device_id:
+                self.b.to_device(x.device)
+
+    	xs = [x, self.W, self.b]
+
     	return self._do_forward(*xs)[0]
 
     def forward(self, *xs):
-        if self.dev.lang()==1: #kCuda = 1           
-            if not hasattr(self, 'cudnnconvhandles'):
-                self.cudnnconvhandles=singa.InitCudnnConvHandles(xs[0], self.recorder, 
-                    self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
-            elif self.reset:
-                self.cudnnconvhandles=singa.InitCudnnConvHandles(xs[0], self.recorder, 
-                    self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
+        assert xs[0].nDim() == 4, 'The dimensions of input should be 4D.'
+        assert xs[0].shape()[1] == self.in_channels, 'in_channels dismatched.'
+        #assert (xs[0].shape()[2]+2*self.padding[0]-self.kernel_size[0]-1)%self.stride[0] == 0, 'invalid padding.'
+        assert 0==0, 'invalid padding'
 
-            return singa.GpuConvForward(xs[0], xs[1], xs[2], self.recorder, self.cudnnconvhandles)
+        if training:
+            self.x = xs[0]
 
-        elif self.dev.lang()==0: #kCpp = 0
-            return singa.CpuConvForward(xs[0], xs[1], xs[2], self.recorder)
+        if self.device_id == -1:
+            if not hasattr (self, 'handles'):
+                self.handles = singa.ConvHandles(xs[0], self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias)
+            elif xs[0].shape()[0] != self.handles.batchsize:
+                self.handles = singa.ConvHandles(xs[0], self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias)
+            return singa.CpuConvForward(xs[0], xs[1], xs[2], self.handles)
 
         else:
-            TypeError('Not implemented yet')
-
+            if not hasattr(self, 'handles'):
+                self.handles = singa.CudnnConvHandles(xs[0], self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias,
+                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
+            elif xs[0].shape()[0] != self.handles.batchsize:
+                self.handles = singa.CudnnConvHandles(xs[0], self.kernel_size, self.stride,
+                               self.padding, self.in_channels, self.out_channels, self.bias,
+                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
+            return singa.GpuConvForward(xs[0], xs[1], xs[2], self.handles)
 
     def backward(self, dy):
         assert training is True and hasattr(self, 'x'), 'Please set training as True before do BP. '
 
-        # todo check device?
-        dy.ToDevice(self.dev)
+        if dy.device().id() != self.device_id:
+            dy.ToDevice(self.x.device())
 
-        if self.dev.lang()==1: #kCuda = 1 
-            dx = singa.GpuConvBackwardx(dy, self.W.data, self.x.data, self.cudnnconvhandles)
-            dW = singa.GpuConvBackwardW(dy, self.x.data, self.W.data, self.cudnnconvhandles)
+        if self.device_id == -1: 
+            dx = singa.CpuConvBackwardx(dy, self.W.data, self.x, self.handles)
+            dW = singa.CpuConvBackwardW(dy, self.x, self.W.data, self.handles)
             if self.bias:
-        	    db = singa.GpuConvBackwardb(dy, self.b.data, self.cudnnconvhandles)
-        	    return dx, dW, db
+                db = singa.CpuConvBackwardb(dy, self.b.data, self.handles)
+                return dx, dW, db
             else:
-        	    return dx, dW
-
-        elif self.dev.lang()==0: #kCpp = 0
-            dx = singa.CpuConvBackwardx(dy, self.W.data, self.x.data, self.recorder)
-            dW = singa.CpuConvBackwardW(dy, self.x.data, self.W.data, self.recorder)
+                return dx, dW
+        else:
+            dx = singa.GpuConvBackwardx(dy, self.W.data, self.x, self.handles)
+            dW = singa.GpuConvBackwardW(dy, self.x, self.W.data, self.handles)
             if self.bias:
-                db = singa.CpuConvBackwardb(dy, self.b.data, self.recorder)
+                db = singa.GpuConvBackwardb(dy, self.b.data, self.handles)
                 return dx, dW, db
             else:
                 return dx, dW
-        else:
-            TypeError('Not implemented yet')
 
 def infer_dependency(op):
     '''
@@ -813,7 +809,7 @@ def backward(y, dy=None):
             if y_stores_grad:
                 # store the gradient for final return, e.g. if x is parameter
                 g = not_ready[src_op][y_idx]
-                gradients[y] = Tensor(device=g.device, data=g)
+                gradients[y] = Tensor(device=g.device(), data=g)
             dependency[src_op] -= 1
             if src_op.requires_grad is True:
                 if dependency[src_op] == 0:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/python/singa/tensor.py
----------------------------------------------------------------------
diff --git a/python/singa/tensor.py b/python/singa/tensor.py
index 8f36775..eddce28 100644
--- a/python/singa/tensor.py
+++ b/python/singa/tensor.py
@@ -98,7 +98,7 @@ class Tensor(object):
             copy_from_numpy(self.data, data)
         elif isinstance(data, CTensor):
             self.data = data
-            assert data.device == device, 'not the same device'
+            assert data.device().id() == device.id(), 'not the same device'
         else:
             self.data = CTensor(list(shape), device, dtype)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 1d31b9d..29f8f58 100644
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -1,24 +1,32 @@
 %module model_operation
 
 %{
-#include "../src/model/operation/convolution_related.h"
+#include "../src/model/operation/convolution_operation.h"
 %}
 namespace singa{
 
-struct Recorder{size_t batchsize;};
+struct ConvHandles{
 
-struct CudnnConvHandles{};
+		size_t batchsize;
 
+		ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias_term_);
+              	};
 
-Recorder SetupRecorder(const Tensor &input, const std::vector<size_t> kernel_size, 
-	                const std::vector<size_t> stride, const std::vector<size_t> padding,
-	                const size_t in_channels, const size_t out_channels,
-	                const bool bias_term_);
+struct CudnnConvHandles{
 
-CudnnConvHandles InitCudnnConvHandles(const Tensor &input, const Recorder r, 
-     const size_t workspace_byte_limit_=1024*1024*1024, const std::string prefer_="fastest");
+		size_t batchsize;
+		
+		CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024,
+                    const std::string prefer_="fastest");
+                };
 
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const Recorder r, const CudnnConvHandles cch);
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch);
 
 Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
 
@@ -27,12 +35,12 @@ Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, cons
 Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
 
 
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const Recorder r);
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch);
 
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Recorder r);
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch);
 
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const Recorder r);
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch);
 
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const Recorder r);
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch);
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index bfdd026..67f1f20 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -506,18 +506,52 @@ void Asum<float, lang::Cpp>(const Tensor& in, float *out,
   *out = cblas_sasum(in.Size(), inPtr, 1); //not using strided traversal
 }
 
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+//                             const Tensor& in, Tensor *out, Context *ctx) {
+//   //check input tensor for strides first
+//   if (in.strides() == out->strides()) {
+//     const float *inPtr = static_cast<const float *>(in.block()->data());
+//     float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//     cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
+//   } else {
+//     //LOG(FATAL) << "Axpy, input and output strides do not match." ;
+//     EltwiseMult<float, lang::Cpp>(in, alpha, out, ctx);
+//   }
+// }
+
 template <>
 void Axpy<float, lang::Cpp>(const float alpha,
                             const Tensor& in, Tensor *out, Context *ctx) {
   //check input tensor for strides first
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+
   if (in.strides() == out->strides()) {
-    const float *inPtr = static_cast<const float *>(in.block()->data());
-    float *outPtr = static_cast<float *>(out->block()->mutable_data());
     cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
   } else {
-    LOG(FATAL) << "Axpy, input and output strides do not match." ;
-  }
-}
+    //LOG(FATAL) << "Axpy, input and output strides do not match." ;
+    Tensor t(in.shape(), in.device(), in.data_type());
+    EltwiseMult<float, lang::Cpp>(in, alpha, &t, ctx);
+    float* tPtr = static_cast<float*>(t.block()->mutable_data());
+    cblas_saxpy(in.Size(), 1, tPtr, 1, outPtr, 1);
+  }
+}
+
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+//                            const Tensor& in, Tensor *out, Context *ctx) {
+//  //check input tensor for strides first
+//  if (in.strides() == out->strides()) {
+//    const float *inPtr = static_cast<const float *>(in.block()->data());
+//    float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//    cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
+//  } else if(out->transpose()) {
+//    LOG(FATAL) << "output is already transposed." ;
+//  } else {
+//    LOG(FATAL) << "Axpy, input and output strides do not match." ;
+//  }
+// }
 
 template <>
 void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/src/model/operation/convolution_operation.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_operation.cc b/src/model/operation/convolution_operation.cc
new file mode 100644
index 0000000..90b1b4a
--- /dev/null
+++ b/src/model/operation/convolution_operation.cc
@@ -0,0 +1,366 @@
+#include "./convolution_operation.h"
+#include "../layer/convolution.h"
+#include<iostream>
+
+namespace singa{
+
+ConvHandles::ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+	                const std::vector<size_t> stride, const std::vector<size_t> padding,
+	                const size_t in_channels, const size_t out_channels,
+	                const bool bias_term_){
+    kernel_h_=kernel_size[0];
+    kernel_w_=kernel_size[1];
+
+    pad_h_=padding[0];
+    pad_w_=padding[1];
+
+    stride_h_=stride[0];
+    stride_w_=stride[1];
+
+    channels_=in_channels;
+    num_filters_=out_channels;
+
+	batchsize = input.shape(0);
+	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    conv_height_ = 1;
+    if (stride_h_ > 0)
+        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+
+    col_height_ = in_channels * kernel_w_ * kernel_h_;
+    col_width_ = conv_height_ * conv_width_;
+    imagesize = input.Size() / batchsize;
+};	
+
+CudnnConvHandles::CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,const bool bias_term_, 
+                    const size_t workspace_byte_limit_,const std::string prefer_)
+                    :ConvHandles(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){
+
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    Context *ctx = dev->context(0);
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+    if (bias_term_)
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), batchsize,
+                                           channels_, height_, width_));
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+            num_filters_, conv_height_, conv_width_));
+    if (bias_term_)
+        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                               GetCudnnDataType(dtype), 1,
+                                               num_filters_, 1, 1));
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
+                                                stride_h_, stride_w_, 1, 1,
+                                                CUDNN_CROSS_CORRELATION,
+                                                GetCudnnDataType(dtype)));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                           CUDNN_TENSOR_NCHW, num_filters_,
+                                           channels_, kernel_h_, kernel_w_));
+    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+        prefer_ == "no_workspace") {
+        cudnnConvolutionFwdPreference_t fwd_pref;
+        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+        if (prefer_ == "fastest") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+        } else if (prefer_ == "limited_workspace") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        } else {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        }
+        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                workspace_byte_limit_, &fp_alg_));
+        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+        // deprecated in cudnn v7
+        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+        } else if (prefer_ == "autotune") {
+        const int topk = 1;
+        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                &num_fp_alg, fp_alg_perf));
+        fp_alg_ = fp_alg_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                &num_bp_filt_alg, bp_filt_perf));
+        bp_filter_alg_ = bp_filt_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                &num_bp_data_alg, bp_data_perf));
+        bp_data_alg_ = bp_data_perf[0].algo;
+    } else {
+        LOG(FATAL) << "Preferred algorithm is not available!";
+    }
+
+    size_t fp_byte, bp_data_byte, bp_filter_byte;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+            &fp_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+            bp_data_alg_, &bp_data_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+            bp_filter_alg_, &bp_filter_byte));
+    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                       sizeof(float) +
+                       1;
+    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
+        LOG(WARNING) << "The required memory for workspace ("
+                     << workspace_count_ * sizeof(float)
+                     << ") is larger than the expected Bytes ("
+                     << workspace_byte_limit_ << ")";
+    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+};
+
+Convolution C;
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch){
+	CHECK_EQ(x.device()->lang(), kCpp);
+
+	CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+    x.shape(3) == ch.width_) << "input sample shape should not change";
+
+    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
+    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    Shape b_shape= b.shape();
+
+    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+    if (ch.bias_term_)
+      b.Reshape(Shape{ch.num_filters_});
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+    Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+
+    float *data_col = new float[ch.col_height_ * ch.col_width_];
+    auto in_data = x.data<float>();
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);    
+
+      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+      Tensor each = Mult(W, col_data);
+      if (ch.bias_term_) {
+          AddColumn(b, &each);
+        }
+      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
+    };
+  W.Reshape(w_shape);
+  b.Reshape(b_shape);
+  return output;
+}; 
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
+    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+    Tensor dx;
+    dx.ResetLike(x);
+    
+    float *dx_b = new float[ch.imagesize];
+
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      Tensor dcol_b = Mult(W.T(), grad_b);
+      auto dcol_data = dcol_b.data<float>();
+      C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
+           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
+      dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
+    }
+  W.Reshape(w_shape); 
+  return dx;
+};
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+    CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+    x.shape(3) == ch.width_) << "input sample shape should not change";
+
+    Tensor dW;
+    dW.ResetLike(W);
+    dW.SetValue(0.0f);
+    
+    Shape w_shape= W.shape();
+    dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+
+    float *data_col = new float[ch.col_height_ * ch.col_width_];
+    auto in_data = dy.data<float>();
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
+      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      dW += Mult(grad_b, col_data.T());
+    }
+   dW.Reshape(w_shape);
+   return dW;
+};
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+	
+	CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change";
+
+    Tensor db;
+    db.ResetLike(b);
+
+    auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
+    Tensor tmp1 = Reshape(dy, tmpshp);
+
+    Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
+
+    SumRows(tmp3, &db);
+
+    return db;
+};
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch){
+	CHECK_EQ(x.device()->lang(), kCuda);
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+
+    Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    output.device()->Exec([output, x, W, cch](Context *ctx) {
+        Block *inblock = x.block(), *outblock = output.block(),
+                *wblock = W.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                                inblock->data(), cch.filter_desc_, wblock->data(),
+                                cch.conv_desc_, cch.fp_alg_,
+                                cch.workspace_.block()->mutable_data(),
+                                cch.workspace_count_ * sizeof(float), &beta,
+                                cch.y_desc_, outblock->mutable_data());
+    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+    if (cch.bias_term_) {
+        output.device()->Exec([output, b, cch](Context *ctx) {
+            float beta = 1.f, alpha = 1.0f;
+            Block *outblock = output.block(), *bblock = b.block();
+            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                           bblock->data(), &beta, cch.y_desc_,
+                           outblock->mutable_data());
+        }, {output.block(), b.block()}, {output.block()});
+    }
+
+    return output;
+};
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dx;
+    dx.ResetLike(x);
+
+    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
+        Block *wblock = W.block(), *dyblock = dy.block(),
+                *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                     wblock->data(), cch.y_desc_, dyblock->data(),
+                                     cch.conv_desc_, cch.bp_data_alg_,
+                                     cch.workspace_.block()->mutable_data(),
+                                     cch.workspace_count_ * sizeof(float), &beta,
+                                     cch.x_desc_, dxblock->mutable_data());
+    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+
+    return dx;
+};
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dW;
+    dW.ResetLike(W);
+
+    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
+    Block *inblock = x.block(), *dyblock = dy.block(),
+            *dwblock = dW.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+            cch.workspace_.block()->mutable_data(),
+            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+            dwblock->mutable_data());
+    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+    return dW;
+};
+
+// input Tensor b for Reset db purpose, can avoid this later.
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
+        Block *dyblock = dy.block(), *dbblock = db.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                     dyblock->data(), &beta, cch.bias_desc_,
+                                     dbblock->mutable_data());
+    }, {dy.block()}, {db.block()});
+
+    return db;
+};
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/src/model/operation/convolution_operation.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_operation.h b/src/model/operation/convolution_operation.h
new file mode 100644
index 0000000..835581e
--- /dev/null
+++ b/src/model/operation/convolution_operation.h
@@ -0,0 +1,78 @@
+#include <string>
+#include <vector>
+#include <cudnn.h>
+#include "../layer/cudnn_convolution.h"
+#include "../layer/cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa{
+
+struct ConvHandles{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+
+    size_t col_height_;
+    size_t col_width_;
+    size_t imagesize;
+
+    ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias_term_);
+
+};
+
+struct CudnnConvHandles:ConvHandles{
+	cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024,
+                    const std::string prefer_="fastest");
+};
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch);
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch);
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch);
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch);
+
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch);
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
+
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/src/model/operation/convolution_related.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_related.cc b/src/model/operation/convolution_related.cc
deleted file mode 100644
index c828f90..0000000
--- a/src/model/operation/convolution_related.cc
+++ /dev/null
@@ -1,431 +0,0 @@
-#include "./convolution_related.h"
-#include "../layer/convolution.h"
-#include<iostream>
-
-namespace singa{
-
-Recorder SetupRecorder(const Tensor &input, const std::vector<size_t> kernel_size, 
-	                const std::vector<size_t> stride, const std::vector<size_t> padding,
-	                const size_t in_channels, const size_t out_channels,
-	                const bool bias_term_){
-	size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-
-    size_t col_height_;
-    size_t col_width_;
-    size_t imagesize;
-
-    kernel_h_=kernel_size[0];
-    kernel_w_=kernel_size[1];
-
-    pad_h_=padding[0];
-    pad_w_=padding[1];
-
-    stride_h_=stride[0];
-    stride_w_=stride[1];
-
-	batchsize = input.shape(0);
-	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    conv_height_ = 1;
-    if (stride_h_ > 0)
-        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
-    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
-
-    col_height_ = in_channels * kernel_w_ * kernel_h_;
-    col_width_ = conv_height_ * conv_width_;
-    imagesize = input.Size() / batchsize;
-
-    return Recorder{
-    	kernel_w_,
-        pad_w_,
-        stride_w_,
-        kernel_h_,
-        pad_h_,
-        stride_h_,
-
-        in_channels,
-        out_channels,
-
-        bias_term_,
-
-        height_,
-        width_,
-        conv_height_,
-        conv_width_,
-        batchsize,
-
-        col_height_,
-        col_width_,
-        imagesize
-    };
-};	
-
-Convolution C;
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const Recorder r){
-	CHECK_EQ(x.device()->lang(), kCpp);
-
-	CHECK(x.shape(1) == r.channels_ && x.shape(2) == r.height_ &&
-    x.shape(3) == r.width_) << "input sample shape should not change";
-
-    CHECK(W.shape(0) == r.num_filters_ && W.shape(1) == r.channels_ && 
-    W.shape(2) == r.kernel_h_ && W.shape(3) == r.kernel_w_) << "weights shape should not change";
-
-    Shape w_shape= W.shape();
-    Shape b_shape= b.shape();
-
-    W.Reshape(Shape{r.num_filters_, r.col_height_});
-    if (r.bias_term_)
-      b.Reshape(Shape{r.num_filters_});
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-    Shape shape{r.batchsize, r.num_filters_, r.conv_height_, r.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    Tensor col_data(Shape{r.col_height_, r.col_width_});//broadcasted image
-
-    float *data_col = new float[r.col_height_ * r.col_width_];
-    auto in_data = x.data<float>();
-    for (size_t num = 0; num < r.batchsize; num++) {
-      C.Im2col(in_data + num * r.imagesize, r.channels_, r.height_, r.width_, r.kernel_h_,
-            r.kernel_w_, r.pad_h_, r.pad_w_, r.stride_h_, r.stride_w_, data_col);    
-
-      col_data.CopyDataFromHostPtr(data_col, r.col_height_ * r.col_width_);
-      Tensor each = Mult(W, col_data);
-      if (r.bias_term_) {
-          AddColumn(b, &each);
-        }
-      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
-    };
-  W.Reshape(w_shape);
-  b.Reshape(b_shape);
-  return output;
-}; 
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Recorder r){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == r.num_filters_ && dy.shape(2) == r.conv_height_ &&
-    dy.shape(3) == r.conv_width_) << "input gradients shape should not change";
-
-    CHECK(W.shape(0) == r.num_filters_ && W.shape(1) == r.channels_ && 
-    W.shape(2) == r.kernel_h_ && W.shape(3) == r.kernel_w_) << "weights shape should not change";
-
-    Shape w_shape= W.shape();
-    W.Reshape(Shape{r.num_filters_, r.col_height_});
-
-    Tensor dx;
-    dx.ResetLike(x);
-    
-    float *dx_b = new float[r.imagesize];
-
-    for (size_t num = 0; num < r.batchsize; num++) {
-      Tensor grad_b(Shape{r.num_filters_, r.conv_height_ * r.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      Tensor dcol_b = Mult(W.T(), grad_b);
-      auto dcol_data = dcol_b.data<float>();
-      C.Col2im(dcol_data, r.channels_, r.height_, r.width_, r.kernel_h_, r.kernel_w_, r.pad_h_,
-           r.pad_w_, r.stride_h_, r.stride_w_, dx_b);
-      dx.CopyDataFromHostPtr(dx_b, r.imagesize, num * r.imagesize);
-    }
-  W.Reshape(w_shape); 
-  return dx;
-};
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const Recorder r){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == r.num_filters_ && dy.shape(2) == r.conv_height_ &&
-    dy.shape(3) == r.conv_width_) << "input gradients shape should not change";
-
-    CHECK(x.shape(1) == r.channels_ && x.shape(2) == r.height_ &&
-    x.shape(3) == r.width_) << "input sample shape should not change";
-
-    Tensor dW;
-    dW.ResetLike(W);
-    dW.SetValue(0.0f);
-    
-    Shape w_shape= W.shape();
-    dW.Reshape(Shape{r.num_filters_, r.col_height_});
-
-    Tensor col_data(Shape{r.col_height_, r.col_width_});//broadcasted image
-
-    float *data_col = new float[r.col_height_ * r.col_width_];
-    auto in_data = dy.data<float>();
-    for (size_t num = 0; num < r.batchsize; num++) {
-      C.Im2col(in_data + num * r.imagesize, r.channels_, r.height_, r.width_, r.kernel_h_,
-            r.kernel_w_, r.pad_h_, r.pad_w_, r.stride_h_, r.stride_w_, data_col);
-      col_data.CopyDataFromHostPtr(data_col, r.col_height_ * r.col_width_);
-      Tensor grad_b(Shape{r.num_filters_, r.conv_height_ * r.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      dW += Mult(grad_b, col_data.T());
-    }
-   dW.Reshape(w_shape);
-   return dW;
-};
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const Recorder r){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == r.num_filters_ && dy.shape(2) == r.conv_height_ &&
-    dy.shape(3) == r.conv_width_) << "input gradients shape should not change";
-	
-	CHECK(b.shape(0) == r.num_filters_)<< "bias shape should not change";
-
-    Tensor db;
-    db.ResetLike(b);
-
-    auto tmpshp = Shape{r.batchsize * r.num_filters_, dy.Size() / (r.batchsize * r.num_filters_)};
-    Tensor tmp1 = Reshape(dy, tmpshp);
-
-    Tensor tmp2(Shape{r.batchsize * r.num_filters_});
-    SumColumns(tmp1, &tmp2);
-    Tensor tmp3 = Reshape(tmp2, Shape{r.batchsize, r.num_filters_});
-
-    SumRows(tmp3, &db);
-
-    return db;
-};
-
-CudnnConvHandles InitCudnnConvHandles(const Tensor &input, const Recorder r, const size_t workspace_byte_limit_,
-    				const std::string prefer_){
-
-	CHECK(input.shape(0) == r.batchsize && input.shape(1) == r.channels_ && input.shape(2) == r.height_ &&
-    input.shape(3) == r.width_) << "input sample shape dismatched";
-
-	cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_; 
-
-    DataType dtype = input.data_type();
-    auto dev = input.device();
-    Context *ctx = dev->context(0);
-
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-    if (r.bias_term_)
-        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), r.batchsize,
-                                           r.channels_, r.height_, r.width_));
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), r.batchsize,
-            r.num_filters_, r.conv_height_, r.conv_width_));
-    if (r.bias_term_)
-        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                               GetCudnnDataType(dtype), 1,
-                                               r.num_filters_, 1, 1));
-    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, r.pad_h_, r.pad_w_,
-                                                r.stride_h_, r.stride_w_, 1, 1,
-                                                CUDNN_CROSS_CORRELATION,
-                                                GetCudnnDataType(dtype)));
-    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                           CUDNN_TENSOR_NCHW, r.num_filters_,
-                                           r.channels_, r.kernel_h_, r.kernel_w_));
-    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-        prefer_ == "no_workspace") {
-        cudnnConvolutionFwdPreference_t fwd_pref;
-        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-        if (prefer_ == "fastest") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-        } else if (prefer_ == "limited_workspace") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        } else {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        }
-        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                workspace_byte_limit_, &fp_alg_));
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
-        // deprecated in cudnn v7
-        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
-        } else if (prefer_ == "autotune") {
-        const int topk = 1;
-        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
-        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
-        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
-        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                &num_fp_alg, fp_alg_perf));
-        fp_alg_ = fp_alg_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
-                &num_bp_filt_alg, bp_filt_perf));
-        bp_filter_alg_ = bp_filt_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
-                &num_bp_data_alg, bp_data_perf));
-        bp_data_alg_ = bp_data_perf[0].algo;
-    } else {
-        LOG(FATAL) << "Preferred algorithm is not available!";
-    }
-
-    size_t fp_byte, bp_data_byte, bp_filter_byte;
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
-            &fp_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-            bp_data_alg_, &bp_data_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-            bp_filter_alg_, &bp_filter_byte));
-    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
-                       sizeof(float) +
-                       1;
-    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
-        LOG(WARNING) << "The required memory for workspace ("
-                     << workspace_count_ * sizeof(float)
-                     << ") is larger than the expected Bytes ("
-                     << workspace_byte_limit_ << ")";
-    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
-
-    return CudnnConvHandles{
-    	x_desc_,
-        y_desc_,
-        bias_desc_,
-        filter_desc_,
-        conv_desc_,
-        fp_alg_,
-        bp_filter_alg_,
-        bp_data_alg_,
-
-        workspace_count_,
-        workspace_,
-    };
-
-};
-
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const Recorder r, const CudnnConvHandles cch){
-	CHECK_EQ(x.device()->lang(), kCuda);
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-
-    Shape shape{r.batchsize, r.num_filters_, r.conv_height_, r.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    output.device()->Exec([output, x, W, cch](Context *ctx) {
-        Block *inblock = x.block(), *outblock = output.block(),
-                *wblock = W.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                                inblock->data(), cch.filter_desc_, wblock->data(),
-                                cch.conv_desc_, cch.fp_alg_,
-                                cch.workspace_.block()->mutable_data(),
-                                cch.workspace_count_ * sizeof(float), &beta,
-                                cch.y_desc_, outblock->mutable_data());
-    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-    if (r.bias_term_) {
-        output.device()->Exec([output, b, cch](Context *ctx) {
-            float beta = 1.f, alpha = 1.0f;
-            Block *outblock = output.block(), *bblock = b.block();
-            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                           bblock->data(), &beta, cch.y_desc_,
-                           outblock->mutable_data());
-        }, {output.block(), b.block()}, {output.block()});
-    }
-
-    return output;
-};
-
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor dx;
-    dx.ResetLike(x);
-
-    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
-        Block *wblock = W.block(), *dyblock = dy.block(),
-                *dxblock = dx.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                     wblock->data(), cch.y_desc_, dyblock->data(),
-                                     cch.conv_desc_, cch.bp_data_alg_,
-                                     cch.workspace_.block()->mutable_data(),
-                                     cch.workspace_count_ * sizeof(float), &beta,
-                                     cch.x_desc_, dxblock->mutable_data());
-    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
-
-    return dx;
-};
-
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor dW;
-    dW.ResetLike(W);
-
-    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
-    Block *inblock = x.block(), *dyblock = dy.block(),
-            *dwblock = dW.block();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-            cch.workspace_.block()->mutable_data(),
-            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
-            dwblock->mutable_data());
-    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
-
-    return dW;
-};
-
-// input Tensor b for Reset db purpose, can avoid this later.
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor db;
-    db.ResetLike(b);
-
-    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
-        Block *dyblock = dy.block(), *dbblock = db.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                     dyblock->data(), &beta, cch.bias_desc_,
-                                     dbblock->mutable_data());
-    }, {dy.block()}, {db.block()});
-
-    return db;
-};
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aa9c52ae/src/model/operation/convolution_related.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_related.h b/src/model/operation/convolution_related.h
deleted file mode 100644
index 49aab5b..0000000
--- a/src/model/operation/convolution_related.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <string>
-#include <vector>
-#include <cudnn.h>
-#include "../layer/cudnn_convolution.h"
-#include "../layer/cudnn_utils.h"
-#include "singa/utils/logging.h"
-
-namespace singa{
-
-struct Recorder{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-
-    size_t col_height_;
-    size_t col_width_;
-    size_t imagesize;
-};
-
-struct CudnnConvHandles{
-	cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;  
-};
-
-
-Recorder SetupRecorder(const Tensor &input, const std::vector<size_t> kernel_size, 
-	                const std::vector<size_t> stride, const std::vector<size_t> padding,
-	                const size_t in_channels, const size_t out_channels,
-	                const bool bias_term_);
-
-CudnnConvHandles InitCudnnConvHandles(const Tensor &input, const Recorder r, const size_t workspace_byte_limit_=1024*1024*1024,
-    				const std::string prefer_="fastest");
-
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const Recorder r, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
-
-
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const Recorder r);
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Recorder r);
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const Recorder r);
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const Recorder r);
-
-}
\ No newline at end of file


[18/18] incubator-singa git commit: Merge branch 'pr387'

Posted by wa...@apache.org.
Merge branch 'pr387'


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/56292f1f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/56292f1f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/56292f1f

Branch: refs/heads/master
Commit: 56292f1fb376b196b92e5c0fb81eaccc7ab5d5c6
Parents: 7a19e63 ac5f4eb
Author: Wang Wei <wa...@gmail.com>
Authored: Thu Jul 5 11:09:32 2018 +0800
Committer: Wang Wei <wa...@gmail.com>
Committed: Thu Jul 5 11:09:32 2018 +0800

----------------------------------------------------------------------
 examples/autograd/mlp.py           |   4 +-
 examples/autograd/mnist_cnn.py     |  11 +-
 python/singa/autograd.py           | 329 ++++++++++++++++-----------
 python/singa/tensor.py             |   2 +-
 src/CMakeLists.txt                 |   1 +
 src/api/model_operation.i          |  46 ++++
 src/api/singa.i                    |   1 +
 src/core/tensor/tensor_math_cpp.h  |  44 +++-
 src/model/layer/convolution.cc     |   4 +-
 src/model/layer/convolution.h      |  21 +-
 src/model/operation/convolution.cc | 384 ++++++++++++++++++++++++++++++++
 src/model/operation/convolution.h  |  93 ++++++++
 test/python/test_operation.py      |  74 ++++++
 13 files changed, 855 insertions(+), 159 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/56292f1f/python/singa/tensor.py
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/56292f1f/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------


[04/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

Functions for convolution operations

Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/30ac41b6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/30ac41b6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/30ac41b6

Branch: refs/heads/master
Commit: 30ac41b6e13977363154457b5534c6fdcdcf9c8a
Parents: c343ff9
Author: xuewanqi <36...@users.noreply.github.com>
Authored: Thu May 31 20:56:27 2018 +0800
Committer: xuewanqi <xu...@u.nus.edu>
Committed: Wed Jun 20 14:47:05 2018 +0000

----------------------------------------------------------------------
 src/model/convolution functions.cpp | 398 +++++++++++++++++++++++++++++++
 1 file changed, 398 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/30ac41b6/src/model/convolution functions.cpp
----------------------------------------------------------------------
diff --git a/src/model/convolution functions.cpp b/src/model/convolution functions.cpp
new file mode 100644
index 0000000..d0aeb1a
--- /dev/null
+++ b/src/model/convolution functions.cpp	
@@ -0,0 +1,398 @@
+#include <iostream>
+#include <cudnn.h>
+
+struct ConvHandle{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    string prefer_;
+};
+
+struct CudnnConvHandle{
+    cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+};
+
+// Done in conv2d.__init__()
+ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
+
+    size_t kernel_w_, pad_w_, stride_w_;
+    size_t kernel_h_, pad_h_, stride_h_;
+
+    size_t channels_, num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    string prefer_;
+
+    ConvolutionConf conv_conf = conf.convolution_conf();
+
+    workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
+    prefer_ = ToLowerCase(conv_conf.prefer());
+    CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+          prefer_ == "no_workspace" || prefer_ == "autotune")
+            << "CudnnConvolution only supports four algorithm preferences: fastest, "
+               "limited_workspace, no_workspace and autotune";
+
+    // store intermediate data, i.e., input tensor
+    //std::stack<Tensor> buf_;
+
+    // kernel_size, pad, and stride are repeated fields.
+    if (conv_conf.kernel_size_size() > 0) {
+    if (conv_conf.kernel_size_size() == 1) {
+    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+    } else {
+    kernel_w_ = conv_conf.kernel_size(0);
+    kernel_h_ = conv_conf.kernel_size(1);
+    }
+    } else {
+    kernel_w_ = conv_conf.kernel_w();
+    kernel_h_ = conv_conf.kernel_h();
+    }
+    CHECK_GT(kernel_w_, 0u);
+    CHECK_GT(kernel_h_, 0u);
+
+    if (conv_conf.pad_size() > 0) {
+    if (conv_conf.pad_size() == 1) {
+    pad_w_ = pad_h_ = conv_conf.pad(0);
+    } else {
+    pad_w_ = conv_conf.pad(0);
+    pad_h_ = conv_conf.pad(1);
+    }
+    } else {
+    pad_w_ = conv_conf.pad_w();
+    pad_h_ = conv_conf.pad_h();
+    }
+    CHECK_GE(pad_w_, 0u);
+    CHECK_GE(pad_h_, 0u);
+
+    const int kStrideDefault = 1;
+    if (conv_conf.stride_size() > 0) {
+    if (conv_conf.stride_size() == 1) {
+    stride_w_ = stride_h_ = conv_conf.stride(0);
+    } else {
+    stride_w_ = conv_conf.stride(0);
+    stride_h_ = conv_conf.stride(1);
+    }
+    } else {
+    stride_w_ = kStrideDefault;
+    stride_h_ = kStrideDefault;
+    if (conv_conf.has_stride_w()) {
+    stride_w_ = conv_conf.stride_w();
+    }
+    if (conv_conf.has_stride_h()) {
+    stride_h_ = conv_conf.stride_h();
+    }
+    }
+    CHECK_GT(stride_w_, 0u);
+    CHECK_GE(stride_h_, 0u);  // 0 for 1D conv
+
+    channels_ = in_channels;
+    num_filters_ = conv_conf.num_output();
+    bias_term_ = conv_conf.bias_term();
+
+    return ConvHandle{
+            kernel_w_,
+            pad_w_,
+            stride_w_,
+            kernel_h_,
+            pad_h_,
+            stride_h_,
+
+            channels_,
+            num_filters_,
+
+            bias_term_,
+
+            workspace_byte_limit_,
+            prefer_,
+    };
+}
+
+
+
+// Done in conv2d.__call__():
+// if self.cudnnconvhandle is None:
+//     self.cudnnconvhandle= InitCudnn(...)
+// elif x.shape(0) != self.cudnnconvhandle.batchsize:
+//     self.cudnnconvhandle= InitCudnn(...)
+CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
+
+    cudnnTensorDescriptor_t x_desc_ = nullptr;
+    cudnnTensorDescriptor_t y_desc_ = nullptr;
+    cudnnTensorDescriptor_t bias_desc_ = nullptr;
+    cudnnFilterDescriptor_t filter_desc_ = nullptr;
+    cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    Context *ctx = dev->context(0);
+
+    size_t batchsize, channels_;
+    batchsize = input.shape(0);
+    channels_ = input.shape(1);
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
+
+    conv_height_ = 1;
+    if (ch.stride_h_ > 0)
+        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
+    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+    if (ch.bias_term_)
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), batchsize,
+                                           ch.channels_, height_, width_));
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+            ch.num_filters_, conv_height_, conv_width_));
+    if (ch.bias_term_)
+        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                               GetCudnnDataType(dtype), 1,
+                                               ch.num_filters_, 1, 1));
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, ch.pad_h_, ch.pad_w_,
+                                                ch.stride_h_, ch.stride_w_, 1, 1,
+                                                CUDNN_CROSS_CORRELATION,
+                                                GetCudnnDataType(dtype)));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                           CUDNN_TENSOR_NCHW, ch.num_filters_,
+                                           channels_, ch.kernel_h_, ch.kernel_w_));
+    if (ch.prefer_ == "fastest" || ch.prefer_ == "limited_workspace" ||
+        ch.prefer_ == "no_workspace") {
+        cudnnConvolutionFwdPreference_t fwd_pref;
+        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+        if (ch.prefer_ == "fastest") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+        } else if (ch.prefer_ == "limited_workspace") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        } else {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        }
+        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                ch.workspace_byte_limit_, &fp_alg_));
+        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bwd_filt_pref, ch.workspace_byte_limit_, &bp_filter_alg_));
+        // deprecated in cudnn v7
+        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bwd_data_pref, ch.workspace_byte_limit_, &bp_data_alg_));
+    } else if (ch.prefer_ == "autotune") {
+        const int topk = 1;
+        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                &num_fp_alg, fp_alg_perf));
+        fp_alg_ = fp_alg_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                &num_bp_filt_alg, bp_filt_perf));
+        bp_filter_alg_ = bp_filt_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                &num_bp_data_alg, bp_data_perf));
+        bp_data_alg_ = bp_data_perf[0].algo;
+    } else {
+        LOG(FATAL) << "Preferred algorithm is not available!";
+    }
+
+    size_t fp_byte, bp_data_byte, bp_filter_byte;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+            &fp_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+            bp_data_alg_, &bp_data_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+            bp_filter_alg_, &bp_filter_byte));
+    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                       sizeof(float) +
+                       1;
+    if (workspace_count_ * sizeof(float) > ch.workspace_byte_limit_)
+        LOG(WARNING) << "The required memory for workspace ("
+                     << workspace_count_ * sizeof(float)
+                     << ") is larger than the expected Bytes ("
+                     << ch.workspace_byte_limit_ << ")";
+    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+
+    return CudnnConvHandle{
+            x_desc_,
+            y_desc_,
+            bias_desc_,
+            filter_desc_,
+            conv_desc_,
+            fp_alg_,
+            bp_filter_alg_,
+            bp_data_alg_,
+
+            workspace_count_,
+            workspace_,
+
+            height_,
+            width_,
+            conv_height_,
+            conv_width_,
+            batchsize,
+    };
+
+}
+
+Tensor CudnnConvForward(Tensor x, Tensor W, Tensor b, const ConvHandle ch, const CudnnConvHandle cch){
+    CHECK_EQ(x.device()->lang(), kCuda);
+    CHECK_EQ(x.nDim(), 4u);
+    CHECK_EQ(x.shape()[0],cch.batchsize);
+    CHECK_EQ(x.shape()[1],ch.channels_);
+    CHECK_EQ(x.shape()[2],cch.height_);
+    CHECK_EQ(x.shape()[3],cch.width_);
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+
+    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    output.device()->Exec([x, output](Context *ctx) {
+        Block *inblock = x.block(), *outblock = output.block(),
+                *wblock = W.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                                inblock->data(), cch.filter_desc_, wblock->data(),
+                                cch.conv_desc_, cch.fp_alg_,
+                                cch.workspace_.block()->mutable_data(),
+                                cch.workspace_count_ * sizeof(float), &beta,
+                                cch.y_desc_, outblock->mutable_data());
+    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+    if (ch.bias_term_) {
+        output.device()->Exec([output](Context *ctx) {
+            float beta = 1.f, alpha = 1.0f;
+            Block *outblock = output.block(), *bblock = b.block();
+            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                           bblock->data(), &beta, cch.y_desc_,
+                           outblock->mutable_data());
+        }, {output.block(), b.block()}, {output.block()});
+    }
+    return output;
+}
+
+// input Tensor W for Reset dW purpose, can avoid this later.
+Tensor CudnnConvBackwardW(Tensor dy, Tensor x, Tensor W, CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor dW;
+    dW.ResetLike(W);
+
+    dy.device()->Exec([dy, dW, x](Context *ctx) {
+    Block *inblock = x.block(), *dyblock = dy.block(),
+            *dwblock = dW.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+            cch.workspace_.block()->mutable_data(),
+            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+            dwblock->mutable_data());
+    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+    return dW;
+}
+
+// input Tensor b for Reset db purpose, can avoid this later.
+Tensor CudnnConvBackwardb(Tensor dy, Tensor b, CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    dy.device()->Exec([dy, db](Context *ctx) {
+        Block *dyblock = dy.block(), *dbblock = db.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                     dyblock->data(), &beta, cch.bias_desc_,
+                                     dbblock->mutable_data());
+    }, {dy.block()}, {db.block()});
+    return db;
+}
+
+// input Tensor x for Reset dx purpose, can avoid this later.
+Tensor CudnnConvBackwardx(Tensor dy, Tensor W, Tensor x, CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor dx;
+    dx.ResetLike(x);
+
+    dy.device()->Exec([dx, dy](Context *ctx) {
+        Block *wblock = W.block(), *dyblock = dy.block(),
+                *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                     wblock->data(), cch.y_desc_, dyblock->data(),
+                                     cch.conv_desc_, cch.bp_data_alg_,
+                                     cch.workspace_.block()->mutable_data(),
+                                     cch.workspace_count_ * sizeof(float), &beta,
+                                     cch.x_desc_, dxblock->mutable_data());
+    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+
+    return dx;
+}
+


[05/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- integrate convolution functions into conv2d autograd operation(gpu part)

- export the field 'batchsize' of CudnnConvHandle to python as it is needed in
  Con2d_GPU.__call__().

- set default 'workspace_byte_limit' as 1GB, which is consistent
  with the default setting in Conv2D Layer.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c57b87ae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c57b87ae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c57b87ae

Branch: refs/heads/master
Commit: c57b87ae7ffd051d818b048de3c20c69643cbd25
Parents: 2cac057
Author: xuewanqi <xu...@u.nus.edu>
Authored: Wed Jun 20 08:46:25 2018 +0000
Committer: xuewanqi <xu...@u.nus.edu>
Committed: Wed Jun 20 14:47:37 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py          | 99 ++++++++++++++++++++++++++++++++++
 src/api/model_operation.i         |  4 +-
 src/model/convolution_functions.h |  2 +-
 3 files changed, 102 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c57b87ae/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 83362e2..c7e0adb 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -583,6 +583,105 @@ class Flatten(Operation):
 def flatten(x):
     return Flatten()(x)[0]
 
+class Conv2d_GPU(Operation):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True, **kwargs):
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        if isinstance(kernel_size, int):
+            self.kernel_size = (kernel_size, kernel_size)
+        elif isinstance(kernel_size, tuple):
+            self.kernel_size = kernel_size
+        else:
+            raise TypeError('Wrong kernel_size type.')
+        
+        if isinstance(stride, int):
+            self.stride = (stride,stride)
+        elif isinstance(stride, tuple):
+            self.stride = stride
+        else:
+            raise TypeError('Wrong stride type.')
+
+        if isinstance(padding, int):
+            self.padding = (padding,padding)
+        elif isinstance(padding, tuple):
+            self.padding = padding
+        else:
+            raise TypeError('Wrong padding type.')
+
+        if dilation != 1 or groups != 1:
+            raise ValueError('Not implemented yet')
+
+        self.bias = bias
+
+        inner_params = {'cudnn_prefer': 'fastest', 'workspace_byte_limit': 1024}
+        # TODO valid value of inner_params check
+
+        for kwarg in kwargs:
+            if kwarg not in inner_params:
+                raise TypeError('Keyword argument not understood:', kwarg)
+            else:
+                inner_params[kwarg] = kwargs[kwarg]
+
+        self.convhandle = singa.SetupConv(self.kernel_size[0], self.kernel_size[1],
+        			self.padding[0], self.padding[1], self.stride[0], self.stride[1],
+        			self.bias, inner_params['workspace_byte_limit']*1024*1024,
+        			inner_params['cudnn_prefer'])
+        
+        w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
+        self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
+        std = math.sqrt(
+                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
+        self.W.gaussian(0.0, std)
+
+        if self.bias:
+            b_shape = (self.out_channels,)
+        else:
+            b_shape = (1,) #to keep consistency when to do forward.
+        self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
+        self.b.set_value(0.0)
+
+
+    def __call__(self, x):
+        assert x.ndim() == 4, 'The dimensions of input should be 4D.'
+        assert x.shape[1] == self.in_channels, 'in_channels dismatched.'
+        assert 0 == 0, 'invalid padding.'
+    	# TODO valid padding check.
+
+    	if not hasattr (self, cudnnconvhandle):
+    	    self.cudnnconvhandle = singa.InitCudnn(x.data, self.convhandle)
+    	elif x.shape[0] != self.cudnnconvhandle.batchsize:
+    	    self.cudnnconvhandle = singa.InitCudnn(x.data, self.convhandle)
+
+    	self.dev = x.device
+
+    	self.W.to_device(self.dev)
+    	xs = [x, self.W]
+    	
+    	self.b.to_device(self.dev)
+    	xs.append(self.b)
+    	return self._do_forward(*xs)[0]
+
+    def forward(self, *xs):
+        if training:
+    	    self.x = xs[0]
+        return singa.CudnnConvForward(xs[0], xs[1], xs[2], self.convhandle, self.cudnnconvhandle)
+
+    def backward(self, dy):
+        assert training is True and hasattr(self, x), 'Please set \'trainging\' as True before do BP. '
+
+        # todo check device?
+        dy.ToDevice(self.dev)
+
+        dx = singa.CudnnConvBackwardx(dy, self.W, self.x, self.cch)
+        dW = singa.CudnnConvBackwardW(dy, self.x, self.W, self.cch)
+        if self.bias:
+    	    db = singa.CudnnConvBackwardb(dy, self.b, self.cch)
+    	    return dx, dW, db
+        else:
+    	    return dx, dW
 
 def infer_dependency(op):
     '''

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c57b87ae/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 77ef6bb..a74ec5e 100644
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -7,14 +7,14 @@ namespace singa{
 
 struct ConvHandle{};
 
-struct CudnnConvHandle{};
+struct CudnnConvHandle{size_t batchsize;};
 
 ConvHandle SetupConv(
     const size_t kernel_h_, const size_t kernel_w_,
     const size_t pad_h_, const size_t pad_w_,
     const size_t stride_h_,const size_t stride_w_,
     const size_t channels_, const size_t num_filters_,
-    const bool bias_term_ = true, const size_t workspace_byte_limit_ =1024*1024,
+    const bool bias_term_ = true, const size_t workspace_byte_limit_ =1024*1024*1024,
     const std::string prefer_="fastest");
 
 CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c57b87ae/src/model/convolution_functions.h
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.h b/src/model/convolution_functions.h
index 9462805..e34423f 100644
--- a/src/model/convolution_functions.h
+++ b/src/model/convolution_functions.h
@@ -48,7 +48,7 @@ ConvHandle SetupConv(
     const size_t pad_h_, const size_t pad_w_,
     const size_t stride_h_,const size_t stride_w_,
     const size_t channels_, const size_t num_filters_,
-    const bool bias_term_ = true ,const size_t workspace_byte_limit_=1024*1024,
+    const bool bias_term_ = true ,const size_t workspace_byte_limit_=1024*1024*1024,
     const std::string prefer_="fastest");
 
 void testInitCudnn(const Tensor &input, const ConvHandle ch);


[16/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- fixed some bugs.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/4a45ee6f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/4a45ee6f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/4a45ee6f

Branch: refs/heads/master
Commit: 4a45ee6f080bb9eacdeb1294047a78a3dbd4635a
Parents: 5340b65
Author: xuewanqi <xu...@outlook.com>
Authored: Wed Jul 4 06:46:09 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Wed Jul 4 07:31:58 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py           | 28 ++++++++++++++++------------
 src/api/model_operation.i          | 18 +++++++++---------
 src/model/layer/convolution.cc     |  4 ++--
 src/model/layer/convolution.h      | 21 +++++++++++----------
 src/model/operation/convolution.cc | 28 ++++++++++++++--------------
 src/model/operation/convolution.h  | 18 +++++++++---------
 test/python/test_operation.py      |  4 ++--
 7 files changed, 63 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4a45ee6f/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 2a10608..b05f701 100755
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -463,7 +463,10 @@ class _Conv2D(Operation):
         #assert 0 == 0, 'invalid padding'
 
         if training:
-            self.inputs = (x, W, b)
+            if self.handle.bias_term_:
+                self.inputs = (x, W, b)
+            else:
+                self.inputs = (x, W)
 
         if self.handle.device_id == -1:
             return singa.CpuConvForward(x, W, b, self.handle)
@@ -717,32 +720,33 @@ class Conv2D(NewLayer):
         else:
             # to keep consistency when to do forward.
             self.b = Tensor(data=CTensor(
-                [1]), requires_grad=False, stores_grad=False)
-            self.b.set_value(0.0)
+                []), requires_grad=False, stores_grad=False)
 
     def __call__(self, x):
-        assert x.shape[1] == self.in_channels,'in_channels dismatched'
-        assert (x.shape[2]+2*self.padding[0]-self.kernel_size[0])%self.stride[0] == 0, 'invalid padding or strides.'
-        assert (x.shape[3]+2*self.padding[1]-self.kernel_size[1])%self.stride[1] == 0, 'invalid padding or stride.'
+        assert x.shape[1] == self.in_channels, 'in_channels dismatched'
+        assert (x.shape[2] + 2 * self.padding[0] - self.kernel_size[0]
+                ) % self.stride[0] == 0, 'invalid padding or strides.'
+        assert (x.shape[3] + 2 * self.padding[1] - self.kernel_size[1]
+                ) % self.stride[1] == 0, 'invalid padding or stride.'
 
         self.device_check(x, self.W, self.b)
 
         if x.device.id() == -1:
             if not hasattr(self, 'handle'):
                 self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
-                                                 self.padding, self.in_channels, self.out_channels, self.bias)
+                                               self.padding, self.in_channels, self.out_channels, self.bias)
             elif x.shape[0] != self.handle.batchsize:
                 self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
-                                                 self.padding, self.in_channels, self.out_channels, self.bias)
+                                               self.padding, self.in_channels, self.out_channels, self.bias)
         else:
             if not hasattr(self, 'handle'):
                 self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
-                                                      self.padding, self.in_channels, self.out_channels, self.bias,
-                                                      self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
+                                                    self.padding, self.in_channels, self.out_channels, self.bias,
+                                                    self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
             elif x.shape[0] != self.handle.batchsize:
                 self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
-                                                      self.padding, self.in_channels, self.out_channels, self.bias,
-                                                      self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
+                                                    self.padding, self.in_channels, self.out_channels, self.bias,
+                                                    self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
         self.handle.device_id = x.device.id()
 
         y = conv2d(x, self.W, self.b, self.handle)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4a45ee6f/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 58e5270..26f5c69 100755
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -10,10 +10,10 @@ struct ConvHandle{
 		size_t batchsize;
         const bool bias_term_;
 
-		ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias_term_);
+		ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
+             const std::vector<size_t>& stride, const std::vector<size_t>& padding,
+             const size_t in_channels, const size_t out_channels,
+             const bool bias);
               	};
 
 struct CudnnConvHandle{
@@ -21,11 +21,11 @@ struct CudnnConvHandle{
 		size_t batchsize;
         const bool bias_term_;
 		
-		CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024,
-                    const std::string prefer_="fastest");
+		CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
+                  const std::vector<size_t>& stride, const std::vector<size_t>& padding,
+                  const size_t in_channels, const size_t out_channels,
+                  const bool bias, const size_t workspace_byte_limit_ = 1024 * 1024 * 1024,
+                  const std::string& prefer_ = "fastest");
                 };
 
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4a45ee6f/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
old mode 100644
new mode 100755
index 3fc7afb..cc77433
--- a/src/model/layer/convolution.cc
+++ b/src/model/layer/convolution.cc
@@ -194,7 +194,7 @@ void Convolution::ToDevice(std::shared_ptr<Device> device) {
   bias_.ToDevice(device);
 }
 
-void Convolution::Im2col(const float *data_im, const int channels,
+void Im2col(const float *data_im, const int channels,
                          const int height, const int width,
                          const int kernel_h, const int kernel_w,
                          const int pad_h, const int pad_w,
@@ -221,7 +221,7 @@ void Convolution::Im2col(const float *data_im, const int channels,
   }
 }
 
-void Convolution::Col2im(const float *data_col, const int channels,
+void Col2im(const float *data_col, const int channels,
                          const int height, const int width,
                          const int kernel_h, const int kernel_w,
                          const int pad_h, const int pad_w,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4a45ee6f/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
old mode 100644
new mode 100755
index 89b5319..d11cdeb
--- a/src/model/layer/convolution.h
+++ b/src/model/layer/convolution.h
@@ -46,16 +46,6 @@ class Convolution : public Layer {
 
   void ToDevice(std::shared_ptr<Device> device) override;
 
-  void Im2col(const float* data_im, const int channels, const int height,
-              const int width, const int kernel_h, const int kernel_w,
-              const int pad_h, const int pad_w, const int stride_h,
-              const int stride_w, float* data_col);
-
-  void Col2im(const float* data_col, const int channels, const int height,
-              const int width, const int kernel_h, const int kernel_w,
-              const int pad_h, const int pad_w, const int stride_h,
-              const int stride_w, float* data_im);
-
   const std::vector<Tensor> param_values() override {
     if (bias_term_)
       return std::vector<Tensor>{weight_, bias_};
@@ -97,5 +87,16 @@ class Convolution : public Layer {
   bool bias_term_;
   vector<size_t> out_sample_shape_;
 };
+
+void Im2col(const float* data_im, const int channels, const int height,
+            const int width, const int kernel_h, const int kernel_w,
+            const int pad_h, const int pad_w, const int stride_h,
+            const int stride_w, float* data_col);
+
+void Col2im(const float* data_col, const int channels, const int height,
+            const int width, const int kernel_h, const int kernel_w,
+            const int pad_h, const int pad_w, const int stride_h,
+            const int stride_w, float* data_im);
+            
 }  // namespace singa
 #endif  // SRC_MODEL_LAYER_CONVOLUTION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4a45ee6f/src/model/operation/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
index d64fbc1..9a702fa 100755
--- a/src/model/operation/convolution.cc
+++ b/src/model/operation/convolution.cc
@@ -1,10 +1,10 @@
 #include "./convolution.h"
-// #include "../layer/convolution.h"
-#include<iostream>
+#include "../layer/convolution.h"
+
 
 namespace singa {
 
-ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size,
+ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
                        const std::vector<size_t>& stride, const std::vector<size_t>& padding,
                        const size_t in_channels, const size_t out_channels,
                        const bool bias) {
@@ -37,7 +37,7 @@ ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t> kernel_siz
   imagesize = input.Size() / batchsize;
 }
 
-// Convolution C;
+
 
 Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch) {
   CHECK_EQ(x.device()->lang(), kCpp);
@@ -67,7 +67,7 @@ Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &
   float *data_col = new float[ch.col_height_ * ch.col_width_];
   auto in_data = x.data<float>();
   for (size_t num = 0; num < ch.batchsize; num++) {
-    C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+    Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
              ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
 
     col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
@@ -105,7 +105,7 @@ Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Conv
     CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
     Tensor dcol_b = Mult(W.T(), grad_b);
     auto dcol_data = dcol_b.data<float>();
-    C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
+    Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
              ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
     dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
   }
@@ -134,7 +134,7 @@ Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, cons
   float *data_col = new float[ch.col_height_ * ch.col_width_];
   auto in_data = dy.data<float>();
   for (size_t num = 0; num < ch.batchsize; num++) {
-    C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+    Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
              ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
     col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
     Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
@@ -171,9 +171,9 @@ Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch)
 #ifdef USE_CUDNN
 CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
                                  const std::vector<size_t>& stride, const std::vector<size_t>& padding,
-                                 const size_t in_channels, const size_t out_channels, const bool bias_term_,
+                                 const size_t in_channels, const size_t out_channels, const bool bias,
                                  const size_t workspace_byte_limit_, const std::string& prefer_)
-  : ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_) {
+  : ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias) {
 
   DataType dtype = input.data_type();
   auto dev = input.device();
@@ -295,7 +295,7 @@ Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const C
   Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
   Tensor output(shape, dev, dtype);
 
-  output.device()->Exec([output, x, W, cch](Context * ctx) {
+  output.device()->Exec([&output, &x, &W, &cch](Context * ctx) {
     Block *inblock = x.block(), *outblock = output.block(),
            *wblock = W.block();
     float alpha = 1.f, beta = 0.f;
@@ -308,7 +308,7 @@ Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const C
   }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
 
   if (cch.bias_term_) {
-    output.device()->Exec([output, b, cch](Context * ctx) {
+    output.device()->Exec([&output, &b, &cch](Context * ctx) {
       float beta = 1.f, alpha = 1.0f;
       Block *outblock = output.block(), *bblock = b.block();
       cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
@@ -326,7 +326,7 @@ Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, cons
   Tensor dx;
   dx.ResetLike(x);
 
-  dy.device()->Exec([dx, dy, W, cch](Context * ctx) {
+  dy.device()->Exec([&dx, &dy, &W, &cch](Context * ctx) {
     Block *wblock = W.block(), *dyblock = dy.block(),
            *dxblock = dx.block();
     float alpha = 1.f, beta = 0.f;
@@ -347,7 +347,7 @@ Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, cons
   Tensor dW;
   dW.ResetLike(W);
 
-  dy.device()->Exec([dW, dy, x, W, cch](Context * ctx) {
+  dy.device()->Exec([&dW, &dy, &x, &cch](Context * ctx) {
     Block *inblock = x.block(), *dyblock = dy.block(),
            *dwblock = dW.block();
     float alpha = 1.f, beta = 0.f;
@@ -369,7 +369,7 @@ Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle
   Tensor db;
   db.ResetLike(b);
 
-  dy.device()->Exec([db, dy, b, cch](Context * ctx) {
+  dy.device()->Exec([&db, &dy, &cch](Context * ctx) {
     Block *dyblock = dy.block(), *dbblock = db.block();
     float alpha = 1.f, beta = 0.f;
     cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4a45ee6f/src/model/operation/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h
index a114b47..93f7775 100755
--- a/src/model/operation/convolution.h
+++ b/src/model/operation/convolution.h
@@ -3,12 +3,12 @@
 
 #include <string>
 #include <vector>
+#include "singa/core/tensor.h"
 #include "singa/utils/logging.h"
 
 #ifdef USE_CUDNN
 #include <cudnn.h>
-// #include "../layer/cudnn_convolution.h"
-// #include "../layer/cudnn_utils.h"
+#include "../layer/cudnn_utils.h"
 #endif // USE_CUDNN
 
 
@@ -21,7 +21,7 @@ class ConvHandle {
              const std::vector<size_t>& stride, const std::vector<size_t>& padding,
              const size_t in_channels, const size_t out_channels,
              const bool bias);
- protected:
+ 
   size_t kernel_w_;
   size_t pad_w_;
   size_t stride_w_;
@@ -66,12 +66,12 @@ class CudnnConvHandle: public ConvHandle {
                   const std::string& prefer_ = "fastest");
   ~CudnnConvHandle();
   // TODO(wangwei) add the destructor
- protected:
-  cudnnTensorDescriptor_t x_desc_ ;
-  cudnnTensorDescriptor_t y_desc_ ;
-  cudnnTensorDescriptor_t bias_desc_ ;
-  cudnnFilterDescriptor_t filter_desc_ ;
-  cudnnConvolutionDescriptor_t conv_desc_ ;
+ 
+  cudnnTensorDescriptor_t x_desc_ = nullptr;
+  cudnnTensorDescriptor_t y_desc_ = nullptr;
+  cudnnTensorDescriptor_t bias_desc_ = nullptr;
+  cudnnFilterDescriptor_t filter_desc_ = nullptr;
+  cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
   cudnnConvolutionFwdAlgo_t fp_alg_;
   cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
   cudnnConvolutionBwdDataAlgo_t bp_data_alg_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4a45ee6f/test/python/test_operation.py
----------------------------------------------------------------------
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
old mode 100644
new mode 100755
index 1bbc70c..315a992
--- a/test/python/test_operation.py
+++ b/test/python/test_operation.py
@@ -48,7 +48,7 @@ class TestPythonOperation(unittest.TestCase):
 
         # forward without bias
         y_without_bias = conv_without_bias_0(gpu_input_tensor)
-        self.check_shape(y.shape, (2, 1, 2, 2))
+        self.check_shape(y_without_bias.shape, (2, 1, 2, 2))
 
     def test_conv2d_cpu(self):
         # (in_channels, out_channels, kernel_size)
@@ -68,7 +68,7 @@ class TestPythonOperation(unittest.TestCase):
 
         # forward without bias
         y_without_bias = conv_without_bias_1(cpu_input_tensor)
-        self.check_shape(y.shape, (2, 1, 2, 2))
+        self.check_shape(y_without_bias.shape, (2, 1, 2, 2))
 
 if __name__ == '__main__':
     unittest.main()


[17/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- tidy codes and rename some variables


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ac5f4eb2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ac5f4eb2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ac5f4eb2

Branch: refs/heads/master
Commit: ac5f4eb2a245f7515f01321b4fe259fc4f58146c
Parents: 4a45ee6
Author: xuewanqi <xu...@outlook.com>
Authored: Thu Jul 5 03:03:03 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Thu Jul 5 03:03:03 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py           |   6 +-
 src/api/model_operation.i          |  28 ++--
 src/model/operation/convolution.cc | 280 ++++++++++++++++----------------
 src/model/operation/convolution.h  |  62 +++----
 4 files changed, 187 insertions(+), 189 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ac5f4eb2/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index b05f701..80209ff 100755
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -463,7 +463,7 @@ class _Conv2D(Operation):
         #assert 0 == 0, 'invalid padding'
 
         if training:
-            if self.handle.bias_term_:
+            if self.handle.bias_term:
                 self.inputs = (x, W, b)
             else:
                 self.inputs = (x, W)
@@ -486,7 +486,7 @@ class _Conv2D(Operation):
                 dy, self.inputs[1], self.inputs[0], self.handle)
             dW = singa.CpuConvBackwardW(
                 dy, self.inputs[0], self.inputs[1], self.handle)
-            if self.handle.bias_term_:
+            if self.handle.bias_term:
                 db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
@@ -496,7 +496,7 @@ class _Conv2D(Operation):
                 dy, self.inputs[1], self.inputs[0], self.handle)
             dW = singa.GpuConvBackwardW(
                 dy, self.inputs[0], self.inputs[1], self.handle)
-            if self.handle.bias_term_:
+            if self.handle.bias_term:
                 db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ac5f4eb2/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 26f5c69..2c13a3b 100755
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -5,28 +5,26 @@
 %}
 namespace singa{
 
-struct ConvHandle{
-
-		size_t batchsize;
-        const bool bias_term_;
-
-		ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
+class ConvHandle {
+ public:
+  ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
              const std::vector<size_t>& stride, const std::vector<size_t>& padding,
              const size_t in_channels, const size_t out_channels,
              const bool bias);
-              	};
+  bool bias_term;
+  size_t batchsize;
+};
 
 struct CudnnConvHandle{
-
-		size_t batchsize;
-        const bool bias_term_;
-		
-		CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
+ public:
+	CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
                   const std::vector<size_t>& stride, const std::vector<size_t>& padding,
                   const size_t in_channels, const size_t out_channels,
-                  const bool bias, const size_t workspace_byte_limit_ = 1024 * 1024 * 1024,
-                  const std::string& prefer_ = "fastest");
-                };
+                  const bool bias, const size_t workspace_byte_limit = 1024 * 1024 * 1024,
+                  const std::string& prefer = "fastest");
+  bool bias_term;
+  size_t batchsize; 
+};
 
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ac5f4eb2/src/model/operation/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
index 9a702fa..e36df43 100755
--- a/src/model/operation/convolution.cc
+++ b/src/model/operation/convolution.cc
@@ -8,32 +8,32 @@ ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_si
                        const std::vector<size_t>& stride, const std::vector<size_t>& padding,
                        const size_t in_channels, const size_t out_channels,
                        const bool bias) {
-  kernel_h_ = kernel_size[0];
-  kernel_w_ = kernel_size[1];
+  kernel_h = kernel_size[0];
+  kernel_w = kernel_size[1];
 
-  pad_h_ = padding[0];
-  pad_w_ = padding[1];
+  pad_h = padding[0];
+  pad_w = padding[1];
 
-  stride_h_ = stride[0];
-  stride_w_ = stride[1];
+  stride_h = stride[0];
+  stride_w = stride[1];
 
-  channels_ = in_channels;
-  num_filters_ = out_channels;
+  channels = in_channels;
+  num_filters = out_channels;
 
-  bias_term_ = bias;
+  bias_term = bias;
 
   batchsize = input.shape(0);
   CHECK(input.shape(1) == in_channels) << "the number of input channels mismatched.";
-  height_ = input.shape(2);
-  width_ = input.shape(3);
+  height = input.shape(2);
+  width = input.shape(3);
 
-  conv_height_ = 1;
-  if (stride_h_ > 0)
-    conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
-  conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+  conv_height = 1;
+  if (stride_h > 0)
+    conv_height = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  conv_width = (width + 2 * pad_w - kernel_w) / stride_w + 1;
 
-  col_height_ = in_channels * kernel_w_ * kernel_h_;
-  col_width_ = conv_height_ * conv_width_;
+  col_height = in_channels * kernel_w * kernel_h;
+  col_width = conv_height * conv_width;
   imagesize = input.Size() / batchsize;
 }
 
@@ -42,43 +42,43 @@ ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_si
 Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch) {
   CHECK_EQ(x.device()->lang(), kCpp);
 
-  CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-        x.shape(3) == ch.width_) << "input sample shape should not change";
+  CHECK(x.shape(1) == ch.channels && x.shape(2) == ch.height &&
+        x.shape(3) == ch.width) << "input sample shape should not change";
 
-  CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ &&
-        W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+  CHECK(W.shape(0) == ch.num_filters && W.shape(1) == ch.channels &&
+        W.shape(2) == ch.kernel_h && W.shape(3) == ch.kernel_w) << "weights shape should not change";
 
   Shape w_shape = W.shape();
   Shape b_shape;
-  if (ch.bias_term_)
+  if (ch.bias_term)
     b_shape = b.shape();
 
-  W.Reshape(Shape{ch.num_filters_, ch.col_height_});
-  if (ch.bias_term_)
-    b.Reshape(Shape{ch.num_filters_});
+  W.Reshape(Shape{ch.num_filters, ch.col_height});
+  if (ch.bias_term)
+    b.Reshape(Shape{ch.num_filters});
 
   DataType dtype = x.data_type();
   auto dev = x.device();
-  Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
+  Shape shape{ch.batchsize, ch.num_filters, ch.conv_height, ch.conv_width};
   Tensor output(shape, dev, dtype);
 
-  Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+  Tensor col_data(Shape{ch.col_height, ch.col_width});//broadcasted image
 
-  float *data_col = new float[ch.col_height_ * ch.col_width_];
+  float *data_col = new float[ch.col_height * ch.col_width];
   auto in_data = x.data<float>();
   for (size_t num = 0; num < ch.batchsize; num++) {
-    Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-             ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
+    Im2col(in_data + num * ch.imagesize, ch.channels, ch.height, ch.width, ch.kernel_h,
+             ch.kernel_w, ch.pad_h, ch.pad_w, ch.stride_h, ch.stride_w, data_col);
 
-    col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+    col_data.CopyDataFromHostPtr(data_col, ch.col_height * ch.col_width);
     Tensor each = Mult(W, col_data);
-    if (ch.bias_term_) {
+    if (ch.bias_term) {
       AddColumn(b, &each);
     }
     CopyDataToFrom(&output, each, each.Size(), num * each.Size());
   };
   W.Reshape(w_shape);
-  if (ch.bias_term_)
+  if (ch.bias_term)
     b.Reshape(b_shape);
   return output;
 }
@@ -86,14 +86,14 @@ Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &
 Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch) {
   CHECK_EQ(dy.device()->lang(), kCpp);
 
-  CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-        dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+  CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
+        dy.shape(3) == ch.conv_width) << "input gradients shape should not change";
 
-  CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ &&
-        W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+  CHECK(W.shape(0) == ch.num_filters && W.shape(1) == ch.channels &&
+        W.shape(2) == ch.kernel_h && W.shape(3) == ch.kernel_w) << "weights shape should not change";
 
   Shape w_shape = W.shape();
-  W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+  W.Reshape(Shape{ch.num_filters, ch.col_height});
 
   Tensor dx;
   dx.ResetLike(x);
@@ -101,12 +101,12 @@ Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Conv
   float *dx_b = new float[ch.imagesize];
 
   for (size_t num = 0; num < ch.batchsize; num++) {
-    Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+    Tensor grad_b(Shape{ch.num_filters, ch.conv_height * ch.conv_width});
     CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
     Tensor dcol_b = Mult(W.T(), grad_b);
     auto dcol_data = dcol_b.data<float>();
-    Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
-             ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
+    Col2im(dcol_data, ch.channels, ch.height, ch.width, ch.kernel_h, ch.kernel_w, ch.pad_h,
+             ch.pad_w, ch.stride_h, ch.stride_w, dx_b);
     dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
   }
   W.Reshape(w_shape);
@@ -116,28 +116,28 @@ Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Conv
 Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch) {
   CHECK_EQ(dy.device()->lang(), kCpp);
 
-  CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-        dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+  CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
+        dy.shape(3) == ch.conv_width) << "input gradients shape should not change";
 
-  CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-        x.shape(3) == ch.width_) << "input sample shape should not change";
+  CHECK(x.shape(1) == ch.channels && x.shape(2) == ch.height &&
+        x.shape(3) == ch.width) << "input sample shape should not change";
 
   Tensor dW;
   dW.ResetLike(W);
   dW.SetValue(0.0f);
 
   Shape w_shape = W.shape();
-  dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
+  dW.Reshape(Shape{ch.num_filters, ch.col_height});
 
-  Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+  Tensor col_data(Shape{ch.col_height, ch.col_width});//broadcasted image
 
-  float *data_col = new float[ch.col_height_ * ch.col_width_];
+  float *data_col = new float[ch.col_height * ch.col_width];
   auto in_data = dy.data<float>();
   for (size_t num = 0; num < ch.batchsize; num++) {
-    Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-             ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
-    col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
-    Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+    Im2col(in_data + num * ch.imagesize, ch.channels, ch.height, ch.width, ch.kernel_h,
+             ch.kernel_w, ch.pad_h, ch.pad_w, ch.stride_h, ch.stride_w, data_col);
+    col_data.CopyDataFromHostPtr(data_col, ch.col_height * ch.col_width);
+    Tensor grad_b(Shape{ch.num_filters, ch.conv_height * ch.conv_width});
     CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
     dW += Mult(grad_b, col_data.T());
   }
@@ -148,20 +148,20 @@ Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, cons
 Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch) {
   CHECK_EQ(dy.device()->lang(), kCpp);
 
-  CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-        dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+  CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
+        dy.shape(3) == ch.conv_width) << "input gradients shape should not change";
 
-  CHECK(b.shape(0) == ch.num_filters_) << "bias shape should not change";
+  CHECK(b.shape(0) == ch.num_filters) << "bias shape should not change";
 
   Tensor db;
   db.ResetLike(b);
 
-  auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
+  auto tmpshp = Shape{ch.batchsize * ch.num_filters, dy.Size() / (ch.batchsize * ch.num_filters)};
   Tensor tmp1 = Reshape(dy, tmpshp);
 
-  Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
+  Tensor tmp2(Shape{ch.batchsize * ch.num_filters});
   SumColumns(tmp1, &tmp2);
-  Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
+  Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters});
 
   SumRows(tmp3, &db);
 
@@ -172,48 +172,48 @@ Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch)
 CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
                                  const std::vector<size_t>& stride, const std::vector<size_t>& padding,
                                  const size_t in_channels, const size_t out_channels, const bool bias,
-                                 const size_t workspace_byte_limit_, const std::string& prefer_)
+                                 const size_t workspace_byte_limit, const std::string& prefer)
   : ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias) {
 
   DataType dtype = input.data_type();
   auto dev = input.device();
   Context *ctx = dev->context(0);
 
-  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-  if (bias_term_)
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-  CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc));
+  if (bias_term)
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
+  CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc));
+  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc));
 
 
-  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW,
                                          GetCudnnDataType(dtype), batchsize,
-                                         channels_, height_, width_));
+                                         channels, height, width));
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-                y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-                num_filters_, conv_height_, conv_width_));
-  if (bias_term_)
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                y_desc, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+                num_filters, conv_height, conv_width));
+  if (bias_term)
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc, CUDNN_TENSOR_NCHW,
                                            GetCudnnDataType(dtype), 1,
-                                           num_filters_, 1, 1));
-  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
-              stride_h_, stride_w_, 1, 1,
+                                           num_filters, 1, 1));
+  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w,
+              stride_h, stride_w, 1, 1,
               CUDNN_CROSS_CORRELATION,
               GetCudnnDataType(dtype)));
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                         CUDNN_TENSOR_NCHW, num_filters_,
-                                         channels_, kernel_h_, kernel_w_));
-  if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-      prefer_ == "no_workspace") {
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc, GetCudnnDataType(dtype),
+                                         CUDNN_TENSOR_NCHW, num_filters,
+                                         channels, kernel_h, kernel_w));
+  if (prefer == "fastest" || prefer == "limited_workspace" ||
+      prefer == "no_workspace") {
     cudnnConvolutionFwdPreference_t fwd_pref;
     cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
     cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-    if (prefer_ == "fastest") {
+    if (prefer == "fastest") {
       fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
       bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
       bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-    } else if (prefer_ == "limited_workspace") {
+    } else if (prefer == "limited_workspace") {
       fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
       bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
       bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
@@ -223,67 +223,67 @@ CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t>&
       bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
     }
     CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                  ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                  workspace_byte_limit_, &fp_alg_));
+                  ctx->cudnn_handle, x_desc, filter_desc, conv_desc, y_desc, fwd_pref,
+                  workspace_byte_limit, &fp_alg));
     CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                  ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                  bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+                  ctx->cudnn_handle, x_desc, y_desc, conv_desc, filter_desc,
+                  bwd_filt_pref, workspace_byte_limit, &bp_filter_alg));
     // deprecated in cudnn v7
     CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                  ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                  bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
-  } else if (prefer_ == "autotune") {
+                  ctx->cudnn_handle, filter_desc, y_desc, conv_desc, x_desc,
+                  bwd_data_pref, workspace_byte_limit, &bp_data_alg));
+  } else if (prefer == "autotune") {
     const int topk = 1;
     int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-    cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+    cudnnConvolutionFwdAlgoPerf_t fp_algperf[topk];
     cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
     cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
     CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                  ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                  &num_fp_alg, fp_alg_perf));
-    fp_alg_ = fp_alg_perf[0].algo;
+                  ctx->cudnn_handle, x_desc, filter_desc, conv_desc, y_desc, topk,
+                  &num_fp_alg, fp_algperf));
+    fp_alg = fp_algperf[0].algo;
     CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                  ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                  ctx->cudnn_handle, x_desc, y_desc, conv_desc, filter_desc, topk,
                   &num_bp_filt_alg, bp_filt_perf));
-    bp_filter_alg_ = bp_filt_perf[0].algo;
+    bp_filter_alg = bp_filt_perf[0].algo;
     CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                  ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                  ctx->cudnn_handle, filter_desc, y_desc, conv_desc, x_desc, topk,
                   &num_bp_data_alg, bp_data_perf));
-    bp_data_alg_ = bp_data_perf[0].algo;
+    bp_data_alg = bp_data_perf[0].algo;
   } else {
     LOG(FATAL) << "Preferred algorithm is not available!";
   }
 
   size_t fp_byte, bp_data_byte, bp_filter_byte;
   CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+                ctx->cudnn_handle, x_desc, filter_desc, conv_desc, y_desc, fp_alg,
                 &fp_byte));
   CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bp_data_alg_, &bp_data_byte));
+                ctx->cudnn_handle, filter_desc, y_desc, conv_desc, x_desc,
+                bp_data_alg, &bp_data_byte));
   CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bp_filter_alg_, &bp_filter_byte));
-  workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                ctx->cudnn_handle, x_desc, y_desc, conv_desc, filter_desc,
+                bp_filter_alg, &bp_filter_byte));
+  workspace_count = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
                      sizeof(float) +
                      1;
-  if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
+  if (workspace_count * sizeof(float) > workspace_byte_limit)
     LOG(WARNING) << "The required memory for workspace ("
-                 << workspace_count_ * sizeof(float)
+                 << workspace_count * sizeof(float)
                  << ") is larger than the expected Bytes ("
-                 << workspace_byte_limit_ << ")";
-  workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+                 << workspace_byte_limit << ")";
+  workspace = Tensor(Shape{workspace_count}, dev, dtype);
 }
 
 CudnnConvHandle::~CudnnConvHandle() {
-  if (bias_desc_ != nullptr)
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
-  if (filter_desc_ != nullptr)
-    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
-  if (conv_desc_ != nullptr)
-    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
-  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
-  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+  if (bias_desc != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc));
+  if (filter_desc != nullptr)
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc));
+  if (conv_desc != nullptr)
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc));
+  if (x_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc));
+  if (y_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc));
 }
 
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch) {
@@ -292,27 +292,27 @@ Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const C
   DataType dtype = x.data_type();
   auto dev = x.device();
 
-  Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
+  Shape shape{cch.batchsize, cch.num_filters, cch.conv_height, cch.conv_width};
   Tensor output(shape, dev, dtype);
 
   output.device()->Exec([&output, &x, &W, &cch](Context * ctx) {
     Block *inblock = x.block(), *outblock = output.block(),
            *wblock = W.block();
     float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                            inblock->data(), cch.filter_desc_, wblock->data(),
-                            cch.conv_desc_, cch.fp_alg_,
-                            cch.workspace_.block()->mutable_data(),
-                            cch.workspace_count_ * sizeof(float), &beta,
-                            cch.y_desc_, outblock->mutable_data());
-  }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-  if (cch.bias_term_) {
+    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc,
+                            inblock->data(), cch.filter_desc, wblock->data(),
+                            cch.conv_desc, cch.fp_alg,
+                            cch.workspace.block()->mutable_data(),
+                            cch.workspace_count * sizeof(float), &beta,
+                            cch.y_desc, outblock->mutable_data());
+  }, {x.block(), W.block()}, {output.block()}, cch.workspace.block());
+
+  if (cch.bias_term) {
     output.device()->Exec([&output, &b, &cch](Context * ctx) {
       float beta = 1.f, alpha = 1.0f;
       Block *outblock = output.block(), *bblock = b.block();
-      cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                     bblock->data(), &beta, cch.y_desc_,
+      cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc,
+                     bblock->data(), &beta, cch.y_desc,
                      outblock->mutable_data());
     }, {output.block(), b.block()}, {output.block()});
   }
@@ -330,13 +330,13 @@ Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, cons
     Block *wblock = W.block(), *dyblock = dy.block(),
            *dxblock = dx.block();
     float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                 wblock->data(), cch.y_desc_, dyblock->data(),
-                                 cch.conv_desc_, cch.bp_data_alg_,
-                                 cch.workspace_.block()->mutable_data(),
-                                 cch.workspace_count_ * sizeof(float), &beta,
-                                 cch.x_desc_, dxblock->mutable_data());
-  }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc,
+                                 wblock->data(), cch.y_desc, dyblock->data(),
+                                 cch.conv_desc, cch.bp_data_alg,
+                                 cch.workspace.block()->mutable_data(),
+                                 cch.workspace_count * sizeof(float), &beta,
+                                 cch.x_desc, dxblock->mutable_data());
+  }, {dy.block(), W.block()}, {dx.block(), cch.workspace.block()});
 
   return dx;
 }
@@ -352,12 +352,12 @@ Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, cons
            *dwblock = dW.block();
     float alpha = 1.f, beta = 0.f;
     cudnnConvolutionBackwardFilter(
-      ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-      cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-      cch.workspace_.block()->mutable_data(),
-      cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+      ctx->cudnn_handle, &alpha, cch.x_desc, inblock->data(),
+      cch.y_desc, dyblock->data(), cch.conv_desc, cch.bp_filter_alg,
+      cch.workspace.block()->mutable_data(),
+      cch.workspace_count * sizeof(float), &beta, cch.filter_desc,
       dwblock->mutable_data());
-  }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+  }, {dy.block(), x.block()}, {dW.block(), cch.workspace.block()});
 
   return dW;
 }
@@ -372,8 +372,8 @@ Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle
   dy.device()->Exec([&db, &dy, &cch](Context * ctx) {
     Block *dyblock = dy.block(), *dbblock = db.block();
     float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                 dyblock->data(), &beta, cch.bias_desc_,
+    cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc,
+                                 dyblock->data(), &beta, cch.bias_desc,
                                  dbblock->mutable_data());
   }, {dy.block()}, {db.block()});
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ac5f4eb2/src/model/operation/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h
index 93f7775..62ff254 100755
--- a/src/model/operation/convolution.h
+++ b/src/model/operation/convolution.h
@@ -22,26 +22,26 @@ class ConvHandle {
              const size_t in_channels, const size_t out_channels,
              const bool bias);
  
-  size_t kernel_w_;
-  size_t pad_w_;
-  size_t stride_w_;
-  size_t kernel_h_;
-  size_t pad_h_;
-  size_t stride_h_;
-
-  size_t channels_;
-  size_t num_filters_;
-
-  bool bias_term_;
-
-  size_t height_;
-  size_t width_;
-  size_t conv_height_;
-  size_t conv_width_;
+  size_t kernel_w;
+  size_t pad_w;
+  size_t stride_w;
+  size_t kernel_h;
+  size_t pad_h;
+  size_t stride_h;
+
+  size_t channels;
+  size_t num_filters;
+
+  bool bias_term;
+
+  size_t height;
+  size_t width;
+  size_t conv_height;
+  size_t conv_width;
   size_t batchsize;
 
-  size_t col_height_;
-  size_t col_width_;
+  size_t col_height;
+  size_t col_width;
   size_t imagesize;
 };
 
@@ -62,22 +62,22 @@ class CudnnConvHandle: public ConvHandle {
   CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
                   const std::vector<size_t>& stride, const std::vector<size_t>& padding,
                   const size_t in_channels, const size_t out_channels,
-                  const bool bias, const size_t workspace_byte_limit_ = 1024 * 1024 * 1024,
-                  const std::string& prefer_ = "fastest");
+                  const bool bias, const size_t workspace_byte_limit = 1024 * 1024 * 1024,
+                  const std::string& prefer = "fastest");
   ~CudnnConvHandle();
   // TODO(wangwei) add the destructor
  
-  cudnnTensorDescriptor_t x_desc_ = nullptr;
-  cudnnTensorDescriptor_t y_desc_ = nullptr;
-  cudnnTensorDescriptor_t bias_desc_ = nullptr;
-  cudnnFilterDescriptor_t filter_desc_ = nullptr;
-  cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
-  cudnnConvolutionFwdAlgo_t fp_alg_;
-  cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-  cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-  size_t workspace_count_;
-  Tensor workspace_;
+  cudnnTensorDescriptor_t x_desc = nullptr;
+  cudnnTensorDescriptor_t y_desc = nullptr;
+  cudnnTensorDescriptor_t bias_desc = nullptr;
+  cudnnFilterDescriptor_t filter_desc = nullptr;
+  cudnnConvolutionDescriptor_t conv_desc = nullptr;
+  cudnnConvolutionFwdAlgo_t fp_alg;
+  cudnnConvolutionBwdFilterAlgo_t bp_filter_alg;
+  cudnnConvolutionBwdDataAlgo_t bp_data_alg;
+
+  size_t workspace_count;
+  Tensor workspace;
 };
 
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);


[11/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- fixed some bugs in convolution_related.cc

- export device.lang() from C++ to python to judge device type(cpu or gpu)

- modified design of autograd.Conv2D

- modified the test file for Conv2D, this unit test has passed.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e68ea2ee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e68ea2ee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e68ea2ee

Branch: refs/heads/master
Commit: e68ea2ee6640d4124e2f5f32ac16726fa84d10ac
Parents: 189958a
Author: xuewanqi <xu...@outlook.com>
Authored: Thu Jun 28 05:22:30 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Thu Jun 28 05:22:30 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py                   | 20 +++++++------
 src/api/core_device.i                      |  3 ++
 src/model/operation/convolution_related.cc | 14 ++++++++++
 test/python/test_operation.py              | 37 +++++++++++++++++++------
 4 files changed, 58 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e68ea2ee/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index e898312..e301e51 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -669,28 +669,30 @@ class Conv2D(Operation):
     	return self._do_forward(*xs)[0]
 
     def forward(self, *xs):
-        if gpu:
-            
+        if self.dev.lang()==1: #kCuda = 1           
             if not hasattr(self, 'cudnnconvhandles'):
-                self.cudnnconvhandles=InitCudnnConvHandles(xs[0], self.recorder, 
+                self.cudnnconvhandles=singa.InitCudnnConvHandles(xs[0], self.recorder, 
                     self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
             elif self.reset:
-                self.cudnnconvhandles=InitCudnnConvHandles(xs[0], self.recorder, 
+                self.cudnnconvhandles=singa.InitCudnnConvHandles(xs[0], self.recorder, 
                     self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
 
             return singa.GpuConvForward(xs[0], xs[1], xs[2], self.recorder, self.cudnnconvhandles)
 
-        if cpu:
-
+        elif self.dev.lang()==0: #kCpp = 0
             return singa.CpuConvForward(xs[0], xs[1], xs[2], self.recorder)
 
+        else:
+            TypeError('Not implemented yet')
+
+
     def backward(self, dy):
         assert training is True and hasattr(self, 'x'), 'Please set training as True before do BP. '
 
         # todo check device?
         dy.ToDevice(self.dev)
 
-        if gpu:
+        if self.dev.lang()==1: #kCuda = 1 
             dx = singa.GpuConvBackwardx(dy, self.W.data, self.x.data, self.cudnnconvhandles)
             dW = singa.GpuConvBackwardW(dy, self.x.data, self.W.data, self.cudnnconvhandles)
             if self.bias:
@@ -699,7 +701,7 @@ class Conv2D(Operation):
             else:
         	    return dx, dW
 
-        if cpu:
+        elif self.dev.lang()==0: #kCpp = 0
             dx = singa.CpuConvBackwardx(dy, self.W.data, self.x.data, self.recorder)
             dW = singa.CpuConvBackwardW(dy, self.x.data, self.W.data, self.recorder)
             if self.bias:
@@ -707,6 +709,8 @@ class Conv2D(Operation):
                 return dx, dW, db
             else:
                 return dx, dW
+        else:
+            TypeError('Not implemented yet')
 
 def infer_dependency(op):
     '''

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e68ea2ee/src/api/core_device.i
----------------------------------------------------------------------
diff --git a/src/api/core_device.i b/src/api/core_device.i
index a5b7de6..381f7c6 100644
--- a/src/api/core_device.i
+++ b/src/api/core_device.i
@@ -43,11 +43,14 @@ namespace std{
 
 namespace singa{
 
+enum LangType {kCpp, kCuda, kOpencl,kNumDeviceType};
+
 class Device {
  public:
   virtual void SetRandSeed(unsigned seed) = 0;
   std::shared_ptr<Device> host();
   int id() const;
+  LangType lang() const;
 };
 
 class Platform {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e68ea2ee/src/model/operation/convolution_related.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_related.cc b/src/model/operation/convolution_related.cc
index 1004074..c828f90 100644
--- a/src/model/operation/convolution_related.cc
+++ b/src/model/operation/convolution_related.cc
@@ -318,6 +318,20 @@ CudnnConvHandles InitCudnnConvHandles(const Tensor &input, const Recorder r, con
                      << workspace_byte_limit_ << ")";
     workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
 
+    return CudnnConvHandles{
+    	x_desc_,
+        y_desc_,
+        bias_desc_,
+        filter_desc_,
+        conv_desc_,
+        fp_alg_,
+        bp_filter_alg_,
+        bp_data_alg_,
+
+        workspace_count_,
+        workspace_,
+    };
+
 };
 
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const Recorder r, const CudnnConvHandles cch){

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e68ea2ee/test/python/test_operation.py
----------------------------------------------------------------------
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
index 295b2d2..ece537d 100644
--- a/test/python/test_operation.py
+++ b/test/python/test_operation.py
@@ -10,16 +10,14 @@ autograd.training = True
 
 CTensor = singa.Tensor
 
-dev = device.create_cuda_gpu()
-
-gpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=dev)
-gpu_input_tensor.gaussian(0.0, 1.0)
+gpu_dev = device.create_cuda_gpu()
+cpu_dev = device.get_default_device()
 
 dy = CTensor([2, 1, 2, 2])
 singa.Gaussian(0.0, 1.0, dy)
-dy.ToDevice(dev)
 
-conv = autograd.Conv2d_GPU(3, 1, 2)  # (in_channels, out_channels, kernel_size)
+conv = autograd.Conv2D(3, 1, 2)  # (in_channels, out_channels, kernel_size)
+conv_without_bias = autograd.Conv2D(3,1,2,bias=False)
 
 
 def _tuple_to_string(t):
@@ -35,14 +33,37 @@ class TestPythonOperation(unittest.TestCase):
                                               _tuple_to_string(expect))
                          )
 
-    def test(self):
+    def test_conv2d_gpu(self):
+        gpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev)
+        gpu_input_tensor.gaussian(0.0, 1.0)
+
         y = conv(gpu_input_tensor)  # PyTensor
         dx, dW, db = conv.backward(dy)  # CTensor
-        
+
         self.check_shape(y.shape, (2, 1, 2, 2))
         self.check_shape(dx.shape(), (2, 3, 3, 3))
         self.check_shape(dW.shape(), (1, 3, 2, 2))
         self.check_shape(db.shape(), (1,))
 
+        #forward without bias
+        y_without_bias=conv_without_bias(gpu_input_tensor)
+        self.check_shape(y.shape, (2, 1, 2, 2))
+
+    def test_conv2d_cpu(self):
+        cpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=cpu_dev)
+        cpu_input_tensor.gaussian(0.0, 1.0)
+
+        y = conv(cpu_input_tensor)  # PyTensor
+        dx, dW, db = conv.backward(dy)  # CTensor
+
+        self.check_shape(y.shape, (2, 1, 2, 2))
+        self.check_shape(dx.shape(), (2, 3, 3, 3))
+        self.check_shape(dW.shape(), (1, 3, 2, 2))
+        self.check_shape(db.shape(), (1,))
+
+        #forward without bias
+        y_without_bias=conv_without_bias(cpu_input_tensor)
+        self.check_shape(y.shape, (2, 1, 2, 2))
+
 if __name__ == '__main__':
     unittest.main()


[10/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- merge cpu and gpu parts for con2d operation in python part(not complete)

- redesign handles(recorder)

- redesign api

- parts of codes have passed unit tests


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/189958ab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/189958ab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/189958ab

Branch: refs/heads/master
Commit: 189958ab53bff686f39e599a36f21867cd4a265f
Parents: dfe4478
Author: xuewanqi <xu...@outlook.com>
Authored: Wed Jun 27 14:57:21 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Wed Jun 27 14:57:21 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py                   |  79 ++--
 src/CMakeLists.txt                         |   1 +
 src/api/model_operation.i                  |  43 +--
 src/model/convolution_functions.cc         | 457 ------------------------
 src/model/convolution_functions.h          |  95 -----
 src/model/operation/convolution_related.cc | 417 +++++++++++++++++++++
 src/model/operation/convolution_related.h  |  75 ++++
 7 files changed, 561 insertions(+), 606 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/189958ab/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 4f45bf1..e898312 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -583,7 +583,7 @@ class Flatten(Operation):
 def flatten(x):
     return Flatten()(x)[0]
 
-class Conv2d_GPU(Operation):
+class Conv2D(Operation):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True, **kwargs):
 
@@ -616,20 +616,14 @@ class Conv2d_GPU(Operation):
 
         self.bias = bias
 
-        inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
+        self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
         # TODO valid value of inner_params check
 
         for kwarg in kwargs:
-            if kwarg not in inner_params:
+            if kwarg not in self.inner_params:
                 raise TypeError('Keyword argument not understood:', kwarg)
             else:
-                inner_params[kwarg] = kwargs[kwarg]
-
-        self.convhandle = singa.SetupConv(self.kernel_size[0], self.kernel_size[1],
-        			self.padding[0], self.padding[1], self.stride[0], self.stride[1],
-        			self.in_channels, self.out_channels, self.bias, 
-                                inner_params['workspace_MB_limit']*1024*1024,
-        			inner_params['cudnn_prefer'])
+                self.inner_params[kwarg] = kwargs[kwarg]
         
         w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
         self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
@@ -639,11 +633,13 @@ class Conv2d_GPU(Operation):
 
         if self.bias:
             b_shape = (self.out_channels,)
+            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
+            self.b.set_value(0.0)
         else:
-            b_shape = (1,) #to keep consistency when to do forward.
-        self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
-        self.b.set_value(0.0)
-
+            #to keep consistency when to do forward.
+            self.b = Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)
+        
+        self.reset = False
 
     def __call__(self, x):
         assert x.ndim() == 4, 'The dimensions of input should be 4D.'
@@ -651,10 +647,13 @@ class Conv2d_GPU(Operation):
         assert 0 == 0, 'invalid padding.'
     	# TODO valid padding check.
 
-    	if not hasattr (self, 'cudnnconvhandle'):
-    	    self.cudnnconvhandle = singa.InitCudnn(x.data, self.convhandle)
-    	elif x.shape[0] != self.cudnnconvhandle.batchsize:
-    	    self.cudnnconvhandle = singa.InitCudnn(x.data, self.convhandle)
+    	if not hasattr (self, 'recorder'):
+    	    self.recorder = singa.SetupRecorder(x.data, self.kernel_size, self.stride,
+                                self.padding, self.in_channels, self.out_channels, self.bias)
+    	elif x.shape[0] != self.recorder.batchsize:
+    	    self.recorder = singa.SetupRecorder(x.data, self.kernel_size, self.stride,
+                                self.padding, self.in_channels, self.out_channels, self.bias)
+            self.reset = True
         
         if training:
             self.x = x
@@ -664,26 +663,50 @@ class Conv2d_GPU(Operation):
     	self.W.to_device(self.dev)
     	xs = [x, self.W]
     	
-    	self.b.to_device(self.dev)
+        if self.bias:
+    	   self.b.to_device(self.dev)
     	xs.append(self.b)
     	return self._do_forward(*xs)[0]
 
     def forward(self, *xs):
-        return singa.CudnnConvForward(xs[0], xs[1], xs[2], self.convhandle, self.cudnnconvhandle)
+        if gpu:
+            
+            if not hasattr(self, 'cudnnconvhandles'):
+                self.cudnnconvhandles=InitCudnnConvHandles(xs[0], self.recorder, 
+                    self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
+            elif self.reset:
+                self.cudnnconvhandles=InitCudnnConvHandles(xs[0], self.recorder, 
+                    self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
+
+            return singa.GpuConvForward(xs[0], xs[1], xs[2], self.recorder, self.cudnnconvhandles)
+
+        if cpu:
+
+            return singa.CpuConvForward(xs[0], xs[1], xs[2], self.recorder)
 
     def backward(self, dy):
-        assert training is True and hasattr(self, 'x'), 'Please set \'training\' as True before do BP. '
+        assert training is True and hasattr(self, 'x'), 'Please set training as True before do BP. '
 
         # todo check device?
         dy.ToDevice(self.dev)
 
-        dx = singa.CudnnConvBackwardx(dy, self.W.data, self.x.data, self.cudnnconvhandle)
-        dW = singa.CudnnConvBackwardW(dy, self.x.data, self.W.data, self.cudnnconvhandle)
-        if self.bias:
-    	    db = singa.CudnnConvBackwardb(dy, self.b.data, self.cudnnconvhandle)
-    	    return dx, dW, db
-        else:
-    	    return dx, dW
+        if gpu:
+            dx = singa.GpuConvBackwardx(dy, self.W.data, self.x.data, self.cudnnconvhandles)
+            dW = singa.GpuConvBackwardW(dy, self.x.data, self.W.data, self.cudnnconvhandles)
+            if self.bias:
+        	    db = singa.GpuConvBackwardb(dy, self.b.data, self.cudnnconvhandles)
+        	    return dx, dW, db
+            else:
+        	    return dx, dW
+
+        if cpu:
+            dx = singa.CpuConvBackwardx(dy, self.W.data, self.x.data, self.recorder)
+            dW = singa.CpuConvBackwardW(dy, self.x.data, self.W.data, self.recorder)
+            if self.bias:
+                db = singa.CpuConvBackwardb(dy, self.b.data, self.recorder)
+                return dx, dW, db
+            else:
+                return dx, dW
 
 def infer_dependency(op):
     '''

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/189958ab/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 709894b..7dd9bf7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -58,6 +58,7 @@ AUX_SOURCE_DIRECTORY(model/optimizer model_source)
 AUX_SOURCE_DIRECTORY(model/loss model_source)
 AUX_SOURCE_DIRECTORY(model/metric model_source)
 AUX_SOURCE_DIRECTORY(model/updater model_source)
+AUX_SOURCE_DIRECTORY(model/operation model_source)
 LIST(APPEND singa_sources ${model_source})
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/189958ab/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 20f112e..1d31b9d 100644
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -1,47 +1,38 @@
 %module model_operation
 
 %{
-#include "../src/model/convolution_functions.h"
+#include "../src/model/operation/convolution_related.h"
 %}
 namespace singa{
 
-struct ConvHandle{};
+struct Recorder{size_t batchsize;};
 
-struct CudnnConvHandle{size_t batchsize;};
+struct CudnnConvHandles{};
 
-struct CpuConvHandle{};
 
-ConvHandle SetupConv(
-    const size_t kernel_h_, const size_t kernel_w_,
-    const size_t pad_h_, const size_t pad_w_,
-    const size_t stride_h_,const size_t stride_w_,
-    const size_t channels_, const size_t num_filters_,
-    const bool bias_term_ = true, const size_t workspace_byte_limit_ =1024*1024*1024,
-    const std::string prefer_="fastest");
+Recorder SetupRecorder(const Tensor &input, const std::vector<size_t> kernel_size, 
+	                const std::vector<size_t> stride, const std::vector<size_t> padding,
+	                const size_t in_channels, const size_t out_channels,
+	                const bool bias_term_);
 
-CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
+CudnnConvHandles InitCudnnConvHandles(const Tensor &input, const Recorder r, 
+     const size_t workspace_byte_limit_=1024*1024*1024, const std::string prefer_="fastest");
 
-Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
-                        const ConvHandle ch, const CudnnConvHandle cch);
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const Recorder r, const CudnnConvHandles cch);
 
-Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch);
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
 
-Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch);
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
 
-Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
 
 
-CpuConvHandle InitCpuHandle(const Tensor &input, const ConvHandle ch);
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const Recorder r);
 
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b,
-                        const ConvHandle ch, const CpuConvHandle cch);
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Recorder r);
 
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, 
-    const ConvHandle ch, const CpuConvHandle cch);
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const Recorder r);
 
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, 
-    const ConvHandle ch, const CpuConvHandle cch);
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle ch, const CpuConvHandle cch);
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const Recorder r);
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/189958ab/src/model/convolution_functions.cc
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.cc b/src/model/convolution_functions.cc
deleted file mode 100644
index 6e4b195..0000000
--- a/src/model/convolution_functions.cc
+++ /dev/null
@@ -1,457 +0,0 @@
-//#include <string>
-//#include <cudnn.h>
-//#include "./layer/cudnn_convolution.h"
-//#include "./layer/cudnn_utils.h"
-//#include "singa/utils/logging.h"
-#include "./convolution_functions.h"
-#include "./layer/convolution.h"
-#include<iostream>
-namespace singa{
-
-// Done in conv2d.__init__()
-ConvHandle SetupConv(
-    const size_t kernel_h_, const size_t kernel_w_,
-    const size_t pad_h_, const size_t pad_w_,
-    const size_t stride_h_,const size_t stride_w_,
-    const size_t channels_, const size_t num_filters_,
-    const bool bias_term_ , const size_t workspace_byte_limit_,
-    const std::string prefer_){
-	 return ConvHandle{
-            kernel_w_,
-            pad_w_,
-            stride_w_,
-            kernel_h_,
-            pad_h_,
-            stride_h_,
-
-            channels_,
-            num_filters_,
-
-            bias_term_,
-
-            workspace_byte_limit_,
-            prefer_,
-    };
-};
-
-
-// Done in conv2d.__call__():
-// if self.cudnnconvhandle is None:
-//     self.cudnnconvhandle= InitCudnn(...)
-// elif x.shape(0) != self.cudnnconvhandle.batchsize:
-//     self.cudnnconvhandle= InitCudnn(...)
-CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
-
-    cudnnTensorDescriptor_t x_desc_ = nullptr;
-    cudnnTensorDescriptor_t y_desc_ = nullptr;
-    cudnnTensorDescriptor_t bias_desc_ = nullptr;
-    cudnnFilterDescriptor_t filter_desc_ = nullptr;
-    cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    
-    DataType dtype = input.data_type();
-    auto dev = input.device();
-    Context *ctx = dev->context(0);
-    
-    size_t batchsize, channels_;
-    batchsize = input.shape(0);
-    channels_ = input.shape(1);
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
-
-    conv_height_ = 1;
-    if (ch.stride_h_ > 0)
-        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
-    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
-    
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-    if (ch.bias_term_)
-        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), batchsize,
-                                           ch.channels_, height_, width_));
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-            ch.num_filters_, conv_height_, conv_width_));
-    if (ch.bias_term_)
-        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                               GetCudnnDataType(dtype), 1,
-                                               ch.num_filters_, 1, 1));
-    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, ch.pad_h_, ch.pad_w_,
-                                                ch.stride_h_, ch.stride_w_, 1, 1,
-                                                CUDNN_CROSS_CORRELATION,
-                                                GetCudnnDataType(dtype)));
-    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                           CUDNN_TENSOR_NCHW, ch.num_filters_,
-                                           channels_, ch.kernel_h_, ch.kernel_w_));
-    if (ch.prefer_ == "fastest" || ch.prefer_ == "limited_workspace" ||
-        ch.prefer_ == "no_workspace") {
-        cudnnConvolutionFwdPreference_t fwd_pref;
-        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-        if (ch.prefer_ == "fastest") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-        } else if (ch.prefer_ == "limited_workspace") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        } else {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        }
-        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                ch.workspace_byte_limit_, &fp_alg_));
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bwd_filt_pref, ch.workspace_byte_limit_, &bp_filter_alg_));
-        // deprecated in cudnn v7
-        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bwd_data_pref, ch.workspace_byte_limit_, &bp_data_alg_));
-        } else if (ch.prefer_ == "autotune") {
-        const int topk = 1;
-        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
-        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
-        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
-        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                &num_fp_alg, fp_alg_perf));
-        fp_alg_ = fp_alg_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
-                &num_bp_filt_alg, bp_filt_perf));
-        bp_filter_alg_ = bp_filt_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
-                &num_bp_data_alg, bp_data_perf));
-        bp_data_alg_ = bp_data_perf[0].algo;
-    } else {
-        LOG(FATAL) << "Preferred algorithm is not available!";
-    }
-
-    size_t fp_byte, bp_data_byte, bp_filter_byte;
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
-            &fp_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-            bp_data_alg_, &bp_data_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-            bp_filter_alg_, &bp_filter_byte));
-    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
-                       sizeof(float) +
-                       1;
-    if (workspace_count_ * sizeof(float) > ch.workspace_byte_limit_)
-        LOG(WARNING) << "The required memory for workspace ("
-                     << workspace_count_ * sizeof(float)
-                     << ") is larger than the expected Bytes ("
-                     << ch.workspace_byte_limit_ << ")";
-    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
-
-    return CudnnConvHandle{
-            x_desc_,
-            y_desc_,
-            bias_desc_,
-            filter_desc_,
-            conv_desc_,
-            fp_alg_,
-            bp_filter_alg_,
-            bp_data_alg_,
-
-            workspace_count_,
-            workspace_,
-
-            height_,
-            width_,
-            conv_height_,
-            conv_width_,
-            batchsize,
-    };
-};
-
-Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
-                        const ConvHandle ch, const CudnnConvHandle cch){
-    CHECK_EQ(x.device()->lang(), kCuda);
-    CHECK_EQ(x.nDim(), 4u);
-    CHECK_EQ(x.shape()[0],cch.batchsize);
-    CHECK_EQ(x.shape()[1],ch.channels_);
-    CHECK_EQ(x.shape()[2],cch.height_);
-    CHECK_EQ(x.shape()[3],cch.width_);
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-
-    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    output.device()->Exec([output, x, W, cch](Context *ctx) {
-        Block *inblock = x.block(), *outblock = output.block(),
-                *wblock = W.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                                inblock->data(), cch.filter_desc_, wblock->data(),
-                                cch.conv_desc_, cch.fp_alg_,
-                                cch.workspace_.block()->mutable_data(),
-                                cch.workspace_count_ * sizeof(float), &beta,
-                                cch.y_desc_, outblock->mutable_data());
-    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-    if (ch.bias_term_) {
-        output.device()->Exec([output, b, cch](Context *ctx) {
-            float beta = 1.f, alpha = 1.0f;
-            Block *outblock = output.block(), *bblock = b.block();
-            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                           bblock->data(), &beta, cch.y_desc_,
-                           outblock->mutable_data());
-        }, {output.block(), b.block()}, {output.block()});
-    }
-    return output;
-};
-
-// input Tensor W for Reset dW purpose, can avoid this later.
-Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor dW;
-    dW.ResetLike(W);
-
-    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
-    Block *inblock = x.block(), *dyblock = dy.block(),
-            *dwblock = dW.block();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-            cch.workspace_.block()->mutable_data(),
-            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
-            dwblock->mutable_data());
-    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
-
-    return dW;
-};
-
-// input Tensor b for Reset db purpose, can avoid this later.
-Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor db;
-    db.ResetLike(b);
-
-    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
-        Block *dyblock = dy.block(), *dbblock = db.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                     dyblock->data(), &beta, cch.bias_desc_,
-                                     dbblock->mutable_data());
-    }, {dy.block()}, {db.block()});
-    return db;
-};
-
-Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor dx;
-    dx.ResetLike(x);
-
-    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
-        Block *wblock = W.block(), *dyblock = dy.block(),
-                *dxblock = dx.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                     wblock->data(), cch.y_desc_, dyblock->data(),
-                                     cch.conv_desc_, cch.bp_data_alg_,
-                                     cch.workspace_.block()->mutable_data(),
-                                     cch.workspace_count_ * sizeof(float), &beta,
-                                     cch.x_desc_, dxblock->mutable_data());
-    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
-
-    return dx;
-};
-
-CpuConvHandle InitCpuHandle(const Tensor &input, const ConvHandle ch){
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;    
-    size_t batchsize;
-    size_t channels_;
-
-    size_t col_height_;
-    size_t col_width_;
-
-    batchsize = input.shape(0);
-    channels_ = input.shape(1);
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
-
-    conv_height_ = 1;
-    if (ch.stride_h_ > 0)
-        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
-    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
-
-    col_height_ = ch.channels_ * ch.kernel_w_ * ch.kernel_h_;
-    col_width_ = conv_height_ * conv_width_;
-
-    return CpuConvHandle{
-        height_,
-        width_,
-        conv_height_,
-        conv_width_,
-        batchsize,
-
-        col_height_,
-        col_width_
-    };
-};
-
-Convolution C;
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b,
-                        const ConvHandle ch, const CpuConvHandle cch){
-    CHECK_EQ(x.device()->lang(), kCpp);
-    CHECK_EQ(x.nDim(), 4u);
-    CHECK_EQ(x.shape()[0],cch.batchsize);
-    CHECK_EQ(x.shape()[1],ch.channels_);
-    CHECK_EQ(x.shape()[2],cch.height_);
-    CHECK_EQ(x.shape()[3],cch.width_);
-
-    size_t imagesize = x.Size() / cch.batchsize;
-
-    Shape w_shape= W.shape();
-    Shape b_shape= b.shape();
-
-    W.Reshape(Shape{ch.num_filters_, cch.col_height_});
-    if (ch.bias_term_)
-      b.Reshape(Shape{ch.num_filters_});
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    Tensor col_data(Shape{cch.col_height_, cch.col_width_});//broadcasted image
-
-    float *data_col = new float[cch.col_height_ * cch.col_width_];
-    auto in_data = x.data<float>();
-    for (size_t num = 0; num < cch.batchsize; num++) {
-      C.Im2col(in_data + num * imagesize, ch.channels_, cch.height_, cch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
-      col_data.CopyDataFromHostPtr(data_col, cch.col_height_ * cch.col_width_);
-      Tensor each = Mult(W, col_data);
-      if (ch.bias_term_) {
-          AddColumn(b, &each);
-        }
-      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
-  }
-  W.Reshape(w_shape);
-  b.Reshape(b_shape);
-  return output;
-}; 
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, 
-    const ConvHandle ch, const CpuConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Shape w_shape= W.shape();
-    W.Reshape(Shape{ch.num_filters_, cch.col_height_});
-
-    Tensor dx;
-    dx.ResetLike(x);
-    
-    size_t imagesize = x.Size() / cch.batchsize;
-    float *dx_b = new float[imagesize];
-
-    for (size_t num = 0; num < cch.batchsize; num++) {
-      Tensor grad_b(Shape{ch.num_filters_, cch.conv_height_ * cch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      Tensor dcol_b = Mult(W.T(), grad_b);
-      auto dcol_data = dcol_b.data<float>();
-      C.Col2im(dcol_data, ch.channels_, cch.height_, cch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
-           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
-      dx.CopyDataFromHostPtr(dx_b, imagesize, num * imagesize);
-    }
-  W.Reshape(w_shape); 
-  return dx;
-};
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, 
-    const ConvHandle ch, const CpuConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    size_t imagesize = x.Size() / cch.batchsize;
-
-    Tensor dW;
-    dW.ResetLike(W);
-    dW.SetValue(0.0f);
-    
-    Shape w_shape= W.shape();
-    dW.Reshape(Shape{ch.num_filters_, cch.col_height_});
-
-    Tensor col_data(Shape{cch.col_height_, cch.col_width_});//broadcasted image
-
-    float *data_col = new float[cch.col_height_ * cch.col_width_];
-    auto in_data = dy.data<float>();
-    for (size_t num = 0; num < cch.batchsize; num++) {
-      C.Im2col(in_data + num * imagesize, ch.channels_, cch.height_, cch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
-      col_data.CopyDataFromHostPtr(data_col, cch.col_height_ * cch.col_width_);
-      Tensor grad_b(Shape{ch.num_filters_, cch.conv_height_ * cch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      dW += Mult(grad_b, col_data.T());
-    }
-   dW.Reshape(w_shape);
-    //dW.Reshape(Shape{ch.num_filters_,ch.channels_ , ch.kernel_w_ , ch.kernel_h_});
-   return dW;
-};
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle ch, const CpuConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor db;
-    db.ResetLike(b);
-
-    auto tmpshp = Shape{cch.batchsize * ch.num_filters_, dy.Size() / (cch.batchsize * ch.num_filters_)};
-    Tensor tmp1 = Reshape(dy, tmpshp);
-
-    Tensor tmp2(Shape{cch.batchsize * ch.num_filters_});
-    SumColumns(tmp1, &tmp2);
-    Tensor tmp3 = Reshape(tmp2, Shape{cch.batchsize, ch.num_filters_});
-
-    SumRows(tmp3, &db);
-
-    return db;
-};
-
-} //namespace_singa
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/189958ab/src/model/convolution_functions.h
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.h b/src/model/convolution_functions.h
deleted file mode 100644
index 1b90941..0000000
--- a/src/model/convolution_functions.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <string>
-#include <cudnn.h>
-#include "./layer/cudnn_convolution.h"
-#include "./layer/cudnn_utils.h"
-#include "singa/utils/logging.h"
-
-namespace singa{
-
-struct ConvHandle{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    string prefer_;
-};
-
-struct CudnnConvHandle{
-    cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-};
-
-struct CpuConvHandle{
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-
-    size_t col_height_;
-    size_t col_width_;
-
-};
-
-    
-
-ConvHandle SetupConv(
-    const size_t kernel_h_, const size_t kernel_w_,
-    const size_t pad_h_, const size_t pad_w_,
-    const size_t stride_h_,const size_t stride_w_,
-    const size_t channels_, const size_t num_filters_,
-    const bool bias_term_ = true ,const size_t workspace_byte_limit_=1024*1024*1024,
-    const std::string prefer_="fastest");
-
-void testInitCudnn(const Tensor &input, const ConvHandle ch);
-
-CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
-
-Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
-                        const ConvHandle ch, const CudnnConvHandle cch);
-
-Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch);
-
-Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch);
-
-Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
-
-
-CpuConvHandle InitCpuHandle(const Tensor &input, const ConvHandle ch);
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b,
-                        const ConvHandle ch, const CpuConvHandle cch);
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, 
-    const ConvHandle ch, const CpuConvHandle cch);
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, 
-    const ConvHandle ch, const CpuConvHandle cch);
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle ch, const CpuConvHandle cch);
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/189958ab/src/model/operation/convolution_related.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_related.cc b/src/model/operation/convolution_related.cc
new file mode 100644
index 0000000..1004074
--- /dev/null
+++ b/src/model/operation/convolution_related.cc
@@ -0,0 +1,417 @@
+#include "./convolution_related.h"
+#include "../layer/convolution.h"
+#include<iostream>
+
+namespace singa{
+
+Recorder SetupRecorder(const Tensor &input, const std::vector<size_t> kernel_size, 
+	                const std::vector<size_t> stride, const std::vector<size_t> padding,
+	                const size_t in_channels, const size_t out_channels,
+	                const bool bias_term_){
+	size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+
+    size_t col_height_;
+    size_t col_width_;
+    size_t imagesize;
+
+    kernel_h_=kernel_size[0];
+    kernel_w_=kernel_size[1];
+
+    pad_h_=padding[0];
+    pad_w_=padding[1];
+
+    stride_h_=stride[0];
+    stride_w_=stride[1];
+
+	batchsize = input.shape(0);
+	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    conv_height_ = 1;
+    if (stride_h_ > 0)
+        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+
+    col_height_ = in_channels * kernel_w_ * kernel_h_;
+    col_width_ = conv_height_ * conv_width_;
+    imagesize = input.Size() / batchsize;
+
+    return Recorder{
+    	kernel_w_,
+        pad_w_,
+        stride_w_,
+        kernel_h_,
+        pad_h_,
+        stride_h_,
+
+        in_channels,
+        out_channels,
+
+        bias_term_,
+
+        height_,
+        width_,
+        conv_height_,
+        conv_width_,
+        batchsize,
+
+        col_height_,
+        col_width_,
+        imagesize
+    };
+};	
+
+Convolution C;
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const Recorder r){
+	CHECK_EQ(x.device()->lang(), kCpp);
+
+	CHECK(x.shape(1) == r.channels_ && x.shape(2) == r.height_ &&
+    x.shape(3) == r.width_) << "input sample shape should not change";
+
+    CHECK(W.shape(0) == r.num_filters_ && W.shape(1) == r.channels_ && 
+    W.shape(2) == r.kernel_h_ && W.shape(3) == r.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    Shape b_shape= b.shape();
+
+    W.Reshape(Shape{r.num_filters_, r.col_height_});
+    if (r.bias_term_)
+      b.Reshape(Shape{r.num_filters_});
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+    Shape shape{r.batchsize, r.num_filters_, r.conv_height_, r.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    Tensor col_data(Shape{r.col_height_, r.col_width_});//broadcasted image
+
+    float *data_col = new float[r.col_height_ * r.col_width_];
+    auto in_data = x.data<float>();
+    for (size_t num = 0; num < r.batchsize; num++) {
+      C.Im2col(in_data + num * r.imagesize, r.channels_, r.height_, r.width_, r.kernel_h_,
+            r.kernel_w_, r.pad_h_, r.pad_w_, r.stride_h_, r.stride_w_, data_col);    
+
+      col_data.CopyDataFromHostPtr(data_col, r.col_height_ * r.col_width_);
+      Tensor each = Mult(W, col_data);
+      if (r.bias_term_) {
+          AddColumn(b, &each);
+        }
+      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
+    };
+  W.Reshape(w_shape);
+  b.Reshape(b_shape);
+  return output;
+}; 
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Recorder r){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == r.num_filters_ && dy.shape(2) == r.conv_height_ &&
+    dy.shape(3) == r.conv_width_) << "input gradients shape should not change";
+
+    CHECK(W.shape(0) == r.num_filters_ && W.shape(1) == r.channels_ && 
+    W.shape(2) == r.kernel_h_ && W.shape(3) == r.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    W.Reshape(Shape{r.num_filters_, r.col_height_});
+
+    Tensor dx;
+    dx.ResetLike(x);
+    
+    float *dx_b = new float[r.imagesize];
+
+    for (size_t num = 0; num < r.batchsize; num++) {
+      Tensor grad_b(Shape{r.num_filters_, r.conv_height_ * r.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      Tensor dcol_b = Mult(W.T(), grad_b);
+      auto dcol_data = dcol_b.data<float>();
+      C.Col2im(dcol_data, r.channels_, r.height_, r.width_, r.kernel_h_, r.kernel_w_, r.pad_h_,
+           r.pad_w_, r.stride_h_, r.stride_w_, dx_b);
+      dx.CopyDataFromHostPtr(dx_b, r.imagesize, num * r.imagesize);
+    }
+  W.Reshape(w_shape); 
+  return dx;
+};
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const Recorder r){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == r.num_filters_ && dy.shape(2) == r.conv_height_ &&
+    dy.shape(3) == r.conv_width_) << "input gradients shape should not change";
+
+    CHECK(x.shape(1) == r.channels_ && x.shape(2) == r.height_ &&
+    x.shape(3) == r.width_) << "input sample shape should not change";
+
+    Tensor dW;
+    dW.ResetLike(W);
+    dW.SetValue(0.0f);
+    
+    Shape w_shape= W.shape();
+    dW.Reshape(Shape{r.num_filters_, r.col_height_});
+
+    Tensor col_data(Shape{r.col_height_, r.col_width_});//broadcasted image
+
+    float *data_col = new float[r.col_height_ * r.col_width_];
+    auto in_data = dy.data<float>();
+    for (size_t num = 0; num < r.batchsize; num++) {
+      C.Im2col(in_data + num * r.imagesize, r.channels_, r.height_, r.width_, r.kernel_h_,
+            r.kernel_w_, r.pad_h_, r.pad_w_, r.stride_h_, r.stride_w_, data_col);
+      col_data.CopyDataFromHostPtr(data_col, r.col_height_ * r.col_width_);
+      Tensor grad_b(Shape{r.num_filters_, r.conv_height_ * r.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      dW += Mult(grad_b, col_data.T());
+    }
+   dW.Reshape(w_shape);
+   return dW;
+};
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const Recorder r){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == r.num_filters_ && dy.shape(2) == r.conv_height_ &&
+    dy.shape(3) == r.conv_width_) << "input gradients shape should not change";
+	
+	CHECK(b.shape(0) == r.num_filters_)<< "bias shape should not change";
+
+    Tensor db;
+    db.ResetLike(b);
+
+    auto tmpshp = Shape{r.batchsize * r.num_filters_, dy.Size() / (r.batchsize * r.num_filters_)};
+    Tensor tmp1 = Reshape(dy, tmpshp);
+
+    Tensor tmp2(Shape{r.batchsize * r.num_filters_});
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{r.batchsize, r.num_filters_});
+
+    SumRows(tmp3, &db);
+
+    return db;
+};
+
+CudnnConvHandles InitCudnnConvHandles(const Tensor &input, const Recorder r, const size_t workspace_byte_limit_,
+    				const std::string prefer_){
+
+	CHECK(input.shape(0) == r.batchsize && input.shape(1) == r.channels_ && input.shape(2) == r.height_ &&
+    input.shape(3) == r.width_) << "input sample shape dismatched";
+
+	cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_; 
+
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    Context *ctx = dev->context(0);
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+    if (r.bias_term_)
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), r.batchsize,
+                                           r.channels_, r.height_, r.width_));
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), r.batchsize,
+            r.num_filters_, r.conv_height_, r.conv_width_));
+    if (r.bias_term_)
+        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                               GetCudnnDataType(dtype), 1,
+                                               r.num_filters_, 1, 1));
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, r.pad_h_, r.pad_w_,
+                                                r.stride_h_, r.stride_w_, 1, 1,
+                                                CUDNN_CROSS_CORRELATION,
+                                                GetCudnnDataType(dtype)));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                           CUDNN_TENSOR_NCHW, r.num_filters_,
+                                           r.channels_, r.kernel_h_, r.kernel_w_));
+    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+        prefer_ == "no_workspace") {
+        cudnnConvolutionFwdPreference_t fwd_pref;
+        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+        if (prefer_ == "fastest") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+        } else if (prefer_ == "limited_workspace") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        } else {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        }
+        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                workspace_byte_limit_, &fp_alg_));
+        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+        // deprecated in cudnn v7
+        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+        } else if (prefer_ == "autotune") {
+        const int topk = 1;
+        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                &num_fp_alg, fp_alg_perf));
+        fp_alg_ = fp_alg_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                &num_bp_filt_alg, bp_filt_perf));
+        bp_filter_alg_ = bp_filt_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                &num_bp_data_alg, bp_data_perf));
+        bp_data_alg_ = bp_data_perf[0].algo;
+    } else {
+        LOG(FATAL) << "Preferred algorithm is not available!";
+    }
+
+    size_t fp_byte, bp_data_byte, bp_filter_byte;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+            &fp_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+            bp_data_alg_, &bp_data_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+            bp_filter_alg_, &bp_filter_byte));
+    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                       sizeof(float) +
+                       1;
+    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
+        LOG(WARNING) << "The required memory for workspace ("
+                     << workspace_count_ * sizeof(float)
+                     << ") is larger than the expected Bytes ("
+                     << workspace_byte_limit_ << ")";
+    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+
+};
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const Recorder r, const CudnnConvHandles cch){
+	CHECK_EQ(x.device()->lang(), kCuda);
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+
+    Shape shape{r.batchsize, r.num_filters_, r.conv_height_, r.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    output.device()->Exec([output, x, W, cch](Context *ctx) {
+        Block *inblock = x.block(), *outblock = output.block(),
+                *wblock = W.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                                inblock->data(), cch.filter_desc_, wblock->data(),
+                                cch.conv_desc_, cch.fp_alg_,
+                                cch.workspace_.block()->mutable_data(),
+                                cch.workspace_count_ * sizeof(float), &beta,
+                                cch.y_desc_, outblock->mutable_data());
+    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+    if (r.bias_term_) {
+        output.device()->Exec([output, b, cch](Context *ctx) {
+            float beta = 1.f, alpha = 1.0f;
+            Block *outblock = output.block(), *bblock = b.block();
+            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                           bblock->data(), &beta, cch.y_desc_,
+                           outblock->mutable_data());
+        }, {output.block(), b.block()}, {output.block()});
+    }
+
+    return output;
+};
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dx;
+    dx.ResetLike(x);
+
+    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
+        Block *wblock = W.block(), *dyblock = dy.block(),
+                *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                     wblock->data(), cch.y_desc_, dyblock->data(),
+                                     cch.conv_desc_, cch.bp_data_alg_,
+                                     cch.workspace_.block()->mutable_data(),
+                                     cch.workspace_count_ * sizeof(float), &beta,
+                                     cch.x_desc_, dxblock->mutable_data());
+    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+
+    return dx;
+};
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dW;
+    dW.ResetLike(W);
+
+    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
+    Block *inblock = x.block(), *dyblock = dy.block(),
+            *dwblock = dW.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+            cch.workspace_.block()->mutable_data(),
+            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+            dwblock->mutable_data());
+    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+    return dW;
+};
+
+// input Tensor b for Reset db purpose, can avoid this later.
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
+        Block *dyblock = dy.block(), *dbblock = db.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                     dyblock->data(), &beta, cch.bias_desc_,
+                                     dbblock->mutable_data());
+    }, {dy.block()}, {db.block()});
+
+    return db;
+};
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/189958ab/src/model/operation/convolution_related.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_related.h b/src/model/operation/convolution_related.h
new file mode 100644
index 0000000..49aab5b
--- /dev/null
+++ b/src/model/operation/convolution_related.h
@@ -0,0 +1,75 @@
+#include <string>
+#include <vector>
+#include <cudnn.h>
+#include "../layer/cudnn_convolution.h"
+#include "../layer/cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa{
+
+struct Recorder{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+
+    size_t col_height_;
+    size_t col_width_;
+    size_t imagesize;
+};
+
+struct CudnnConvHandles{
+	cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;  
+};
+
+
+Recorder SetupRecorder(const Tensor &input, const std::vector<size_t> kernel_size, 
+	                const std::vector<size_t> stride, const std::vector<size_t> padding,
+	                const size_t in_channels, const size_t out_channels,
+	                const bool bias_term_);
+
+CudnnConvHandles InitCudnnConvHandles(const Tensor &input, const Recorder r, const size_t workspace_byte_limit_=1024*1024*1024,
+    				const std::string prefer_="fastest");
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const Recorder r, const CudnnConvHandles cch);
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
+
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
+
+
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const Recorder r);
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const Recorder r);
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const Recorder r);
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const Recorder r);
+
+}
\ No newline at end of file


[07/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- fix some bugs.
- the conv2d_gpu operation has pass tests.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/78e1fc23
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/78e1fc23
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/78e1fc23

Branch: refs/heads/master
Commit: 78e1fc230a14510239728e585103e7c5791c3943
Parents: c57b87a
Author: xuewanqi <xu...@u.nus.edu>
Authored: Thu Jun 21 03:10:33 2018 +0000
Committer: xuewanqi <xu...@u.nus.edu>
Committed: Thu Jun 21 03:10:33 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/78e1fc23/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index c7e0adb..7ba68f5 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -616,7 +616,7 @@ class Conv2d_GPU(Operation):
 
         self.bias = bias
 
-        inner_params = {'cudnn_prefer': 'fastest', 'workspace_byte_limit': 1024}
+        inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
         # TODO valid value of inner_params check
 
         for kwarg in kwargs:
@@ -627,7 +627,8 @@ class Conv2d_GPU(Operation):
 
         self.convhandle = singa.SetupConv(self.kernel_size[0], self.kernel_size[1],
         			self.padding[0], self.padding[1], self.stride[0], self.stride[1],
-        			self.bias, inner_params['workspace_byte_limit']*1024*1024,
+        			self.in_channels, self.out_channels, self.bias, 
+                                inner_params['workspace_MB_limit']*1024*1024,
         			inner_params['cudnn_prefer'])
         
         w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
@@ -650,10 +651,13 @@ class Conv2d_GPU(Operation):
         assert 0 == 0, 'invalid padding.'
     	# TODO valid padding check.
 
-    	if not hasattr (self, cudnnconvhandle):
+    	if not hasattr (self, 'cudnnconvhandle'):
     	    self.cudnnconvhandle = singa.InitCudnn(x.data, self.convhandle)
     	elif x.shape[0] != self.cudnnconvhandle.batchsize:
     	    self.cudnnconvhandle = singa.InitCudnn(x.data, self.convhandle)
+        
+        if training:
+            self.x = x
 
     	self.dev = x.device
 
@@ -665,20 +669,18 @@ class Conv2d_GPU(Operation):
     	return self._do_forward(*xs)[0]
 
     def forward(self, *xs):
-        if training:
-    	    self.x = xs[0]
         return singa.CudnnConvForward(xs[0], xs[1], xs[2], self.convhandle, self.cudnnconvhandle)
 
     def backward(self, dy):
-        assert training is True and hasattr(self, x), 'Please set \'trainging\' as True before do BP. '
+        assert training is True and hasattr(self, 'x'), 'Please set \'trainging\' as True before do BP. '
 
         # todo check device?
         dy.ToDevice(self.dev)
 
-        dx = singa.CudnnConvBackwardx(dy, self.W, self.x, self.cch)
-        dW = singa.CudnnConvBackwardW(dy, self.x, self.W, self.cch)
+        dx = singa.CudnnConvBackwardx(dy, self.W.data, self.x.data, self.cudnnconvhandle)
+        dW = singa.CudnnConvBackwardW(dy, self.x.data, self.W.data, self.cudnnconvhandle)
         if self.bias:
-    	    db = singa.CudnnConvBackwardb(dy, self.b, self.cch)
+    	    db = singa.CudnnConvBackwardb(dy, self.b.data, self.cudnnconvhandle)
     	    return dx, dW, db
         else:
     	    return dx, dW


[02/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- separate .cc and .h file
- write interface files for these function(not completed)


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/af95cc1a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/af95cc1a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/af95cc1a

Branch: refs/heads/master
Commit: af95cc1a67f163bdef265f6bdc93aeaef05f848f
Parents: fc181cd
Author: xuewanqi <xu...@u.nus.edu>
Authored: Thu Jun 14 02:56:41 2018 +0000
Committer: xuewanqi <xu...@u.nus.edu>
Committed: Wed Jun 20 14:47:05 2018 +0000

----------------------------------------------------------------------
 src/api/model_operation.i        | 59 +++++++++++++++++++++++++++++++++++
 src/api/singa.i                  |  1 +
 src/model/convolution_forward.cc | 57 ++++++---------------------------
 src/model/convolution_forward.h  | 59 +++++++++++++++++++++++++++++++++++
 4 files changed, 129 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af95cc1a/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
new file mode 100644
index 0000000..64ecca1
--- /dev/null
+++ b/src/api/model_operation.i
@@ -0,0 +1,59 @@
+/* interface file for swig */
+
+%module model_operation
+%include "std_string.i"
+
+%{
+#include "../src/model/convolution_functions.h"
+%}
+
+namespace singa{
+extern struct ConvHandle{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    std::string prefer_;
+};
+
+struct CudnnConvHandle{
+    cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+};
+
+extern ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
+
+CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
+
+Tensor CudnnConvForward(const Tensor x, const Tensor W, const Tensor b,
+                        const ConvHandle ch, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardW(const Tensor dy, const Tensor x, const Tensor W, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardb(const Tensor dy, const Tensor b, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardx(const Tensor dy, const Tensor W, const Tensor x, const CudnnConvHandle cch);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af95cc1a/src/api/singa.i
----------------------------------------------------------------------
diff --git a/src/api/singa.i b/src/api/singa.i
index 3fc3b47..b5abc6b 100644
--- a/src/api/singa.i
+++ b/src/api/singa.i
@@ -29,4 +29,5 @@
 %include "model_optimizer.i"
 %include "model_loss.i"
 %include "model_metric.i"
+%include "model_operation.i"
 %include "io_snapshot.i"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af95cc1a/src/model/convolution_forward.cc
----------------------------------------------------------------------
diff --git a/src/model/convolution_forward.cc b/src/model/convolution_forward.cc
index 8457e95..52acf05 100644
--- a/src/model/convolution_forward.cc
+++ b/src/model/convolution_forward.cc
@@ -1,48 +1,11 @@
-#include <string>
-#include <cudnn.h>
-#include "./layer/cudnn_convolution.h"
-#include "./layer/cudnn_utils.h"
-#include "singa/utils/logging.h"
+//#include <string>
+//#include <cudnn.h>
+//#include "./layer/cudnn_convolution.h"
+//#include "./layer/cudnn_utils.h"
+//#include "singa/utils/logging.h"
+#include "./convolution_forward.h"
 
 namespace singa{
-struct ConvHandle{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    std::string prefer_;
-};
-
-
-struct CudnnConvHandle{
-    cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-};
-
 
 // Done in conv2d.__init__()
 ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
@@ -297,7 +260,7 @@ CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
     };
 };
 
-Tensor CudnnConvForward(const Tensor x, const Tensor W, const Tensor b,
+Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
                         const ConvHandle ch, const CudnnConvHandle cch){
     CHECK_EQ(x.device()->lang(), kCuda);
     CHECK_EQ(x.nDim(), 4u);
@@ -337,7 +300,7 @@ Tensor CudnnConvForward(const Tensor x, const Tensor W, const Tensor b,
 };
 
 // input Tensor W for Reset dW purpose, can avoid this later.
-Tensor CudnnConvBackwardW(const Tensor dy, const Tensor x, const Tensor W, const CudnnConvHandle cch){
+Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch){
     CHECK_EQ(dy.device()->lang(), kCuda);
     CHECK_EQ(dy.nDim(), 4u);
 
@@ -360,7 +323,7 @@ Tensor CudnnConvBackwardW(const Tensor dy, const Tensor x, const Tensor W, const
 };
 
 // input Tensor b for Reset db purpose, can avoid this later.
-Tensor CudnnConvBackwardb(const Tensor dy, const Tensor b, const CudnnConvHandle cch){
+Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch){
     CHECK_EQ(dy.device()->lang(), kCuda);
     CHECK_EQ(dy.nDim(), 4u);
 
@@ -377,7 +340,7 @@ Tensor CudnnConvBackwardb(const Tensor dy, const Tensor b, const CudnnConvHandle
     return db;
 };
 
-Tensor CudnnConvBackwardx(const Tensor dy, const Tensor W, const Tensor x, const CudnnConvHandle cch){
+Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch){
     CHECK_EQ(dy.device()->lang(), kCuda);
     CHECK_EQ(dy.nDim(), 4u);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af95cc1a/src/model/convolution_forward.h
----------------------------------------------------------------------
diff --git a/src/model/convolution_forward.h b/src/model/convolution_forward.h
new file mode 100644
index 0000000..eba0e50
--- /dev/null
+++ b/src/model/convolution_forward.h
@@ -0,0 +1,59 @@
+#include <string>
+#include <cudnn.h>
+#include "./layer/cudnn_convolution.h"
+#include "./layer/cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa{
+
+struct ConvHandle{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    string prefer_;
+};
+
+struct CudnnConvHandle{
+    cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+};
+
+ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
+
+CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
+
+Tensor CudnnConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
+                        const ConvHandle ch, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch);
+
+Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
+
+}


[06/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- fix some bugs
- rewrite the function SetupConv(), the reason is that LayerConf cannot be created in python,thus  unable to send this arguement to InitConv()
- these functions have been tested and are workable


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/2cac057d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/2cac057d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/2cac057d

Branch: refs/heads/master
Commit: 2cac057dffa77bbdeffdaa8aabdf37a03e82dd00
Parents: d48dea0
Author: xuewanqi <xu...@u.nus.edu>
Authored: Tue Jun 19 11:01:45 2018 +0000
Committer: xuewanqi <xu...@u.nus.edu>
Committed: Wed Jun 20 14:47:37 2018 +0000

----------------------------------------------------------------------
 src/api/model_operation.i          | 11 ++--
 src/model/convolution_functions.cc | 96 +++++----------------------------
 src/model/convolution_functions.h  | 10 +++-
 3 files changed, 30 insertions(+), 87 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2cac057d/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 79707eb..77ef6bb 100644
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -2,8 +2,6 @@
 
 %{
 #include "../src/model/convolution_functions.h"
-using singa::Tensor;
-using singa::CudnnConvHandle;
 %}
 namespace singa{
 
@@ -11,7 +9,13 @@ struct ConvHandle{};
 
 struct CudnnConvHandle{};
 
-ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
+ConvHandle SetupConv(
+    const size_t kernel_h_, const size_t kernel_w_,
+    const size_t pad_h_, const size_t pad_w_,
+    const size_t stride_h_,const size_t stride_w_,
+    const size_t channels_, const size_t num_filters_,
+    const bool bias_term_ = true, const size_t workspace_byte_limit_ =1024*1024,
+    const std::string prefer_="fastest");
 
 CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
 
@@ -23,4 +27,5 @@ Tensor CudnnConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, co
 Tensor CudnnConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle cch);
 
 Tensor CudnnConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle cch);
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2cac057d/src/model/convolution_functions.cc
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.cc b/src/model/convolution_functions.cc
index 0fc8e65..7ff399a 100644
--- a/src/model/convolution_functions.cc
+++ b/src/model/convolution_functions.cc
@@ -4,87 +4,18 @@
 //#include "./layer/cudnn_utils.h"
 //#include "singa/utils/logging.h"
 #include "./convolution_functions.h"
-
+#include<iostream>
 namespace singa{
 
 // Done in conv2d.__init__()
-ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
-
-    size_t kernel_w_, pad_w_, stride_w_;
-    size_t kernel_h_, pad_h_, stride_h_;
-
-    size_t channels_, num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    string prefer_;
-
-    ConvolutionConf conv_conf = conf.convolution_conf();
-
-    workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
-    prefer_ = ToLowerCase(conv_conf.prefer());
-    CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-          prefer_ == "no_workspace" || prefer_ == "autotune")
-            << "CudnnConvolution only supports four algorithm preferences: fastest, "
-               "limited_workspace, no_workspace and autotune";
-
-
-    // kernel_size, pad, and stride are repeated fields.
-    if (conv_conf.kernel_size_size() > 0) {
-    if (conv_conf.kernel_size_size() == 1) {
-    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
-    } else {
-    kernel_w_ = conv_conf.kernel_size(0);
-    kernel_h_ = conv_conf.kernel_size(1);
-    }
-    } else {
-    kernel_w_ = conv_conf.kernel_w();
-    kernel_h_ = conv_conf.kernel_h();
-    }
-    CHECK_GT(kernel_w_, 0u);
-    CHECK_GT(kernel_h_, 0u);
-
-    if (conv_conf.pad_size() > 0) {
-    if (conv_conf.pad_size() == 1) {
-    pad_w_ = pad_h_ = conv_conf.pad(0);
-    } else {
-    pad_w_ = conv_conf.pad(0);
-    pad_h_ = conv_conf.pad(1);
-    }
-    } else {
-    pad_w_ = conv_conf.pad_w();
-    pad_h_ = conv_conf.pad_h();
-    }
-    CHECK_GE(pad_w_, 0u);
-    CHECK_GE(pad_h_, 0u);
-
-    const int kStrideDefault = 1;
-    if (conv_conf.stride_size() > 0) {
-    if (conv_conf.stride_size() == 1) {
-    stride_w_ = stride_h_ = conv_conf.stride(0);
-    } else {
-    stride_w_ = conv_conf.stride(0);
-    stride_h_ = conv_conf.stride(1);
-    }
-    } else {
-    stride_w_ = kStrideDefault;
-    stride_h_ = kStrideDefault;
-    if (conv_conf.has_stride_w()) {
-    stride_w_ = conv_conf.stride_w();
-    }
-    if (conv_conf.has_stride_h()) {
-    stride_h_ = conv_conf.stride_h();
-    }
-    }
-    CHECK_GT(stride_w_, 0u);
-    CHECK_GE(stride_h_, 0u);  // 0 for 1D conv
-
-    channels_ = in_channels;
-    num_filters_ = conv_conf.num_output();
-    bias_term_ = conv_conf.bias_term();
-
-    return ConvHandle{
+ConvHandle SetupConv(
+    const size_t kernel_h_, const size_t kernel_w_,
+    const size_t pad_h_, const size_t pad_w_,
+    const size_t stride_h_,const size_t stride_w_,
+    const size_t channels_, const size_t num_filters_,
+    const bool bias_term_ , const size_t workspace_byte_limit_,
+    const std::string prefer_){
+	 return ConvHandle{
             kernel_w_,
             pad_w_,
             stride_w_,
@@ -103,7 +34,6 @@ ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
 };
 
 
-
 // Done in conv2d.__call__():
 // if self.cudnnconvhandle is None:
 //     self.cudnnconvhandle= InitCudnn(...)
@@ -126,11 +56,11 @@ CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
     size_t width_;
     size_t conv_height_;
     size_t conv_width_;
-
+    
     DataType dtype = input.data_type();
     auto dev = input.device();
     Context *ctx = dev->context(0);
-
+    
     size_t batchsize, channels_;
     batchsize = input.shape(0);
     channels_ = input.shape(1);
@@ -143,7 +73,7 @@ CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
     if (ch.stride_h_ > 0)
         conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
     conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
-
+    
     CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
     CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
     if (ch.bias_term_)
@@ -197,7 +127,7 @@ CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
         CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
                 ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
                 bwd_data_pref, ch.workspace_byte_limit_, &bp_data_alg_));
-    } else if (ch.prefer_ == "autotune") {
+        } else if (ch.prefer_ == "autotune") {
         const int topk = 1;
         int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
         cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2cac057d/src/model/convolution_functions.h
----------------------------------------------------------------------
diff --git a/src/model/convolution_functions.h b/src/model/convolution_functions.h
index eba0e50..9462805 100644
--- a/src/model/convolution_functions.h
+++ b/src/model/convolution_functions.h
@@ -43,7 +43,15 @@ struct CudnnConvHandle{
     size_t batchsize;
 };
 
-ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf);
+ConvHandle SetupConv(
+    const size_t kernel_h_, const size_t kernel_w_,
+    const size_t pad_h_, const size_t pad_w_,
+    const size_t stride_h_,const size_t stride_w_,
+    const size_t channels_, const size_t num_filters_,
+    const bool bias_term_ = true ,const size_t workspace_byte_limit_=1024*1024,
+    const std::string prefer_="fastest");
+
+void testInitCudnn(const Tensor &input, const ConvHandle ch);
 
 CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch);
 


[15/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

add destructor for CudnnConvHandle;
comment unused code (and include)


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5340b65d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5340b65d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5340b65d

Branch: refs/heads/master
Commit: 5340b65d508f50d20f8f99086c08ceaa1509e391
Parents: 15c0230
Author: Wang Wei <wa...@gmail.com>
Authored: Tue Jul 3 22:32:15 2018 +0800
Committer: Wang Wei <wa...@gmail.com>
Committed: Tue Jul 3 22:32:15 2018 +0800

----------------------------------------------------------------------
 src/model/operation/convolution.cc | 669 ++++++++++++++++----------------
 src/model/operation/convolution.h  | 123 +++---
 2 files changed, 410 insertions(+), 382 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5340b65d/src/model/operation/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
index 8d60df4..d64fbc1 100755
--- a/src/model/operation/convolution.cc
+++ b/src/model/operation/convolution.cc
@@ -1,371 +1,384 @@
 #include "./convolution.h"
-#include "../layer/convolution.h"
+// #include "../layer/convolution.h"
 #include<iostream>
 
-namespace singa{
-
-ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
-	                const std::vector<size_t> stride, const std::vector<size_t> padding,
-	                const size_t in_channels, const size_t out_channels,
-	                const bool bias){
-    kernel_h_=kernel_size[0];
-    kernel_w_=kernel_size[1];
-
-    pad_h_=padding[0];
-    pad_w_=padding[1];
-
-    stride_h_=stride[0];
-    stride_w_=stride[1];
-
-    channels_=in_channels;
-    num_filters_=out_channels;
-
-    bias_term_ = bias;
-
-	batchsize = input.shape(0);
-	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    conv_height_ = 1;
-    if (stride_h_ > 0)
-        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
-    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
-
-    col_height_ = in_channels * kernel_w_ * kernel_h_;
-    col_width_ = conv_height_ * conv_width_;
-    imagesize = input.Size() / batchsize;
-};	
-
-CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,const bool bias_term_, 
-                    const size_t workspace_byte_limit_,const std::string prefer_)
-                    :ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){
-
-    DataType dtype = input.data_type();
-    auto dev = input.device();
-    Context *ctx = dev->context(0);
-
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-    if (bias_term_)
-        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), batchsize,
-                                           channels_, height_, width_));
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-            num_filters_, conv_height_, conv_width_));
-    if (bias_term_)
-        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                               GetCudnnDataType(dtype), 1,
-                                               num_filters_, 1, 1));
-    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
-                                                stride_h_, stride_w_, 1, 1,
-                                                CUDNN_CROSS_CORRELATION,
-                                                GetCudnnDataType(dtype)));
-    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                           CUDNN_TENSOR_NCHW, num_filters_,
-                                           channels_, kernel_h_, kernel_w_));
-    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-        prefer_ == "no_workspace") {
-        cudnnConvolutionFwdPreference_t fwd_pref;
-        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-        if (prefer_ == "fastest") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-        } else if (prefer_ == "limited_workspace") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        } else {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        }
-        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                workspace_byte_limit_, &fp_alg_));
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
-        // deprecated in cudnn v7
-        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
-        } else if (prefer_ == "autotune") {
-        const int topk = 1;
-        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
-        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
-        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
-        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                &num_fp_alg, fp_alg_perf));
-        fp_alg_ = fp_alg_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
-                &num_bp_filt_alg, bp_filt_perf));
-        bp_filter_alg_ = bp_filt_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
-                &num_bp_data_alg, bp_data_perf));
-        bp_data_alg_ = bp_data_perf[0].algo;
-    } else {
-        LOG(FATAL) << "Preferred algorithm is not available!";
-    }
+namespace singa {
 
-    size_t fp_byte, bp_data_byte, bp_filter_byte;
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
-            &fp_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-            bp_data_alg_, &bp_data_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-            bp_filter_alg_, &bp_filter_byte));
-    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
-                       sizeof(float) +
-                       1;
-    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
-        LOG(WARNING) << "The required memory for workspace ("
-                     << workspace_count_ * sizeof(float)
-                     << ") is larger than the expected Bytes ("
-                     << workspace_byte_limit_ << ")";
-    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
-};
+ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size,
+                       const std::vector<size_t>& stride, const std::vector<size_t>& padding,
+                       const size_t in_channels, const size_t out_channels,
+                       const bool bias) {
+  kernel_h_ = kernel_size[0];
+  kernel_w_ = kernel_size[1];
+
+  pad_h_ = padding[0];
+  pad_w_ = padding[1];
+
+  stride_h_ = stride[0];
+  stride_w_ = stride[1];
 
-Convolution C;
+  channels_ = in_channels;
+  num_filters_ = out_channels;
 
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch){
-	CHECK_EQ(x.device()->lang(), kCpp);
+  bias_term_ = bias;
 
-	CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-    x.shape(3) == ch.width_) << "input sample shape should not change";
+  batchsize = input.shape(0);
+  CHECK(input.shape(1) == in_channels) << "the number of input channels mismatched.";
+  height_ = input.shape(2);
+  width_ = input.shape(3);
 
-    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
-    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+  conv_height_ = 1;
+  if (stride_h_ > 0)
+    conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+  conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
 
-    Shape w_shape= W.shape();
-    Shape b_shape;
-    if (ch.bias_term_)
-      b_shape= b.shape();
+  col_height_ = in_channels * kernel_w_ * kernel_h_;
+  col_width_ = conv_height_ * conv_width_;
+  imagesize = input.Size() / batchsize;
+}
+
+// Convolution C;
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch) {
+  CHECK_EQ(x.device()->lang(), kCpp);
+
+  CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+        x.shape(3) == ch.width_) << "input sample shape should not change";
+
+  CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ &&
+        W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+  Shape w_shape = W.shape();
+  Shape b_shape;
+  if (ch.bias_term_)
+    b_shape = b.shape();
 
-    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
-    if (ch.bias_term_)
-      b.Reshape(Shape{ch.num_filters_});
+  W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+  if (ch.bias_term_)
+    b.Reshape(Shape{ch.num_filters_});
 
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-    Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
-    Tensor output(shape, dev, dtype);
+  DataType dtype = x.data_type();
+  auto dev = x.device();
+  Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
+  Tensor output(shape, dev, dtype);
 
-    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+  Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
 
-    float *data_col = new float[ch.col_height_ * ch.col_width_];
-    auto in_data = x.data<float>();
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);    
+  float *data_col = new float[ch.col_height_ * ch.col_width_];
+  auto in_data = x.data<float>();
+  for (size_t num = 0; num < ch.batchsize; num++) {
+    C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+             ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
 
-      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
-      Tensor each = Mult(W, col_data);
-      if (ch.bias_term_) {
-          AddColumn(b, &each);
-        }
-      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
-    };
+    col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+    Tensor each = Mult(W, col_data);
+    if (ch.bias_term_) {
+      AddColumn(b, &each);
+    }
+    CopyDataToFrom(&output, each, each.Size(), num * each.Size());
+  };
   W.Reshape(w_shape);
   if (ch.bias_term_)
     b.Reshape(b_shape);
   return output;
-}; 
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-
-    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
-    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
-
-    Shape w_shape= W.shape();
-    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
-
-    Tensor dx;
-    dx.ResetLike(x);
-    
-    float *dx_b = new float[ch.imagesize];
-
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      Tensor dcol_b = Mult(W.T(), grad_b);
-      auto dcol_data = dcol_b.data<float>();
-      C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
-           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
-      dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
-    }
-  W.Reshape(w_shape); 
+}
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch) {
+  CHECK_EQ(dy.device()->lang(), kCpp);
+
+  CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+        dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+  CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ &&
+        W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+  Shape w_shape = W.shape();
+  W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+  Tensor dx;
+  dx.ResetLike(x);
+
+  float *dx_b = new float[ch.imagesize];
+
+  for (size_t num = 0; num < ch.batchsize; num++) {
+    Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+    CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+    Tensor dcol_b = Mult(W.T(), grad_b);
+    auto dcol_data = dcol_b.data<float>();
+    C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
+             ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
+    dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
+  }
+  W.Reshape(w_shape);
   return dx;
-};
+}
 
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-
-    CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-    x.shape(3) == ch.width_) << "input sample shape should not change";
-
-    Tensor dW;
-    dW.ResetLike(W);
-    dW.SetValue(0.0f);
-    
-    Shape w_shape= W.shape();
-    dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
-
-    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
-
-    float *data_col = new float[ch.col_height_ * ch.col_width_];
-    auto in_data = dy.data<float>();
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
-      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
-      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      dW += Mult(grad_b, col_data.T());
-    }
-   dW.Reshape(w_shape);
-   return dW;
-};
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch) {
+  CHECK_EQ(dy.device()->lang(), kCpp);
+
+  CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+        dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+  CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+        x.shape(3) == ch.width_) << "input sample shape should not change";
+
+  Tensor dW;
+  dW.ResetLike(W);
+  dW.SetValue(0.0f);
+
+  Shape w_shape = W.shape();
+  dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+  Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+
+  float *data_col = new float[ch.col_height_ * ch.col_width_];
+  auto in_data = dy.data<float>();
+  for (size_t num = 0; num < ch.batchsize; num++) {
+    C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+             ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
+    col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+    Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+    CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+    dW += Mult(grad_b, col_data.T());
+  }
+  dW.Reshape(w_shape);
+  return dW;
+}
 
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-	
-	CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change";
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch) {
+  CHECK_EQ(dy.device()->lang(), kCpp);
 
-    Tensor db;
-    db.ResetLike(b);
+  CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+        dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
 
-    auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
-    Tensor tmp1 = Reshape(dy, tmpshp);
+  CHECK(b.shape(0) == ch.num_filters_) << "bias shape should not change";
 
-    Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
-    SumColumns(tmp1, &tmp2);
-    Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
+  Tensor db;
+  db.ResetLike(b);
 
-    SumRows(tmp3, &db);
+  auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
+  Tensor tmp1 = Reshape(dy, tmpshp);
 
-    return db;
+  Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
+  SumColumns(tmp1, &tmp2);
+  Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
+
+  SumRows(tmp3, &db);
+
+  return db;
 };
 
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch){
-	CHECK_EQ(x.device()->lang(), kCuda);
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-
-    Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    output.device()->Exec([output, x, W, cch](Context *ctx) {
-        Block *inblock = x.block(), *outblock = output.block(),
-                *wblock = W.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                                inblock->data(), cch.filter_desc_, wblock->data(),
-                                cch.conv_desc_, cch.fp_alg_,
-                                cch.workspace_.block()->mutable_data(),
-                                cch.workspace_count_ * sizeof(float), &beta,
-                                cch.y_desc_, outblock->mutable_data());
-    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-    if (cch.bias_term_) {
-        output.device()->Exec([output, b, cch](Context *ctx) {
-            float beta = 1.f, alpha = 1.0f;
-            Block *outblock = output.block(), *bblock = b.block();
-            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                           bblock->data(), &beta, cch.y_desc_,
-                           outblock->mutable_data());
-        }, {output.block(), b.block()}, {output.block()});
+#ifdef USE_CUDNN
+CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
+                                 const std::vector<size_t>& stride, const std::vector<size_t>& padding,
+                                 const size_t in_channels, const size_t out_channels, const bool bias_term_,
+                                 const size_t workspace_byte_limit_, const std::string& prefer_)
+  : ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_) {
+
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+  Context *ctx = dev->context(0);
+
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  if (bias_term_)
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+  CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), batchsize,
+                                         channels_, height_, width_));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+                y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+                num_filters_, conv_height_, conv_width_));
+  if (bias_term_)
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), 1,
+                                           num_filters_, 1, 1));
+  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
+              stride_h_, stride_w_, 1, 1,
+              CUDNN_CROSS_CORRELATION,
+              GetCudnnDataType(dtype)));
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                         CUDNN_TENSOR_NCHW, num_filters_,
+                                         channels_, kernel_h_, kernel_w_));
+  if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+      prefer_ == "no_workspace") {
+    cudnnConvolutionFwdPreference_t fwd_pref;
+    cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+    cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+    if (prefer_ == "fastest") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+    } else if (prefer_ == "limited_workspace") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+    } else {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
     }
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                  ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                  workspace_byte_limit_, &fp_alg_));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                  ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                  bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+    // deprecated in cudnn v7
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                  ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                  bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+  } else if (prefer_ == "autotune") {
+    const int topk = 1;
+    int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+    cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+    cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+    cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                  ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                  &num_fp_alg, fp_alg_perf));
+    fp_alg_ = fp_alg_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                  ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                  &num_bp_filt_alg, bp_filt_perf));
+    bp_filter_alg_ = bp_filt_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                  ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                  &num_bp_data_alg, bp_data_perf));
+    bp_data_alg_ = bp_data_perf[0].algo;
+  } else {
+    LOG(FATAL) << "Preferred algorithm is not available!";
+  }
+
+  size_t fp_byte, bp_data_byte, bp_filter_byte;
+  CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+                &fp_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bp_data_alg_, &bp_data_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bp_filter_alg_, &bp_filter_byte));
+  workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                     sizeof(float) +
+                     1;
+  if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
+    LOG(WARNING) << "The required memory for workspace ("
+                 << workspace_count_ * sizeof(float)
+                 << ") is larger than the expected Bytes ("
+                 << workspace_byte_limit_ << ")";
+  workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+}
+
+CudnnConvHandle::~CudnnConvHandle() {
+  if (bias_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
+  if (filter_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
+  if (conv_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch) {
+  CHECK_EQ(x.device()->lang(), kCuda);
+
+  DataType dtype = x.data_type();
+  auto dev = x.device();
+
+  Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
+  Tensor output(shape, dev, dtype);
+
+  output.device()->Exec([output, x, W, cch](Context * ctx) {
+    Block *inblock = x.block(), *outblock = output.block(),
+           *wblock = W.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                            inblock->data(), cch.filter_desc_, wblock->data(),
+                            cch.conv_desc_, cch.fp_alg_,
+                            cch.workspace_.block()->mutable_data(),
+                            cch.workspace_count_ * sizeof(float), &beta,
+                            cch.y_desc_, outblock->mutable_data());
+  }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+  if (cch.bias_term_) {
+    output.device()->Exec([output, b, cch](Context * ctx) {
+      float beta = 1.f, alpha = 1.0f;
+      Block *outblock = output.block(), *bblock = b.block();
+      cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                     bblock->data(), &beta, cch.y_desc_,
+                     outblock->mutable_data());
+    }, {output.block(), b.block()}, {output.block()});
+  }
 
-    return output;
-};
+  return output;
+}
 
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor dx;
-    dx.ResetLike(x);
-
-    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
-        Block *wblock = W.block(), *dyblock = dy.block(),
-                *dxblock = dx.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                     wblock->data(), cch.y_desc_, dyblock->data(),
-                                     cch.conv_desc_, cch.bp_data_alg_,
-                                     cch.workspace_.block()->mutable_data(),
-                                     cch.workspace_count_ * sizeof(float), &beta,
-                                     cch.x_desc_, dxblock->mutable_data());
-    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
-
-    return dx;
-};
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch) {
+  CHECK_EQ(dy.device()->lang(), kCuda);
+
+  Tensor dx;
+  dx.ResetLike(x);
+
+  dy.device()->Exec([dx, dy, W, cch](Context * ctx) {
+    Block *wblock = W.block(), *dyblock = dy.block(),
+           *dxblock = dx.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                 wblock->data(), cch.y_desc_, dyblock->data(),
+                                 cch.conv_desc_, cch.bp_data_alg_,
+                                 cch.workspace_.block()->mutable_data(),
+                                 cch.workspace_count_ * sizeof(float), &beta,
+                                 cch.x_desc_, dxblock->mutable_data());
+  }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
 
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
+  return dx;
+}
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch) {
+  CHECK_EQ(dy.device()->lang(), kCuda);
 
-    Tensor dW;
-    dW.ResetLike(W);
+  Tensor dW;
+  dW.ResetLike(W);
 
-    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
+  dy.device()->Exec([dW, dy, x, W, cch](Context * ctx) {
     Block *inblock = x.block(), *dyblock = dy.block(),
-            *dwblock = dW.block();
+           *dwblock = dW.block();
     float alpha = 1.f, beta = 0.f;
     cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-            cch.workspace_.block()->mutable_data(),
-            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
-            dwblock->mutable_data());
-    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
-
-    return dW;
-};
+      ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+      cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+      cch.workspace_.block()->mutable_data(),
+      cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+      dwblock->mutable_data());
+  }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+  return dW;
+}
 
 // input Tensor b for Reset db purpose, can avoid this later.
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch) {
+  CHECK_EQ(dy.device()->lang(), kCuda);
 
-    Tensor db;
-    db.ResetLike(b);
+  Tensor db;
+  db.ResetLike(b);
 
-    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
-        Block *dyblock = dy.block(), *dbblock = db.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                     dyblock->data(), &beta, cch.bias_desc_,
-                                     dbblock->mutable_data());
-    }, {dy.block()}, {db.block()});
+  dy.device()->Exec([db, dy, b, cch](Context * ctx) {
+    Block *dyblock = dy.block(), *dbblock = db.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                 dyblock->data(), &beta, cch.bias_desc_,
+                                 dbblock->mutable_data());
+  }, {dy.block()}, {db.block()});
 
-    return db;
-};
+  return db;
+}
+#endif  // USE_CUDNN
 
-}
\ No newline at end of file
+}  // namespace singa
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5340b65d/src/model/operation/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h
index 96a6d60..a114b47 100755
--- a/src/model/operation/convolution.h
+++ b/src/model/operation/convolution.h
@@ -1,61 +1,50 @@
+#ifndef SINGA_MODEL_OPERATION_CONVOLUTION_H_
+#define SINGA_MODEL_OPERATION_CONVOLUTION_H_
+
 #include <string>
 #include <vector>
-#include <cudnn.h>
-#include "../layer/cudnn_convolution.h"
-#include "../layer/cudnn_utils.h"
 #include "singa/utils/logging.h"
 
-namespace singa{
-
-struct ConvHandle{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-
-    size_t col_height_;
-    size_t col_width_;
-    size_t imagesize;
-
-    ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias);
-
+#ifdef USE_CUDNN
+#include <cudnn.h>
+// #include "../layer/cudnn_convolution.h"
+// #include "../layer/cudnn_utils.h"
+#endif // USE_CUDNN
+
+
+namespace singa {
+
+class ConvHandle {
+
+ public:
+  ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
+             const std::vector<size_t>& stride, const std::vector<size_t>& padding,
+             const size_t in_channels, const size_t out_channels,
+             const bool bias);
+ protected:
+  size_t kernel_w_;
+  size_t pad_w_;
+  size_t stride_w_;
+  size_t kernel_h_;
+  size_t pad_h_;
+  size_t stride_h_;
+
+  size_t channels_;
+  size_t num_filters_;
+
+  bool bias_term_;
+
+  size_t height_;
+  size_t width_;
+  size_t conv_height_;
+  size_t conv_width_;
+  size_t batchsize;
+
+  size_t col_height_;
+  size_t col_width_;
+  size_t imagesize;
 };
 
-struct CudnnConvHandle:ConvHandle{
-	cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias, const size_t workspace_byte_limit_=1024*1024*1024,
-                    const std::string prefer_="fastest");
-};
 
 Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch);
 
@@ -66,6 +55,31 @@ Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, cons
 Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch);
 
 
+
+#ifdef USE_CUDNN
+class CudnnConvHandle: public ConvHandle {
+ public:
+  CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
+                  const std::vector<size_t>& stride, const std::vector<size_t>& padding,
+                  const size_t in_channels, const size_t out_channels,
+                  const bool bias, const size_t workspace_byte_limit_ = 1024 * 1024 * 1024,
+                  const std::string& prefer_ = "fastest");
+  ~CudnnConvHandle();
+  // TODO(wangwei) add the destructor
+ protected:
+  cudnnTensorDescriptor_t x_desc_ ;
+  cudnnTensorDescriptor_t y_desc_ ;
+  cudnnTensorDescriptor_t bias_desc_ ;
+  cudnnFilterDescriptor_t filter_desc_ ;
+  cudnnConvolutionDescriptor_t conv_desc_ ;
+  cudnnConvolutionFwdAlgo_t fp_alg_;
+  cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+  cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+  size_t workspace_count_;
+  Tensor workspace_;
+};
+
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);
 
 Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch);
@@ -73,6 +87,7 @@ Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, cons
 Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch);
 
 Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch);
+#endif  // USE_CUDNN
 
-
-}
\ No newline at end of file
+}  // namespace singa
+#endif  // SINGA_MODEL_OPERATION_CONVOLUTION_H_



[03/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- fix some bugs to let the file complied without error.
- rename the file name


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/fc181cdc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/fc181cdc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/fc181cdc

Branch: refs/heads/master
Commit: fc181cdcd4cb5b1c913fb95df5e3e7fbfb6168dd
Parents: 30ac41b
Author: xuewanqi <xu...@u.nus.edu>
Authored: Tue Jun 12 12:26:23 2018 +0000
Committer: xuewanqi <xu...@u.nus.edu>
Committed: Wed Jun 20 14:47:05 2018 +0000

----------------------------------------------------------------------
 src/model/convolution functions.cpp | 398 ------------------------------
 src/model/convolution_forward.cc    | 404 +++++++++++++++++++++++++++++++
 2 files changed, 404 insertions(+), 398 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fc181cdc/src/model/convolution functions.cpp
----------------------------------------------------------------------
diff --git a/src/model/convolution functions.cpp b/src/model/convolution functions.cpp
deleted file mode 100644
index d0aeb1a..0000000
--- a/src/model/convolution functions.cpp	
+++ /dev/null
@@ -1,398 +0,0 @@
-#include <iostream>
-#include <cudnn.h>
-
-struct ConvHandle{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    string prefer_;
-};
-
-struct CudnnConvHandle{
-    cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-};
-
-// Done in conv2d.__init__()
-ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
-
-    size_t kernel_w_, pad_w_, stride_w_;
-    size_t kernel_h_, pad_h_, stride_h_;
-
-    size_t channels_, num_filters_;
-
-    bool bias_term_;
-
-    size_t workspace_byte_limit_;
-    string prefer_;
-
-    ConvolutionConf conv_conf = conf.convolution_conf();
-
-    workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
-    prefer_ = ToLowerCase(conv_conf.prefer());
-    CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-          prefer_ == "no_workspace" || prefer_ == "autotune")
-            << "CudnnConvolution only supports four algorithm preferences: fastest, "
-               "limited_workspace, no_workspace and autotune";
-
-    // store intermediate data, i.e., input tensor
-    //std::stack<Tensor> buf_;
-
-    // kernel_size, pad, and stride are repeated fields.
-    if (conv_conf.kernel_size_size() > 0) {
-    if (conv_conf.kernel_size_size() == 1) {
-    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
-    } else {
-    kernel_w_ = conv_conf.kernel_size(0);
-    kernel_h_ = conv_conf.kernel_size(1);
-    }
-    } else {
-    kernel_w_ = conv_conf.kernel_w();
-    kernel_h_ = conv_conf.kernel_h();
-    }
-    CHECK_GT(kernel_w_, 0u);
-    CHECK_GT(kernel_h_, 0u);
-
-    if (conv_conf.pad_size() > 0) {
-    if (conv_conf.pad_size() == 1) {
-    pad_w_ = pad_h_ = conv_conf.pad(0);
-    } else {
-    pad_w_ = conv_conf.pad(0);
-    pad_h_ = conv_conf.pad(1);
-    }
-    } else {
-    pad_w_ = conv_conf.pad_w();
-    pad_h_ = conv_conf.pad_h();
-    }
-    CHECK_GE(pad_w_, 0u);
-    CHECK_GE(pad_h_, 0u);
-
-    const int kStrideDefault = 1;
-    if (conv_conf.stride_size() > 0) {
-    if (conv_conf.stride_size() == 1) {
-    stride_w_ = stride_h_ = conv_conf.stride(0);
-    } else {
-    stride_w_ = conv_conf.stride(0);
-    stride_h_ = conv_conf.stride(1);
-    }
-    } else {
-    stride_w_ = kStrideDefault;
-    stride_h_ = kStrideDefault;
-    if (conv_conf.has_stride_w()) {
-    stride_w_ = conv_conf.stride_w();
-    }
-    if (conv_conf.has_stride_h()) {
-    stride_h_ = conv_conf.stride_h();
-    }
-    }
-    CHECK_GT(stride_w_, 0u);
-    CHECK_GE(stride_h_, 0u);  // 0 for 1D conv
-
-    channels_ = in_channels;
-    num_filters_ = conv_conf.num_output();
-    bias_term_ = conv_conf.bias_term();
-
-    return ConvHandle{
-            kernel_w_,
-            pad_w_,
-            stride_w_,
-            kernel_h_,
-            pad_h_,
-            stride_h_,
-
-            channels_,
-            num_filters_,
-
-            bias_term_,
-
-            workspace_byte_limit_,
-            prefer_,
-    };
-}
-
-
-
-// Done in conv2d.__call__():
-// if self.cudnnconvhandle is None:
-//     self.cudnnconvhandle= InitCudnn(...)
-// elif x.shape(0) != self.cudnnconvhandle.batchsize:
-//     self.cudnnconvhandle= InitCudnn(...)
-CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
-
-    cudnnTensorDescriptor_t x_desc_ = nullptr;
-    cudnnTensorDescriptor_t y_desc_ = nullptr;
-    cudnnTensorDescriptor_t bias_desc_ = nullptr;
-    cudnnFilterDescriptor_t filter_desc_ = nullptr;
-    cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-
-    DataType dtype = input.data_type();
-    auto dev = input.device();
-    Context *ctx = dev->context(0);
-
-    size_t batchsize, channels_;
-    batchsize = input.shape(0);
-    channels_ = input.shape(1);
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
-
-    conv_height_ = 1;
-    if (ch.stride_h_ > 0)
-        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
-    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
-
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-    if (ch.bias_term_)
-        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), batchsize,
-                                           ch.channels_, height_, width_));
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-            ch.num_filters_, conv_height_, conv_width_));
-    if (ch.bias_term_)
-        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                               GetCudnnDataType(dtype), 1,
-                                               ch.num_filters_, 1, 1));
-    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, ch.pad_h_, ch.pad_w_,
-                                                ch.stride_h_, ch.stride_w_, 1, 1,
-                                                CUDNN_CROSS_CORRELATION,
-                                                GetCudnnDataType(dtype)));
-    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                           CUDNN_TENSOR_NCHW, ch.num_filters_,
-                                           channels_, ch.kernel_h_, ch.kernel_w_));
-    if (ch.prefer_ == "fastest" || ch.prefer_ == "limited_workspace" ||
-        ch.prefer_ == "no_workspace") {
-        cudnnConvolutionFwdPreference_t fwd_pref;
-        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-        if (ch.prefer_ == "fastest") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-        } else if (ch.prefer_ == "limited_workspace") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        } else {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        }
-        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                ch.workspace_byte_limit_, &fp_alg_));
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bwd_filt_pref, ch.workspace_byte_limit_, &bp_filter_alg_));
-        // deprecated in cudnn v7
-        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bwd_data_pref, ch.workspace_byte_limit_, &bp_data_alg_));
-    } else if (ch.prefer_ == "autotune") {
-        const int topk = 1;
-        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
-        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
-        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
-        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                &num_fp_alg, fp_alg_perf));
-        fp_alg_ = fp_alg_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
-                &num_bp_filt_alg, bp_filt_perf));
-        bp_filter_alg_ = bp_filt_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
-                &num_bp_data_alg, bp_data_perf));
-        bp_data_alg_ = bp_data_perf[0].algo;
-    } else {
-        LOG(FATAL) << "Preferred algorithm is not available!";
-    }
-
-    size_t fp_byte, bp_data_byte, bp_filter_byte;
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
-            &fp_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-            bp_data_alg_, &bp_data_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-            bp_filter_alg_, &bp_filter_byte));
-    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
-                       sizeof(float) +
-                       1;
-    if (workspace_count_ * sizeof(float) > ch.workspace_byte_limit_)
-        LOG(WARNING) << "The required memory for workspace ("
-                     << workspace_count_ * sizeof(float)
-                     << ") is larger than the expected Bytes ("
-                     << ch.workspace_byte_limit_ << ")";
-    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
-
-    return CudnnConvHandle{
-            x_desc_,
-            y_desc_,
-            bias_desc_,
-            filter_desc_,
-            conv_desc_,
-            fp_alg_,
-            bp_filter_alg_,
-            bp_data_alg_,
-
-            workspace_count_,
-            workspace_,
-
-            height_,
-            width_,
-            conv_height_,
-            conv_width_,
-            batchsize,
-    };
-
-}
-
-Tensor CudnnConvForward(Tensor x, Tensor W, Tensor b, const ConvHandle ch, const CudnnConvHandle cch){
-    CHECK_EQ(x.device()->lang(), kCuda);
-    CHECK_EQ(x.nDim(), 4u);
-    CHECK_EQ(x.shape()[0],cch.batchsize);
-    CHECK_EQ(x.shape()[1],ch.channels_);
-    CHECK_EQ(x.shape()[2],cch.height_);
-    CHECK_EQ(x.shape()[3],cch.width_);
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-
-    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    output.device()->Exec([x, output](Context *ctx) {
-        Block *inblock = x.block(), *outblock = output.block(),
-                *wblock = W.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                                inblock->data(), cch.filter_desc_, wblock->data(),
-                                cch.conv_desc_, cch.fp_alg_,
-                                cch.workspace_.block()->mutable_data(),
-                                cch.workspace_count_ * sizeof(float), &beta,
-                                cch.y_desc_, outblock->mutable_data());
-    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-    if (ch.bias_term_) {
-        output.device()->Exec([output](Context *ctx) {
-            float beta = 1.f, alpha = 1.0f;
-            Block *outblock = output.block(), *bblock = b.block();
-            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                           bblock->data(), &beta, cch.y_desc_,
-                           outblock->mutable_data());
-        }, {output.block(), b.block()}, {output.block()});
-    }
-    return output;
-}
-
-// input Tensor W for Reset dW purpose, can avoid this later.
-Tensor CudnnConvBackwardW(Tensor dy, Tensor x, Tensor W, CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor dW;
-    dW.ResetLike(W);
-
-    dy.device()->Exec([dy, dW, x](Context *ctx) {
-    Block *inblock = x.block(), *dyblock = dy.block(),
-            *dwblock = dW.block();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-            cch.workspace_.block()->mutable_data(),
-            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
-            dwblock->mutable_data());
-    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
-
-    return dW;
-}
-
-// input Tensor b for Reset db purpose, can avoid this later.
-Tensor CudnnConvBackwardb(Tensor dy, Tensor b, CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor db;
-    db.ResetLike(b);
-
-    dy.device()->Exec([dy, db](Context *ctx) {
-        Block *dyblock = dy.block(), *dbblock = db.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                     dyblock->data(), &beta, cch.bias_desc_,
-                                     dbblock->mutable_data());
-    }, {dy.block()}, {db.block()});
-    return db;
-}
-
-// input Tensor x for Reset dx purpose, can avoid this later.
-Tensor CudnnConvBackwardx(Tensor dy, Tensor W, Tensor x, CudnnConvHandle cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-    CHECK_EQ(dy.nDim(), 4u);
-
-    Tensor dx;
-    dx.ResetLike(x);
-
-    dy.device()->Exec([dx, dy](Context *ctx) {
-        Block *wblock = W.block(), *dyblock = dy.block(),
-                *dxblock = dx.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                     wblock->data(), cch.y_desc_, dyblock->data(),
-                                     cch.conv_desc_, cch.bp_data_alg_,
-                                     cch.workspace_.block()->mutable_data(),
-                                     cch.workspace_count_ * sizeof(float), &beta,
-                                     cch.x_desc_, dxblock->mutable_data());
-    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
-
-    return dx;
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fc181cdc/src/model/convolution_forward.cc
----------------------------------------------------------------------
diff --git a/src/model/convolution_forward.cc b/src/model/convolution_forward.cc
new file mode 100644
index 0000000..8457e95
--- /dev/null
+++ b/src/model/convolution_forward.cc
@@ -0,0 +1,404 @@
+#include <string>
+#include <cudnn.h>
+#include "./layer/cudnn_convolution.h"
+#include "./layer/cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa{
+struct ConvHandle{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    std::string prefer_;
+};
+
+
+struct CudnnConvHandle{
+    cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+};
+
+
+// Done in conv2d.__init__()
+ConvHandle SetupConv(const size_t in_channels, const LayerConf &conf){
+
+    size_t kernel_w_, pad_w_, stride_w_;
+    size_t kernel_h_, pad_h_, stride_h_;
+
+    size_t channels_, num_filters_;
+
+    bool bias_term_;
+
+    size_t workspace_byte_limit_;
+    string prefer_;
+
+    ConvolutionConf conv_conf = conf.convolution_conf();
+
+    workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
+    prefer_ = ToLowerCase(conv_conf.prefer());
+    CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+          prefer_ == "no_workspace" || prefer_ == "autotune")
+            << "CudnnConvolution only supports four algorithm preferences: fastest, "
+               "limited_workspace, no_workspace and autotune";
+
+
+    // kernel_size, pad, and stride are repeated fields.
+    if (conv_conf.kernel_size_size() > 0) {
+    if (conv_conf.kernel_size_size() == 1) {
+    kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+    } else {
+    kernel_w_ = conv_conf.kernel_size(0);
+    kernel_h_ = conv_conf.kernel_size(1);
+    }
+    } else {
+    kernel_w_ = conv_conf.kernel_w();
+    kernel_h_ = conv_conf.kernel_h();
+    }
+    CHECK_GT(kernel_w_, 0u);
+    CHECK_GT(kernel_h_, 0u);
+
+    if (conv_conf.pad_size() > 0) {
+    if (conv_conf.pad_size() == 1) {
+    pad_w_ = pad_h_ = conv_conf.pad(0);
+    } else {
+    pad_w_ = conv_conf.pad(0);
+    pad_h_ = conv_conf.pad(1);
+    }
+    } else {
+    pad_w_ = conv_conf.pad_w();
+    pad_h_ = conv_conf.pad_h();
+    }
+    CHECK_GE(pad_w_, 0u);
+    CHECK_GE(pad_h_, 0u);
+
+    const int kStrideDefault = 1;
+    if (conv_conf.stride_size() > 0) {
+    if (conv_conf.stride_size() == 1) {
+    stride_w_ = stride_h_ = conv_conf.stride(0);
+    } else {
+    stride_w_ = conv_conf.stride(0);
+    stride_h_ = conv_conf.stride(1);
+    }
+    } else {
+    stride_w_ = kStrideDefault;
+    stride_h_ = kStrideDefault;
+    if (conv_conf.has_stride_w()) {
+    stride_w_ = conv_conf.stride_w();
+    }
+    if (conv_conf.has_stride_h()) {
+    stride_h_ = conv_conf.stride_h();
+    }
+    }
+    CHECK_GT(stride_w_, 0u);
+    CHECK_GE(stride_h_, 0u);  // 0 for 1D conv
+
+    channels_ = in_channels;
+    num_filters_ = conv_conf.num_output();
+    bias_term_ = conv_conf.bias_term();
+
+    return ConvHandle{
+            kernel_w_,
+            pad_w_,
+            stride_w_,
+            kernel_h_,
+            pad_h_,
+            stride_h_,
+
+            channels_,
+            num_filters_,
+
+            bias_term_,
+
+            workspace_byte_limit_,
+            prefer_,
+    };
+};
+
+
+
+// Done in conv2d.__call__():
+// if self.cudnnconvhandle is None:
+//     self.cudnnconvhandle= InitCudnn(...)
+// elif x.shape(0) != self.cudnnconvhandle.batchsize:
+//     self.cudnnconvhandle= InitCudnn(...)
+CudnnConvHandle InitCudnn(const Tensor &input, const ConvHandle ch){
+
+    cudnnTensorDescriptor_t x_desc_ = nullptr;
+    cudnnTensorDescriptor_t y_desc_ = nullptr;
+    cudnnTensorDescriptor_t bias_desc_ = nullptr;
+    cudnnFilterDescriptor_t filter_desc_ = nullptr;
+    cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    Context *ctx = dev->context(0);
+
+    size_t batchsize, channels_;
+    batchsize = input.shape(0);
+    channels_ = input.shape(1);
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    CHECK(channels_ == ch.channels_)<<"the number of input channels mismatched.";
+
+    conv_height_ = 1;
+    if (ch.stride_h_ > 0)
+        conv_height_ = (height_ + 2 * ch.pad_h_ - ch.kernel_h_) / ch.stride_h_ + 1;
+    conv_width_ = (width_ + 2 * ch.pad_w_ - ch.kernel_w_) / ch.stride_w_ + 1;
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+    if (ch.bias_term_)
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), batchsize,
+                                           ch.channels_, height_, width_));
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+            ch.num_filters_, conv_height_, conv_width_));
+    if (ch.bias_term_)
+        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                               GetCudnnDataType(dtype), 1,
+                                               ch.num_filters_, 1, 1));
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, ch.pad_h_, ch.pad_w_,
+                                                ch.stride_h_, ch.stride_w_, 1, 1,
+                                                CUDNN_CROSS_CORRELATION,
+                                                GetCudnnDataType(dtype)));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                           CUDNN_TENSOR_NCHW, ch.num_filters_,
+                                           channels_, ch.kernel_h_, ch.kernel_w_));
+    if (ch.prefer_ == "fastest" || ch.prefer_ == "limited_workspace" ||
+        ch.prefer_ == "no_workspace") {
+        cudnnConvolutionFwdPreference_t fwd_pref;
+        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+        if (ch.prefer_ == "fastest") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+        } else if (ch.prefer_ == "limited_workspace") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        } else {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        }
+        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                ch.workspace_byte_limit_, &fp_alg_));
+        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bwd_filt_pref, ch.workspace_byte_limit_, &bp_filter_alg_));
+        // deprecated in cudnn v7
+        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bwd_data_pref, ch.workspace_byte_limit_, &bp_data_alg_));
+    } else if (ch.prefer_ == "autotune") {
+        const int topk = 1;
+        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                &num_fp_alg, fp_alg_perf));
+        fp_alg_ = fp_alg_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                &num_bp_filt_alg, bp_filt_perf));
+        bp_filter_alg_ = bp_filt_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                &num_bp_data_alg, bp_data_perf));
+        bp_data_alg_ = bp_data_perf[0].algo;
+    } else {
+        LOG(FATAL) << "Preferred algorithm is not available!";
+    }
+
+    size_t fp_byte, bp_data_byte, bp_filter_byte;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+            &fp_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+            bp_data_alg_, &bp_data_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+            bp_filter_alg_, &bp_filter_byte));
+    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                       sizeof(float) +
+                       1;
+    if (workspace_count_ * sizeof(float) > ch.workspace_byte_limit_)
+        LOG(WARNING) << "The required memory for workspace ("
+                     << workspace_count_ * sizeof(float)
+                     << ") is larger than the expected Bytes ("
+                     << ch.workspace_byte_limit_ << ")";
+    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+
+    return CudnnConvHandle{
+            x_desc_,
+            y_desc_,
+            bias_desc_,
+            filter_desc_,
+            conv_desc_,
+            fp_alg_,
+            bp_filter_alg_,
+            bp_data_alg_,
+
+            workspace_count_,
+            workspace_,
+
+            height_,
+            width_,
+            conv_height_,
+            conv_width_,
+            batchsize,
+    };
+};
+
+Tensor CudnnConvForward(const Tensor x, const Tensor W, const Tensor b,
+                        const ConvHandle ch, const CudnnConvHandle cch){
+    CHECK_EQ(x.device()->lang(), kCuda);
+    CHECK_EQ(x.nDim(), 4u);
+    CHECK_EQ(x.shape()[0],cch.batchsize);
+    CHECK_EQ(x.shape()[1],ch.channels_);
+    CHECK_EQ(x.shape()[2],cch.height_);
+    CHECK_EQ(x.shape()[3],cch.width_);
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+
+    Shape shape{cch.batchsize, ch.num_filters_, cch.conv_height_, cch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    output.device()->Exec([output, x, W, cch](Context *ctx) {
+        Block *inblock = x.block(), *outblock = output.block(),
+                *wblock = W.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                                inblock->data(), cch.filter_desc_, wblock->data(),
+                                cch.conv_desc_, cch.fp_alg_,
+                                cch.workspace_.block()->mutable_data(),
+                                cch.workspace_count_ * sizeof(float), &beta,
+                                cch.y_desc_, outblock->mutable_data());
+    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+    if (ch.bias_term_) {
+        output.device()->Exec([output, b, cch](Context *ctx) {
+            float beta = 1.f, alpha = 1.0f;
+            Block *outblock = output.block(), *bblock = b.block();
+            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                           bblock->data(), &beta, cch.y_desc_,
+                           outblock->mutable_data());
+        }, {output.block(), b.block()}, {output.block()});
+    }
+    return output;
+};
+
+// input Tensor W for Reset dW purpose, can avoid this later.
+Tensor CudnnConvBackwardW(const Tensor dy, const Tensor x, const Tensor W, const CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor dW;
+    dW.ResetLike(W);
+
+    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
+    Block *inblock = x.block(), *dyblock = dy.block(),
+            *dwblock = dW.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+            cch.workspace_.block()->mutable_data(),
+            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+            dwblock->mutable_data());
+    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+    return dW;
+};
+
+// input Tensor b for Reset db purpose, can avoid this later.
+Tensor CudnnConvBackwardb(const Tensor dy, const Tensor b, const CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
+        Block *dyblock = dy.block(), *dbblock = db.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                     dyblock->data(), &beta, cch.bias_desc_,
+                                     dbblock->mutable_data());
+    }, {dy.block()}, {db.block()});
+    return db;
+};
+
+Tensor CudnnConvBackwardx(const Tensor dy, const Tensor W, const Tensor x, const CudnnConvHandle cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+    CHECK_EQ(dy.nDim(), 4u);
+
+    Tensor dx;
+    dx.ResetLike(x);
+
+    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
+        Block *wblock = W.block(), *dyblock = dy.block(),
+                *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                     wblock->data(), cch.y_desc_, dyblock->data(),
+                                     cch.conv_desc_, cch.bp_data_alg_,
+                                     cch.workspace_.block()->mutable_data(),
+                                     cch.workspace_count_ * sizeof(float), &beta,
+                                     cch.x_desc_, dxblock->mutable_data());
+    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+
+    return dx;
+};
+
+} //namespace_singa
+
+


[08/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

Posted by wa...@apache.org.
SINGA-371 Implement functional operations in c++ for autograd

- add test case for conv2d operation.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5c8504a9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5c8504a9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5c8504a9

Branch: refs/heads/master
Commit: 5c8504a94c66af3459a515f654fa01f5099dc790
Parents: 78e1fc2
Author: xuewanqi <xu...@outlook.com>
Authored: Thu Jun 21 15:36:49 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Fri Jun 22 02:37:17 2018 +0000

----------------------------------------------------------------------
 python/singa/autograd.py      |  2 +-
 test/python/test_operation.py | 48 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5c8504a9/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 7ba68f5..4f45bf1 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -672,7 +672,7 @@ class Conv2d_GPU(Operation):
         return singa.CudnnConvForward(xs[0], xs[1], xs[2], self.convhandle, self.cudnnconvhandle)
 
     def backward(self, dy):
-        assert training is True and hasattr(self, 'x'), 'Please set \'trainging\' as True before do BP. '
+        assert training is True and hasattr(self, 'x'), 'Please set \'training\' as True before do BP. '
 
         # todo check device?
         dy.ToDevice(self.dev)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5c8504a9/test/python/test_operation.py
----------------------------------------------------------------------
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
new file mode 100644
index 0000000..295b2d2
--- /dev/null
+++ b/test/python/test_operation.py
@@ -0,0 +1,48 @@
+import unittest
+from builtins import str
+
+from singa import tensor
+from singa import singa_wrap as singa
+from singa import device
+from singa import autograd
+
+autograd.training = True
+
+CTensor = singa.Tensor
+
+dev = device.create_cuda_gpu()
+
+gpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=dev)
+gpu_input_tensor.gaussian(0.0, 1.0)
+
+dy = CTensor([2, 1, 2, 2])
+singa.Gaussian(0.0, 1.0, dy)
+dy.ToDevice(dev)
+
+conv = autograd.Conv2d_GPU(3, 1, 2)  # (in_channels, out_channels, kernel_size)
+
+
+def _tuple_to_string(t):
+    lt = [str(x) for x in t]
+    return '(' + ', '.join(lt) + ')'
+
+
+class TestPythonOperation(unittest.TestCase):
+
+    def check_shape(self, actual, expect):
+        self.assertEqual(actual, expect, 'shape mismatch, actual shape is %s'
+                         ' exepcted is %s' % (_tuple_to_string(actual),
+                                              _tuple_to_string(expect))
+                         )
+
+    def test(self):
+        y = conv(gpu_input_tensor)  # PyTensor
+        dx, dW, db = conv.backward(dy)  # CTensor
+        
+        self.check_shape(y.shape, (2, 1, 2, 2))
+        self.check_shape(dx.shape(), (2, 3, 3, 3))
+        self.check_shape(dW.shape(), (1, 3, 2, 2))
+        self.check_shape(db.shape(), (1,))
+
+if __name__ == '__main__':
+    unittest.main()