You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2018/07/05 03:10:09 UTC

[14/18] incubator-singa git commit: SINGA-371 Implement functional operations in c++ for autograd

SINGA-371 Implement functional operations in c++ for autograd

- tidy some files and fixed some bugs.

- add few shape checks and functions in new developed layer.

- rename some files, classes, variables


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/15c0230c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/15c0230c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/15c0230c

Branch: refs/heads/master
Commit: 15c0230cbc98c3662f5e2519bed4da4b26741a4f
Parents: 82ef417
Author: xuewanqi <xu...@outlook.com>
Authored: Mon Jul 2 05:53:13 2018 +0000
Committer: xuewanqi <xu...@outlook.com>
Committed: Tue Jul 3 03:37:48 2018 +0000

----------------------------------------------------------------------
 examples/autograd/mlp.py                     |   2 +-
 examples/autograd/mnist_cnn.py               |   2 +-
 python/singa/autograd.py                     | 313 +++++-------------
 src/api/core_device.i                        |   3 -
 src/api/model_operation.i                    |  28 +-
 src/model/operation/convolution.cc           | 371 ++++++++++++++++++++++
 src/model/operation/convolution.h            |  78 +++++
 src/model/operation/convolution_operation.cc | 366 ---------------------
 src/model/operation/convolution_operation.h  |  78 -----
 test/python/test_operation.py                |  27 +-
 10 files changed, 564 insertions(+), 704 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/examples/autograd/mlp.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mlp.py b/examples/autograd/mlp.py
old mode 100644
new mode 100755
index f7c4353..0447927
--- a/examples/autograd/mlp.py
+++ b/examples/autograd/mlp.py
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     label = to_categorical(label, 2).astype(np.float32)
     print('train_data_shape:', data.shape)
     print('train_label_shape:', label.shape)
-    # 1
+    
     inputs = Tensor(data=data)
     target = Tensor(data=label)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/examples/autograd/mnist_cnn.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py
old mode 100644
new mode 100755
index cbb5650..a82f64c
--- a/examples/autograd/mnist_cnn.py
+++ b/examples/autograd/mnist_cnn.py
@@ -100,7 +100,7 @@ if __name__ == '__main__':
     print('the shape of testing label is', y_test.shape)
 
     # operations initialization
-    conv1 = autograd.Conv2D(1, 32, 3, padding=1)
+    conv1 = autograd.Conv2D(1, 32, 3, padding=1, bias=False)
     conv2 = autograd.Conv2D(32, 32, 3, padding=1)
     linear = autograd.Linear(32 * 28 * 28, 10)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
old mode 100644
new mode 100755
index 474fff4..2a10608
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -369,105 +369,6 @@ def ctensor2numpy(x):
     return np_array.reshape(x.shape())
 
 
-class Conv2d(Operation):
-
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True, **kwargs):
-
-        inner_params = {'name': 'Conv2d',
-                        'border_mode': 'same',
-                        'cudnn_prefer': 'fastest',
-                        'workspace_byte_limit': 1024,
-                        'data_format': 'NCHW',
-                        'W_specs': {'init': 'xavier'},
-                        'b_specs': {'init': 'constant'},
-                        'input_sample_shape': None}
-        # TODO valid value of inner_params check
-
-        for kwarg in kwargs:
-            if kwarg not in inner_params:
-                raise TypeError('Keyword argument not understood:', kwarg)
-            else:
-                inner_params[kwarg] = kwargs[kwarg]
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.W_specs = inner_params['W_specs']
-        self.b_specs = inner_params['b_specs']
-
-        if isinstance(kernel_size, int):
-            self.kernel_size = (kernel_size, kernel_size)
-        else:
-            self.kernel_size = kernel_size
-
-        if padding == 0:
-            pad = None
-        else:
-            pad = padding
-
-        if dilation != 1 or groups != 1:
-            raise ValueError('Not implemented yet')
-
-        self.PyLayer = layer.Conv2D(inner_params['name'],
-                                    nb_kernels=out_channels,
-                                    kernel=kernel_size,
-                                    stride=stride,
-                                    border_mode=inner_params['border_mode'],
-                                    cudnn_prefer=inner_params['cudnn_prefer'],
-                                    workspace_byte_limit=inner_params[
-                                        'workspace_byte_limit'],
-                                    data_format=inner_params['data_format'],
-                                    use_bias=bias,
-                                    W_specs=self.W_specs,
-                                    b_specs=self.b_specs,
-                                    pad=pad,
-                                    input_sample_shape=inner_params['input_sample_shape'])
-
-    def get_params(self):
-        assert self.init_value is True, 'must initialize before get_params()'
-        if self.bias:
-            return (self.w, self.b)
-        else:
-            return self.w
-
-    def __call__(self, x):
-        if training:
-            self.flag = model_pb2.kTrain
-        else:
-            self.flag = model_pb2.kEval
-
-        if not self.PyLayer.has_setup:
-            self.PyLayer.setup(x.shape[1:])
-
-        param_data = self.PyLayer.layer.param_values()
-
-        if not hasattr(self, 'w'):
-            self.w = Tensor(device=param_data[0].device(), data=param_data[
-                            0], requires_grad=True, stores_grad=True)
-            std = math.sqrt(
-                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
-            self.w.gaussian(0.0, std)
-
-        xs = [x, self.w]
-
-        if len(param_data) == 2:
-            if not hasattr(self, 'b'):
-                self.b = Tensor(device=param_data[1].device(), data=param_data[
-                                1], requires_grad=True, stores_grad=True)
-                self.b.set_value(0.0)
-
-            xs.append(self.b)
-
-        xs = tuple(xs)
-        return self._do_forward(*xs)[0]
-
-    def forward(self, *xs):
-        return self.PyLayer.layer.Forward(self.flag, xs[0])
-
-    def backward(self, dy):
-        ret = self.PyLayer.layer.Backward(self.flag, dy)
-        return (ret[0],) + ret[1]
-
 class MaxPool2d(Operation):
 
     def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1,
@@ -548,80 +449,11 @@ class Flatten(Operation):
 def flatten(x):
     return Flatten()(x)[0]
 
-class CONV2D(Operation):
-    '''def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True, **kwargs):
 
-        self.in_channels = in_channels
-        self.out_channels = out_channels
+class _Conv2D(Operation):
 
-        if isinstance(kernel_size, int):
-            self.kernel_size = (kernel_size, kernel_size)
-        elif isinstance(kernel_size, tuple):
-            self.kernel_size = kernel_size
-        else:
-            raise TypeError('Wrong kernel_size type.')
-        
-        if isinstance(stride, int):
-            self.stride = (stride,stride)
-        elif isinstance(stride, tuple):
-            self.stride = stride
-        else:
-            raise TypeError('Wrong stride type.')
-
-        if isinstance(padding, int):
-            self.padding = (padding,padding)
-        elif isinstance(padding, tuple):
-            self.padding = padding
-        else:
-            raise TypeError('Wrong padding type.')
-
-        if dilation != 1 or groups != 1:
-            raise ValueError('Not implemented yet')
-
-        self.bias = bias
-
-        self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
-        # TODO valid value of inner_params check
-
-        for kwarg in kwargs:
-            if kwarg not in self.inner_params:
-                raise TypeError('Keyword argument not understood:', kwarg)
-            else:
-                self.inner_params[kwarg] = kwargs[kwarg]
-        
-        w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
-        self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
-        std = math.sqrt(
-                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
-        self.W.gaussian(0.0, std)
-
-        if self.bias:
-            b_shape = (self.out_channels,)
-            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
-            self.b.set_value(0.0)
-        else:
-            #to keep consistency when to do forward.
-            self.b = Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)
-
-    def __call__(self, x): 
-        if not hasattr(self, 'device_id'):
-            self.device_id = x.device.id()
-        else:
-            assert self.device_id == x.device.id(),'Not the same device.'
-
-        if self.W.device.id() != self.device_id:
-            self.W.to_device(x.device)
-
-        if self.bias:
-            if self.b.device.id() != self.device_id:
-                self.b.to_device(x.device)
-
-    	xs = [x, self.W, self.b]
-
-    	return self._do_forward(*xs)[0]'''
-    def __init__(self, handles):
-        self.handles = handles
+    def __init__(self, handle):
+        self.handle = handle
 
     def forward(self, x, W, b):
         #assert x.nDim() == 4, 'The dimensions of input should be 4D.'
@@ -631,39 +463,46 @@ class CONV2D(Operation):
         #assert 0 == 0, 'invalid padding'
 
         if training:
-            self.inputs = (x,W,b)
+            self.inputs = (x, W, b)
 
-        if self.handles.device_id == -1:
-            return singa.CpuConvForward(x, W, b, self.handles)
+        if self.handle.device_id == -1:
+            return singa.CpuConvForward(x, W, b, self.handle)
 
         else:
-            return singa.GpuConvForward(x, W, b, self.handles)
+            return singa.GpuConvForward(x, W, b, self.handle)
 
     def backward(self, dy):
-        assert training is True and hasattr(self, 'inputs'), 'Please set training as True before do BP. '
-
-        if dy.device().id() != self.handles.device_id:
-            dy.ToDevice(self.x.device())
-
-        if self.handles.device_id == -1: 
-            dx = singa.CpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles)
-            dW = singa.CpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles)
-            if self.handles.bias:
-                db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handles)
+        assert training is True and hasattr(
+            self, 'inputs'), 'Please set training as True before do BP. '
+
+        if dy.device().id() != self.handle.device_id:
+            dy.ToDevice(self.inputs[0].device())
+
+        if self.handle.device_id == -1:
+            dx = singa.CpuConvBackwardx(
+                dy, self.inputs[1], self.inputs[0], self.handle)
+            dW = singa.CpuConvBackwardW(
+                dy, self.inputs[0], self.inputs[1], self.handle)
+            if self.handle.bias_term_:
+                db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
-                return dx, dW
+                return dx, dW, None
         else:
-            dx = singa.GpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles)
-            dW = singa.GpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles)
-            if self.handles.bias:
-                db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handles)
+            dx = singa.GpuConvBackwardx(
+                dy, self.inputs[1], self.inputs[0], self.handle)
+            dW = singa.GpuConvBackwardW(
+                dy, self.inputs[0], self.inputs[1], self.handle)
+            if self.handle.bias_term_:
+                db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
-                return dx, dW
+                return dx, dW, None
+
+
+def conv2d(x, W, b, handle):
+    return _Conv2D(handle)(x, W, b)[0]
 
-def conv2d(x,W,b,handles):
-    return CONV2D(handles)(x,W,b)[0]
 
 def infer_dependency(op):
     '''
@@ -776,27 +615,33 @@ def backward(y, dy=None):
 
     return gradients
 
-class newlayer(object):
+
+class NewLayer(object):
+
     def __init__(self):
         pass
 
-    def device_check(*inputs):
-        pass
+    def device_check(self, *inputs):
+        x_device = inputs[0].device
+        for var in inputs:
+            if var.device.id() != x_device:
+                var.to_device(x_device)
+
 
+class Linear(NewLayer):
 
-class Linear(newlayer):
     def __init__(self, in_features, out_features, bias=True):
         #self.in_features = in_features
         #self.out_features = out_features
         w_shape = (in_features, out_features)
         b_shape = (1, out_features)
         self.bias = bias
-        
+
         self.W = Tensor(shape=w_shape,
                         requires_grad=True, stores_grad=True)
         std = math.sqrt(2.0 / (in_features + out_features))
         self.W.gaussian(0.0, std)
-        
+
         if self.bias:
             self.b = Tensor(shape=b_shape,
                             requires_grad=True, stores_grad=True)
@@ -812,7 +657,9 @@ class Linear(newlayer):
             y = add_bias(y, self.b, axis=0)
         return y
 
-class Conv2D(newlayer):
+
+class Conv2D(NewLayer):
+
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True, **kwargs):
 
@@ -825,16 +672,16 @@ class Conv2D(newlayer):
             self.kernel_size = kernel_size
         else:
             raise TypeError('Wrong kernel_size type.')
-        
+
         if isinstance(stride, int):
-            self.stride = (stride,stride)
+            self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
         else:
             raise TypeError('Wrong stride type.')
 
         if isinstance(padding, int):
-            self.padding = (padding,padding)
+            self.padding = (padding, padding)
         elif isinstance(padding, tuple):
             self.padding = padding
         else:
@@ -845,7 +692,8 @@ class Conv2D(newlayer):
 
         self.bias = bias
 
-        self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024}
+        self.inner_params = {'cudnn_prefer': 'fastest',
+                             'workspace_MB_limit': 1024}
         # TODO valid value of inner_params check
 
         for kwarg in kwargs:
@@ -853,46 +701,49 @@ class Conv2D(newlayer):
                 raise TypeError('Keyword argument not understood:', kwarg)
             else:
                 self.inner_params[kwarg] = kwargs[kwarg]
-        
-        w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])
+
+        w_shape = (self.out_channels, self.in_channels,
+                   self.kernel_size[0], self.kernel_size[1])
         self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
         std = math.sqrt(
-                2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
+            2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
         self.W.gaussian(0.0, std)
 
         if self.bias:
             b_shape = (self.out_channels,)
-            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
+            self.b = Tensor(shape=b_shape, requires_grad=True,
+                            stores_grad=True)
             self.b.set_value(0.0)
         else:
-            #to keep consistency when to do forward.
-            self.b = Tensor(data=CTensor([1]), requires_grad=False, stores_grad=False)
+            # to keep consistency when to do forward.
+            self.b = Tensor(data=CTensor(
+                [1]), requires_grad=False, stores_grad=False)
             self.b.set_value(0.0)
 
     def __call__(self, x):
+        assert x.shape[1] == self.in_channels,'in_channels dismatched'
+        assert (x.shape[2]+2*self.padding[0]-self.kernel_size[0])%self.stride[0] == 0, 'invalid padding or strides.'
+        assert (x.shape[3]+2*self.padding[1]-self.kernel_size[1])%self.stride[1] == 0, 'invalid padding or stride.'
+
         self.device_check(x, self.W, self.b)
 
         if x.device.id() == -1:
-            if not hasattr (self, 'handles'):
-                self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias)
-            elif x.shape[0] != self.handles.batchsize:
-                self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias)
+            if not hasattr(self, 'handle'):
+                self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
+                                                 self.padding, self.in_channels, self.out_channels, self.bias)
+            elif x.shape[0] != self.handle.batchsize:
+                self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
+                                                 self.padding, self.in_channels, self.out_channels, self.bias)
         else:
-            if not hasattr(self, 'handles'):
-                self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias,
-                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
-            elif x.shape[0] != self.handles.batchsize:
-                self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride,
-                               self.padding, self.in_channels, self.out_channels, self.bias,
-                               self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer'])
-        self.handles.device_id= x.device.id()
-        self.handles.bias=self.bias # can simplified
-        y = conv2d(x, self.W, self.b, self.handles)
+            if not hasattr(self, 'handle'):
+                self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
+                                                      self.padding, self.in_channels, self.out_channels, self.bias,
+                                                      self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
+            elif x.shape[0] != self.handle.batchsize:
+                self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
+                                                      self.padding, self.in_channels, self.out_channels, self.bias,
+                                                      self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer'])
+        self.handle.device_id = x.device.id()
+
+        y = conv2d(x, self.W, self.b, self.handle)
         return y
-
-
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/api/core_device.i
----------------------------------------------------------------------
diff --git a/src/api/core_device.i b/src/api/core_device.i
index 381f7c6..a5b7de6 100644
--- a/src/api/core_device.i
+++ b/src/api/core_device.i
@@ -43,14 +43,11 @@ namespace std{
 
 namespace singa{
 
-enum LangType {kCpp, kCuda, kOpencl,kNumDeviceType};
-
 class Device {
  public:
   virtual void SetRandSeed(unsigned seed) = 0;
   std::shared_ptr<Device> host();
   int id() const;
-  LangType lang() const;
 };
 
 class Platform {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/api/model_operation.i
----------------------------------------------------------------------
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
old mode 100644
new mode 100755
index 29f8f58..58e5270
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -1,46 +1,48 @@
 %module model_operation
 
 %{
-#include "../src/model/operation/convolution_operation.h"
+#include "../src/model/operation/convolution.h"
 %}
 namespace singa{
 
-struct ConvHandles{
+struct ConvHandle{
 
 		size_t batchsize;
+        const bool bias_term_;
 
-		ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+		ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
                     const std::vector<size_t> stride, const std::vector<size_t> padding,
                     const size_t in_channels, const size_t out_channels,
                     const bool bias_term_);
               	};
 
-struct CudnnConvHandles{
+struct CudnnConvHandle{
 
 		size_t batchsize;
+        const bool bias_term_;
 		
-		CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
+		CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
                     const std::vector<size_t> stride, const std::vector<size_t> padding,
                     const size_t in_channels, const size_t out_channels,
                     const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024,
                     const std::string prefer_="fastest");
                 };
 
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch);
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);
 
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch);
 
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch);
 
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch);
 
 
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch);
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch);
 
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch);
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch);
 
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch);
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch);
 
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch);
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch);
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
new file mode 100755
index 0000000..8d60df4
--- /dev/null
+++ b/src/model/operation/convolution.cc
@@ -0,0 +1,371 @@
+#include "./convolution.h"
+#include "../layer/convolution.h"
+#include<iostream>
+
+namespace singa{
+
+ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+	                const std::vector<size_t> stride, const std::vector<size_t> padding,
+	                const size_t in_channels, const size_t out_channels,
+	                const bool bias){
+    kernel_h_=kernel_size[0];
+    kernel_w_=kernel_size[1];
+
+    pad_h_=padding[0];
+    pad_w_=padding[1];
+
+    stride_h_=stride[0];
+    stride_w_=stride[1];
+
+    channels_=in_channels;
+    num_filters_=out_channels;
+
+    bias_term_ = bias;
+
+	batchsize = input.shape(0);
+	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
+    height_ = input.shape(2);
+    width_ = input.shape(3);
+
+    conv_height_ = 1;
+    if (stride_h_ > 0)
+        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+
+    col_height_ = in_channels * kernel_w_ * kernel_h_;
+    col_width_ = conv_height_ * conv_width_;
+    imagesize = input.Size() / batchsize;
+};	
+
+CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,const bool bias_term_, 
+                    const size_t workspace_byte_limit_,const std::string prefer_)
+                    :ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){
+
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    Context *ctx = dev->context(0);
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+    if (bias_term_)
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), batchsize,
+                                           channels_, height_, width_));
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+            num_filters_, conv_height_, conv_width_));
+    if (bias_term_)
+        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                               GetCudnnDataType(dtype), 1,
+                                               num_filters_, 1, 1));
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
+                                                stride_h_, stride_w_, 1, 1,
+                                                CUDNN_CROSS_CORRELATION,
+                                                GetCudnnDataType(dtype)));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                           CUDNN_TENSOR_NCHW, num_filters_,
+                                           channels_, kernel_h_, kernel_w_));
+    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+        prefer_ == "no_workspace") {
+        cudnnConvolutionFwdPreference_t fwd_pref;
+        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+        if (prefer_ == "fastest") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+        } else if (prefer_ == "limited_workspace") {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        } else {
+            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+        }
+        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+                workspace_byte_limit_, &fp_alg_));
+        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+        // deprecated in cudnn v7
+        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+        } else if (prefer_ == "autotune") {
+        const int topk = 1;
+        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+                &num_fp_alg, fp_alg_perf));
+        fp_alg_ = fp_alg_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+                &num_bp_filt_alg, bp_filt_perf));
+        bp_filter_alg_ = bp_filt_perf[0].algo;
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+                &num_bp_data_alg, bp_data_perf));
+        bp_data_alg_ = bp_data_perf[0].algo;
+    } else {
+        LOG(FATAL) << "Preferred algorithm is not available!";
+    }
+
+    size_t fp_byte, bp_data_byte, bp_filter_byte;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+            &fp_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+            bp_data_alg_, &bp_data_byte));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+            bp_filter_alg_, &bp_filter_byte));
+    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                       sizeof(float) +
+                       1;
+    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
+        LOG(WARNING) << "The required memory for workspace ("
+                     << workspace_count_ * sizeof(float)
+                     << ") is larger than the expected Bytes ("
+                     << workspace_byte_limit_ << ")";
+    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+};
+
+Convolution C;
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch){
+	CHECK_EQ(x.device()->lang(), kCpp);
+
+	CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+    x.shape(3) == ch.width_) << "input sample shape should not change";
+
+    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
+    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    Shape b_shape;
+    if (ch.bias_term_)
+      b_shape= b.shape();
+
+    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+    if (ch.bias_term_)
+      b.Reshape(Shape{ch.num_filters_});
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+    Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+
+    float *data_col = new float[ch.col_height_ * ch.col_width_];
+    auto in_data = x.data<float>();
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);    
+
+      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+      Tensor each = Mult(W, col_data);
+      if (ch.bias_term_) {
+          AddColumn(b, &each);
+        }
+      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
+    };
+  W.Reshape(w_shape);
+  if (ch.bias_term_)
+    b.Reshape(b_shape);
+  return output;
+}; 
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
+    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
+
+    Shape w_shape= W.shape();
+    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+    Tensor dx;
+    dx.ResetLike(x);
+    
+    float *dx_b = new float[ch.imagesize];
+
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      Tensor dcol_b = Mult(W.T(), grad_b);
+      auto dcol_data = dcol_b.data<float>();
+      C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
+           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
+      dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
+    }
+  W.Reshape(w_shape); 
+  return dx;
+};
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+
+    CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
+    x.shape(3) == ch.width_) << "input sample shape should not change";
+
+    Tensor dW;
+    dW.ResetLike(W);
+    dW.SetValue(0.0f);
+    
+    Shape w_shape= W.shape();
+    dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
+
+    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
+
+    float *data_col = new float[ch.col_height_ * ch.col_width_];
+    auto in_data = dy.data<float>();
+    for (size_t num = 0; num < ch.batchsize; num++) {
+      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
+            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
+      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
+      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
+      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
+      dW += Mult(grad_b, col_data.T());
+    }
+   dW.Reshape(w_shape);
+   return dW;
+};
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch){
+    CHECK_EQ(dy.device()->lang(), kCpp);
+    
+    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
+    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
+	
+	CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change";
+
+    Tensor db;
+    db.ResetLike(b);
+
+    auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
+    Tensor tmp1 = Reshape(dy, tmpshp);
+
+    Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
+
+    SumRows(tmp3, &db);
+
+    return db;
+};
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch){
+	CHECK_EQ(x.device()->lang(), kCuda);
+
+    DataType dtype = x.data_type();
+    auto dev = x.device();
+
+    Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
+    Tensor output(shape, dev, dtype);
+
+    output.device()->Exec([output, x, W, cch](Context *ctx) {
+        Block *inblock = x.block(), *outblock = output.block(),
+                *wblock = W.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
+                                inblock->data(), cch.filter_desc_, wblock->data(),
+                                cch.conv_desc_, cch.fp_alg_,
+                                cch.workspace_.block()->mutable_data(),
+                                cch.workspace_count_ * sizeof(float), &beta,
+                                cch.y_desc_, outblock->mutable_data());
+    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
+
+    if (cch.bias_term_) {
+        output.device()->Exec([output, b, cch](Context *ctx) {
+            float beta = 1.f, alpha = 1.0f;
+            Block *outblock = output.block(), *bblock = b.block();
+            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
+                           bblock->data(), &beta, cch.y_desc_,
+                           outblock->mutable_data());
+        }, {output.block(), b.block()}, {output.block()});
+    }
+
+    return output;
+};
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dx;
+    dx.ResetLike(x);
+
+    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
+        Block *wblock = W.block(), *dyblock = dy.block(),
+                *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
+                                     wblock->data(), cch.y_desc_, dyblock->data(),
+                                     cch.conv_desc_, cch.bp_data_alg_,
+                                     cch.workspace_.block()->mutable_data(),
+                                     cch.workspace_count_ * sizeof(float), &beta,
+                                     cch.x_desc_, dxblock->mutable_data());
+    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
+
+    return dx;
+};
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor dW;
+    dW.ResetLike(W);
+
+    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
+    Block *inblock = x.block(), *dyblock = dy.block(),
+            *dwblock = dW.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
+            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
+            cch.workspace_.block()->mutable_data(),
+            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
+            dwblock->mutable_data());
+    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
+
+    return dW;
+};
+
+// input Tensor b for Reset db purpose, can avoid this later.
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch){
+    CHECK_EQ(dy.device()->lang(), kCuda);
+
+    Tensor db;
+    db.ResetLike(b);
+
+    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
+        Block *dyblock = dy.block(), *dbblock = db.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
+                                     dyblock->data(), &beta, cch.bias_desc_,
+                                     dbblock->mutable_data());
+    }, {dy.block()}, {db.block()});
+
+    return db;
+};
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h
new file mode 100755
index 0000000..96a6d60
--- /dev/null
+++ b/src/model/operation/convolution.h
@@ -0,0 +1,78 @@
+#include <string>
+#include <vector>
+#include <cudnn.h>
+#include "../layer/cudnn_convolution.h"
+#include "../layer/cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa{
+
+struct ConvHandle{
+    size_t kernel_w_;
+    size_t pad_w_;
+    size_t stride_w_;
+    size_t kernel_h_;
+    size_t pad_h_;
+    size_t stride_h_;
+
+    size_t channels_;
+    size_t num_filters_;
+
+    bool bias_term_;
+
+    size_t height_;
+    size_t width_;
+    size_t conv_height_;
+    size_t conv_width_;
+    size_t batchsize;
+
+    size_t col_height_;
+    size_t col_width_;
+    size_t imagesize;
+
+    ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias);
+
+};
+
+struct CudnnConvHandle:ConvHandle{
+	cudnnTensorDescriptor_t x_desc_ ;
+    cudnnTensorDescriptor_t y_desc_ ;
+    cudnnTensorDescriptor_t bias_desc_ ;
+    cudnnFilterDescriptor_t filter_desc_ ;
+    cudnnConvolutionDescriptor_t conv_desc_ ;
+    cudnnConvolutionFwdAlgo_t fp_alg_;
+    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+
+    size_t workspace_count_;
+    Tensor workspace_;
+
+    CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, 
+                    const std::vector<size_t> stride, const std::vector<size_t> padding,
+                    const size_t in_channels, const size_t out_channels,
+                    const bool bias, const size_t workspace_byte_limit_=1024*1024*1024,
+                    const std::string prefer_="fastest");
+};
+
+Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch);
+
+Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch);
+
+Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch);
+
+Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch);
+
+
+Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);
+
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch);
+
+Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch);
+
+Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch);
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution_operation.cc
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_operation.cc b/src/model/operation/convolution_operation.cc
deleted file mode 100644
index 90b1b4a..0000000
--- a/src/model/operation/convolution_operation.cc
+++ /dev/null
@@ -1,366 +0,0 @@
-#include "./convolution_operation.h"
-#include "../layer/convolution.h"
-#include<iostream>
-
-namespace singa{
-
-ConvHandles::ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-	                const std::vector<size_t> stride, const std::vector<size_t> padding,
-	                const size_t in_channels, const size_t out_channels,
-	                const bool bias_term_){
-    kernel_h_=kernel_size[0];
-    kernel_w_=kernel_size[1];
-
-    pad_h_=padding[0];
-    pad_w_=padding[1];
-
-    stride_h_=stride[0];
-    stride_w_=stride[1];
-
-    channels_=in_channels;
-    num_filters_=out_channels;
-
-	batchsize = input.shape(0);
-	CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched.";
-    height_ = input.shape(2);
-    width_ = input.shape(3);
-
-    conv_height_ = 1;
-    if (stride_h_ > 0)
-        conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
-    conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
-
-    col_height_ = in_channels * kernel_w_ * kernel_h_;
-    col_width_ = conv_height_ * conv_width_;
-    imagesize = input.Size() / batchsize;
-};	
-
-CudnnConvHandles::CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,const bool bias_term_, 
-                    const size_t workspace_byte_limit_,const std::string prefer_)
-                    :ConvHandles(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){
-
-    DataType dtype = input.data_type();
-    auto dev = input.device();
-    Context *ctx = dev->context(0);
-
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
-    if (bias_term_)
-        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), batchsize,
-                                           channels_, height_, width_));
-    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
-            y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
-            num_filters_, conv_height_, conv_width_));
-    if (bias_term_)
-        CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                               GetCudnnDataType(dtype), 1,
-                                               num_filters_, 1, 1));
-    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
-                                                stride_h_, stride_w_, 1, 1,
-                                                CUDNN_CROSS_CORRELATION,
-                                                GetCudnnDataType(dtype)));
-    CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
-                                           CUDNN_TENSOR_NCHW, num_filters_,
-                                           channels_, kernel_h_, kernel_w_));
-    if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
-        prefer_ == "no_workspace") {
-        cudnnConvolutionFwdPreference_t fwd_pref;
-        cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
-        cudnnConvolutionBwdDataPreference_t bwd_data_pref;
-        if (prefer_ == "fastest") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
-        } else if (prefer_ == "limited_workspace") {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        } else {
-            fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-            bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-            bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
-        }
-        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
-                workspace_byte_limit_, &fp_alg_));
-        CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-                bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
-        // deprecated in cudnn v7
-        CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-                bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
-        } else if (prefer_ == "autotune") {
-        const int topk = 1;
-        int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
-        cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
-        cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
-        cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
-        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
-                ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
-                &num_fp_alg, fp_alg_perf));
-        fp_alg_ = fp_alg_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
-                ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
-                &num_bp_filt_alg, bp_filt_perf));
-        bp_filter_alg_ = bp_filt_perf[0].algo;
-        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
-                ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
-                &num_bp_data_alg, bp_data_perf));
-        bp_data_alg_ = bp_data_perf[0].algo;
-    } else {
-        LOG(FATAL) << "Preferred algorithm is not available!";
-    }
-
-    size_t fp_byte, bp_data_byte, bp_filter_byte;
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
-            &fp_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-            ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
-            bp_data_alg_, &bp_data_byte));
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
-            bp_filter_alg_, &bp_filter_byte));
-    workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
-                       sizeof(float) +
-                       1;
-    if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
-        LOG(WARNING) << "The required memory for workspace ("
-                     << workspace_count_ * sizeof(float)
-                     << ") is larger than the expected Bytes ("
-                     << workspace_byte_limit_ << ")";
-    workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
-};
-
-Convolution C;
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch){
-	CHECK_EQ(x.device()->lang(), kCpp);
-
-	CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-    x.shape(3) == ch.width_) << "input sample shape should not change";
-
-    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
-    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
-
-    Shape w_shape= W.shape();
-    Shape b_shape= b.shape();
-
-    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
-    if (ch.bias_term_)
-      b.Reshape(Shape{ch.num_filters_});
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-    Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
-
-    float *data_col = new float[ch.col_height_ * ch.col_width_];
-    auto in_data = x.data<float>();
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);    
-
-      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
-      Tensor each = Mult(W, col_data);
-      if (ch.bias_term_) {
-          AddColumn(b, &each);
-        }
-      CopyDataToFrom(&output, each, each.Size(), num * each.Size());
-    };
-  W.Reshape(w_shape);
-  b.Reshape(b_shape);
-  return output;
-}; 
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-
-    CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && 
-    W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change";
-
-    Shape w_shape= W.shape();
-    W.Reshape(Shape{ch.num_filters_, ch.col_height_});
-
-    Tensor dx;
-    dx.ResetLike(x);
-    
-    float *dx_b = new float[ch.imagesize];
-
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      Tensor dcol_b = Mult(W.T(), grad_b);
-      auto dcol_data = dcol_b.data<float>();
-      C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_,
-           ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b);
-      dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
-    }
-  W.Reshape(w_shape); 
-  return dx;
-};
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-
-    CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ &&
-    x.shape(3) == ch.width_) << "input sample shape should not change";
-
-    Tensor dW;
-    dW.ResetLike(W);
-    dW.SetValue(0.0f);
-    
-    Shape w_shape= W.shape();
-    dW.Reshape(Shape{ch.num_filters_, ch.col_height_});
-
-    Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image
-
-    float *data_col = new float[ch.col_height_ * ch.col_width_];
-    auto in_data = dy.data<float>();
-    for (size_t num = 0; num < ch.batchsize; num++) {
-      C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_,
-            ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col);
-      col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_);
-      Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_});
-      CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
-      dW += Mult(grad_b, col_data.T());
-    }
-   dW.Reshape(w_shape);
-   return dW;
-};
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch){
-    CHECK_EQ(dy.device()->lang(), kCpp);
-    
-    CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ &&
-    dy.shape(3) == ch.conv_width_) << "input gradients shape should not change";
-	
-	CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change";
-
-    Tensor db;
-    db.ResetLike(b);
-
-    auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)};
-    Tensor tmp1 = Reshape(dy, tmpshp);
-
-    Tensor tmp2(Shape{ch.batchsize * ch.num_filters_});
-    SumColumns(tmp1, &tmp2);
-    Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_});
-
-    SumRows(tmp3, &db);
-
-    return db;
-};
-
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch){
-	CHECK_EQ(x.device()->lang(), kCuda);
-
-    DataType dtype = x.data_type();
-    auto dev = x.device();
-
-    Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_};
-    Tensor output(shape, dev, dtype);
-
-    output.device()->Exec([output, x, W, cch](Context *ctx) {
-        Block *inblock = x.block(), *outblock = output.block(),
-                *wblock = W.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_,
-                                inblock->data(), cch.filter_desc_, wblock->data(),
-                                cch.conv_desc_, cch.fp_alg_,
-                                cch.workspace_.block()->mutable_data(),
-                                cch.workspace_count_ * sizeof(float), &beta,
-                                cch.y_desc_, outblock->mutable_data());
-    }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block());
-
-    if (cch.bias_term_) {
-        output.device()->Exec([output, b, cch](Context *ctx) {
-            float beta = 1.f, alpha = 1.0f;
-            Block *outblock = output.block(), *bblock = b.block();
-            cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_,
-                           bblock->data(), &beta, cch.y_desc_,
-                           outblock->mutable_data());
-        }, {output.block(), b.block()}, {output.block()});
-    }
-
-    return output;
-};
-
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor dx;
-    dx.ResetLike(x);
-
-    dy.device()->Exec([dx, dy, W, cch](Context *ctx) {
-        Block *wblock = W.block(), *dyblock = dy.block(),
-                *dxblock = dx.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_,
-                                     wblock->data(), cch.y_desc_, dyblock->data(),
-                                     cch.conv_desc_, cch.bp_data_alg_,
-                                     cch.workspace_.block()->mutable_data(),
-                                     cch.workspace_count_ * sizeof(float), &beta,
-                                     cch.x_desc_, dxblock->mutable_data());
-    }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()});
-
-    return dx;
-};
-
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor dW;
-    dW.ResetLike(W);
-
-    dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) {
-    Block *inblock = x.block(), *dyblock = dy.block(),
-            *dwblock = dW.block();
-    float alpha = 1.f, beta = 0.f;
-    cudnnConvolutionBackwardFilter(
-            ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(),
-            cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_,
-            cch.workspace_.block()->mutable_data(),
-            cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_,
-            dwblock->mutable_data());
-    }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()});
-
-    return dW;
-};
-
-// input Tensor b for Reset db purpose, can avoid this later.
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch){
-    CHECK_EQ(dy.device()->lang(), kCuda);
-
-    Tensor db;
-    db.ResetLike(b);
-
-    dy.device()->Exec([db, dy, b, cch](Context *ctx) {
-        Block *dyblock = dy.block(), *dbblock = db.block();
-        float alpha = 1.f, beta = 0.f;
-        cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_,
-                                     dyblock->data(), &beta, cch.bias_desc_,
-                                     dbblock->mutable_data());
-    }, {dy.block()}, {db.block()});
-
-    return db;
-};
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution_operation.h
----------------------------------------------------------------------
diff --git a/src/model/operation/convolution_operation.h b/src/model/operation/convolution_operation.h
deleted file mode 100644
index 835581e..0000000
--- a/src/model/operation/convolution_operation.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#include <string>
-#include <vector>
-#include <cudnn.h>
-#include "../layer/cudnn_convolution.h"
-#include "../layer/cudnn_utils.h"
-#include "singa/utils/logging.h"
-
-namespace singa{
-
-struct ConvHandles{
-    size_t kernel_w_;
-    size_t pad_w_;
-    size_t stride_w_;
-    size_t kernel_h_;
-    size_t pad_h_;
-    size_t stride_h_;
-
-    size_t channels_;
-    size_t num_filters_;
-
-    bool bias_term_;
-
-    size_t height_;
-    size_t width_;
-    size_t conv_height_;
-    size_t conv_width_;
-    size_t batchsize;
-
-    size_t col_height_;
-    size_t col_width_;
-    size_t imagesize;
-
-    ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias_term_);
-
-};
-
-struct CudnnConvHandles:ConvHandles{
-	cudnnTensorDescriptor_t x_desc_ ;
-    cudnnTensorDescriptor_t y_desc_ ;
-    cudnnTensorDescriptor_t bias_desc_ ;
-    cudnnFilterDescriptor_t filter_desc_ ;
-    cudnnConvolutionDescriptor_t conv_desc_ ;
-    cudnnConvolutionFwdAlgo_t fp_alg_;
-    cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-    cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-
-    size_t workspace_count_;
-    Tensor workspace_;
-
-    CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, 
-                    const std::vector<size_t> stride, const std::vector<size_t> padding,
-                    const size_t in_channels, const size_t out_channels,
-                    const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024,
-                    const std::string prefer_="fastest");
-};
-
-Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandles ch);
-
-Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch);
-
-Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch);
-
-Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch);
-
-
-Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch);
-
-Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch);
-
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/test/python/test_operation.py
----------------------------------------------------------------------
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
index ece537d..1bbc70c 100644
--- a/test/python/test_operation.py
+++ b/test/python/test_operation.py
@@ -16,9 +16,6 @@ cpu_dev = device.get_default_device()
 dy = CTensor([2, 1, 2, 2])
 singa.Gaussian(0.0, 1.0, dy)
 
-conv = autograd.Conv2D(3, 1, 2)  # (in_channels, out_channels, kernel_size)
-conv_without_bias = autograd.Conv2D(3,1,2,bias=False)
-
 
 def _tuple_to_string(t):
     lt = [str(x) for x in t]
@@ -34,35 +31,43 @@ class TestPythonOperation(unittest.TestCase):
                          )
 
     def test_conv2d_gpu(self):
+        # (in_channels, out_channels, kernel_size)
+        conv_0 = autograd.Conv2D(3, 1, 2)
+        conv_without_bias_0 = autograd.Conv2D(3, 1, 2, bias=False)
+
         gpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev)
         gpu_input_tensor.gaussian(0.0, 1.0)
 
-        y = conv(gpu_input_tensor)  # PyTensor
-        dx, dW, db = conv.backward(dy)  # CTensor
+        y = conv_0(gpu_input_tensor)  # PyTensor
+        dx, dW, db = y.creator.backward(dy)  # CTensor
 
         self.check_shape(y.shape, (2, 1, 2, 2))
         self.check_shape(dx.shape(), (2, 3, 3, 3))
         self.check_shape(dW.shape(), (1, 3, 2, 2))
         self.check_shape(db.shape(), (1,))
 
-        #forward without bias
-        y_without_bias=conv_without_bias(gpu_input_tensor)
+        # forward without bias
+        y_without_bias = conv_without_bias_0(gpu_input_tensor)
         self.check_shape(y.shape, (2, 1, 2, 2))
 
     def test_conv2d_cpu(self):
+        # (in_channels, out_channels, kernel_size)
+        conv_1 = autograd.Conv2D(3, 1, 2)
+        conv_without_bias_1 = autograd.Conv2D(3, 1, 2, bias=False)
+
         cpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=cpu_dev)
         cpu_input_tensor.gaussian(0.0, 1.0)
 
-        y = conv(cpu_input_tensor)  # PyTensor
-        dx, dW, db = conv.backward(dy)  # CTensor
+        y = conv_1(cpu_input_tensor)  # PyTensor
+        dx, dW, db = y.creator.backward(dy)  # CTensor
 
         self.check_shape(y.shape, (2, 1, 2, 2))
         self.check_shape(dx.shape(), (2, 3, 3, 3))
         self.check_shape(dW.shape(), (1, 3, 2, 2))
         self.check_shape(db.shape(), (1,))
 
-        #forward without bias
-        y_without_bias=conv_without_bias(cpu_input_tensor)
+        # forward without bias
+        y_without_bias = conv_without_bias_1(cpu_input_tensor)
         self.check_shape(y.shape, (2, 1, 2, 2))
 
 if __name__ == '__main__':