You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@singa.apache.org by GitBox <gi...@apache.org> on 2020/06/01 15:06:20 UTC
[GitHub] [singa] nudles commented on a change in pull request #697: New Model Layer Operator API

nudles commented on a change in pull request #697:
URL: https://github.com/apache/singa/pull/697#discussion_r433284169



##########
File path: python/singa/model.py
##########
@@ -0,0 +1,308 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''
+This script includes Model class for python users
+to use Computational Graph in their model.
+'''
+
+from functools import wraps
+import time
+import numpy as np
+import json
+import zipfile
+import os
+
+from singa import tensor
+from singa import autograd
+from singa import layer
+from . import singa_wrap as singa
+from .device import get_default_device
+
+import gc
+
+
+class ModelMeta(layer.LayerMeta):
+
+    def buffer_operation(func):
+
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if self.graph_mode and self.training:
+                if not self._buffered:
+                    # buffer operations
+                    self._device.EnableGraph(True)
+                    self._results = func(self, *args, **kwargs)
+                    self._device.Sync()
+                    self._device.EnableGraph(False)
+                    self._buffered = True
+
+                    # deconstruct Operations before running the entire graph
+                    if self._results:
+                        if isinstance(self._results, list):
+                            for _matrix in self._results:
+                                if isinstance(_matrix, tensor.Tensor):
+                                    _matrix.creator = None
+                        elif isinstance(self._results, tensor.Tensor):
+                            self._results.creator = None
+
+                    # make sure all Operations are deallocated
+                    gc.collect()
+
+                # run graph
+                self._device.RunGraph(self.sequential)
+                return self._results
+            else:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    def __new__(cls, name, bases, attr):
+        if 'train_one_batch' in attr:
+            attr['train_one_batch'] = ModelMeta.buffer_operation(
+                attr['train_one_batch'])
+
+        return super(ModelMeta, cls).__new__(cls, name, bases, attr)
+
+
+class Model(layer.Layer, metaclass=ModelMeta):
+    """ Base class for your neural network models.
+
+    Example usage::
+
+        import numpy as np
+        from singa import opt
+        from singa import tensor
+        from singa import device
+        from singa import autograd
+        from singa import layer
+        from singa import model
+
+        class MyModel(model.Model):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+                self.conv1 = layer.Conv2d(1, 20, 5, padding=0)
+                self.conv2 = layer.Conv2d(20, 50, 5, padding=0)
+
+                self.sgd = opt.SGD(lr=0.01)
+
+            def forward(self, x):
+                y = self.conv1(x)
+                y = self.conv2(y)
+                return y
+
+            def train_one_batch(self, x, y):
+                out = self.forward(x)
+                loss = autograd.softmax_cross_entropy(out, y)
+                self.sgd.backward_and_update(loss)
+                return out, loss
+
+    """
+
+    # save load states constant
+    TENSOR_DICT_FILENAME = '/tensor_dict.npz'
+    STATES_ATTR_FILENAME = '/states_attr.json'
+    MODEL_STATE_TYPE = 0
+    AUX_STATE_TYPE = 1
+
+    def __init__(self):
+        """
+        Initializes internal Model state
+        """
+        super(Model, self).__init__()
+
+        self.training = True
+        self.graph_mode = True
+        self.sequential = False
+        self._buffered = False
+        self._device = get_default_device()
+
+        self._results = None
+
+    def compile(self, inputs, is_train=True, use_graph=False, sequential=False):
+        self._device.EnableGraph(True)
+        self.forward(*inputs)
+        self._device.EnableGraph(False)
+        self._device.ResetGraph()
+        autograd.training = is_train
+        self.training = is_train
+        self.graph_mode = use_graph
+        self.sequential = sequential
+
+    def forward(self, *input):
+        """Defines the computation performed at every call.
+
+        Should be overridden by all subclasses.
+
+        Args:
+            *input: the input training data for the model
+
+        Returns:
+            out: the outputs of the forward propagation.
+        """
+        raise NotImplementedError
+
+    def train_one_batch(self, *input):
+        raise NotImplementedError
+
+    def train(self, mode=True):
+        """Set the model in evaluation mode.
+
+        Args:
+            mode(bool): when mode is True, this model will enter training mode
+        """
+        self.training = mode
+        autograd.training = mode
+
+    def eval(self):
+        """Sets the model in evaluation mode.
+        """
+        self.train(mode=False)
+
+    def graph(self, mode=True, sequential=False):
+        """ Turn on the computational graph. Specify execution mode.
+
+        Args:
+            mode(bool): when mode is True, model will use computational graph
+            sequential(bool): when sequential is True, model will execute ops
+            in the graph follow the order of joining the graph
+        """
+        self.graph_mode = mode
+        self.sequential = sequential
+
+    def on_device(self, device):
+        """Sets the target device.

Review comment:
       when will _device be used?
   do we need to call this one explicitly or get the device from the inputs of compile() function?

##########
File path: examples/cnn/train.py
##########
@@ -185,8 +185,7 @@ def run(global_rank,
     # attached model to graph
     model.on_device(dev)

Review comment:
       is it necessary to call `on_device()` if the `inputs` to compile() is created on the gpu device?

##########
File path: examples/cnn/model/resnet.py
##########
@@ -21,30 +21,31 @@
 # https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
 
 from singa import autograd
-from singa import module
+from singa import layer
+from singa import model
 
 
 def conv3x3(in_planes, out_planes, stride=1):
     """3x3 convolution with padding"""
-    return autograd.Conv2d(
+    return layer.Conv2d(
         in_planes,
         out_planes,
-        kernel_size=3,
+        3,
         stride=stride,
         padding=1,
         bias=False,
     )
 
 
-class BasicBlock(autograd.Layer):
+class BasicBlock(layer.Layer):
     expansion = 1
 
     def __init__(self, inplanes, planes, stride=1, downsample=None):
         super(BasicBlock, self).__init__()
         self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = autograd.BatchNorm2d(planes)
+        self.bn1 = layer.BatchNorm2d(planes)

Review comment:
       do we need to create the ReLU layer instance?
   or is ReLU activation merged into the conv2d Layer?
   I think we should not mix the layer instances and operator instances in the model definition to avoid confusion. 
   It would be better to just let users use layer instances to define the model's forward function.

##########
File path: examples/mlp/module.py
##########
@@ -56,10 +56,9 @@ def forward(self, inputs):
         x = autograd.add_bias(x, self.b1)
         return x
 
-    def loss(self, out, ty):
-        return autograd.softmax_cross_entropy(out, ty)
-
-    def optim(self, loss, dist_option, spars):
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = autograd.softmax_cross_entropy(out, y)

Review comment:
       shall we implement loss layers and call it here instead of calling the softmax_cross_entropy operator?
   if the loss is  implemented using cudnn, we need to create the cudnn handle and store it in the layer class.

##########
File path: examples/rnn/train.py
##########
@@ -59,12 +60,12 @@ def forward(self, inputs):
         x = autograd.reshape(x, (-1, self.hidden_size))
         return self.dense(x)
 
-    def loss(self, out, ty):
-        ty = autograd.reshape(ty, (-1, 1))
-        return autograd.softmax_cross_entropy(out, ty)
-
-    def optim(self, loss):
+    def train_one_batch(self, x, y):
+        out = self.forward(x)

Review comment:
       we need to set the self.hx and self.cx creator field to be None before the self.forward(x)

##########
File path: new_api.py
##########
@@ -0,0 +1,62 @@
+from singa import autograd
+from singa import layer

Review comment:
       this one will be deleted?

##########
File path: python/singa/model.py
##########
@@ -0,0 +1,308 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''
+This script includes Model class for python users
+to use Computational Graph in their model.
+'''
+
+from functools import wraps
+import time
+import numpy as np
+import json
+import zipfile
+import os
+
+from singa import tensor
+from singa import autograd
+from singa import layer
+from . import singa_wrap as singa
+from .device import get_default_device
+
+import gc
+
+
+class ModelMeta(layer.LayerMeta):
+
+    def buffer_operation(func):
+
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if self.graph_mode and self.training:
+                if not self._buffered:
+                    # buffer operations
+                    self._device.EnableGraph(True)
+                    self._results = func(self, *args, **kwargs)
+                    self._device.Sync()
+                    self._device.EnableGraph(False)
+                    self._buffered = True
+
+                    # deconstruct Operations before running the entire graph
+                    if self._results:
+                        if isinstance(self._results, list):
+                            for _matrix in self._results:
+                                if isinstance(_matrix, tensor.Tensor):
+                                    _matrix.creator = None
+                        elif isinstance(self._results, tensor.Tensor):
+                            self._results.creator = None
+
+                    # make sure all Operations are deallocated
+                    gc.collect()
+
+                # run graph
+                self._device.RunGraph(self.sequential)
+                return self._results
+            else:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    def __new__(cls, name, bases, attr):
+        if 'train_one_batch' in attr:
+            attr['train_one_batch'] = ModelMeta.buffer_operation(
+                attr['train_one_batch'])
+
+        return super(ModelMeta, cls).__new__(cls, name, bases, attr)
+
+
+class Model(layer.Layer, metaclass=ModelMeta):
+    """ Base class for your neural network models.
+
+    Example usage::
+
+        import numpy as np
+        from singa import opt
+        from singa import tensor
+        from singa import device
+        from singa import autograd
+        from singa import layer
+        from singa import model
+
+        class MyModel(model.Model):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+                self.conv1 = layer.Conv2d(1, 20, 5, padding=0)
+                self.conv2 = layer.Conv2d(20, 50, 5, padding=0)
+
+                self.sgd = opt.SGD(lr=0.01)
+
+            def forward(self, x):
+                y = self.conv1(x)
+                y = self.conv2(y)
+                return y
+
+            def train_one_batch(self, x, y):
+                out = self.forward(x)
+                loss = autograd.softmax_cross_entropy(out, y)
+                self.sgd.backward_and_update(loss)
+                return out, loss
+
+    """
+
+    # save load states constant
+    TENSOR_DICT_FILENAME = '/tensor_dict.npz'
+    STATES_ATTR_FILENAME = '/states_attr.json'
+    MODEL_STATE_TYPE = 0
+    AUX_STATE_TYPE = 1
+
+    def __init__(self):
+        """
+        Initializes internal Model state
+        """
+        super(Model, self).__init__()
+
+        self.training = True
+        self.graph_mode = True
+        self.sequential = False
+        self._buffered = False
+        self._device = get_default_device()
+
+        self._results = None
+
+    def compile(self, inputs, is_train=True, use_graph=False, sequential=False):
+        self._device.EnableGraph(True)
+        self.forward(*inputs)
+        self._device.EnableGraph(False)
+        self._device.ResetGraph()
+        autograd.training = is_train
+        self.training = is_train
+        self.graph_mode = use_graph
+        self.sequential = sequential
+
+    def forward(self, *input):
+        """Defines the computation performed at every call.
+
+        Should be overridden by all subclasses.
+
+        Args:
+            *input: the input training data for the model
+
+        Returns:
+            out: the outputs of the forward propagation.
+        """
+        raise NotImplementedError
+
+    def train_one_batch(self, *input):
+        raise NotImplementedError
+
+    def train(self, mode=True):
+        """Set the model in evaluation mode.
+
+        Args:
+            mode(bool): when mode is True, this model will enter training mode
+        """
+        self.training = mode
+        autograd.training = mode
+
+    def eval(self):
+        """Sets the model in evaluation mode.
+        """
+        self.train(mode=False)
+
+    def graph(self, mode=True, sequential=False):
+        """ Turn on the computational graph. Specify execution mode.
+
+        Args:
+            mode(bool): when mode is True, model will use computational graph
+            sequential(bool): when sequential is True, model will execute ops
+            in the graph follow the order of joining the graph
+        """
+        self.graph_mode = mode
+        self.sequential = sequential
+
+    def on_device(self, device):
+        """Sets the target device.
+
+        The following training will be performed on that device.
+
+        Args:
+            device(Device): the target device
+        """
+        self._device = device
+
+    def __get_name__(self):
+        return self.__class__.__name__
+
+    def __call__(self, *input, **kwargs):
+        if self.training:
+            return self.train_one_batch(*input, **kwargs)
+        else:
+            return self.forward(*input, **kwargs)
+
+    def save_states(self, fpath, aux_states={}):
+        """Save states.
+
+        Args:
+            fpath: output file path (without the extension)
+            aux_states(dict): values are standard data types or Tensor,
+                              e.g., epoch ID, learning rate, optimizer states
+        """
+        assert not os.path.isfile(fpath), (
+            "Failed to save states, %s is already existed." % fpath)
+
+        states = self.get_states()
+
+        # save states data and attr
+        tensor_dict = {}
+        states_attr = {}
+        for k, v in states.items():
+            assert isinstance(v, tensor.Tensor), "Only tensor state is allowed"

Review comment:
       we may have other non tensor states. but we can assume they are standard python data types.

##########
File path: python/singa/model.py
##########
@@ -0,0 +1,308 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''
+This script includes Model class for python users
+to use Computational Graph in their model.
+'''
+
+from functools import wraps
+import time
+import numpy as np
+import json
+import zipfile
+import os
+
+from singa import tensor
+from singa import autograd
+from singa import layer
+from . import singa_wrap as singa
+from .device import get_default_device
+
+import gc
+
+
+class ModelMeta(layer.LayerMeta):
+
+    def buffer_operation(func):
+
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if self.graph_mode and self.training:
+                if not self._buffered:
+                    # buffer operations
+                    self._device.EnableGraph(True)
+                    self._results = func(self, *args, **kwargs)
+                    self._device.Sync()
+                    self._device.EnableGraph(False)
+                    self._buffered = True
+
+                    # deconstruct Operations before running the entire graph
+                    if self._results:
+                        if isinstance(self._results, list):
+                            for _matrix in self._results:
+                                if isinstance(_matrix, tensor.Tensor):
+                                    _matrix.creator = None
+                        elif isinstance(self._results, tensor.Tensor):
+                            self._results.creator = None
+
+                    # make sure all Operations are deallocated
+                    gc.collect()
+
+                # run graph
+                self._device.RunGraph(self.sequential)
+                return self._results
+            else:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    def __new__(cls, name, bases, attr):
+        if 'train_one_batch' in attr:
+            attr['train_one_batch'] = ModelMeta.buffer_operation(
+                attr['train_one_batch'])
+
+        return super(ModelMeta, cls).__new__(cls, name, bases, attr)
+
+
+class Model(layer.Layer, metaclass=ModelMeta):
+    """ Base class for your neural network models.
+
+    Example usage::
+
+        import numpy as np
+        from singa import opt
+        from singa import tensor
+        from singa import device
+        from singa import autograd
+        from singa import layer
+        from singa import model
+
+        class MyModel(model.Model):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+                self.conv1 = layer.Conv2d(1, 20, 5, padding=0)
+                self.conv2 = layer.Conv2d(20, 50, 5, padding=0)
+
+                self.sgd = opt.SGD(lr=0.01)
+
+            def forward(self, x):
+                y = self.conv1(x)
+                y = self.conv2(y)
+                return y
+
+            def train_one_batch(self, x, y):
+                out = self.forward(x)
+                loss = autograd.softmax_cross_entropy(out, y)
+                self.sgd.backward_and_update(loss)
+                return out, loss
+
+    """
+
+    # save load states constant
+    TENSOR_DICT_FILENAME = '/tensor_dict.npz'
+    STATES_ATTR_FILENAME = '/states_attr.json'
+    MODEL_STATE_TYPE = 0
+    AUX_STATE_TYPE = 1
+
+    def __init__(self):
+        """
+        Initializes internal Model state
+        """
+        super(Model, self).__init__()
+
+        self.training = True
+        self.graph_mode = True
+        self.sequential = False
+        self._buffered = False
+        self._device = get_default_device()
+
+        self._results = None
+
+    def compile(self, inputs, is_train=True, use_graph=False, sequential=False):
+        self._device.EnableGraph(True)
+        self.forward(*inputs)
+        self._device.EnableGraph(False)
+        self._device.ResetGraph()
+        autograd.training = is_train
+        self.training = is_train
+        self.graph_mode = use_graph
+        self.sequential = sequential
+
+    def forward(self, *input):
+        """Defines the computation performed at every call.
+
+        Should be overridden by all subclasses.
+
+        Args:
+            *input: the input training data for the model
+
+        Returns:
+            out: the outputs of the forward propagation.
+        """
+        raise NotImplementedError
+
+    def train_one_batch(self, *input):
+        raise NotImplementedError
+
+    def train(self, mode=True):
+        """Set the model in evaluation mode.
+
+        Args:
+            mode(bool): when mode is True, this model will enter training mode
+        """
+        self.training = mode
+        autograd.training = mode
+
+    def eval(self):
+        """Sets the model in evaluation mode.
+        """
+        self.train(mode=False)
+
+    def graph(self, mode=True, sequential=False):
+        """ Turn on the computational graph. Specify execution mode.
+
+        Args:
+            mode(bool): when mode is True, model will use computational graph
+            sequential(bool): when sequential is True, model will execute ops
+            in the graph follow the order of joining the graph
+        """
+        self.graph_mode = mode
+        self.sequential = sequential
+
+    def on_device(self, device):
+        """Sets the target device.
+
+        The following training will be performed on that device.
+
+        Args:
+            device(Device): the target device
+        """
+        self._device = device
+
+    def __get_name__(self):
+        return self.__class__.__name__
+
+    def __call__(self, *input, **kwargs):
+        if self.training:
+            return self.train_one_batch(*input, **kwargs)
+        else:
+            return self.forward(*input, **kwargs)
+
+    def save_states(self, fpath, aux_states={}):
+        """Save states.
+
+        Args:
+            fpath: output file path (without the extension)
+            aux_states(dict): values are standard data types or Tensor,
+                              e.g., epoch ID, learning rate, optimizer states
+        """
+        assert not os.path.isfile(fpath), (
+            "Failed to save states, %s is already existed." % fpath)
+
+        states = self.get_states()
+
+        # save states data and attr
+        tensor_dict = {}
+        states_attr = {}
+        for k, v in states.items():
+            assert isinstance(v, tensor.Tensor), "Only tensor state is allowed"

Review comment:
       e.g., the current training step, initial learning rate, etc.

##########
File path: python/singa/model.py
##########
@@ -0,0 +1,308 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''
+This script includes Model class for python users
+to use Computational Graph in their model.
+'''
+
+from functools import wraps
+import time
+import numpy as np
+import json
+import zipfile
+import os
+
+from singa import tensor
+from singa import autograd
+from singa import layer
+from . import singa_wrap as singa
+from .device import get_default_device
+
+import gc
+
+
+class ModelMeta(layer.LayerMeta):
+
+    def buffer_operation(func):
+
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if self.graph_mode and self.training:
+                if not self._buffered:
+                    # buffer operations
+                    self._device.EnableGraph(True)
+                    self._results = func(self, *args, **kwargs)
+                    self._device.Sync()
+                    self._device.EnableGraph(False)
+                    self._buffered = True
+
+                    # deconstruct Operations before running the entire graph
+                    if self._results:
+                        if isinstance(self._results, list):
+                            for _matrix in self._results:
+                                if isinstance(_matrix, tensor.Tensor):
+                                    _matrix.creator = None
+                        elif isinstance(self._results, tensor.Tensor):
+                            self._results.creator = None
+
+                    # make sure all Operations are deallocated
+                    gc.collect()
+
+                # run graph
+                self._device.RunGraph(self.sequential)
+                return self._results
+            else:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    def __new__(cls, name, bases, attr):
+        if 'train_one_batch' in attr:
+            attr['train_one_batch'] = ModelMeta.buffer_operation(
+                attr['train_one_batch'])
+
+        return super(ModelMeta, cls).__new__(cls, name, bases, attr)
+
+
+class Model(layer.Layer, metaclass=ModelMeta):
+    """ Base class for your neural network models.
+
+    Example usage::
+
+        import numpy as np
+        from singa import opt
+        from singa import tensor
+        from singa import device
+        from singa import autograd
+        from singa import layer
+        from singa import model
+
+        class MyModel(model.Model):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+                self.conv1 = layer.Conv2d(1, 20, 5, padding=0)
+                self.conv2 = layer.Conv2d(20, 50, 5, padding=0)
+
+                self.sgd = opt.SGD(lr=0.01)
+
+            def forward(self, x):
+                y = self.conv1(x)
+                y = self.conv2(y)
+                return y
+
+            def train_one_batch(self, x, y):
+                out = self.forward(x)
+                loss = autograd.softmax_cross_entropy(out, y)
+                self.sgd.backward_and_update(loss)
+                return out, loss
+
+    """
+
+    # save load states constant
+    TENSOR_DICT_FILENAME = '/tensor_dict.npz'
+    STATES_ATTR_FILENAME = '/states_attr.json'
+    MODEL_STATE_TYPE = 0
+    AUX_STATE_TYPE = 1
+
+    def __init__(self):
+        """
+        Initializes internal Model state
+        """
+        super(Model, self).__init__()
+
+        self.training = True
+        self.graph_mode = True
+        self.sequential = False
+        self._buffered = False
+        self._device = get_default_device()
+
+        self._results = None
+
+    def compile(self, inputs, is_train=True, use_graph=False, sequential=False):
+        self._device.EnableGraph(True)
+        self.forward(*inputs)
+        self._device.EnableGraph(False)
+        self._device.ResetGraph()
+        autograd.training = is_train
+        self.training = is_train
+        self.graph_mode = use_graph
+        self.sequential = sequential
+
+    def forward(self, *input):
+        """Defines the computation performed at every call.
+
+        Should be overridden by all subclasses.
+
+        Args:
+            *input: the input training data for the model
+
+        Returns:
+            out: the outputs of the forward propagation.
+        """
+        raise NotImplementedError
+
+    def train_one_batch(self, *input):
+        raise NotImplementedError
+
+    def train(self, mode=True):
+        """Set the model in evaluation mode.
+
+        Args:
+            mode(bool): when mode is True, this model will enter training mode
+        """
+        self.training = mode
+        autograd.training = mode
+
+    def eval(self):
+        """Sets the model in evaluation mode.
+        """
+        self.train(mode=False)
+
+    def graph(self, mode=True, sequential=False):
+        """ Turn on the computational graph. Specify execution mode.
+
+        Args:
+            mode(bool): when mode is True, model will use computational graph
+            sequential(bool): when sequential is True, model will execute ops
+            in the graph follow the order of joining the graph
+        """
+        self.graph_mode = mode
+        self.sequential = sequential
+
+    def on_device(self, device):
+        """Sets the target device.
+
+        The following training will be performed on that device.
+
+        Args:
+            device(Device): the target device
+        """
+        self._device = device
+
+    def __get_name__(self):
+        return self.__class__.__name__
+
+    def __call__(self, *input, **kwargs):
+        if self.training:
+            return self.train_one_batch(*input, **kwargs)
+        else:
+            return self.forward(*input, **kwargs)
+
+    def save_states(self, fpath, aux_states={}):
+        """Save states.
+
+        Args:
+            fpath: output file path (without the extension)
+            aux_states(dict): values are standard data types or Tensor,
+                              e.g., epoch ID, learning rate, optimizer states
+        """
+        assert not os.path.isfile(fpath), (
+            "Failed to save states, %s is already existed." % fpath)
+
+        states = self.get_states()
+
+        # save states data and attr
+        tensor_dict = {}
+        states_attr = {}
+        for k, v in states.items():
+            assert isinstance(v, tensor.Tensor), "Only tensor state is allowed"

Review comment:
       for non-tensor states, we just dump them into the json string.

##########
File path: python/singa/layer.py
##########
@@ -8,1443 +8,1279 @@
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 # =============================================================================
-""" Python layers wrap the C++ layers to provide simpler construction APIs.
-
-Example usages::
-
-    from singa import layer
-    from singa import tensor
-    from singa import device
-
-    layer.engine = 'cudnn'  # to use cudnn layers
-    dev = device.create_cuda_gpu()
-
-    # create a convolution layer
-    conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32))
-
-    # init param values
-    w, b = conv.param_values()
-    w.guassian(0, 0.01)
-    b.set_value(0)
-    conv.to_device(dev)  # move the layer data onto a CudaGPU device
-
-    x = tensor.Tensor((3, 32, 32), dev)
-    x.uniform(-1, 1)
-    y = conv.foward(True, x)
-
-    dy = tensor.Tensor()
-    dy.reset_like(y)
-    dy.set_value(0.1)
-    # dp is a list of tensors for parameter gradients
-    dx, dp = conv.backward(kTrain, dy)
-"""
-from __future__ import division
-from __future__ import absolute_import
-
-from builtins import str
-from builtins import range
-from builtins import object
-from builtins import set
-
-from . import singa_wrap
-from .proto import model_pb2
-from . import tensor
-
-engine = 'cudnn'
-'''engine is the prefix of layer identifier.
-
-The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for
-layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively.
-For example, CudnnConvolution layer is identified by 'cudnn_convolution';
-'singacpp_convolution' is for Convolution layer;
-Some layers' implementation use only Tensor functions, thererfore they are
-transparent to the underlying devices. For threse layers, they would have
-multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and
-singacl_dropout are all for the Dropout layer. In addition, it has an extra
-identifier 'singa', i.e. 'singa_dropout' also stands for the Dropout layer.
-
-engine is case insensitive. Each python layer would create the correct specific
-layer using the engine attribute.
-'''
-
-if singa_wrap.USE_CUDNN:
-    cudnn_version = singa_wrap.CUDNN_VERSION
-else:
-    cudnn_version = 0
-
-
-class Layer(object):
-    '''Base Python layer class.
-
-    Typically, the life cycle of a layer instance includes:
-        1. construct layer without input_sample_shapes, goto 2;
-           construct layer with input_sample_shapes, goto 3;
-        2. call setup to create the parameters and setup other meta fields
-        3. call forward or access layer members
-        4. call backward and get parameters for update
-
-    Args:
-        name (str): layer name
-    '''
-
-    def __init__(self, name, conf=None, **kwargs):
-        if conf is None:
-            self.layer = None  # layer converted by swig
-            self.name = name  # TODO(wangwei) duplicate with self.conf.name
-            self.conf = model_pb2.LayerConf()
-            self.conf.name = name
-            self.param_specs = []
-        else:
-            self.conf = conf
-            self.name = conf.name
-            self.caffe_layer()
-            self.param_specs = []
-
-            # convert caffe proto into singa proto format
-            #   case1: parameters of conv and dense layers
-            #   case2: type of activation layers
-            if (conf.type == 'Convolution' or conf.type == 4) or \
-                    (conf.type == 'InnerProduct' or conf.type == 14):
-                w, b = _construct_param_specs_from_caffe_proto(conf)
-                del conf.param[:]
-                conf.param.extend([w, b])
-                self.param_specs.append(w)
-                self.param_specs.append(b)
-                # print 'conf:\n', conf
-            if conf.type == 'Pooling':
-                conf.pooling_conf.ceil = True
-                # print 'conf:\n', conf
-            elif (conf.type == 'ReLU' or conf.type == 18 or
-                  conf.type == 'Sigmoid' or conf.type == 19 or
-                  conf.type == 'TanH' or conf.type == 23):
-                conf.type = (engine + '_' + conf.type).lower()
-            self.conf = conf
-
-        self.has_setup = False
-
-    def setup(self, in_shapes):
-        '''Call the C++ setup function to create params and set some meta data.
-
-        Args:
-            in_shapes: if the layer accepts a single input Tensor, in_shapes is
-                a single tuple specifying the inpute Tensor shape; if the layer
-                accepts multiple input Tensor (e.g., the concatenation layer),
-                in_shapes is a tuple of tuples, each for one input Tensor
-        '''
-        if self.has_setup:
-            return
-        if type(in_shapes[0]) is tuple:
-            self.layer.SetupWithMultInputs([list(s) for s in in_shapes],
-                                           self.conf.SerializeToString())
-        else:
-            self.layer.Setup(list(in_shapes), self.conf.SerializeToString())
-        self.has_setup = True
-
-    def caffe_layer(self):
-        '''
-        Create a singa layer based on caffe layer configuration.
-        '''
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
-        if self.conf.type == 'InnerProduct' or self.conf.type == 14:
-            self.layer = _create_layer(engine, 'Dense')
-        else:
-            self.layer = _create_layer(engine, self.conf.type)
-
-    def get_output_sample_shape(self):
-        '''Called after setup to get the shape of the output sample(s).
-
-        Returns:
-            a tuple for a single output Tensor or a list of tuples if this layer
-            has multiple outputs
-        '''
-        assert self.has_setup, \
-            'Must call setup() before get_output_sample_shape()'
-        return self.layer.GetOutputSampleShape()
-
-    def param_names(self):
-        '''
-        Returns:
-            a list of strings, one for the name of one parameter Tensor
-        '''
-        names = []
-        for x in self.param_specs:
-            names.append(x.name)
-        return names
-
-    def param_values(self):
-        '''Return param value tensors.
-
-        Parameter tensors are not stored as layer members because cpp Tensor
-        could be moved onto diff devices due to the change of layer device,
-        which would result in inconsistency.
-
-        Returns:
-            a list of tensors, one for each paramter
-        '''
-        if self.layer is None:
-            return []
-        else:
-            return tensor.from_raw_tensors(self.layer.param_values())
 
-    def forward(self, flag, x):
-        '''Forward propagate through this layer.
+import math
+import numpy as np
+from functools import wraps
 
-        Args:
-            flag: True (kTrain) for training (kEval); False for evaluating;
-                other values for furture use.
-            x (Tensor or list<Tensor>): an input tensor if the layer is
-                connected from a single layer; a list of tensors if the layer
-                is connected from multiple layers.
-
-        Return:
-            a tensor if the layer is connected to a single layer; a list of
-            tensors if the layer is connected to multiple layers;
-        '''
-        assert self.has_setup, 'Must call setup() before forward()'
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
-        if type(x) is list:
-            xs = [t.data for t in x]
-            y = self.layer.ForwardWithMultInputs(flag, xs)
-        else:
-            assert isinstance(x, tensor.Tensor), \
-                'input of %s (type:%s) must be a Tensor or Tensor list'\
-                % (self.name, type(x).__name__)
-            y = self.layer.Forward(flag, x.data)
-        if type(y) is tuple:
-            return tensor.from_raw_tensors(y)
-        else:
-            return tensor.from_raw_tensor(y)
+from singa import utils
+from .tensor import Tensor
+from . import singa_wrap as singa
 
-    def backward(self, flag, dy):
-        '''Backward propagate gradients through this layer.
 
-        Args:
-            flag (int): for future use.
-            dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the
-                objective loss
-        Return:
-            <dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x
-            , dpi is the gradient of the i-th parameter
-        '''
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
+class LayerMeta(type):
 
-        if type(dy) == list:
-            dys = [t.data for t in dy]
-            ret = self.layer.BackwardWithMultInputs(flag, dys)
-        else:
-            assert isinstance(dy, tensor.Tensor), \
-                'input of %s (type:%s) must be a Tensor or Tensor list'\
-                % (self.name, type(dy).__name__)
-            dys = dy.data
-            ret = self.layer.Backward(flag, dys)
-        if type(ret[0]) is tuple:
-            dxs = tensor.from_raw_tensors(ret[0])
-        else:
-            dxs = tensor.from_raw_tensor(ret[0])
-        return dxs, tensor.from_raw_tensors(ret[1])
+    def init_wrapper(func):
 
-    def to_device(self, device):
-        '''Move layer state tensors onto the given device.
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if len(args) == 0:
+                return
 
-        Args:
-            device: swig converted device, created using singa.device
-        '''
-        if self.layer is not None:
-            self.layer.ToDevice(device)
+            if isinstance(args[0], list):
+                assert len(args) > 0 and isinstance(args[0][0], Tensor), (
+                    'initialize function expects PlaceHolders or Tensors')
+                dev = args[0][0].device
+            else:
+                assert len(args) > 0 and isinstance(args[0], Tensor), (
+                    'initialize function expects PlaceHolders or Tensors')
+                dev = args[0].device
 
-    def as_type(self, dtype):
-        pass
+            self._get_unique_name()
+            prev_state = dev.graph_enabled()
+            dev.EnableGraph(False)
+            func(self, *args, **kwargs)
+            self._initialzied = True
+            dev.EnableGraph(prev_state)
 
-    def __copy__(self):
-        pass
+        return wrapper
 
-    def __deepcopy__(self, memo):
-        pass
+    def __new__(cls, name, bases, attr):
+        if 'initialize' in attr:
+            attr['initialize'] = LayerMeta.init_wrapper(attr['initialize'])
 
+        return super(LayerMeta, cls).__new__(cls, name, bases, attr)
 
-class Dummy(Layer):
-    '''A dummy layer that does nothing but just forwards/backwards the data
-    (the input/output is a single tensor).
-    '''
-
-    def __init__(self, name, input_sample_shape=None):
-        super(Dummy, self).__init__(name)
-        self.output_sample_shape = input_sample_shape
-
-    def get_output_sample_shape(self):
-        return self.output_sample_shape
-
-    def setup(self, input_sample_shape):
-        self.output_sample_shape = input_sample_shape
-        self.has_setup = True
-
-    def forward(self, flag, x):
-        '''Return the input x'''
-        return x
-
-    def backward(self, falg, dy):
-        '''Return dy, []'''
-        return dy, []
-
-
-class Conv2D(Layer):
-    """Construct a layer for 2D convolution.
-
-    Args:
-        nb_kernels (int): num of the channels (kernels) of the input Tensor
-        kernel: an integer or a pair of integers for kernel height and width
-        stride: an integer or a pair of integers for stride height and width
-        border_mode (string): padding mode, case in-sensitive,
-            'valid' -> padding is 0 for height and width
-            'same' -> padding is half of the kernel (floor), the kernel must be
-            odd number.
-        cudnn_prefer (string): the preferred algorithm for cudnn convolution
-            which could be 'fastest', 'autotune', 'limited_workspace' and
-            'no_workspace'
-        workspace_byte_limit(int): max workspace size in MB (default is 512MB)
-        data_format (string): either 'NCHW' or 'NHWC'
-        use_bias (bool): True or False
-        pad: an integer or a pair of integers for padding height and width
-        W_specs (dict): used to specify the weight matrix specs, fields
-            include,
-            'name' for parameter name
-            'lr_mult' for learning rate multiplier
-            'decay_mult' for weight decay multiplier
-            'init' for init method, which could be 'gaussian', 'uniform',
-            'xavier' and ''
-            'std', 'mean', 'high', 'low' for corresponding init methods
-            TODO(wangwei) 'clamp' for gradient constraint, value is scalar
-            'regularizer' for regularization, currently support 'l2'
-        b_specs (dict): hyper-parameters for bias vector, similar as W_specs
-        name (string): layer name.
-        input_sample_shape: 3d tuple for the shape of the input Tensor
-            without the batchsize, e.g., (channel, height, width) or
-            (height, width, channel)
-    """
 
-    def __init__(self,
-                 name,
-                 nb_kernels,
-                 kernel=3,
-                 stride=1,
-                 border_mode='same',
-                 cudnn_prefer='fastest',
-                 workspace_byte_limit=1024,
-                 data_format='NCHW',
-                 use_bias=True,
-                 W_specs=None,
-                 b_specs=None,
-                 pad=None,
-                 input_sample_shape=None):
-        super(Conv2D, self).__init__(name)
-        assert data_format == 'NCHW', 'Not supported data format: %s ' \
-            'only "NCHW" is enabled currently' % (data_format)
-        conf = self.conf.convolution_conf
-        conf.num_output = nb_kernels
-        conf.prefer = cudnn_prefer
-        conf.workspace_byte_limit = workspace_byte_limit
-        self.kernel = kernel
-        self.stride = stride
-        self.pad = pad
-        self.border_mode = border_mode
-        conf.bias_term = use_bias
-        # TODO(wangwei) enable data format for cpp code
-        # conf.data_format = data_format
-        if W_specs is None:
-            W_specs = {'init': 'xavier'}
-        if 'name' not in W_specs:
-            W_specs['name'] = name + '/weight'
-        wspecs = _construct_param_specs_from_dict(W_specs)
-        self.conf.param.extend([wspecs])
-        self.param_specs.append(wspecs)
-        if use_bias:
-            if b_specs is None:
-                b_specs = {'init': 'constant'}
-            if 'name' not in b_specs:
-                b_specs['name'] = name + '/bias'
-            bspecs = _construct_param_specs_from_dict(b_specs)
-            self.conf.param.extend([bspecs])
-            self.param_specs.append(bspecs)
-
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
-        self.layer = _create_layer(engine, 'Convolution')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
-    def setup(self, in_shape):
-        '''Set up the kernel, stride and padding; then call the C++ setup
-        function to create params and set some meta data.
+class Layer(object, metaclass=LayerMeta):
 
-        Args:
-                in_shapes is a tuple of int for the input sample shape
-        '''
-        if self.has_setup:
-            return
-        _set_kernel_stride_pad(self.conf.convolution_conf, self.kernel,
-                               self.stride, self.border_mode, self.pad,
-                               in_shape)
-        self.layer.Setup(list(in_shape), self.conf.SerializeToString())
-        self.has_setup = True
-
-
-class Conv1D(Conv2D):
-    """Construct a layer for 1D convolution.
-
-    Most of the args are the same as those for Conv2D except the kernel,
-    stride, pad, which is a scalar instead of a tuple.
-    input_sample_shape is a tuple with a single value for the input feature
-    length
-    """
+    sep = '.'
 
-    def __init__(self,
-                 name,
-                 nb_kernels,
-                 kernel=3,
-                 stride=1,
-                 border_mode='same',
-                 cudnn_prefer='fastest',
-                 workspace_byte_limit=1024,
-                 use_bias=True,
-                 W_specs={'init': 'Xavier'},
-                 b_specs={
-                     'init': 'Constant',
-                     'value': 0
-                 },
-                 pad=None,
-                 input_sample_shape=None):
-        pad = None
-        if pad is not None:
-            pad = (0, pad)
-        if input_sample_shape is not None:
-            input_sample_shape = (1, 1, input_sample_shape[0])
-        super(Conv1D, self).__init__(name,
-                                     nb_kernels, (1, kernel), (0, stride),
-                                     border_mode,
-                                     cudnn_prefer,
-                                     workspace_byte_limit,
-                                     use_bias=use_bias,
-                                     pad=pad,
-                                     W_specs=W_specs,
-                                     b_specs=b_specs,
-                                     input_sample_shape=input_sample_shape)
-
-    def get_output_sample_shape(self):
-        shape = self.layer.GetOutputSampleShape()
-        assert len(shape) == 3, 'The output sample shape should be 3D.'\
-            'But the length is %d' % len(shape)
-        return (shape[0], shape[2])
-
-
-class Pooling2D(Layer):
-    '''2D pooling layer providing max/avg pooling.
-
-    All args are the same as those for Conv2D, except the following one
-
-    Args:
-        mode: pooling type, model_pb2.PoolingConf.MAX or
-            model_pb2.PoolingConf.AVE
-
-    '''
+    def __init__(self):
+        self.name = self.__class__.__name__
+        self._initialized = False
+        self._parent = None
+        self._layers = dict()
 
-    def __init__(self,
-                 name,
-                 mode,
-                 kernel=3,
-                 stride=2,
-                 border_mode='same',
-                 pad=None,
-                 data_format='NCHW',
-                 input_sample_shape=None):
-        super(Pooling2D, self).__init__(name)
-        assert data_format == 'NCHW', 'Not supported data format: %s ' \
-            'only "NCHW" is enabled currently' % (data_format)
-        conf = self.conf.pooling_conf
-        conf.pool = mode
-        self.kernel = kernel
-        self.stride = stride
-        self.pad = pad
-        self.border_mode = border_mode
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
-        self.layer = _create_layer(engine, 'Pooling')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
+    def initialize(self, *input):
+        pass
 
-    def setup(self, in_shape):
-        '''Set up the kernel, stride and padding; then call the C++ setup
-        function to create params and set some meta data.
+    def forward(self, *input):
+        pass
 
-        Args:
-            in_shapes is a tuple of int for the input sample shape
-        '''
-        if self.has_setup:
-            return
-        _set_kernel_stride_pad(self.conf.pooling_conf, self.kernel, self.stride,
-                               self.border_mode, self.pad, in_shape)
-        self.layer.Setup(list(in_shape), self.conf.SerializeToString())
-        self.has_setup = True
+    def __call__(self, *args, **kwargs):
+        if not self._initialized:
+            self.initialize(*args, **kwargs)
+            self._initialized = True
+
+        return self.forward(*args, **kwargs)
+
+    def get_params(self):
+        params = dict()
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            params.update(sublayer.get_params())
+        return params
+
+    def set_params(self, parameters):
+        # set parameters for Layer
+        # input should be either a PyTensor or numpy ndarray.
+        # examples: Layer.set_params(W=np.ones((in, out), dtype=np.float32)),
+        # Layer.set_params(**{'block1':{'linear1':{'W':np.ones((in, out),
+        # dtype=np.float32)}}})
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            sublayer.set_params(parameters)
+
+    def get_states(self):
+        states = dict()
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            states.update(sublayer.get_states())
+        states.update(self.get_params())
+        return states
+
+    def set_states(self, states):
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            sublayer.set_states(states)
+        self.set_params(states)
+
+    def device_check(self, *inputs):
+        x_device = inputs[0].device
+        x_dev_id = x_device.id()
+        for var in inputs:
+            if var.device.id() != x_dev_id:
+                var.to_device(x_device)
+
+    def set_attribute(self, attribute, attribute_value):

Review comment:
       can python call this function implicitly when we do 
   `self.W = parameters[self.W.name]`

##########
File path: python/singa/layer.py
##########
@@ -8,1443 +8,1279 @@
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 # =============================================================================
-""" Python layers wrap the C++ layers to provide simpler construction APIs.
-
-Example usages::
-
-    from singa import layer
-    from singa import tensor
-    from singa import device
-
-    layer.engine = 'cudnn'  # to use cudnn layers
-    dev = device.create_cuda_gpu()
-
-    # create a convolution layer
-    conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32))
-
-    # init param values
-    w, b = conv.param_values()
-    w.guassian(0, 0.01)
-    b.set_value(0)
-    conv.to_device(dev)  # move the layer data onto a CudaGPU device
-
-    x = tensor.Tensor((3, 32, 32), dev)
-    x.uniform(-1, 1)
-    y = conv.foward(True, x)
-
-    dy = tensor.Tensor()
-    dy.reset_like(y)
-    dy.set_value(0.1)
-    # dp is a list of tensors for parameter gradients
-    dx, dp = conv.backward(kTrain, dy)
-"""
-from __future__ import division
-from __future__ import absolute_import
-
-from builtins import str
-from builtins import range
-from builtins import object
-from builtins import set
-
-from . import singa_wrap
-from .proto import model_pb2
-from . import tensor
-
-engine = 'cudnn'
-'''engine is the prefix of layer identifier.
-
-The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for
-layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively.
-For example, CudnnConvolution layer is identified by 'cudnn_convolution';
-'singacpp_convolution' is for Convolution layer;
-Some layers' implementation use only Tensor functions, thererfore they are
-transparent to the underlying devices. For threse layers, they would have
-multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and
-singacl_dropout are all for the Dropout layer. In addition, it has an extra
-identifier 'singa', i.e. 'singa_dropout' also stands for the Dropout layer.
-
-engine is case insensitive. Each python layer would create the correct specific
-layer using the engine attribute.
-'''
-
-if singa_wrap.USE_CUDNN:
-    cudnn_version = singa_wrap.CUDNN_VERSION
-else:
-    cudnn_version = 0
-
-
-class Layer(object):
-    '''Base Python layer class.
-
-    Typically, the life cycle of a layer instance includes:
-        1. construct layer without input_sample_shapes, goto 2;
-           construct layer with input_sample_shapes, goto 3;
-        2. call setup to create the parameters and setup other meta fields
-        3. call forward or access layer members
-        4. call backward and get parameters for update
-
-    Args:
-        name (str): layer name
-    '''
-
-    def __init__(self, name, conf=None, **kwargs):
-        if conf is None:
-            self.layer = None  # layer converted by swig
-            self.name = name  # TODO(wangwei) duplicate with self.conf.name
-            self.conf = model_pb2.LayerConf()
-            self.conf.name = name
-            self.param_specs = []
-        else:
-            self.conf = conf
-            self.name = conf.name
-            self.caffe_layer()
-            self.param_specs = []
-
-            # convert caffe proto into singa proto format
-            #   case1: parameters of conv and dense layers
-            #   case2: type of activation layers
-            if (conf.type == 'Convolution' or conf.type == 4) or \
-                    (conf.type == 'InnerProduct' or conf.type == 14):
-                w, b = _construct_param_specs_from_caffe_proto(conf)
-                del conf.param[:]
-                conf.param.extend([w, b])
-                self.param_specs.append(w)
-                self.param_specs.append(b)
-                # print 'conf:\n', conf
-            if conf.type == 'Pooling':
-                conf.pooling_conf.ceil = True
-                # print 'conf:\n', conf
-            elif (conf.type == 'ReLU' or conf.type == 18 or
-                  conf.type == 'Sigmoid' or conf.type == 19 or
-                  conf.type == 'TanH' or conf.type == 23):
-                conf.type = (engine + '_' + conf.type).lower()
-            self.conf = conf
-
-        self.has_setup = False
-
-    def setup(self, in_shapes):
-        '''Call the C++ setup function to create params and set some meta data.
-
-        Args:
-            in_shapes: if the layer accepts a single input Tensor, in_shapes is
-                a single tuple specifying the inpute Tensor shape; if the layer
-                accepts multiple input Tensor (e.g., the concatenation layer),
-                in_shapes is a tuple of tuples, each for one input Tensor
-        '''
-        if self.has_setup:
-            return
-        if type(in_shapes[0]) is tuple:
-            self.layer.SetupWithMultInputs([list(s) for s in in_shapes],
-                                           self.conf.SerializeToString())
-        else:
-            self.layer.Setup(list(in_shapes), self.conf.SerializeToString())
-        self.has_setup = True
-
-    def caffe_layer(self):
-        '''
-        Create a singa layer based on caffe layer configuration.
-        '''
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
-        if self.conf.type == 'InnerProduct' or self.conf.type == 14:
-            self.layer = _create_layer(engine, 'Dense')
-        else:
-            self.layer = _create_layer(engine, self.conf.type)
-
-    def get_output_sample_shape(self):
-        '''Called after setup to get the shape of the output sample(s).
-
-        Returns:
-            a tuple for a single output Tensor or a list of tuples if this layer
-            has multiple outputs
-        '''
-        assert self.has_setup, \
-            'Must call setup() before get_output_sample_shape()'
-        return self.layer.GetOutputSampleShape()
-
-    def param_names(self):
-        '''
-        Returns:
-            a list of strings, one for the name of one parameter Tensor
-        '''
-        names = []
-        for x in self.param_specs:
-            names.append(x.name)
-        return names
-
-    def param_values(self):
-        '''Return param value tensors.
-
-        Parameter tensors are not stored as layer members because cpp Tensor
-        could be moved onto diff devices due to the change of layer device,
-        which would result in inconsistency.
-
-        Returns:
-            a list of tensors, one for each paramter
-        '''
-        if self.layer is None:
-            return []
-        else:
-            return tensor.from_raw_tensors(self.layer.param_values())
 
-    def forward(self, flag, x):
-        '''Forward propagate through this layer.
+import math
+import numpy as np
+from functools import wraps
 
-        Args:
-            flag: True (kTrain) for training (kEval); False for evaluating;
-                other values for furture use.
-            x (Tensor or list<Tensor>): an input tensor if the layer is
-                connected from a single layer; a list of tensors if the layer
-                is connected from multiple layers.
-
-        Return:
-            a tensor if the layer is connected to a single layer; a list of
-            tensors if the layer is connected to multiple layers;
-        '''
-        assert self.has_setup, 'Must call setup() before forward()'
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
-        if type(x) is list:
-            xs = [t.data for t in x]
-            y = self.layer.ForwardWithMultInputs(flag, xs)
-        else:
-            assert isinstance(x, tensor.Tensor), \
-                'input of %s (type:%s) must be a Tensor or Tensor list'\
-                % (self.name, type(x).__name__)
-            y = self.layer.Forward(flag, x.data)
-        if type(y) is tuple:
-            return tensor.from_raw_tensors(y)
-        else:
-            return tensor.from_raw_tensor(y)
+from singa import utils
+from .tensor import Tensor
+from . import singa_wrap as singa
 
-    def backward(self, flag, dy):
-        '''Backward propagate gradients through this layer.
 
-        Args:
-            flag (int): for future use.
-            dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the
-                objective loss
-        Return:
-            <dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x
-            , dpi is the gradient of the i-th parameter
-        '''
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
+class LayerMeta(type):
 
-        if type(dy) == list:
-            dys = [t.data for t in dy]
-            ret = self.layer.BackwardWithMultInputs(flag, dys)
-        else:
-            assert isinstance(dy, tensor.Tensor), \
-                'input of %s (type:%s) must be a Tensor or Tensor list'\
-                % (self.name, type(dy).__name__)
-            dys = dy.data
-            ret = self.layer.Backward(flag, dys)
-        if type(ret[0]) is tuple:
-            dxs = tensor.from_raw_tensors(ret[0])
-        else:
-            dxs = tensor.from_raw_tensor(ret[0])
-        return dxs, tensor.from_raw_tensors(ret[1])
+    def init_wrapper(func):
 
-    def to_device(self, device):
-        '''Move layer state tensors onto the given device.
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if len(args) == 0:
+                return
 
-        Args:
-            device: swig converted device, created using singa.device
-        '''
-        if self.layer is not None:
-            self.layer.ToDevice(device)
+            if isinstance(args[0], list):
+                assert len(args) > 0 and isinstance(args[0][0], Tensor), (
+                    'initialize function expects PlaceHolders or Tensors')
+                dev = args[0][0].device
+            else:
+                assert len(args) > 0 and isinstance(args[0], Tensor), (
+                    'initialize function expects PlaceHolders or Tensors')
+                dev = args[0].device
 
-    def as_type(self, dtype):
-        pass
+            self._get_unique_name()
+            prev_state = dev.graph_enabled()
+            dev.EnableGraph(False)
+            func(self, *args, **kwargs)
+            self._initialzied = True
+            dev.EnableGraph(prev_state)
 
-    def __copy__(self):
-        pass
+        return wrapper
 
-    def __deepcopy__(self, memo):
-        pass
+    def __new__(cls, name, bases, attr):
+        if 'initialize' in attr:
+            attr['initialize'] = LayerMeta.init_wrapper(attr['initialize'])
 
+        return super(LayerMeta, cls).__new__(cls, name, bases, attr)
 
-class Dummy(Layer):
-    '''A dummy layer that does nothing but just forwards/backwards the data
-    (the input/output is a single tensor).
-    '''
-
-    def __init__(self, name, input_sample_shape=None):
-        super(Dummy, self).__init__(name)
-        self.output_sample_shape = input_sample_shape
-
-    def get_output_sample_shape(self):
-        return self.output_sample_shape
-
-    def setup(self, input_sample_shape):
-        self.output_sample_shape = input_sample_shape
-        self.has_setup = True
-
-    def forward(self, flag, x):
-        '''Return the input x'''
-        return x
-
-    def backward(self, falg, dy):
-        '''Return dy, []'''
-        return dy, []
-
-
-class Conv2D(Layer):
-    """Construct a layer for 2D convolution.
-
-    Args:
-        nb_kernels (int): num of the channels (kernels) of the input Tensor
-        kernel: an integer or a pair of integers for kernel height and width
-        stride: an integer or a pair of integers for stride height and width
-        border_mode (string): padding mode, case in-sensitive,
-            'valid' -> padding is 0 for height and width
-            'same' -> padding is half of the kernel (floor), the kernel must be
-            odd number.
-        cudnn_prefer (string): the preferred algorithm for cudnn convolution
-            which could be 'fastest', 'autotune', 'limited_workspace' and
-            'no_workspace'
-        workspace_byte_limit(int): max workspace size in MB (default is 512MB)
-        data_format (string): either 'NCHW' or 'NHWC'
-        use_bias (bool): True or False
-        pad: an integer or a pair of integers for padding height and width
-        W_specs (dict): used to specify the weight matrix specs, fields
-            include,
-            'name' for parameter name
-            'lr_mult' for learning rate multiplier
-            'decay_mult' for weight decay multiplier
-            'init' for init method, which could be 'gaussian', 'uniform',
-            'xavier' and ''
-            'std', 'mean', 'high', 'low' for corresponding init methods
-            TODO(wangwei) 'clamp' for gradient constraint, value is scalar
-            'regularizer' for regularization, currently support 'l2'
-        b_specs (dict): hyper-parameters for bias vector, similar as W_specs
-        name (string): layer name.
-        input_sample_shape: 3d tuple for the shape of the input Tensor
-            without the batchsize, e.g., (channel, height, width) or
-            (height, width, channel)
-    """
 
-    def __init__(self,
-                 name,
-                 nb_kernels,
-                 kernel=3,
-                 stride=1,
-                 border_mode='same',
-                 cudnn_prefer='fastest',
-                 workspace_byte_limit=1024,
-                 data_format='NCHW',
-                 use_bias=True,
-                 W_specs=None,
-                 b_specs=None,
-                 pad=None,
-                 input_sample_shape=None):
-        super(Conv2D, self).__init__(name)
-        assert data_format == 'NCHW', 'Not supported data format: %s ' \
-            'only "NCHW" is enabled currently' % (data_format)
-        conf = self.conf.convolution_conf
-        conf.num_output = nb_kernels
-        conf.prefer = cudnn_prefer
-        conf.workspace_byte_limit = workspace_byte_limit
-        self.kernel = kernel
-        self.stride = stride
-        self.pad = pad
-        self.border_mode = border_mode
-        conf.bias_term = use_bias
-        # TODO(wangwei) enable data format for cpp code
-        # conf.data_format = data_format
-        if W_specs is None:
-            W_specs = {'init': 'xavier'}
-        if 'name' not in W_specs:
-            W_specs['name'] = name + '/weight'
-        wspecs = _construct_param_specs_from_dict(W_specs)
-        self.conf.param.extend([wspecs])
-        self.param_specs.append(wspecs)
-        if use_bias:
-            if b_specs is None:
-                b_specs = {'init': 'constant'}
-            if 'name' not in b_specs:
-                b_specs['name'] = name + '/bias'
-            bspecs = _construct_param_specs_from_dict(b_specs)
-            self.conf.param.extend([bspecs])
-            self.param_specs.append(bspecs)
-
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
-        self.layer = _create_layer(engine, 'Convolution')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
-    def setup(self, in_shape):
-        '''Set up the kernel, stride and padding; then call the C++ setup
-        function to create params and set some meta data.
+class Layer(object, metaclass=LayerMeta):
 
-        Args:
-                in_shapes is a tuple of int for the input sample shape
-        '''
-        if self.has_setup:
-            return
-        _set_kernel_stride_pad(self.conf.convolution_conf, self.kernel,
-                               self.stride, self.border_mode, self.pad,
-                               in_shape)
-        self.layer.Setup(list(in_shape), self.conf.SerializeToString())
-        self.has_setup = True
-
-
-class Conv1D(Conv2D):
-    """Construct a layer for 1D convolution.
-
-    Most of the args are the same as those for Conv2D except the kernel,
-    stride, pad, which is a scalar instead of a tuple.
-    input_sample_shape is a tuple with a single value for the input feature
-    length
-    """
+    sep = '.'
 
-    def __init__(self,
-                 name,
-                 nb_kernels,
-                 kernel=3,
-                 stride=1,
-                 border_mode='same',
-                 cudnn_prefer='fastest',
-                 workspace_byte_limit=1024,
-                 use_bias=True,
-                 W_specs={'init': 'Xavier'},
-                 b_specs={
-                     'init': 'Constant',
-                     'value': 0
-                 },
-                 pad=None,
-                 input_sample_shape=None):
-        pad = None
-        if pad is not None:
-            pad = (0, pad)
-        if input_sample_shape is not None:
-            input_sample_shape = (1, 1, input_sample_shape[0])
-        super(Conv1D, self).__init__(name,
-                                     nb_kernels, (1, kernel), (0, stride),
-                                     border_mode,
-                                     cudnn_prefer,
-                                     workspace_byte_limit,
-                                     use_bias=use_bias,
-                                     pad=pad,
-                                     W_specs=W_specs,
-                                     b_specs=b_specs,
-                                     input_sample_shape=input_sample_shape)
-
-    def get_output_sample_shape(self):
-        shape = self.layer.GetOutputSampleShape()
-        assert len(shape) == 3, 'The output sample shape should be 3D.'\
-            'But the length is %d' % len(shape)
-        return (shape[0], shape[2])
-
-
-class Pooling2D(Layer):
-    '''2D pooling layer providing max/avg pooling.
-
-    All args are the same as those for Conv2D, except the following one
-
-    Args:
-        mode: pooling type, model_pb2.PoolingConf.MAX or
-            model_pb2.PoolingConf.AVE
-
-    '''
+    def __init__(self):
+        self.name = self.__class__.__name__
+        self._initialized = False
+        self._parent = None
+        self._layers = dict()
 
-    def __init__(self,
-                 name,
-                 mode,
-                 kernel=3,
-                 stride=2,
-                 border_mode='same',
-                 pad=None,
-                 data_format='NCHW',
-                 input_sample_shape=None):
-        super(Pooling2D, self).__init__(name)
-        assert data_format == 'NCHW', 'Not supported data format: %s ' \
-            'only "NCHW" is enabled currently' % (data_format)
-        conf = self.conf.pooling_conf
-        conf.pool = mode
-        self.kernel = kernel
-        self.stride = stride
-        self.pad = pad
-        self.border_mode = border_mode
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
-        self.layer = _create_layer(engine, 'Pooling')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
+    def initialize(self, *input):
+        pass
 
-    def setup(self, in_shape):
-        '''Set up the kernel, stride and padding; then call the C++ setup
-        function to create params and set some meta data.
+    def forward(self, *input):
+        pass
 
-        Args:
-            in_shapes is a tuple of int for the input sample shape
-        '''
-        if self.has_setup:
-            return
-        _set_kernel_stride_pad(self.conf.pooling_conf, self.kernel, self.stride,
-                               self.border_mode, self.pad, in_shape)
-        self.layer.Setup(list(in_shape), self.conf.SerializeToString())
-        self.has_setup = True
+    def __call__(self, *args, **kwargs):
+        if not self._initialized:
+            self.initialize(*args, **kwargs)
+            self._initialized = True
+
+        return self.forward(*args, **kwargs)
+
+    def get_params(self):
+        params = dict()
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            params.update(sublayer.get_params())
+        return params
+
+    def set_params(self, parameters):
+        # set parameters for Layer
+        # input should be either a PyTensor or numpy ndarray.
+        # examples: Layer.set_params(W=np.ones((in, out), dtype=np.float32)),
+        # Layer.set_params(**{'block1':{'linear1':{'W':np.ones((in, out),
+        # dtype=np.float32)}}})
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            sublayer.set_params(parameters)
+
+    def get_states(self):
+        states = dict()
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            states.update(sublayer.get_states())
+        states.update(self.get_params())
+        return states
+
+    def set_states(self, states):
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            sublayer.set_states(states)
+        self.set_params(states)
+
+    def device_check(self, *inputs):
+        x_device = inputs[0].device
+        x_dev_id = x_device.id()
+        for var in inputs:
+            if var.device.id() != x_dev_id:
+                var.to_device(x_device)
+
+    def set_attribute(self, attribute, attribute_value):
+        assert (attribute_value.shape == attribute.shape), "Shape dismatched."
+        if isinstance(attribute_value, Tensor):
+            attribute.reset_like(attribute_value)
+            attribute.copy_data(attribute_value)
+        elif isinstance(attribute_value, np.ndarray):
+            attribute.copy_from_numpy(attribute_value)
+        else:
+            raise ValueError("attributes should be Tensor or Numpy array.")
+
+    def _get_unique_name(self):
+        prefix = ''

Review comment:
       this function is be called recursively to set all sublayers' name?

##########
File path: python/singa/layer.py
##########
@@ -8,1443 +8,1279 @@
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 # =============================================================================
-""" Python layers wrap the C++ layers to provide simpler construction APIs.
-
-Example usages::
-
-    from singa import layer
-    from singa import tensor
-    from singa import device
-
-    layer.engine = 'cudnn'  # to use cudnn layers
-    dev = device.create_cuda_gpu()
-
-    # create a convolution layer
-    conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32))
-
-    # init param values
-    w, b = conv.param_values()
-    w.guassian(0, 0.01)
-    b.set_value(0)
-    conv.to_device(dev)  # move the layer data onto a CudaGPU device
-
-    x = tensor.Tensor((3, 32, 32), dev)
-    x.uniform(-1, 1)
-    y = conv.foward(True, x)
-
-    dy = tensor.Tensor()
-    dy.reset_like(y)
-    dy.set_value(0.1)
-    # dp is a list of tensors for parameter gradients
-    dx, dp = conv.backward(kTrain, dy)
-"""
-from __future__ import division
-from __future__ import absolute_import
-
-from builtins import str
-from builtins import range
-from builtins import object
-from builtins import set
-
-from . import singa_wrap
-from .proto import model_pb2
-from . import tensor
-
-engine = 'cudnn'
-'''engine is the prefix of layer identifier.
-
-The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for
-layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively.
-For example, CudnnConvolution layer is identified by 'cudnn_convolution';
-'singacpp_convolution' is for Convolution layer;
-Some layers' implementation use only Tensor functions, thererfore they are
-transparent to the underlying devices. For threse layers, they would have
-multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and
-singacl_dropout are all for the Dropout layer. In addition, it has an extra
-identifier 'singa', i.e. 'singa_dropout' also stands for the Dropout layer.
-
-engine is case insensitive. Each python layer would create the correct specific
-layer using the engine attribute.
-'''
-
-if singa_wrap.USE_CUDNN:
-    cudnn_version = singa_wrap.CUDNN_VERSION
-else:
-    cudnn_version = 0
-
-
-class Layer(object):
-    '''Base Python layer class.
-
-    Typically, the life cycle of a layer instance includes:
-        1. construct layer without input_sample_shapes, goto 2;
-           construct layer with input_sample_shapes, goto 3;
-        2. call setup to create the parameters and setup other meta fields
-        3. call forward or access layer members
-        4. call backward and get parameters for update
-
-    Args:
-        name (str): layer name
-    '''
-
-    def __init__(self, name, conf=None, **kwargs):
-        if conf is None:
-            self.layer = None  # layer converted by swig
-            self.name = name  # TODO(wangwei) duplicate with self.conf.name
-            self.conf = model_pb2.LayerConf()
-            self.conf.name = name
-            self.param_specs = []
-        else:
-            self.conf = conf
-            self.name = conf.name
-            self.caffe_layer()
-            self.param_specs = []
-
-            # convert caffe proto into singa proto format
-            #   case1: parameters of conv and dense layers
-            #   case2: type of activation layers
-            if (conf.type == 'Convolution' or conf.type == 4) or \
-                    (conf.type == 'InnerProduct' or conf.type == 14):
-                w, b = _construct_param_specs_from_caffe_proto(conf)
-                del conf.param[:]
-                conf.param.extend([w, b])
-                self.param_specs.append(w)
-                self.param_specs.append(b)
-                # print 'conf:\n', conf
-            if conf.type == 'Pooling':
-                conf.pooling_conf.ceil = True
-                # print 'conf:\n', conf
-            elif (conf.type == 'ReLU' or conf.type == 18 or
-                  conf.type == 'Sigmoid' or conf.type == 19 or
-                  conf.type == 'TanH' or conf.type == 23):
-                conf.type = (engine + '_' + conf.type).lower()
-            self.conf = conf
-
-        self.has_setup = False
-
-    def setup(self, in_shapes):
-        '''Call the C++ setup function to create params and set some meta data.
-
-        Args:
-            in_shapes: if the layer accepts a single input Tensor, in_shapes is
-                a single tuple specifying the inpute Tensor shape; if the layer
-                accepts multiple input Tensor (e.g., the concatenation layer),
-                in_shapes is a tuple of tuples, each for one input Tensor
-        '''
-        if self.has_setup:
-            return
-        if type(in_shapes[0]) is tuple:
-            self.layer.SetupWithMultInputs([list(s) for s in in_shapes],
-                                           self.conf.SerializeToString())
-        else:
-            self.layer.Setup(list(in_shapes), self.conf.SerializeToString())
-        self.has_setup = True
-
-    def caffe_layer(self):
-        '''
-        Create a singa layer based on caffe layer configuration.
-        '''
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
-        if self.conf.type == 'InnerProduct' or self.conf.type == 14:
-            self.layer = _create_layer(engine, 'Dense')
-        else:
-            self.layer = _create_layer(engine, self.conf.type)
-
-    def get_output_sample_shape(self):
-        '''Called after setup to get the shape of the output sample(s).
-
-        Returns:
-            a tuple for a single output Tensor or a list of tuples if this layer
-            has multiple outputs
-        '''
-        assert self.has_setup, \
-            'Must call setup() before get_output_sample_shape()'
-        return self.layer.GetOutputSampleShape()
-
-    def param_names(self):
-        '''
-        Returns:
-            a list of strings, one for the name of one parameter Tensor
-        '''
-        names = []
-        for x in self.param_specs:
-            names.append(x.name)
-        return names
-
-    def param_values(self):
-        '''Return param value tensors.
-
-        Parameter tensors are not stored as layer members because cpp Tensor
-        could be moved onto diff devices due to the change of layer device,
-        which would result in inconsistency.
-
-        Returns:
-            a list of tensors, one for each paramter
-        '''
-        if self.layer is None:
-            return []
-        else:
-            return tensor.from_raw_tensors(self.layer.param_values())
 
-    def forward(self, flag, x):
-        '''Forward propagate through this layer.
+import math
+import numpy as np
+from functools import wraps
 
-        Args:
-            flag: True (kTrain) for training (kEval); False for evaluating;
-                other values for furture use.
-            x (Tensor or list<Tensor>): an input tensor if the layer is
-                connected from a single layer; a list of tensors if the layer
-                is connected from multiple layers.
-
-        Return:
-            a tensor if the layer is connected to a single layer; a list of
-            tensors if the layer is connected to multiple layers;
-        '''
-        assert self.has_setup, 'Must call setup() before forward()'
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
-        if type(x) is list:
-            xs = [t.data for t in x]
-            y = self.layer.ForwardWithMultInputs(flag, xs)
-        else:
-            assert isinstance(x, tensor.Tensor), \
-                'input of %s (type:%s) must be a Tensor or Tensor list'\
-                % (self.name, type(x).__name__)
-            y = self.layer.Forward(flag, x.data)
-        if type(y) is tuple:
-            return tensor.from_raw_tensors(y)
-        else:
-            return tensor.from_raw_tensor(y)
+from singa import utils
+from .tensor import Tensor
+from . import singa_wrap as singa
 
-    def backward(self, flag, dy):
-        '''Backward propagate gradients through this layer.
 
-        Args:
-            flag (int): for future use.
-            dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the
-                objective loss
-        Return:
-            <dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x
-            , dpi is the gradient of the i-th parameter
-        '''
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
+class LayerMeta(type):
 
-        if type(dy) == list:
-            dys = [t.data for t in dy]
-            ret = self.layer.BackwardWithMultInputs(flag, dys)
-        else:
-            assert isinstance(dy, tensor.Tensor), \
-                'input of %s (type:%s) must be a Tensor or Tensor list'\
-                % (self.name, type(dy).__name__)
-            dys = dy.data
-            ret = self.layer.Backward(flag, dys)
-        if type(ret[0]) is tuple:
-            dxs = tensor.from_raw_tensors(ret[0])
-        else:
-            dxs = tensor.from_raw_tensor(ret[0])
-        return dxs, tensor.from_raw_tensors(ret[1])
+    def init_wrapper(func):
 
-    def to_device(self, device):
-        '''Move layer state tensors onto the given device.
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if len(args) == 0:
+                return
 
-        Args:
-            device: swig converted device, created using singa.device
-        '''
-        if self.layer is not None:
-            self.layer.ToDevice(device)
+            if isinstance(args[0], list):
+                assert len(args) > 0 and isinstance(args[0][0], Tensor), (
+                    'initialize function expects PlaceHolders or Tensors')
+                dev = args[0][0].device
+            else:
+                assert len(args) > 0 and isinstance(args[0], Tensor), (
+                    'initialize function expects PlaceHolders or Tensors')
+                dev = args[0].device
 
-    def as_type(self, dtype):
-        pass
+            self._get_unique_name()
+            prev_state = dev.graph_enabled()
+            dev.EnableGraph(False)
+            func(self, *args, **kwargs)
+            self._initialzied = True
+            dev.EnableGraph(prev_state)
 
-    def __copy__(self):
-        pass
+        return wrapper
 
-    def __deepcopy__(self, memo):
-        pass
+    def __new__(cls, name, bases, attr):
+        if 'initialize' in attr:
+            attr['initialize'] = LayerMeta.init_wrapper(attr['initialize'])
 
+        return super(LayerMeta, cls).__new__(cls, name, bases, attr)
 
-class Dummy(Layer):
-    '''A dummy layer that does nothing but just forwards/backwards the data
-    (the input/output is a single tensor).
-    '''
-
-    def __init__(self, name, input_sample_shape=None):
-        super(Dummy, self).__init__(name)
-        self.output_sample_shape = input_sample_shape
-
-    def get_output_sample_shape(self):
-        return self.output_sample_shape
-
-    def setup(self, input_sample_shape):
-        self.output_sample_shape = input_sample_shape
-        self.has_setup = True
-
-    def forward(self, flag, x):
-        '''Return the input x'''
-        return x
-
-    def backward(self, falg, dy):
-        '''Return dy, []'''
-        return dy, []
-
-
-class Conv2D(Layer):
-    """Construct a layer for 2D convolution.
-
-    Args:
-        nb_kernels (int): num of the channels (kernels) of the input Tensor
-        kernel: an integer or a pair of integers for kernel height and width
-        stride: an integer or a pair of integers for stride height and width
-        border_mode (string): padding mode, case in-sensitive,
-            'valid' -> padding is 0 for height and width
-            'same' -> padding is half of the kernel (floor), the kernel must be
-            odd number.
-        cudnn_prefer (string): the preferred algorithm for cudnn convolution
-            which could be 'fastest', 'autotune', 'limited_workspace' and
-            'no_workspace'
-        workspace_byte_limit(int): max workspace size in MB (default is 512MB)
-        data_format (string): either 'NCHW' or 'NHWC'
-        use_bias (bool): True or False
-        pad: an integer or a pair of integers for padding height and width
-        W_specs (dict): used to specify the weight matrix specs, fields
-            include,
-            'name' for parameter name
-            'lr_mult' for learning rate multiplier
-            'decay_mult' for weight decay multiplier
-            'init' for init method, which could be 'gaussian', 'uniform',
-            'xavier' and ''
-            'std', 'mean', 'high', 'low' for corresponding init methods
-            TODO(wangwei) 'clamp' for gradient constraint, value is scalar
-            'regularizer' for regularization, currently support 'l2'
-        b_specs (dict): hyper-parameters for bias vector, similar as W_specs
-        name (string): layer name.
-        input_sample_shape: 3d tuple for the shape of the input Tensor
-            without the batchsize, e.g., (channel, height, width) or
-            (height, width, channel)
-    """
 
-    def __init__(self,
-                 name,
-                 nb_kernels,
-                 kernel=3,
-                 stride=1,
-                 border_mode='same',
-                 cudnn_prefer='fastest',
-                 workspace_byte_limit=1024,
-                 data_format='NCHW',
-                 use_bias=True,
-                 W_specs=None,
-                 b_specs=None,
-                 pad=None,
-                 input_sample_shape=None):
-        super(Conv2D, self).__init__(name)
-        assert data_format == 'NCHW', 'Not supported data format: %s ' \
-            'only "NCHW" is enabled currently' % (data_format)
-        conf = self.conf.convolution_conf
-        conf.num_output = nb_kernels
-        conf.prefer = cudnn_prefer
-        conf.workspace_byte_limit = workspace_byte_limit
-        self.kernel = kernel
-        self.stride = stride
-        self.pad = pad
-        self.border_mode = border_mode
-        conf.bias_term = use_bias
-        # TODO(wangwei) enable data format for cpp code
-        # conf.data_format = data_format
-        if W_specs is None:
-            W_specs = {'init': 'xavier'}
-        if 'name' not in W_specs:
-            W_specs['name'] = name + '/weight'
-        wspecs = _construct_param_specs_from_dict(W_specs)
-        self.conf.param.extend([wspecs])
-        self.param_specs.append(wspecs)
-        if use_bias:
-            if b_specs is None:
-                b_specs = {'init': 'constant'}
-            if 'name' not in b_specs:
-                b_specs['name'] = name + '/bias'
-            bspecs = _construct_param_specs_from_dict(b_specs)
-            self.conf.param.extend([bspecs])
-            self.param_specs.append(bspecs)
-
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
-        self.layer = _create_layer(engine, 'Convolution')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
-    def setup(self, in_shape):
-        '''Set up the kernel, stride and padding; then call the C++ setup
-        function to create params and set some meta data.
+class Layer(object, metaclass=LayerMeta):
 
-        Args:
-                in_shapes is a tuple of int for the input sample shape
-        '''
-        if self.has_setup:
-            return
-        _set_kernel_stride_pad(self.conf.convolution_conf, self.kernel,
-                               self.stride, self.border_mode, self.pad,
-                               in_shape)
-        self.layer.Setup(list(in_shape), self.conf.SerializeToString())
-        self.has_setup = True
-
-
-class Conv1D(Conv2D):
-    """Construct a layer for 1D convolution.
-
-    Most of the args are the same as those for Conv2D except the kernel,
-    stride, pad, which is a scalar instead of a tuple.
-    input_sample_shape is a tuple with a single value for the input feature
-    length
-    """
+    sep = '.'
 
-    def __init__(self,
-                 name,
-                 nb_kernels,
-                 kernel=3,
-                 stride=1,
-                 border_mode='same',
-                 cudnn_prefer='fastest',
-                 workspace_byte_limit=1024,
-                 use_bias=True,
-                 W_specs={'init': 'Xavier'},
-                 b_specs={
-                     'init': 'Constant',
-                     'value': 0
-                 },
-                 pad=None,
-                 input_sample_shape=None):
-        pad = None
-        if pad is not None:
-            pad = (0, pad)
-        if input_sample_shape is not None:
-            input_sample_shape = (1, 1, input_sample_shape[0])
-        super(Conv1D, self).__init__(name,
-                                     nb_kernels, (1, kernel), (0, stride),
-                                     border_mode,
-                                     cudnn_prefer,
-                                     workspace_byte_limit,
-                                     use_bias=use_bias,
-                                     pad=pad,
-                                     W_specs=W_specs,
-                                     b_specs=b_specs,
-                                     input_sample_shape=input_sample_shape)
-
-    def get_output_sample_shape(self):
-        shape = self.layer.GetOutputSampleShape()
-        assert len(shape) == 3, 'The output sample shape should be 3D.'\
-            'But the length is %d' % len(shape)
-        return (shape[0], shape[2])
-
-
-class Pooling2D(Layer):
-    '''2D pooling layer providing max/avg pooling.
-
-    All args are the same as those for Conv2D, except the following one
-
-    Args:
-        mode: pooling type, model_pb2.PoolingConf.MAX or
-            model_pb2.PoolingConf.AVE
-
-    '''
+    def __init__(self):
+        self.name = self.__class__.__name__
+        self._initialized = False
+        self._parent = None
+        self._layers = dict()
 
-    def __init__(self,
-                 name,
-                 mode,
-                 kernel=3,
-                 stride=2,
-                 border_mode='same',
-                 pad=None,
-                 data_format='NCHW',
-                 input_sample_shape=None):
-        super(Pooling2D, self).__init__(name)
-        assert data_format == 'NCHW', 'Not supported data format: %s ' \
-            'only "NCHW" is enabled currently' % (data_format)
-        conf = self.conf.pooling_conf
-        conf.pool = mode
-        self.kernel = kernel
-        self.stride = stride
-        self.pad = pad
-        self.border_mode = border_mode
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
-        self.layer = _create_layer(engine, 'Pooling')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
+    def initialize(self, *input):
+        pass
 
-    def setup(self, in_shape):
-        '''Set up the kernel, stride and padding; then call the C++ setup
-        function to create params and set some meta data.
+    def forward(self, *input):
+        pass
 
-        Args:
-            in_shapes is a tuple of int for the input sample shape
-        '''
-        if self.has_setup:
-            return
-        _set_kernel_stride_pad(self.conf.pooling_conf, self.kernel, self.stride,
-                               self.border_mode, self.pad, in_shape)
-        self.layer.Setup(list(in_shape), self.conf.SerializeToString())
-        self.has_setup = True
+    def __call__(self, *args, **kwargs):
+        if not self._initialized:
+            self.initialize(*args, **kwargs)
+            self._initialized = True
+
+        return self.forward(*args, **kwargs)
+
+    def get_params(self):
+        params = dict()
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            params.update(sublayer.get_params())
+        return params
+
+    def set_params(self, parameters):
+        # set parameters for Layer
+        # input should be either a PyTensor or numpy ndarray.
+        # examples: Layer.set_params(W=np.ones((in, out), dtype=np.float32)),
+        # Layer.set_params(**{'block1':{'linear1':{'W':np.ones((in, out),
+        # dtype=np.float32)}}})
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            sublayer.set_params(parameters)
+
+    def get_states(self):
+        states = dict()
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            states.update(sublayer.get_states())
+        states.update(self.get_params())
+        return states
+
+    def set_states(self, states):
+        sublayers = self._layers
+        for name, sublayer in sublayers.items():
+            sublayer.set_states(states)
+        self.set_params(states)
+
+    def device_check(self, *inputs):
+        x_device = inputs[0].device
+        x_dev_id = x_device.id()
+        for var in inputs:
+            if var.device.id() != x_dev_id:
+                var.to_device(x_device)
+
+    def set_attribute(self, attribute, attribute_value):
+        assert (attribute_value.shape == attribute.shape), "Shape dismatched."
+        if isinstance(attribute_value, Tensor):
+            attribute.reset_like(attribute_value)
+            attribute.copy_data(attribute_value)
+        elif isinstance(attribute_value, np.ndarray):
+            attribute.copy_from_numpy(attribute_value)
+        else:
+            raise ValueError("attributes should be Tensor or Numpy array.")
+
+    def _get_unique_name(self):
+        prefix = ''
+        if self._parent:
+            prefix = self._parent.name
+            if prefix:
+                prefix += Layer.sep
+
+        self.name = prefix + self.name
+        return self.name
+
+    def __getattr__(self, name):
+        if '_layers' in self.__dict__:
+            layers = self.__dict__['_layers']
+            if name in layers:
+                return layers[name]
+        object.__getattr__(self, name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, Layer):
+            # TODO: remove the attr from dict first
+            self.__dict__['_layers'][name] = value
+            value.__dict__['_parent'] = self
+            value.__dict__['name'] = name
+        else:
+            object.__setattr__(self, name, value)
 
+    def __delattr__(self, name):
+        if name in self._layers:
+            del self._layers[name]
+        else:
+            object.__delattr__(self, name)
 
-class MaxPooling2D(Pooling2D):
 
-    def __init__(self,
-                 name,
-                 kernel=3,
-                 stride=2,
-                 border_mode='same',
-                 pad=None,
-                 data_format='NCHW',
-                 input_sample_shape=None):
-        super(MaxPooling2D,
-              self).__init__(name, model_pb2.PoolingConf.MAX, kernel, stride,
-                             border_mode, pad, data_format, input_sample_shape)
+class Linear(Layer):
+    """
+    Generate a Linear operator
+    """
 
+    # TODO: replace current with
+    #   def __init__(self, out_features, bias=True):
+    def __init__(self, out_features, *args, bias=True, **kwargs):
+        """
+        Args:
+            out_channels: int, the channel of output, also is the number of
+                filters
+            bias: bool
+        """
+        super(Linear, self).__init__()
 
-class AvgPooling2D(Pooling2D):
+        self.out_features = out_features
 
-    def __init__(self,
-                 name,
-                 kernel=3,
-                 stride=2,
-                 border_mode='same',
-                 pad=None,
-                 data_format='NCHW',
-                 input_sample_shape=None):
-        super(AvgPooling2D,
-              self).__init__(name, model_pb2.PoolingConf.AVE, kernel, stride,
-                             border_mode, pad, data_format, input_sample_shape)
+        # TODO: for backward compatibility, to remove
+        if len(args) > 0:
+            self.in_features = out_features
+            self.out_features = args[0]
+        if len(args) > 1:
+            self.bias = args[1]
+        else:
+            self.bias = bias
+
+    def initialize(self, x):
+        self.in_features = x.shape[1]
+        w_shape = (self.in_features, self.out_features)
+        b_shape = (self.out_features,)
+        w_name = self.name + Layer.sep + 'W'
+        b_name = self.name + Layer.sep + 'b'
+
+        self.W = Tensor(shape=w_shape,
+                        requires_grad=True,
+                        stores_grad=True,
+                        name=w_name)
+        std = math.sqrt(2.0 / (self.in_features + self.out_features))
+        self.W.gaussian(0.0, std)
+
+        if self.bias:
+            self.b = Tensor(shape=b_shape,
+                            requires_grad=True,
+                            stores_grad=True,
+                            name=b_name)
+            self.b.set_value(0.0)
+        else:
+            self.b = None
 
+    def forward(self, x):
+        if self.b:
+            self.device_check(x, self.W, self.b)
+        else:
+            self.device_check(x, self.W)
 
-class MaxPooling1D(MaxPooling2D):
+        assert x.shape[1] == self.W.shape[0], (
+            "Linear layer expects input features size %d received %d" %
+            (self.W.shape[0], x.shape[1]))
 
-    def __init__(self,
-                 name,
-                 kernel=3,
-                 stride=2,
-                 border_mode='same',
-                 pad=None,
-                 data_format='NCHW',
-                 input_sample_shape=None):
-        """Max pooling for 1D feature.
+        y = autograd.matmul(x, self.W)
+        if self.bias:
+            y = autograd.add_bias(y, self.b, axis=0)
+        return y
 
-        Args:
-            input_sample_shape (tuple): 1D tuple for input feature length
-        """
-        pad = None
-        if pad is not None:
-            pad = (0, pad)
-        if input_sample_shape is not None:
-            assert len(input_sample_shape) == 1, \
-                'AvgPooling1D expects input sample to be 1D'
-            input_sample_shape = (1, 1, input_sample_shape[0])
+    def get_params(self):
+        if self.bias:
+            return {self.W.name: self.W, self.b.name: self.b}
         else:
-            input_sample_shape = None
-        super(MaxPooling1D,
-              self).__init__(name, (1, kernel), (0, stride), border_mode, pad,
-                             data_format, input_sample_shape)
-
-    def get_output_sample_shape(self):
-        shape = self.layer.GetOutputSampleShape()
-        return (shape[2],)
+            return {self.W.name: self.W}
 
+    def set_params(self, parameters):
+        self.set_attribute(self.W, parameters[self.W.name])
+        if self.b:
+            self.set_attribute(self.b, parameters[self.b.name])
 
-class AvgPooling1D(AvgPooling2D):
-
-    def __init__(self,
-                 name,
-                 kernel=3,
-                 stride=2,
-                 border_mode='same',
-                 pad=None,
-                 data_format='NCHW',
-                 input_sample_shape=None):
-        """input_feature_length is a scalar value"""
-        pad2 = None
-        if pad is not None:
-            pad2 = (pad, 0)
-        if input_sample_shape is not None:
-            assert len(input_sample_shape) == 1, \
-                'AvgPooling1D expects input sample to be 1D'
-            input_sample_shape = (1, 1, input_sample_shape[0])
-        else:
-            input_sample_shape = None
-
-        super(AvgPooling1D,
-              self).__init__(name, (kernel, 1), (0, stride), border_mode, pad2,
-                             data_format, input_sample_shape)
-
-    def get_output_sample_shape(self):
-        shape = self.layer.GetOutputSampleShape()
-        return (shape[2],)
-
-
-class BatchNormalization(Layer):
-    """Batch-normalization.
-
-    Args:
-        momentum (float): for running average mean and variance.
-        beta_specs (dict): dictionary includes the fields for the beta
-            param:
-            'name' for parameter name
-            'lr_mult' for learning rate multiplier
-            'decay_mult' for weight decay multiplier
-            'init' for init method, which could be 'gaussian', 'uniform',
-            'xavier' and ''
-            'std', 'mean', 'high', 'low' for corresponding init methods
-            'clamp' for gradient constraint, value is scalar
-            'regularizer' for regularization, currently support 'l2'
-        gamma_specs (dict): similar to beta_specs, but for the gamma param.
-        name (string): layer name
-        input_sample_shape (tuple): with at least one integer
-    """
 
-    def __init__(self,
-                 name,
-                 momentum=0.9,
-                 beta_specs=None,
-                 gamma_specs=None,
-                 input_sample_shape=None):
-        super(BatchNormalization, self).__init__(name)
-        conf = self.conf.batchnorm_conf
-        conf.factor = momentum
-        if beta_specs is None:
-            beta_specs = {'init': 'Xavier'}
-        if gamma_specs is None:
-            gamma_specs = {'init': 'Xavier'}
-        if 'name' not in beta_specs:
-            beta_specs['name'] = name + '/beta'
-        if 'name' not in gamma_specs:
-            gamma_specs['name'] = name + '/gamma'
-        mean_specs = {'init': 'constant', 'value': 0, 'name': name + '/mean'}
-        var_specs = {'init': 'constant', 'value': 1, 'name': name + '/var'}
-        self.conf.param.extend([_construct_param_specs_from_dict(gamma_specs)])
-        self.conf.param.extend([_construct_param_specs_from_dict(beta_specs)])
-        self.conf.param.extend([_construct_param_specs_from_dict(mean_specs)])
-        self.conf.param.extend([_construct_param_specs_from_dict(var_specs)])
-        self.param_specs.append(_construct_param_specs_from_dict(gamma_specs))
-        self.param_specs.append(_construct_param_specs_from_dict(beta_specs))
-        self.param_specs.append(_construct_param_specs_from_dict(mean_specs))
-        self.param_specs.append(_construct_param_specs_from_dict(var_specs))
-        _check_engine(engine,
-                      ['cudnn', 'singa', 'singacpp', 'singacuda', 'singacl'])
-        self.layer = _create_layer(engine, 'BatchNorm')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
-
-class L2Norm(Layer):
-    '''Normalize each sample to have L2 norm = 1'''
-
-    def __init__(self, name, input_sample_shape, epsilon=1e-8):
-        super(L2Norm, self).__init__(name)
-        self.y = None
-        self.norm = None
-        self.name = name
-        self.epsilon = epsilon
-        self.out_sample_shape = input_sample_shape
-
-    def get_output_sample_shape(self):
-        return self.out_sample_shape
-
-    def forward(self, is_train, x):
-        norm = tensor.sum_columns(tensor.square(x))
-        norm += self.epsilon
-        norm = tensor.sqrt(norm)
-        self.y = x.clone()
-        self.y.div_column(norm)
-
-        if is_train:
-            self.norm = norm
-        return self.y
-
-    def backward(self, is_train, dy):
-        # (dy - y * k) / norm, k = sum(dy * y)
-        k = tensor.sum_columns(tensor.eltwise_mult(dy, self.y))
-        self.y.mult_column(k)
-        dx = dy - self.y
-        dx.div_column(self.norm)
-        return dx, []
-
-
-class LRN(Layer):
-    """Local response normalization.
-
-    Args:
-        size (int): # of channels to be crossed
-            normalization.
-        mode (string): 'cross_channel'
-        input_sample_shape (tuple): 3d tuple, (channel, height, width)
+class Gemm(Layer):
     """
-
-    def __init__(self,
-                 name,
-                 size=5,
-                 alpha=1,
-                 beta=0.75,
-                 mode='cross_channel',
-                 k=1,
-                 input_sample_shape=None):
-        super(LRN, self).__init__(name)
-        conf = self.conf.lrn_conf
-        conf.local_size = size
-        conf.alpha = alpha
-        conf.beta = beta
-        conf.k = k
-        # TODO(wangwei) enable mode = 'within_channel'
-        assert mode == 'cross_channel', 'only support mode="across_channel"'
-        conf.norm_region = model_pb2.LRNConf.ACROSS_CHANNELS
-        _check_engine(engine,
-                      ['cudnn', 'singa', 'singacpp', 'singacuda', 'singacl'])
-        self.layer = _create_layer(engine, 'LRN')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
-
-class Dense(Layer):
-    """Apply linear/affine transformation, also called inner-product or
-    fully connected layer.
-
-    Args:
-        num_output (int): output feature length.
-        use_bias (bool): add a bias vector or not to the transformed feature
-        W_specs (dict): specs for the weight matrix
-            'name' for parameter name
-            'lr_mult' for learning rate multiplier
-            'decay_mult' for weight decay multiplier
-            'init' for init method, which could be 'gaussian', 'uniform',
-            'xavier' and ''
-            'std', 'mean', 'high', 'low' for corresponding init methods
-            'clamp' for gradient constraint, value is scalar
-            'regularizer' for regularization, currently support 'l2'
-        b_specs (dict): specs for the bias vector, same fields as W_specs.
-        W_transpose (bool): if true, output=x*W.T+b;
-        input_sample_shape (tuple): input feature length
+    Generate a Gemm operator
+    Y = alpha * A' * B' + beta * C
+    B is weight, C is bias
     """
 
     def __init__(self,
-                 name,
-                 num_output,
-                 use_bias=True,
-                 W_specs=None,
-                 b_specs=None,
-                 W_transpose=False,
-                 input_sample_shape=None):
-        """Apply linear/affine transformation, also called inner-product or
-        fully connected layer.
-
+                 nb_kernels,
+                 alpha=1.0,
+                 beta=1.0,
+                 transA=False,
+                 transB=True,
+                 bias=True,
+                 bias_shape=None):
+        """
         Args:
-            num_output (int): output feature length.
-            use_bias (bool): add a bias vector or not to the transformed feature
-            W_specs (dict): specs for the weight matrix
-                'name' for parameter name
-                'lr_mult' for learning rate multiplier
-                'decay_mult' for weight decay multiplier
-                'init' for init method, which could be 'gaussian', 'uniform',
-                'xavier' and ''
-                'std', 'mean', 'high', 'low' for corresponding init methods
-                'clamp' for gradient constraint, value is scalar
-                'regularizer' for regularization, currently support 'l2'
-            b_specs (dict): specs for the bias vector, same fields as W_specs.
-            W_transpose (bool): if true, output=x*W.T+b;
-            input_sample_shape (tuple): input feature length
+            nb_kernels: int, the channel of output, also is the number of
+                filters
+            alpha (float): Scalar multiplier for the product of input tensors A * B.
+            beta (float): Scalar multiplier for input tensor C.
+            ransA (bool): Whether A should be transposed
+            transB (bool): Whether B should be transposed
+            bias: bool
         """
-        super(Dense, self).__init__(name)
-        conf = self.conf.dense_conf
-        conf.num_output = num_output
-        conf.bias_term = use_bias
-        conf.transpose = W_transpose
-        if W_specs is None:
-            W_specs = {'init': 'xavier'}
-        if 'name' not in W_specs:
-            W_specs['name'] = name + '/weight'
-        wspecs = _construct_param_specs_from_dict(W_specs)
-        self.conf.param.extend([wspecs])
-        self.param_specs.append(wspecs)
-        if use_bias:
-            if b_specs is None:
-                b_specs = {'init': 'constant', 'value': 0}
-            if 'name' not in b_specs:
-                b_specs['name'] = name + '/bias'
-            bspecs = _construct_param_specs_from_dict(b_specs)
-            self.conf.param.extend([bspecs])
-            self.param_specs.append(bspecs)
-        # dense layer is transparent to engine.
-        if engine == 'cudnn':
-            self.layer = _create_layer('singacuda', 'Dense')
+        super(Gemm, self).__init__()
+        self.nb_kernels = nb_kernels
+        self.alpha = alpha
+        self.beta = beta
+        self.transA = 1 if transA else 0
+        self.transB = 1 if transB else 0
+        self.bias = bias
+        self.bias_shape = bias_shape
+
+    def initialize(self, x):
+        if self.transA == 0:
+            self.in_features = x.shape[1]
         else:
-            self.layer = _create_layer(engine, 'Dense')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
+            self.in_features = x.shape[0]
 
+        if self.transB == 0:
+            w_shape = (self.in_features, self.nb_kernels)
+        else:
+            w_shape = (self.nb_kernels, self.in_features)
 
-class Dropout(Layer):
-    """Droput layer.
+        if self.bias_shape:
+            b_shape = self.bias_shape
+        else:
+            b_shape = (1, self.nb_kernels)
+
+        w_name = self.name + Layer.sep + 'W'
+        b_name = self.name + Layer.sep + 'b'
+
+        self.W = Tensor(shape=w_shape,
+                        requires_grad=True,
+                        stores_grad=True,
+                        device=x.device,
+                        name=w_name)
+        std = math.sqrt(2.0 / (self.in_features + self.nb_kernels))
+        self.W.gaussian(0.0, std)
+
+        if self.bias:
+            self.b = Tensor(shape=b_shape,
+                            requires_grad=True,
+                            stores_grad=True,
+                            device=x.device,
+                            name=b_name)
+            self.b.set_value(0.0)
+        else:
+            self.b = None
 
-    Args:
-        p (float): probability for dropping out the element, i.e., set to 0
-        name (string): layer name
-    """
+    def forward(self, x):
+        if self.b:
+            self.device_check(x, self.W, self.b)
+        else:
+            self.device_check(x, self.W)
 
-    def __init__(self, name, p=0.5, input_sample_shape=None):
-        super(Dropout, self).__init__(name)
-        conf = self.conf.dropout_conf
-        conf.dropout_ratio = p
-        # dropout is support in cudnn since V5
-        if engine.lower() == 'cudnn' and cudnn_version < 5000:
-            myengine = 'singacuda'
+        if self.transA == 0:
+            in_features = x.shape[1]
         else:
-            myengine = engine
-        _check_engine(myengine,
-                      ['cudnn', 'singa', 'singacpp', 'singacuda', 'singacl'])
-        self.layer = _create_layer(myengine, 'Dropout')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
+            in_features = x.shape[0]
 
+        if self.transB == 0:
+            in_features_w = self.W.shape[0]
+        else:
+            in_features_w = self.W.shape[1]
 
-class Activation(Layer):
-    """Activation layers.
+        assert in_features == in_features_w, (
+            "Gemm layer expects input features size %d received %d" %
+            (in_features_w, in_features))
+        y = autograd.gemm(x, self.W, self.b, self.alpha, self.beta, self.transA,
+                          self.transB)
 
-    Args:
-        name (string): layer name
-        mode (string): 'relu', 'sigmoid', or 'tanh'
-        input_sample_shape (tuple): shape of a single sample
-    """
+        return y
 
-    def __init__(self, name, mode='relu', input_sample_shape=None):
-        super(Activation, self).__init__(name)
-        _check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
-        self.conf.type = (engine + '_' + mode).lower()
-        self.layer = _create_layer(engine, mode)
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
+    def get_params(self):
+        if self.bias:
+            return {self.W.name: self.W, self.b.name: self.b}
+        else:
+            return {self.W.name: self.W}
 
+    def set_params(self, parameters):
+        self.set_attribute(self.W, parameters[self.W.name])
+        if self.b:
+            self.set_attribute(self.b, parameters[self.b.name])
 
-class Softmax(Layer):
-    """Apply softmax.
 
-    Args:
-        axis (int): reshape the input as a matrix with the dimension
-            [0,axis) as the row, the [axis, -1) as the column.
-        input_sample_shape (tuple): shape of a single sample
+class Conv2d(Layer):
     """
-
-    def __init__(self, name, axis=1, input_sample_shape=None):
-        super(Softmax, self).__init__(name)
-        # conf = self.conf.softmax_conf
-        # conf.axis = axis
-        _check_engine(engine,
-                      ['cudnn', 'singa', 'singacpp', 'singacl', 'singacuda'])
-        self.layer = _create_layer(engine, 'Softmax')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
-
-class Flatten(Layer):
-    """Reshape the input tensor into a matrix.
-
-    Args:
-        axis (int): reshape the input as a matrix with the dimension
-            [0,axis) as the row, the [axis, -1) as the column.
-        input_sample_shape (tuple): shape for a single sample
+    Generate a Conv 2d operator
     """
 
-    def __init__(self, name, axis=1, input_sample_shape=None):
-        super(Flatten, self).__init__(name)
-        conf = self.conf.flatten_conf
-        conf.axis = axis
-        # fltten layer is transparent to engine
-        if engine == 'cudnn':
-            self.layer = _create_layer('singacuda', 'Flatten')
+    def __init__(self,
+                 nb_kernels,
+                 kernel_size,
+                 *args,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 group=1,
+                 bias=True,
+                 pad_mode="NOTSET",
+                 **kwargs):
+        """
+        Args:
+            nb_kernels (int): the channel of output, also is the number of filters
+            kernel_size (int or tuple): kernel size for two direction of each
+                axis. For example, (2, 3), the first 2 means will add 2 at the
+                beginning and also 2 at the end for its axis.and if a int is
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel size.
+            padding (int): tuple, list or None, padding, the logic is the same
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" or
+                "SAME_LOWER" mode, you can set padding as None, and the padding
+                will be computed automatically.
+            dilation (int): only support 1
+            group (int): group
+            bias (bool): bias
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
+                default value is NOTSET, which means explicit padding is used.
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output
+                spatial size match the input. In case of odd number add the extra
+                padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        """
+        super(Conv2d, self).__init__()
+
+        # the old code create the layer like: Conv2d(8, 16, 3)， or Conv2d(8, 16, 3, stride=1)
+        # the following code block is for backward compatibility
+        if len(args) > 0:
+            nb_kernels = kernel_size
+            kernel_size = args[0]
+        if len(args) > 1:
+            stride = args[1]
+        if len(args) > 2:
+            padding = args[2]
+
+        self.nb_kernels = nb_kernels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.group = group
+        self.bias = bias
+        self.pad_mode = pad_mode
+
+        if isinstance(kernel_size, int):
+            self.kernel_size = (kernel_size, kernel_size)
+        elif isinstance(kernel_size, tuple):
+            self.kernel_size = kernel_size
         else:
-            self.layer = _create_layer(engine, 'Flatten')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
+            raise TypeError("Wrong kernel_size type.")
 
-class Merge(Layer):
-    '''Sum all input tensors.
+        if isinstance(stride, int):
+            self.stride = (stride, stride)
+        elif isinstance(stride, tuple):
+            self.stride = stride
+        else:
+            raise TypeError("Wrong stride type.")
+
+        self.odd_padding = (0, 0, 0, 0)
+        if isinstance(padding, int):
+            self.padding = (padding, padding)
+        elif isinstance(padding, tuple) or isinstance(padding, list):
+            if len(padding) == 2:
+                self.padding = padding
+            elif len(padding) == 4:
+                _h_mask = padding[0] - padding[1]
+                _w_mask = padding[2] - padding[3]
+                # the odd paddding is the value that cannot be handled by the tuple padding (w, h) mode
+                # so we need to firstly handle the input, then use the nomal padding method.
+                self.odd_padding = (max(_h_mask, 0), max(-_h_mask, 0),
+                                    max(_w_mask, 0), max(-_w_mask, 0))
+                self.padding = (
+                    padding[0] - self.odd_padding[0],
+                    padding[2] - self.odd_padding[2],
+                )
+            else:
+                raise TypeError("Wrong padding value.")
 
-    Args:
-        input_sample_shape: sample shape of the input. The sample shape of all
-            inputs should be the same.
-    '''
+        if dilation != 1 and list(dilation) != [1, 1]:
+            raise ValueError("Not implemented yet")
 
-    def __init__(self, name, input_sample_shape=None):
-        self.in_shape = input_sample_shape
-        self.num_input = 1
-        super(Merge, self).__init__(name)
+        self.inner_params = {
+            "cudnn_prefer": "fastest",
+            "workspace_MB_limit": 1024,
+        }
+        # TODO valid value of inner_params check
 
-    def setup(self, in_shape):
-        self.in_shape = in_shape
-        self.has_setup = True
+        for kwarg in kwargs:
+            if kwarg not in self.inner_params:
+                raise TypeError("Keyword argument not understood:", kwarg)
+            else:
+                self.inner_params[kwarg] = kwargs[kwarg]
+
+    def initialize(self, x):
+        self.in_channels = x.shape[1]
+        w_shape = (
+            self.nb_kernels,
+            int(self.in_channels / self.group),
+            self.kernel_size[0],
+            self.kernel_size[1],
+        )
+        w_name = self.name + Layer.sep + 'W'
+
+        self.W = Tensor(shape=w_shape,
+                        requires_grad=True,
+                        stores_grad=True,
+                        name=w_name,
+                        device=x.device)
+        # std = math.sqrt(
+        # 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] +
+        # self.nb_kernels))
+        std = math.sqrt(
+            2.0 / (w_shape[1] * self.kernel_size[0] * self.kernel_size[1] +
+                   self.nb_kernels))
+        self.W.gaussian(0.0, std)
+
+        if self.bias:
+            b_shape = (self.nb_kernels,)
+            b_name = self.name + Layer.sep + 'b'
+            self.b = Tensor(shape=b_shape,
+                            requires_grad=True,
+                            stores_grad=True,
+                            name=b_name,
+                            device=x.device)
+            self.b.set_value(0.0)
+        else:
+            # to keep consistency when to do forward.
+            self.b = None
+            # Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)
+
+        # if same pad mode, re-compute the padding
+        if self.pad_mode in ("SAME_UPPER", "SAME_LOWER"):
+            self.padding, self.odd_padding = utils.get_padding_shape(
+                self.pad_mode, x.shape[2:], self.kernel_size, self.stride)
+            self.padding = [self.padding[0], self.padding[2]]
+
+        _x = x
+        if self.odd_padding != (0, 0, 0, 0):
+            x_shape = list(x.data.shape())
+            x_shape[2] += (self.odd_padding[0] + self.odd_padding[1])
+            x_shape[3] += (self.odd_padding[2] + self.odd_padding[3])
+            _x = Tensor(shape=x_shape, device=x.device)
+            _x.set_value(0.0)
+
+        if _x.device.id() == -1:
+            if self.group != 1:
+                raise ValueError("Not implemented yet")
+            else:
+                if not hasattr(self, "handle"):
+                    self.handle = singa.ConvHandle(
+                        _x.data,
+                        self.kernel_size,
+                        self.stride,
+                        self.padding,
+                        self.in_channels,
+                        self.nb_kernels,
+                        self.bias,
+                        self.group,
+                    )
+        else:
+            if not hasattr(self, "handle"):
+                self.handle = singa.CudnnConvHandle(
+                    _x.data,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.in_channels,
+                    self.nb_kernels,
+                    self.bias,
+                    self.group,
+                )
+
+    def forward(self, x):
+        # sanitize the device of params/states, TODO: better to decorate forward()
+        self.device_check(x, *[s for k, s in self.get_states().items()])
+
+        assert (self.group >= 1 and self.in_channels %
+                self.group == 0), "please set reasonable group."
+
+        assert (self.nb_kernels >= self.group and self.nb_kernels %
+                self.group == 0), "nb_kernels and group dismatched."
+
+        y = autograd.conv2d(self.handle, x, self.W, self.b, self.odd_padding)
+        return y
+
+    def get_params(self):
+        if self.bias:
+            return {self.W.name: self.W, self.b.name: self.b}
+        else:
+            return {self.W.name: self.W}
 
-    def get_output_sample_shape(self):
-        return self.in_shape
+    def set_params(self, parameters):
+        self.set_attribute(self.W, parameters[self.W.name])
+        if self.b:
+            self.set_attribute(self.b, parameters[self.b.name])
 
-    def forward(self, flag, inputs):
-        '''Merge all input tensors by summation.
 
-        TODO(wangwei) do element-wise merge operations, e.g., avg, count
-        Args:
-            flag: not used.
-            inputs (list): a list of tensors
-
-        Returns:
-            A single tensor as the sum of all input tensors
-        '''
-        assert len(inputs) > 1, 'There must be multiple input tensors'
-        self.num_input = len(inputs)
-        output = tensor.Tensor()
-        output.reset_like(inputs[0])
-        output.set_value(0)
-        for x in inputs:
-            output += x
-        return output
-
-    def backward(self, flag, grad):
-        '''Replicate the grad for each input source layer.
+class SeparableConv2d(Layer):
+    """
+    Generate a Conv 2d operator
+    """
 
+    def __init__(self,
+                 nb_kernels,
+                 kernel_size,
+                 *args,
+                 stride=1,
+                 padding=0,
+                 bias=False):
+        """
         Args:
-            grad(Tensor), the gradient tensor of the merged result from forward
-
-        Returns:
-            A list of replicated grad, one per source layer
-        '''
-        assert isinstance(grad, tensor.Tensor), 'The input must be Tensor' \
-            ' instead of %s' % type(grad).__name__
-        return [grad] * self.num_input, []  # * self.num_input
-
+            nb_kernels (int): the channel of output, also is the number of filters
+            kernel_size (int or tuple): kernel size for two direction of each
+                axis. For example, (2, 3), the first 2 means will add 2 at the
+                beginning and also 2 at the end for its axis.and if a int is
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel size.
+            padding (int): tuple, list or None, padding, the logic is the same
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" or
+                "SAME_LOWER" mode, you can set padding as None, and the padding
+                will be computed automatically.
+            bias (bool): bias
+        """
+        super(SeparableConv2d, self).__init__()
+
+        # the following code block is for backward compatibility
+        if len(args) > 0:
+            nb_kernels = kernel_size
+            kernel_size = args[0]
+        if len(args) > 1:
+            stride = args[1]
+        if len(args) > 2:
+            padding = args[2]
+
+        self.nb_kernels = nb_kernels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.bias = bias
 
-class Split(Layer):
-    '''Replicate the input tensor.
+    def initialize(self, x):
+        self.in_channels = x.shape[1]
+        self.depthwise_conv = Conv2d(
+            self.in_channels,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            group=self.in_channels,
+            bias=self.bias,
+        )
 
-    Args:
-        num_output (int): number of output tensors to generate.
-        input_sample_shape: includes a single integer for the input sample
-            feature size.
-    '''
+        self.point_conv = Conv2d(self.nb_kernels, 1, bias=self.bias)
 
-    def __init__(self, name, num_output, input_sample_shape=None):
-        self.num_output = num_output
-        self.in_shape = input_sample_shape
-        super(Split, self).__init__(name)
+    def forward(self, x):
+        y = self.depthwise_conv(x)
+        y = self.point_conv(y)
+        return y
 
-    def setup(self, in_shape):
-        self.in_shape = in_shape
-        self.has_setup = True
 
-    def get_output_sample_shape(self):
-        return [self.in_shape] * self.num_output
+class BatchNorm2d(Layer):
+    """
+    Generate a BatchNorm 2d operator
+    """
 
-    def forward(self, flag, input):
-        '''Replicate the input tensor into mutiple tensors.
+    def __init__(self, momentum=0.9):
+        """
+        Args:
+            momentum (float): Factor used in computing the running mean and
+                variance.
+        """
+        super(BatchNorm2d, self).__init__()
+
+        self.momentum = momentum
+
+    def initialize(self, x):
+        self.channels = x.shape[1]
+        param_shape = (self.channels,)
+        scale_name = self.name + Layer.sep + 'scale'
+        bias_name = self.name + Layer.sep + 'bias'
+        running_mean_name = self.name + Layer.sep + 'running_mean'
+        running_var_name = self.name + Layer.sep + 'running_var'
+
+        self.scale = Tensor(shape=param_shape,
+                            requires_grad=True,
+                            stores_grad=True,
+                            name=scale_name)
+        self.scale.set_value(1.0)
+
+        self.bias = Tensor(shape=param_shape,
+                           requires_grad=True,
+                           stores_grad=True,
+                           name=bias_name)
+        self.bias.set_value(0.0)
+
+        self.running_mean = Tensor(shape=param_shape,
+                                   requires_grad=False,
+                                   stores_grad=False,
+                                   name=running_mean_name)
+        self.running_mean.set_value(0.0)
+
+        self.running_var = Tensor(shape=param_shape,
+                                  requires_grad=False,
+                                  stores_grad=False,
+                                  name=running_var_name)
+        self.running_var.set_value(1.0)
+
+        if not hasattr(self, "handle"):
+            if x.device.id() == -1:
+                self.handle = singa.BatchNormHandle(self.momentum, x.data)
+            else:
+                self.handle = singa.CudnnBatchNormHandle(self.momentum, x.data)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels, (
+            "number of channels dismatched. %d vs %d" %
+            (x.shape[1], self.channels))
+
+        self.device_check(x, self.scale, self.bias, self.running_mean,
+                          self.running_var)
+
+        y = autograd.batchnorm_2d(
+            self.handle,
+            x,
+            self.scale,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+        )
+        return y
+
+    def get_params(self):
+        return {self.scale.name: self.scale, self.bias.name: self.bias}
+
+    def set_params(self, parameters):
+        self.set_attribute(self.scale, parameters[self.scale.name])
+        self.set_attribute(self.bias, parameters[self.bias.name])
+
+    def get_states(self):
+        ret = self.get_params()
+        ret[self.running_mean.name] = self.running_mean
+        ret[self.running_var.name] = self.running_var
+        return ret
+
+    def set_states(self, states):
+        self.set_params(states)
+        self.set_attribute(self.running_mean, states[self.running_mean.name])
+        self.set_attribute(self.running_var, states[self.running_var.name])
+
+
+class Pooling2d(Layer):
+    """
+    Generate a Pooling 2d operator
+    """
 
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 is_max=True,
+                 pad_mode="NOTSET"):
+        """
         Args:
-            flag: not used
-            input: a single input tensor
+            kernel_size (int or tuple): kernel size for two direction of each
+                axis. For example, (2, 3), the first 2 means will add 2 at the
+                beginning and also 2 at the end for its axis.and if a int is
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel size.
+            padding (int): tuple, list or None, padding, the logic is the same
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" or
+                "SAME_LOWER" mode, you can set padding as None, and the padding
+                will be computed automatically.
+            is_max (bool): is max pooling or avg pooling
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
+                default value is NOTSET, which means explicit padding is used.
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output
+                spatial size match the input. In case of odd number add the extra
+                padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        """
+        super(Pooling2d, self).__init__()
 
-        Returns:
-            a list a output tensor (each one is a copy of the input)
-        '''
-        assert isinstance(input, tensor.Tensor), 'The input must be Tensor'
-        outputs = [input] * self.num_output
-        return outputs
+        if isinstance(kernel_size, int):
+            self.kernel_size = (kernel_size, kernel_size)
+        elif isinstance(kernel_size, tuple):
+            self.kernel_size = kernel_size
+        else:
+            raise TypeError("Wrong kernel_size type.")
+
+        if stride is None:
+            self.stride = self.kernel_size
+        elif isinstance(stride, int):
+            self.stride = (stride, stride)
+        elif isinstance(stride, tuple):
+            self.stride = stride
+            assert stride[0] > 0 or (kernel_size[0] == 1 and padding[0] == 0), (
+                "stride[0]=0, but kernel_size[0]=%d, padding[0]=%d" %
+                (kernel_size[0], padding[0]))
+        else:
+            raise TypeError("Wrong stride type.")
+
+        self.odd_padding = (0, 0, 0, 0)
+        if isinstance(padding, int):
+            self.padding = (padding, padding)
+        elif isinstance(padding, tuple) or isinstance(padding, list):
+            if len(padding) == 2:
+                self.padding = padding
+            elif len(padding) == 4:
+                _h_mask = padding[0] - padding[1]
+                _w_mask = padding[2] - padding[3]
+                # the odd paddding is the value that cannot be handled by the tuple padding (w, h) mode
+                # so we need to firstly handle the input, then use the nomal padding method.
+                self.odd_padding = (max(_h_mask, 0), max(-_h_mask, 0),
+                                    max(_w_mask, 0), max(-_w_mask, 0))
+                self.padding = (
+                    padding[0] - self.odd_padding[0],
+                    padding[2] - self.odd_padding[2],
+                )
+            else:
+                raise TypeError("Wrong padding value.")
+
+        self.is_max = is_max
+        self.pad_mode = pad_mode
+
+    def initialize(self, x):
+        # if same pad mode, re-compute the padding
+        if self.pad_mode in ("SAME_UPPER", "SAME_LOWER"):
+            self.padding, self.odd_padding = utils.get_padding_shape(
+                self.pad_mode, x.shape[2:], self.kernel_size, self.stride)
+
+        # if same pad mode, re-compute the padding
+        if self.pad_mode in ("SAME_UPPER", "SAME_LOWER"):
+            self.padding, self.odd_padding = utils.get_padding_shape(
+                self.pad_mode, x.shape[2:], self.kernel_size, self.stride)
+            self.padding = [self.padding[0], self.padding[2]]
+
+        _x = x
+        if self.odd_padding != (0, 0, 0, 0):
+            x_shape = list(x.data.shape())
+            x_shape[2] += (self.odd_padding[0] + self.odd_padding[1])
+            x_shape[3] += (self.odd_padding[2] + self.odd_padding[3])
+            _x = Tensor(shape=x_shape, device=x.device)
+            _x.set_value(0.0)
+
+        if _x.device.id() == -1:
+            self.handle = singa.PoolingHandle(
+                _x.data,
+                self.kernel_size,
+                self.stride,
+                self.padding,
+                self.is_max,
+            )
+        else:
+            self.handle = singa.CudnnPoolingHandle(
+                _x.data,
+                self.kernel_size,
+                self.stride,
+                self.padding,
+                self.is_max,
+            )
 
-    def backward(self, flag, grads):
-        '''Sum all grad tensors to generate a single output tensor.
+    def forward(self, x):
+        y = autograd.pooling_2d(self.handle, x, self.odd_padding)
+        return y
 
-        Args:
-            grads(list of Tensor), one per dest layer
-
-        Returns:
-            a single tensor as the sum of all grads
-        '''
-        assert len(grads) > 1, 'There must be multiple gradients'
-        dx = tensor.Tensor()
-        dx.reset_like(grads[0])
-        dx.set_value(0)
-        for g in grads:
-            dx += g
-        return dx, []
-
-
-class Concat(Layer):
-    '''Concatenate tensors vertically (axis = 0) or horizontally (axis = 1).
-
-    Currently, only support tensors with 2 dimensions.
-
-    Args:
-        axis(int): 0 for concat row; 1 for concat columns;
-        input_sample_shapes: a list of sample shape tuples, one per input tensor
-    '''
-
-    def __init__(self, name, axis, input_sample_shapes=None):
-        super(Concat, self).__init__(name)
-        self.in_shapes = input_sample_shapes
-        self.axis = axis
-        self.conf.concat_conf.axis = axis
-        if engine == "cudnn":
-            self.layer = _create_layer('singacuda', 'Concat')
-        else:
-            self.layer = _create_layer(engine, 'Concat')
-        if input_sample_shapes is not None:
-            self.setup(input_sample_shapes)
 
-    def forward(self, flag, inputs):
-        '''Concatenate all input tensors.
+class MaxPool2d(Pooling2d):
+    """
+    Generate a Max Pooling 2d operator
+    """
 
+    def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
+        """
         Args:
-            flag: same as Layer::forward()
-            input: a list of tensors
+            kernel_size (int or tuple): kernel size for two direction of each
+                axis. For example, (2, 3), the first 2 means will add 2 at the
+                beginning and also 2 at the end for its axis.and if a int is
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel size.
+            padding (int): tuple, list or None, padding, the logic is the same
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" or
+                "SAME_LOWER" mode, you can set padding as None, and the padding
+                will be computed automatically.
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
+                default value is NOTSET, which means explicit padding is used.
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output
+                spatial size match the input. In case of odd number add the extra
+                padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        """
+        super(MaxPool2d, self).__init__(kernel_size, stride, padding, True,
+                                        pad_mode)
 
-        Returns:
-            a single concatenated tensor
-        '''
-        assert type(inputs) is list, 'Must be a list of Tensors'
-        ys = super(Concat, self).forward(flag, inputs)
-        return ys[0]
 
-    def backward(self, flag, dy):
-        '''Backward propagate gradients through this layer.
+class AvgPool2d(Pooling2d):
 
+    def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
+        """
         Args:
-            flag: same as Layer::backward()
-            dy(Tensor): the gradient tensors of y w.r.t objective loss
-        Return:
-            <dx, []>, dx is a list tensors for the gradient of the inputs; []
-               is an empty list.
-        '''
-        if type(dy) is tensor.Tensor:
-            dy = [dy]
-        assert type(dy) is list, 'Must be a list(Tensor)'
-        return super(Concat, self).backward(flag, dy)
-
-
-class Slice(Layer):
-    '''Slice the input tensor into multiple sub-tensors vertially (axis=0) or
-    horizontally (axis=1).
-
-    Args:
-        axis (int): 0 for slice rows; 1 for slice columns;
-        slice_point(list): positions along the axis to do slice; there are n-1
-            points for n sub-tensors;
-        input_sample_shape: input tensor sample shape
-    '''
-
-    def __init__(self, name, axis, slice_point, input_sample_shape=None):
-        super(Slice, self).__init__(name)
-        self.in_shape = input_sample_shape
-        self.axis = axis
-        self.conf.slice_conf.axis = axis
-        self.conf.slice_conf.slice_point.extend(slice_point)
-        if engine == "cudnn":
-            self.layer = _create_layer('singacuda', 'Slice')
-        else:
-            self.layer = _create_layer(engine, 'Slice')
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
+            kernel_size (int or tuple): kernel size for two direction of each
+                axis. For example, (2, 3), the first 2 means will add 2 at the
+                beginning and also 2 at the end for its axis.and if a int is
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel size.
+            padding (int): tuple, list or None, padding, the logic is the same
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" or
+                "SAME_LOWER" mode, you can set padding as None, and the padding
+                will be computed automatically.
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
+                default value is NOTSET, which means explicit padding is used.
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output
+                spatial size match the input. In case of odd number add the extra
+                padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        """
+        super(AvgPool2d, self).__init__(kernel_size, stride, padding, False,
+                                        pad_mode)
 
-    def get_output_sample_shape(self):
-        out = []
-        for i in range(len(self.conf.slice_conf.slice_point) + 1):
-            out.append(self.layer.GetOutputSampleShapeAt(i))
-        return out
 
-    def forward(self, flag, x):
-        '''Slice the input tensor on the given axis.
+class MaxPool1d(Pooling2d):
+    """
+    Generate a Max Pooling 1d operator
+    """
 
+    def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
+        """
         Args:
-            flag: same as Layer::forward()
-            x: a single input tensor
+            kernel_size (int or tuple): kernel size for two direction of each
+                axis. For example, (2, 3), the first 2 means will add 2 at the
+                beginning and also 2 at the end for its axis.and if a int is
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel size.
+            padding (int): tuple, list or None, padding, the logic is the same
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" or
+                "SAME_LOWER" mode, you can set padding as None, and the padding
+                will be computed automatically.
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
+                default value is NOTSET, which means explicit padding is used.
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output
+                spatial size match the input. In case of odd number add the extra
+                padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        """
+        if stride is None:
+            stride = kernel_size
+        super(MaxPool1d, self).__init__((1, kernel_size), (1, stride),
+                                        (0, padding), True, pad_mode)
 
-        Returns:
-            a list a output tensor
-        '''
-        if type(x) is tensor.Tensor:
-            x = [x]
-        assert type(x) is list, 'Must be a list of Tensor'
-        return super(Slice, self).forward(flag, x)
 
-    def backward(self, flag, grads):
-        '''Concate all grad tensors to generate a single output tensor
+class AvgPool1d(Pooling2d):
+    """
+    Generate a Avg Pooling 1d operator
+    """
 
+    def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
+        """
         Args:
-            flag: same as Layer::backward()
-            grads: a list of tensors, one for the gradient of one sliced tensor
-
-        Returns:
-            a single tensor for the gradient of the original user, and an empty
-                list.
-        '''
-        assert len(grads) > 1, 'There must be multiple gradients'
-        dxs, _ = super(Slice, self).backward(flag, grads)
-        return dxs[0], []
-
-
-class RNN(Layer):
-    '''Recurrent layer with 4 types of units, namely lstm, gru, tanh and relu.
-
-    Args:
-        hidden_size: hidden feature size, the same for all stacks of layers.
-        rnn_mode: decides the rnn unit, which could be one of 'lstm', 'gru',
-            'tanh' and 'relu', refer to cudnn manual for each mode.
-        num_stacks: num of stacks of rnn layers. It is different to the
-            unrolling seqence length.
-        input_mode: 'linear' convert the input feature x by by a linear
-            transformation to get a feature vector of size hidden_size;
-            'skip' does nothing but requires the input feature size equals
-            hidden_size
-        bidirection: True for bidirectional RNN
-        param_specs: config for initializing the RNN parameters.
-        input_sample_shape: includes a single integer for the input sample
-            feature size.
-    '''
-
-    def __init__(self,
-                 name,
-                 hidden_size,
-                 rnn_mode='lstm',
-                 dropout=0.0,
-                 num_stacks=1,
-                 input_mode='linear',
-                 bidirectional=False,
-                 param_specs=None,
-                 input_sample_shape=None):
-        assert cudnn_version >= 5005, 'RNN is supported since CUDNN V5.0.5; '\
-            'This version is %d' % cudnn_version
-        super(RNN, self).__init__(name)
-        conf = self.conf.rnn_conf
-        assert hidden_size > 0, 'Hidden feature size must > 0'
-        conf.hidden_size = hidden_size
-        assert rnn_mode in set(['lstm', 'gru', 'tanh', 'relu']),  \
-            'rnn mode %s is not available' % (rnn_mode)
-        conf.rnn_mode = rnn_mode
-        conf.num_stacks = num_stacks
-        conf.dropout = dropout
-        conf.input_mode = input_mode
-        conf.direction = 'unidirectional'
-        if bidirectional:
-            conf.direction = 'bidirectional'
-        # currently only has rnn layer implemented using cudnn
-        _check_engine(engine, ['cudnn'])
-        if param_specs is None:
-            param_specs = {
-                'name': name + '/weight',
-                'init': 'uniform',
-                'low': 0,
-                'high': 1
-            }
-        self.conf.param.extend([_construct_param_specs_from_dict(param_specs)])
-        self.param_specs.append(_construct_param_specs_from_dict(param_specs))
-
-        self.layer = singa_wrap.CudnnRNN()
-        if input_sample_shape is not None:
-            self.setup(input_sample_shape)
-
-    def forward(self, flag, inputs):
-        '''Forward inputs through the RNN.
+            kernel_size (int or tuple): kernel size for two direction of each
+                axis. For example, (2, 3), the first 2 means will add 2 at the
+                beginning and also 2 at the end for its axis.and if a int is
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel size.
+            padding (int): tuple, list or None, padding, the logic is the same
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" or
+                "SAME_LOWER" mode, you can set padding as None, and the padding
+                will be computed automatically.
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
+                default value is NOTSET, which means explicit padding is used.
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output
+                spatial size match the input. In case of odd number add the extra
+                padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
+        """
+        if stride is None:
+            stride = kernel_size
+        super(AvgPool1d, self).__init__((1, kernel_size), (1, stride),
+                                        (0, padding), False, pad_mode)
 
-        Args:
-            flag: True(kTrain) for training; False(kEval) for evaluation;
-                others values for future use.
-            inputs, <x1, x2,...xn, hx, cx>, where xi is the input tensor for the
-                i-th position, its shape is (batch_size, input_feature_length);
-                the batch_size of xi must >= that of xi+1; hx is the initial
-                hidden state of shape (num_stacks * bidirection?2:1, batch_size,
-                hidden_size). cx is the initial cell state tensor of the same
-                shape as hy. cx is valid for only lstm. For other RNNs there is
-                no cx. Both hx and cx could be dummy tensors without shape and
-                data.
-
-        Returns:
-            <y1, y2, ... yn, hy, cy>, where yi is the output tensor for the i-th
-                position, its shape is (batch_size,
-                hidden_size * bidirection?2:1). hy is the final hidden state
-                tensor. cx is the final cell state tensor. cx is only used for
-                lstm.
-        '''
-        assert self.has_setup, 'Must call setup() before forward()'
-        assert len(inputs) > 1, 'The input to RNN must include at '\
-            'least one input tensor '\
-            'and one hidden state tensor (could be a dummy tensor)'
-        tensors = []
-        for t in inputs:
-            assert isinstance(t, tensor.Tensor), \
-                'input must be py Tensor %s' % (type(t))
-            tensors.append(t.data)
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
-        y = self.layer.ForwardWithMultInputs(flag, tensors)
-        return tensor.from_raw_tensors(y)
 
-    def backward(self, flag, grad):
-        '''Backward gradients through the RNN.
+class RNN_Base(Layer):
 
-        Args:
-            flag, for future use.
-            grad, <dy1, dy2,...dyn, dhy, dcy>, where dyi is the gradient for the
-            i-th output, its shape is (batch_size, hidden_size*bidirection?2:1);
-                dhy is the gradient for the final hidden state, its shape is
-                (num_stacks * bidirection?2:1, batch_size,
-                hidden_size). dcy is the gradient for the final cell state.
-                cx is valid only for lstm. For other RNNs there is
-                no cx. Both dhy and dcy could be dummy tensors without shape and
-                data.
-
-        Returns:
-            <dx1, dx2, ... dxn, dhx, dcx>, where dxi is the gradient tensor for
-                the i-th input, its shape is (batch_size,
-                input_feature_length). dhx is the gradient for the initial
-                hidden state. dcx is the gradient for the initial cell state,
-                which is valid only for lstm.
-        '''
-        if type(flag) is bool:
-            if flag:
-                flag = model_pb2.kTrain
-            else:
-                flag = model_pb2.kEval
+    def step_forward(self,
+                     x=None,
+                     h=None,
+                     c=None,
+                     Wx=None,
+                     Wh=None,
+                     Bx=None,
+                     Bh=None,
+                     b=None):
+        raise NotImplementedError
 
-        tensors = []
-        for t in grad:
-            assert isinstance(t, tensor.Tensor), 'grad must be py Tensor'
-            tensors.append(t.data)
-        ret = self.layer.BackwardWithMultInputs(flag, tensors)
-        return tensor.from_raw_tensors(ret[0]), tensor.from_raw_tensors(ret[1])
 
+class RNN(RNN_Base):
+    """
+    Generate a RNN operator
+    """
 
-class LSTM(RNN):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers=1,
+        nonlinearity="tanh",
+        bias=True,
+        batch_first=False,
+        dropout=0,
+        bidirectional=False,
+    ):
+        """
+        Args:
+            input_size (int):  The number of expected features in the input x
+            hidden_size (int): The number of features in the hidden state h
+            num_layers (int):  Number of recurrent layers. Default: 1
+            nonlinearity (string): The non-linearity to use. Default: 'tanh'
+            bias (bool):  If False, then the layer does not use bias weights.
+                Default: True
+            batch_first (bool):  If True, then the input and output tensors
+                are provided as (batch, seq, feature). Default: False
+            dropout (float): If non-zero, introduces a Dropout layer on the
+                outputs of each RNN layer except the last layer, with dropout
+                probability equal to dropout. Default: 0
+            bidirectional (bool): If True, becomes a bidirectional RNN.
+                Default: False
+        """
+        super(RNN, self).__init__()
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.nonlinearity = nonlinearity
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = dropout
+        self.bidirectional = bidirectional
+
+    def initialize(self, xs, h0):
+        Wx_name = self.name + Layer.sep + 'Wx'
+        Wx_shape = (self.input_size, self.hidden_size)
+        self.Wx = Tensor(shape=Wx_shape,
+                         requires_grad=True,
+                         stores_grad=True,
+                         name=Wx_name)
+        self.Wx.gaussian(0.0, 1.0)
+
+        Wh_name = self.name + Layer.sep + 'Wh'
+        Wh_shape = (self.hidden_size, self.hidden_size)
+        self.Wh = Tensor(shape=Wh_shape,
+                         requires_grad=True,
+                         stores_grad=True,
+                         name=Wh_name)
+        self.Wh.gaussian(0.0, 1.0)
+
+        b_name = self.name + Layer.sep + 'b'
+        b_shape = (self.hidden_size,)
+        self.b = Tensor(shape=b_shape,
+                        requires_grad=True,
+                        stores_grad=True,
+                        name=b_name)
+        self.b.set_value(0.0)
+
+    def forward(self, xs, h0):
+        # xs: a tuple or list of input tensors
+        if not isinstance(xs, tuple):
+            xs = tuple(xs)
+        inputs = xs + (h0,)
+        self.device_check(*inputs)
+        # self.device_check(inputs[0], *self.params)
+        self.device_check(inputs[0], self.Wx, self.Wh, self.b)
+        batchsize = xs[0].shape[0]
+        out = []
+        h = self.step_forward(xs[0], h0, self.Wx, self.Wh, self.b)
+        out.append(h)
+        for x in xs[1:]:
+            assert x.shape[0] == batchsize
+            h = self.step_forward(x, h, self.Wx, self.Wh, self.b)
+            out.append(h)
+        return out, h
+
+    def step_forward(self, x, h, Wx, Wh, b):
+        y2 = autograd.matmul(h, Wh)
+        y1 = autograd.matmul(x, Wx)
+        y = autograd.add(y2, y1)
+        y = autograd.add_bias(y, b, axis=0)
+        if self.nonlinearity == "tanh":
+            y = autograd.tanh(y)
+        elif self.nonlinearity == "relu":
+            y = autograd.relu(y)
+        else:
+            raise ValueError
+        return y
 
-    def __init__(self,
-                 name,
-                 hidden_size,
-                 dropout=0.0,
-                 num_stacks=1,
-                 input_mode='linear',
-                 bidirectional=False,
-                 param_specs=None,
-                 input_sample_shape=None):
-        super(LSTM, self).__init__(name, hidden_size, 'lstm', dropout,
-                                   num_stacks, input_mode, bidirectional,
-                                   param_specs, input_sample_shape)
+    def get_params(self):
+        return {
+            self.Wx.name: self.Wx,
+            self.Wh.name: self.Wh,
+            self.b.name: self.b
+        }
 
+    def set_params(self, parameters):
+        self.set_attribute(self.Wx, parameters[self.Wx.name])

Review comment:
       why not use `self.Wx = parameters[self.Wx.name]`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org