You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/08/15 16:15:20 UTC

[07/22] incubator-singa git commit: SINGA-237 New documentation files for SINGA v1.0

SINGA-237 New documentation files for SINGA v1.0

Updated the comments of python files for autodoc to generate python APIs by Sphinx.

Fixed a bug in optimizer which ignored the momentum value


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/33992c90
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/33992c90
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/33992c90

Branch: refs/heads/dev
Commit: 33992c90191021451c9286ad28ad6140b80a9bd9
Parents: bc822cd
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Aug 12 14:45:41 2016 +0800
Committer: Wei Wang <wa...@gmail.com>
Committed: Sun Aug 14 23:46:17 2016 +0800

----------------------------------------------------------------------
 cmake/Dependencies.cmake        |   5 +-
 doc/conf.py                     |  14 +-
 doc/docs/device.rst             |  29 +-
 doc/docs/index.rst              |   6 +
 doc/docs/initializer.rst        |  12 +
 doc/docs/layer.rst              |  14 +
 doc/docs/loss.rst               |   7 +
 doc/docs/metric.rst             |   8 +
 doc/docs/optimizer.rst          |  11 +
 doc/docs/tensor.md              |   7 -
 doc/docs/tensor.rst             |  30 ++
 doc/docs/utils.rst              |   6 +
 doc/index.rst                   |  28 +-
 examples/index.rst              |   6 +
 src/python/singa/device.py      |  31 ++
 src/python/singa/initializer.py |  86 ++++-
 src/python/singa/layer.py       | 417 ++++++++++++++----------
 src/python/singa/loss.py        | 105 +++++-
 src/python/singa/metric.py      |  49 ++-
 src/python/singa/optimizer.py   | 286 ++++++++--------
 src/python/singa/tensor.py      | 608 ++++++++++++++++++++++++++++++-----
 21 files changed, 1332 insertions(+), 433 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index eb729db..c03c81e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -61,12 +61,13 @@ IF(USE_OPENCV)
     MESSAGE(STATUS "Found OpenCV_${OpenCV_VERSION} at ${OpenCV_INCLUDE_DIRS}")
     INCLUDE_DIRECTORIES(SYSTEM ${OpenCV_INCLUDE_DIRS})
     LIST(APPEND SINGA_LINKER_LIBS ${OpenCV_LIBRARIES})
-ENDIF()    
+ENDIF()
 
 #LIST(APPEND SINGA_LINKER_LIBS "/home/wangwei/local/lib/libopenblas.so")
 #MESSAGE(STATUS "link lib : " ${SINGA_LINKER_LIBS})
 
 IF(USE_PYTHON)
-    FIND_PACKAGE(PythonLibs REQUIRED)
+    FIND_PACKAGE(PythonLibs 2.7 REQUIRED)
+    FIND_PACKAGE(PythonInterp 2.7 REQUIRED)
     FIND_PACKAGE(SWIG 3.0 REQUIRED)
 ENDIF()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/conf.py
----------------------------------------------------------------------
diff --git a/doc/conf.py b/doc/conf.py
index 20ba51a..9f52d16 100755
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -19,7 +19,8 @@
 import os
 import sys
 sys.path.insert(0, os.path.abspath('.'))
-sys.path.insert(1, '../src/python/singa/')
+sys.path.insert(1, os.path.abspath('../build/python'))
+#autodoc_mock_imports = ['singa.device', 'singa.tensor', 'singa.layer']
 
 # -- General configuration ------------------------------------------------
 from recommonmark.parser import CommonMarkParser
@@ -35,9 +36,8 @@ source_parsers = {
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = [
-'sphinx.ext.autodoc'
-]
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon']
+napoleon_google_docstring = True
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -50,7 +50,7 @@ source_suffix = ['.rst', '.md']
 
 # The encoding of source files.
 #
-# source_encoding = 'utf-8-sig'
+source_encoding = 'utf-8-sig'
 
 # The master toctree document.
 master_doc = 'index'
@@ -150,7 +150,7 @@ html_theme = 'sphinx_rtd_theme'
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
 #
-html_logo = '/singa.png'
+html_logo = 'image/singa.png'
 
 # The name of an image file (relative to this directory) to use as a favicon of
 # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
@@ -203,7 +203,7 @@ html_static_path = ['_static']
 
 # If true, links to the reST sources are added to the pages.
 #
-html_show_sourcelink = False
+# html_show_sourcelink = True
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
 #

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/device.rst
----------------------------------------------------------------------
diff --git a/doc/docs/device.rst b/doc/docs/device.rst
index aa5defb..53faf48 100644
--- a/doc/docs/device.rst
+++ b/doc/docs/device.rst
@@ -2,7 +2,10 @@ Device
 =======
 
 
-The Device abstract represent a hardware device with memory and compuation units.
+The Device abstract represents any hardware device with memory and compuation units.
+All [Tensor operations](tensor.html) are scheduled by the resident device for execution.
+Tensor memory is also managed by the device's memory manager. Therefore, optimization
+of memory and execution are implemented in the Device class.
 
 Specific devices
 ----------------
@@ -13,24 +16,14 @@ Currently, SINGA has three Device implmentations,
 3. OpenclGPU for a GPU card which runs OpenCL code
 
 
-Create devices
----------------
-
 Python API
-~~~~~~~~~~
-
-.. autofunction:: device.create_cuda_gpus
-
-.. autofunction:: device.create_cuda_gpus_on
-
-.. autofunction:: device.create_cuda_gpu_on
-
-.. autofunction:: device.get_default_device
+----------
 
+.. automodule:: singa.device
+   :members: create_cuda_gpus, create_cuda_gpus_on, get_default_device
 
-The following code shows how to create devices,
 
-.. code:: python
+The following code provides examples of creating devices::
 
    from singa import device
    cuda = device.create_cuda_gpu_on(0)  # use GPU card of ID 0
@@ -39,9 +32,5 @@ The following code shows how to create devices,
    ary2 = device.create_cuda_gpus([0,2])  # create 2 devices on ID 0 and 2
 
 
-
 CPP API
-~~~~~~~
-
-
-
+---------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/index.rst
----------------------------------------------------------------------
diff --git a/doc/docs/index.rst b/doc/docs/index.rst
index 8a74976..2294054 100644
--- a/doc/docs/index.rst
+++ b/doc/docs/index.rst
@@ -6,4 +6,10 @@ English
    installation
    software_stack
    device
+   tensor
+   layer
+   initializer
+   loss
+   metric
+   optimizer
    examples

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/initializer.rst
----------------------------------------------------------------------
diff --git a/doc/docs/initializer.rst b/doc/docs/initializer.rst
new file mode 100644
index 0000000..a190702
--- /dev/null
+++ b/doc/docs/initializer.rst
@@ -0,0 +1,12 @@
+Initializer
+===========
+
+Python API
+----------
+
+.. automodule:: singa.initializer
+   :members:
+   :member-order: bysource
+
+CPP API
+--------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/layer.rst
----------------------------------------------------------------------
diff --git a/doc/docs/layer.rst b/doc/docs/layer.rst
new file mode 100644
index 0000000..62ef3c3
--- /dev/null
+++ b/doc/docs/layer.rst
@@ -0,0 +1,14 @@
+Layer
+======
+
+Python API
+-----------
+.. automodule:: singa.layer
+   :members:
+   :member-order: bysource
+   :show-inheritance:
+   :undoc-members:
+
+
+CPP API
+--------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/loss.rst
----------------------------------------------------------------------
diff --git a/doc/docs/loss.rst b/doc/docs/loss.rst
new file mode 100644
index 0000000..27872dd
--- /dev/null
+++ b/doc/docs/loss.rst
@@ -0,0 +1,7 @@
+Loss
+=========
+
+
+.. automodule:: singa.loss
+   :members:
+   :show-inheritance:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/metric.rst
----------------------------------------------------------------------
diff --git a/doc/docs/metric.rst b/doc/docs/metric.rst
new file mode 100644
index 0000000..35fa24e
--- /dev/null
+++ b/doc/docs/metric.rst
@@ -0,0 +1,8 @@
+Metric
+=========
+
+
+.. automodule:: singa.metric
+   :members:
+   :show-inheritance:
+   :member-order: bysource

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/optimizer.rst
----------------------------------------------------------------------
diff --git a/doc/docs/optimizer.rst b/doc/docs/optimizer.rst
new file mode 100644
index 0000000..486c01e
--- /dev/null
+++ b/doc/docs/optimizer.rst
@@ -0,0 +1,11 @@
+Optimizer
+=========
+
+
+.. automodule:: singa.optimizer
+   :members:
+   :member-order: bysource
+   :show-inheritance:
+   :undoc-members:
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/tensor.md
----------------------------------------------------------------------
diff --git a/doc/docs/tensor.md b/doc/docs/tensor.md
deleted file mode 100644
index eaf8362..0000000
--- a/doc/docs/tensor.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Tensor
-
-
-##
-
-
-##

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/tensor.rst
----------------------------------------------------------------------
diff --git a/doc/docs/tensor.rst b/doc/docs/tensor.rst
new file mode 100644
index 0000000..ff6142e
--- /dev/null
+++ b/doc/docs/tensor.rst
@@ -0,0 +1,30 @@
+Tensor
+========
+
+Each Tensor instance is a multi-dimensional array allocated on a specific
+Device instance. Tensor instances store variables and provide
+linear algebra operations over different types of hardware devices without user
+awareness. Note that users need to make sure the tensor operands are
+allocated on the same device except copy functions.
+
+
+Tensor implementation
+---------------------
+
+SINGA has three different sets of implmentations of Tensor functions, one for each
+type of Device.
+
+* 'tensor_math_cpp.h' implements operations using Cpp (with CBLAS) for CppGPU devices.
+* 'tensor_math_cuda.h' implements operations using Cuda (with cuBLAS) for CudaGPU devices.
+* 'tensor_math_opencl.h' implements operations using OpenCL for OpenclGPU devices.
+
+Python API
+----------
+
+
+.. automodule:: singa.tensor
+   :members:
+
+
+CPP API
+---------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/docs/utils.rst
----------------------------------------------------------------------
diff --git a/doc/docs/utils.rst b/doc/docs/utils.rst
new file mode 100644
index 0000000..5306719
--- /dev/null
+++ b/doc/docs/utils.rst
@@ -0,0 +1,6 @@
+Misc.
+=========
+
+
+.. automodule:: singa.utils
+   :members:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/doc/index.rst
----------------------------------------------------------------------
diff --git a/doc/index.rst b/doc/index.rst
index ec727b1..50c65d7 100755
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -7,9 +7,9 @@ Welcome to Apache Singa
 =======================
 
 Recent News
-===========
+-----------
 
-* The **third release** is now available, 20 April, 2016. `Download SINGA v0.3.0 <downloads.html>`_ 
+* The **third release** is now available, 20 April, 2016. `Download SINGA v0.3.0 <downloads.html>`_
 
 * The **second release** is now available, 14 Jan, 2016. `Download SINGA v0.2.0 <downloads.html>`_.
 
@@ -34,7 +34,7 @@ Recent News
 * SINGA has been accepted by `Apache Incubator <http://incubator.apache.org/>`_, 17 March, 2015.
 
 Getting Started
-===============
+---------------
 * The `Introduction <docs/overview.html>`_ page gives an overview of SINGA.
 
 * The `Installation <docs/installation.html>`_ guide describes details on downloading and installing SINGA.
@@ -42,7 +42,7 @@ Getting Started
 * Please follow the `Quick Start <docs/quick-start.html>`_ guide to run simple applications on SINGA.
 
 Documentation
-=============
+-------------
 
 * Documentations are listed `here <docs.html>`_.
 
@@ -51,8 +51,8 @@ Documentation
 * Research publication list is available `here <http://www.comp.nus.edu.sg/~dbsystem/singa/research/publication/>`_.
 
 How to contribute
-=================
-  
+----------------------
+
 * Please subscribe to our development mailing list dev-subscribe@singa.incubator.apache.org.
 
 * If you find any issues using SINGA, please report it to the `Issue Tracker <https://issues.apache.org/jira/browse/singa>`_.
@@ -62,17 +62,17 @@ How to contribute
 More details on contributing to SINGA is described `here <develop/how-contribute.html>`_ .
 
 Citing SINGA
-============
+------------
 
 Please cite the following two papers if you use SINGA in your research:
 
 * B. C. Ooi, K.-L. Tan, S. Wang, W. Wang, Q. Cai, G. Chen, J. Gao, Z. Luo, A. K. H. Tung, Y. Wang, Z. Xie, M. Zhang, and K. Zheng. `SINGA: A distributed deep learning platform <http://www.comp.nus.edu.sg/~ooibc/singaopen-mm15.pdf>`_. ACM Multimedia (Open Source Software Competition) 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-oss.txt>`_).
 
-* W. Wang, G. Chen, T. T. A. Dinh, B. C. Ooi, K.-L.Tan, J. Gao, and S. Wang. `SINGA: putting deep learning in the hands of multimedia users <http://www.comp.nus.edu.sg/~ooibc/singa-mm15.pdf>`_. ACM Multimedia 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-singa.txt>`_, `Slides <files/mm2015.ppt>`_). 
+* W. Wang, G. Chen, T. T. A. Dinh, B. C. Ooi, K.-L.Tan, J. Gao, and S. Wang. `SINGA: putting deep learning in the hands of multimedia users <http://www.comp.nus.edu.sg/~ooibc/singa-mm15.pdf>`_. ACM Multimedia 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-singa.txt>`_, `Slides <files/mm2015.ppt>`_).
 
 .. toctree::
    :hidden:
-   
+
    downloads
    docs
 
@@ -85,25 +85,25 @@ Please cite the following two papers if you use SINGA in your research:
    develop/how-contribute
    develop/contribute-code
    develop/contribute-docs
-   
+
 .. toctree::
    :hidden:
    :maxdepth: 2
    :caption: Community
-   
+
    community/source-repository
    community/mail-lists
    community/issue-tracking
    community/team-list
-   
+
 
 
 License
-=======
+----------
 SINGA is released under `Apache License Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0>`_.
 
 Disclaimers
-===========
+-----------
 
 Apache SINGA is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/examples/index.rst
----------------------------------------------------------------------
diff --git a/examples/index.rst b/examples/index.rst
new file mode 100644
index 0000000..d6faf5d
--- /dev/null
+++ b/examples/index.rst
@@ -0,0 +1,6 @@
+.. toctree::
+
+   char-rnn/README
+   imagenet/README
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/src/python/singa/device.py
----------------------------------------------------------------------
diff --git a/src/python/singa/device.py b/src/python/singa/device.py
index aff3587..eff6783 100644
--- a/src/python/singa/device.py
+++ b/src/python/singa/device.py
@@ -68,21 +68,52 @@ def device_query(id, verbose=False):
 
 
 def create_cuda_gpus(num):
+    '''Create a list of CudaGPU devices.
+
+    Args:
+        num (int): number of device to create.
+    Returns:
+        a list of swig converted CudaGPU devices.
+    '''
+
     return singa.Platform.CreateCudaGPUs(num)
 
 
 def create_cuda_gpu():
+    '''Create a single CudaGPU device.
+
+    Returns:
+        a swig converted CudaGPU device.
+    '''
+
     return singa.Platform.CreateCudaGPUs(1)[0]
 
 
 def create_cuda_gpus_on(device_ids):
+    '''Create a list of CudaGPU devices.
+
+    Args:
+        device_ids (list): a list of GPU card IDs.
+
+    Returns:
+        a list of swig converted CudaGPU devices.
+    '''
     return singa.Platform.CreateCudaGPUsOn(device_ids)
 
 
 def create_cuda_gpu_on(device_id):
+    '''Create a CudaGPU device on the given device ID.
+
+    Args:
+        device_id (int): GPU card ID.
+
+    Returns:
+        a swig converted CudaGPU device.
+    '''
     devices = create_cuda_gpus_on([device_id])
     return devices[0]
 
 
 def get_default_device():
+    '''Get the default host device which is a CppCPU device'''
     return singa.Platform.GetDefaultDevice()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/src/python/singa/initializer.py
----------------------------------------------------------------------
diff --git a/src/python/singa/initializer.py b/src/python/singa/initializer.py
index 15caed3..277fd2f 100644
--- a/src/python/singa/initializer.py
+++ b/src/python/singa/initializer.py
@@ -15,29 +15,113 @@
 # specific language governing permissions and limitations
 # under the License.
 # =============================================================================
-"""Popular initialization methods for parameter values (Tensor ojects)"""
+'''Popular initialization methods for parameter values (Tensor objects).
+
+Example usages::
+
+    from singa import tensor
+    from singa import initializer
+
+    x = tensor.Tensor((3, 5))
+    initializer.xavier(x)
+'''
 
 import math
 
 
+'''
+TODO(wangwei) update the uniform and gaussian initializers
+
+def uniform(t, fan_in=0, fan_out=0):
+    typically, for conv layer weight: fan_in = nb_filter * kh * kw,
+    fan_out = nb_channel * kh * kw
+    for dense layer weight, fan_in = input_feature_length,
+    fan_out = output_feature_length
+    # Ref: [Bengio and Glorot 2010]: Understanding the difficulty of
+    training deep feedforward neuralnetworks.
+
+    assert fan_in >0 or fan_out > 0, \
+        'fan_in and fan_out cannot be 0 at the same time'
+    avg = 1
+    if fan_in * fan_out == 0:
+      avg = 2
+    x = math.sqrt(3.0f * avg / (fan_in + fan_out))
+    t.uniform(-x, x)
+
+
+def gaussian(t, fan_in=0, fan_out=0):
+    typically, for conv layer weight: fan_in = nb_filter * kh * kw,
+    fan_out = nb_channel * kh * kw
+    for dense layer weight, fan_in = input_feature_length,
+    fan_out = output_feature_length
+
+    Ref Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Delving Deep into
+    Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
+
+    assert fan_in >0 or fan_out > 0, \
+        'fan_in and fan_out cannot be 0 at the same time'
+    avg = 1
+    if fan_in * fan_out == 0:
+      avg = 2
+    std = math.sqrt(2.0f * avg / (fan_in + fan_out))
+    t.gaussian(0, std)
+'''
+
+
 def uniform(t, low=0, high=1):
+    '''Initialize the parameter values following an Uniform distribution.
+
+    Args:
+        t (Tensor): the parater tensor
+        low (float): lower bound
+        high (float): higher bound
+    '''
     t.uniform(low, high)
 
 
 def gaussian(t, mean=0, std=0.01):
+    '''Initialize the parameter values following an Gaussian distribution.
+
+    Args:
+        t (Tensor): the parater tensor
+        mean (float): mean of the distribution
+        std (float): standard variance
+    '''
     t.gaussian(mean, std)
 
 
 def xavier(t):
+    '''Initialize the matrix parameter follow a Uniform distribution from
+    [-sqrt(6/(fan_in + fan_out)), sqrt(6/(fan_in + fan_out))].
+
+    Args:
+        t (Tensor): the parater tensor
+    '''
+
     scale = math.sqrt(6.0 / (t.shape[0] + t.shape[1]))
     t.uniform(-scale, scale)
 
 
 def glorot(t):
+    '''Initialize the matrix parameter follow a Gaussian distribution with
+    mean = 0 and std = sqrt(2.0 / (nb_row + nb_col))
+
+    Args:
+        t (Tensor): the parater tensor
+    '''
     scale = math.sqrt(2.0 / (t.shape[0] + t.shape[1]))
     t.gaussian(0, 1)
     t *= scale
 
 
 def msra(t):
+    '''Initialize the matrix parameter follow a Guassian distribution with
+    mean = 0, std = math.sqrt(2.0 / nb_row).
+
+    Ref [He, Zhang, Ren and Sun 2015]: Specifically accounts for ReLU
+    nonlinearities.
+
+    Args:
+        t (Tensor): the parater tensor
+    '''
     t.gaussian(0, math.sqrt(2.0 / t.shape[0]))

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/src/python/singa/layer.py
----------------------------------------------------------------------
diff --git a/src/python/singa/layer.py b/src/python/singa/layer.py
index c8c8c05..0759716 100644
--- a/src/python/singa/layer.py
+++ b/src/python/singa/layer.py
@@ -14,7 +14,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-""" Python layers which wraps the C++ layers by providing easy to construct APIs
+""" Python layers wrap the C++ layers to provide simpler construction APIs.
+
+Example usages::
+
+    from singa import layer
+    from singa import tensor
+    from singa import device
+    from singa.model_pb2 import kTrain
+
+    layer.engine = 'cudnn'  # to use cudnn layers
+    dev = device.create_cuda_gpu()
+
+    # create a convolution layer
+    conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32))
+    conv.to_device(dev)  # move the layer data onto a CudaGPU device
+    x = tensor.Tensor((3, 32, 32), dev)
+    x.uniform(-1, 1)
+    y = conv.foward(kTrain, x)
+
+    dy = tensor.Tensor()
+    dy.reset_like(y)
+    dy.set_value(0.1)
+    # dp is a list of tensors for parameter gradients
+    dx, dp = conv.backward(kTrain, dy)
 """
 
 from sets import Set
@@ -22,23 +45,37 @@ from . import singa_wrap
 from .proto import model_pb2
 import tensor
 
-# engine could be 'cudnn', 'singa', which is used to create layers.
-# e.g., CudnnConvolution layer is identified by 'cudnn_convolution'
-# Convolution layer is identified by 'singa_convolution'
-# engine is case insensitive
+
 engine = 'cudnn'
+'''engine is the prefix of layer identifier.
+
+The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for
+layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively.
+For example, CudnnConvolution layer is identified by 'cudnn_convolution';
+'singacpp_convolution' is for Convolution layer;
+Some layers' implementation use only Tensor functions, thererfore they are
+transparent to the underlying devices. For threse layers, they would have
+multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and
+singacl_dropout are all for the Dropout layer.
+
+engine is case insensitive. Each python layer would create the correct specific
+layer using the engine attribute.
+'''
 
 
 class Layer(object):
-    """Base Python layer class.
+    '''Base Python layer class.
 
-    Usages:
-        1.  construct layer without input_sample_shapes, goto 2;
-            construct layer with input_sample_shapes, goto 3;
+    Typically, the life cycle of a layer instance includes:
+        1. construct layer without input_sample_shapes, goto 2;
+           construct layer with input_sample_shapes, goto 3;
         2. call setup to create the parameters and setup other meta fields
         3. call forward or access layer members
         4. call backward and get parameters for update
-    """
+
+    Args:
+        name (str): layer name
+    '''
 
     def __init__(self, name, **kwargs):
         self.layer = None  # layer converted by swig
@@ -49,20 +86,24 @@ class Layer(object):
         self.has_setup = False
 
     def param_names(self):
+        '''
+        Returns:
+            a list of strings, one for the name of one parameter Tensor
+        '''
         names = []
         for x in self.param_specs:
             names.append(x['name'])
         return names
 
     def setup(self, in_shapes):
-        """Call the C++ setup function to create params and set some meta data.
+        '''Call the C++ setup function to create params and set some meta data.
 
         Args:
             in_shapes: if the layer accepts a single input Tensor, in_shapes is
                 a single tuple specifying the inpute Tensor shape; if the layer
                 accepts multiple input Tensor (e.g., the concatenation layer),
-                in_shapes is a tuple of tuples, each for one input Tensor shape
-        """
+                in_shapes is a tuple of tuples, each for one input Tensor
+        '''
         if self.has_setup:
             return
         self.layer.Setup(list(in_shapes),
@@ -70,54 +111,92 @@ class Layer(object):
         self.has_setup = True
 
     def get_output_sample_shape(self):
+        '''Called after setup to get the shape of the output sample(s).
+
+        Returns:
+            a tuple for a single output Tensor or a list of tuples if this layer
+            has multiple outputs
+        '''
         assert self.has_setup, \
             'Must call setup() before get_output_sample_shape()'
         return self.layer.GetOutputSampleShape()
 
     def param_values(self):
-        """Return param value tensors.
+        '''Return param value tensors.
 
-        Do not store these tensors as layer members because cpp Tensor could be
-        moved onto diff devices due to the change of layer device. However, the
-        py tensors would not update its internal cpp tensor automatically.
-        """
+        Parameter tensors are not stored as layer members because cpp Tensor
+        could be moved onto diff devices due to the change of layer device,
+        which would result in inconsistency.
+
+        Returns:
+            a list of tensors, one for each paramter
+        '''
         return tensor.from_raw_tensors(self.layer.param_values())
 
-    def forward(self, flag, input):
+    def forward(self, flag, x):
         '''Forward propagate through this layer.
 
         Args:
-            flag, kTrain or kEval
-            input, an input tensor
+            flag (int): kTrain or kEval
+            x (Tensor or list<Tensor>): an input tensor if the layer is
+                connected from a single layer; a list of tensors if the layer
+                is connected from multiple layers.
 
         Return:
-            a tensor for the transformed feature
+            a tensor if the layer is connected to a single layer; a list of
+            tensors if the layer is connected to multiple layers;
         '''
         assert self.has_setup, 'Must call setup() before forward()'
-        assert isinstance(input, tensor.Tensor), 'input must be py Tensor'
-        y = self.layer.Forward(flag, input.singa_tensor)
-        return tensor.from_raw_tensor(y)
+        if type(x) == list:
+            xs = []
+            for t in x:
+                x.append(t.singa_tensor)
+        else:
+            assert isinstance(input, tensor.Tensor), \
+                'input must be a Tensor or a list of Tensor'
+            xs = x
+        y = self.layer.Forward(flag, xs)
+        if type(y) == list:
+            return tensor.from_raw_tensors(y)
+        else:
+            return tensor.from_raw_tensor(y)
 
-    def backward(self, flag, grad):
-        '''Backward propagate through this layer.
+    def backward(self, flag, dy):
+        '''Backward propagate gradients through this layer.
 
         Args:
-            flag, for future use.
-            grad, gradient of the returned values of the forward function.
-
+            flag (int): for future use.
+            dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the
+                objective loss
         Return:
-            <dx, <dp1, dp2..>>, dx is the gradient of the input of the
-            forward function, dpi is the gradient of the i-th parameter
+            <dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x
+            , dpi is the gradient of the i-th parameter
         '''
-        assert isinstance(grad, tensor.Tensor), 'grad must be py Tensor'
-        ret = self.layer.Backward(flag, grad.singa_tensor)
-        return tensor.from_raw_tensor(ret[0]), tensor.from_raw_tensors(ret[1])
+        if type(dy) == list:
+            dys = []
+            for t in dy:
+                dys.append(t.singa_tensor)
+        else:
+            assert isinstance(dy, tensor.Tensor), \
+                'the input must be a Tensor or a set of Tensor'
+            dys = dy.singa_tensor
+        ret = self.layer.Backward(flag, dys)
+        if type(ret[0]) == list:
+            dxs = tensor.from_raw_tensors(ret[0])
+        else:
+            dxs = tensor.from_raw_tensor(ret[0])
+        return dxs, tensor.from_raw_tensors(ret[1])
 
     def to_device(self, device):
+        '''Move layer state tensors onto the given device.
+
+        Args:
+            device: swig converted device, created using singa.device
+        '''
         self.layer.ToDevice(device)
 
     def as_type(self, dtype):
-        self.layer.AsType(dtype)
+        pass
 
     def __copy__(self):
         pass
@@ -127,43 +206,42 @@ class Layer(object):
 
 
 class Conv2D(Layer):
+    """Construct a layer for 2D convolution.
 
+    Args:
+        nb_kernels (int): num of the channels (kernels) of the input Tensor
+        kernel: an integer or a pair of integers for kernel height and width
+        stride: an integer or a pair of integers for stride height and width
+        border_mode (string): padding mode, case in-sensitive,
+            'valid' -> padding is 0 for height and width
+            'same' -> padding is half of the kernel (floor), the kernel must be
+            odd number.
+        cudnn_prefer (string): the preferred algorithm for cudnn convolution
+            which could be 'fatest', 'autotune', 'limited_workspace' and
+            'no_workspace'
+        data_format (string): either 'NCHW' or 'NHWC'
+        use_bias (bool): True or False
+        pad: an integer or a pair of integers for padding height and width
+        W_specs (dict): used to specify the weight matrix specs, fields
+            include,
+            'name' for parameter name
+            'lr_mult' for learning rate multiplier
+            'decay_mult' for weight decay multiplier
+            'init' for init method, which could be 'gaussian', 'uniform',
+            'xavier' and ''
+            'std', 'mean', 'high', 'low' for corresponding init methods
+            TODO(wangwei) 'clamp' for gradient constraint, value is scalar
+            'regularizer' for regularization, currently support 'l2'
+        b_specs (dict): hyper-parameters for bias vector, similar as W_specs
+        name (string): layer name.
+        input_sample_shape: 3d tuple for the shape of the input Tensor
+            without the batchsize, e.g., (channel, height, width) or
+            (height, width, channel)
+    """
     def __init__(self, name, nb_kernels, kernel=3, stride=1, border_mode='same',
                  cudnn_prefer='fatest', data_format='NCHW',
                  use_bias=True, W_specs=None, b_specs=None,
                  pad=None, input_sample_shape=None):
-        """Construct a layer for 2D convolution.
-
-        Args:
-            nb_kernels (int): num of the channels (kernels) of the input Tensor
-            kernel: an integer or a pair of integers for kernel height and width
-            stride: an integer or a pair of integers for stride height and width
-            border_mode (string): padding mode, case in-sensitive,
-                'valid' -> padding is 0 for height and width
-                'same' -> padding is half of the kernel (floor),
-                    the kernel must be odd number.
-            cudnn_prefer (string): the preferred algorithm for cudnn convolution
-                which could be 'fatest', 'autotune', 'limited_workspace' and
-                'no_workspace'
-            data_format (string): either 'NCHW' or 'NHWC'
-            use_bias (bool): True or False
-            pad: an integer or a pair of integers for padding height and width
-            W_specs (dict): used to specify the weight matrix specs, fields
-                include,
-                'name' for parameter name
-                'lr_mult' for learning rate multiplier
-                'decay_mult' for weight decay multiplier
-                'init' for init method, which could be 'gaussian', 'uniform',
-                'xavier' and ''
-                'std', 'mean', 'high', 'low' for corresponding init methods
-                TODO(wangwei) 'clamp' for gradient constraint, value is scalar
-                'regularizer' for regularization, currently support 'l2'
-            b_specs (dict): hyper-parameters for bias vector, similar as W_specs
-            name (string): layer name.
-            input_sample_shape: 3d tuple for the shape of the input Tensor
-                without the batchsize, e.g., (channel, height, width) or
-                (height, width, channel)
-        """
         super(Conv2D, self).__init__(name)
         assert data_format == 'NCHW', 'Not supported data format: %s ' \
             'only "NCHW" is enabled currently' % (data_format)
@@ -195,19 +273,19 @@ class Conv2D(Layer):
 
 
 class Conv1D(Conv2D):
+    """Construct a layer for 1D convolution.
+
+    Most of the args are the same as those for Conv2D except the kernel,
+    stride, pad, which is a scalar instead of a tuple.
+    input_sample_shape is a tuple with a single value for the input feature
+    length
+    """
 
     def __init__(self, name, nb_kernels, kernel=3, stride=1,
                  border_mode='same', cudnn_prefer='fatest',
                  use_bias=True, W_specs={'init': 'Xavier'},
                  b_specs={'init': 'Constant', 'value': 0}, pad=None,
                  input_sample_shape=None):
-        """Construct a layer for 1D convolution.
-
-        Most of the args are the same as those for Conv2D except the kernel,
-        stride, pad, which is a scalar instead of a tuple.
-        input_sample_shape is a tuple with a single value for the input feature
-        length
-        """
         pad = None
         if pad is not None:
             pad = (0, pad)
@@ -227,7 +305,15 @@ class Conv1D(Conv2D):
 
 
 class Pooling2D(Layer):
+    '''2D pooling layer providing max/avg pooling.
+
+    All args are the same as those for Conv2D, except the following one
 
+    Args:
+        mode: pooling type, model_pb2.PoolingConf.MAX or
+            model_pb2.PoolingConf.AVE
+
+    '''
     def __init__(self, name, mode, kernel=3, stride=2, border_mode='same',
                  pad=None, data_format='NCHW', input_sample_shape=None):
         super(Pooling2D, self).__init__(name)
@@ -312,28 +398,26 @@ class AvgPooling1D(AvgPooling2D):
 
 
 class BatchNormalization(Layer):
-    # TODO(wangwei) add mode and epsilon arguments
+    """Batch-normalization.
 
+    Args:
+        momentum (float): for running average mean and variance.
+        beta_specs (dict): dictionary includes the fields for the beta
+            param:
+            'name' for parameter name
+            'lr_mult' for learning rate multiplier
+            'decay_mult' for weight decay multiplier
+            'init' for init method, which could be 'gaussian', 'uniform',
+            'xavier' and ''
+            'std', 'mean', 'high', 'low' for corresponding init methods
+            'clamp' for gradient constraint, value is scalar
+            'regularizer' for regularization, currently support 'l2'
+        gamma_specs (dict): similar to beta_specs, but for the gamma param.
+        name (string): layer name
+        input_sample_shape (tuple): with at least one integer
+    """
     def __init__(self, name, momentum=0.9,
                  beta_specs=None, gamma_specs=None, input_sample_shape=None):
-        """Batch-normalization.
-
-        Args:
-            momentum (float): for running average mean and variance.
-            beta_specs (dict): dictionary includes the fields for the beta
-                param:
-                'name' for parameter name
-                'lr_mult' for learning rate multiplier
-                'decay_mult' for weight decay multiplier
-                'init' for init method, which could be 'gaussian', 'uniform',
-                'xavier' and ''
-                'std', 'mean', 'high', 'low' for corresponding init methods
-                'clamp' for gradient constraint, value is scalar
-                'regularizer' for regularization, currently support 'l2'
-            gamma_specs (dict): similar to beta_specs, but for the gamma param.
-            name (string): layer name
-            input_sample_shape (tuple): with at least one integer
-        """
         super(BatchNormalization, self).__init__(name)
         conf = self.conf.batchnorm_conf
         conf.factor = momentum
@@ -362,16 +446,17 @@ class BatchNormalization(Layer):
 
 
 class LRN(Layer):
+    """Local response normalization.
+
+    Args:
+        size (int): # of channels to be crossed
+            normalization.
+        mode (string): 'cross_channel'
+        input_sample_shape (tuple): 3d tuple, (channel, height, width)
+    """
+
     def __init__(self, name, size=5, alpha=1, beta=0.75, mode='cross_channel',
                  k=1, input_sample_shape=None):
-        """Local response normalization.
-
-        Args:
-            size (int): # of channels to be crossed
-                normalization.
-            mode (string): 'cross_channel'
-            input_sample_shape (tuple): 3d tuple, (channel, height, width)
-        """
         super(LRN, self).__init__(name)
         conf = self.conf.lrn_conf
         conf.local_size = size
@@ -388,29 +473,28 @@ class LRN(Layer):
 
 
 class Dense(Layer):
+    """Apply linear/affine transformation, also called inner-product or
+    fully connected layer.
 
+    Args:
+        num_output (int): output feature length.
+        use_bias (bool): add a bias vector or not to the transformed feature
+        W_specs (dict): specs for the weight matrix
+            'name' for parameter name
+            'lr_mult' for learning rate multiplier
+            'decay_mult' for weight decay multiplier
+            'init' for init method, which could be 'gaussian', 'uniform',
+            'xavier' and ''
+            'std', 'mean', 'high', 'low' for corresponding init methods
+            'clamp' for gradient constraint, value is scalar
+            'regularizer' for regularization, currently support 'l2'
+        b_specs (dict): specs for the bias vector, same fields as W_specs.
+        W_transpose (bool): if true, output=x*W.T+b;
+        input_sample_shape (tuple): input feature length
+    """
     def __init__(self, name, num_output, use_bias=True,
                  W_specs=None, b_specs=None,
                  W_transpose=True, input_sample_shape=None):
-        """Apply linear/affine transformation, also called inner-product or
-        fully connected layer.
-
-        Args:
-            num_output (int): output feature length.
-            use_bias (bool): add a bias vector or not to the transformed feature
-            W_specs (dict): specs for the weight matrix
-                'name' for parameter name
-                'lr_mult' for learning rate multiplier
-                'decay_mult' for weight decay multiplier
-                'init' for init method, which could be 'gaussian', 'uniform',
-                'xavier' and ''
-                'std', 'mean', 'high', 'low' for corresponding init methods
-                'clamp' for gradient constraint, value is scalar
-                'regularizer' for regularization, currently support 'l2'
-            b_specs (dict): specs for the bias vector, same fields as W_specs.
-            W_transpose (bool): if true, output=x*W.T+b;
-            input_sample_shape (tuple): input feature length
-        """
         super(Dense, self).__init__(name)
         conf = self.conf.dense_conf
         conf.num_output = num_output
@@ -435,14 +519,14 @@ class Dense(Layer):
 
 
 class Dropout(Layer):
+    """Droput layer.
 
-    def __init__(self, name, p=0.5, input_sample_shape=None):
-        """Droput layer.
+    Args:
+        p (float): probability for dropping out the element, i.e., set to 0
+        name (string): layer name
+    """
 
-        Args:
-            p (float): probability for dropping out the element, i.e., set to 0
-            name (string): layer name
-        """
+    def __init__(self, name, p=0.5, input_sample_shape=None):
         super(Dropout, self).__init__(name)
         conf = self.conf.dropout_conf
         conf.dropout_ratio = p
@@ -456,15 +540,14 @@ class Dropout(Layer):
 
 
 class Activation(Layer):
+    """Activation layers.
 
+    Args:
+        name (string): layer name
+        mode (string): 'relu', 'sigmoid', or 'tanh'
+        input_sample_shape (tuple): shape of a single sample
+    """
     def __init__(self, name, mode='relu', input_sample_shape=None):
-        """Activation layers.
-
-        Args:
-            name (string): layer name
-            mode (string): 'relu', 'sigmoid', or 'tanh'
-            input_sample_shape (tuple): shape of a single sample
-        """
         super(Activation, self).__init__(name)
         self.conf.type = (engine + '_' + mode).lower()
         _check_engine(engine, ['cudnn', 'singa'])
@@ -474,15 +557,14 @@ class Activation(Layer):
 
 
 class Softmax(Layer):
+    """Apply softmax.
 
+    Args:
+        axis (int): reshape the input as a matrix with the dimension
+            [0,axis) as the row, the [axis, -1) as the column.
+        input_sample_shape (tuple): shape of a single sample
+    """
     def __init__(self, name, axis=1, input_sample_shape=None):
-        """Apply softmax.
-
-        Args:
-            axis (int): reshape the input as a matrix with the dimension
-                [0,axis) as the row, the [axis, -1) as the column.
-            input_sample_shape (tuple): shape of a single sample
-        """
         super(Softmax, self).__init__(name)
         # conf = self.conf.softmax_conf
         # conf.axis = axis
@@ -493,14 +575,14 @@ class Softmax(Layer):
 
 
 class Flatten(Layer):
+    """Reshape the input tensor into a matrix.
 
+    Args:
+        axis (int): reshape the input as a matrix with the dimension
+            [0,axis) as the row, the [axis, -1) as the column.
+        input_sample_shape (tuple): shape for a single sample
+    """
     def __init__(self, name, axis=1, input_sample_shape=None):
-        """Reshape the input tensor into a matrix.
-        Args:
-            axis (int): reshape the input as a matrix with the dimension
-                [0,axis) as the row, the [axis, -1) as the column.
-            input_sample_shape (tuple): shape for a single sample
-        """
         super(Flatten, self).__init__(name)
         conf = self.conf.flatten_conf
         conf.axis = axis
@@ -511,26 +593,27 @@ class Flatten(Layer):
 
 
 class RNN(Layer):
+    '''Recurrent layer with 4 types of units, namely lstm, gru, tanh and relu.
+
+    Args:
+        hidden_size: hidden feature size, the same for all stacks of layers.
+        rnn_mode: decides the rnn unit, which could be one of 'lstm', 'gru',
+            'tanh' and 'relu', refer to cudnn manual for each mode.
+        num_stacks: num of stacks of rnn layers. It is different to the
+            unrolling seqence length.
+        input_mode: 'linear' convert the input feature x by by a linear
+            transformation to get a feature vector of size hidden_size;
+            'skip' does nothing but requires the input feature size equals
+            hidden_size
+        bidirection: True for bidirectional RNN
+        param_specs: config for initializing the RNN parameters.
+        input_sample_shape: includes a single integer for the input sample
+            feature size.
+    '''
+
     def __init__(self, name, hidden_size, rnn_mode='lstm', dropout=0.0,
                  num_stacks=1, input_mode='linear', bidirectional=False,
                  param_specs=None, input_sample_shape=None):
-        '''Wrapper for singa::RNN class.
-
-        Args:
-            hidden_size, hidden feature size, the same for all stacks of layers.
-            rnn_mode, decides the rnn unit, which could be one of 'lstm', 'gru',
-                'tanh' and 'relu', refer to cudnn manual for each mode.
-            num_stacks, num of stacks of rnn layers. It is different to the
-                unrolling seqence length.
-            input_mode, 'linear' convert the input feature x by by a linear
-                transformation to get a feature vector of size hidden_size;
-                'skip' does nothing but requires the input feature size equals
-                hidden_size
-            bidirection, True for bidirectional RNN
-            param_specs, config for initializing the RNN parameters.
-            input_sample_shape, includes a single integer for the input sample
-                feature size.
-        '''
         super(RNN, self).__init__(name)
         conf = self.conf.rnn_conf
         assert hidden_size > 0, 'Hidden feature size must > 0'
@@ -605,7 +688,7 @@ class RNN(Layer):
 
         Returns:
             <dx1, dx2, ... dxn, dhx, dcx>, where dxi is the gradient tensor for
-            the i-th input, its shape is (batch_size,
+                the i-th input, its shape is (batch_size,
                 input_feature_length). dhx is the gradient for the initial
                 hidden state. dcx is the gradient for the initial cell state,
                 which is valid only for lstm.
@@ -741,5 +824,7 @@ def _construct_param_specs_from_dict(specs):
 
 
 def get_layer_list():
-    """ Return a list of strings reprensenting the all supported layers"""
+    """ Return a list of strings which include the identifiers (tags) of all
+    supported layers
+    """
     return singa_wrap.GetRegisteredLayers()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/src/python/singa/loss.py
----------------------------------------------------------------------
diff --git a/src/python/singa/loss.py b/src/python/singa/loss.py
index acfb813..c88290b 100644
--- a/src/python/singa/loss.py
+++ b/src/python/singa/loss.py
@@ -15,32 +15,127 @@
 # specific language governing permissions and limitations
 # under the License.
 # =============================================================================
-""" Python wrappers for optimizers implemented by C++."""
+
+'''
+Loss module includes a set of training loss implmentations. Some are converted
+from C++ implementation, and the rest are implemented directly using python
+Tensor.
+
+Example usage::
+
+    from singa import tensor
+    from singa import loss
+    from singa.proto import model_pb2
+
+    x = tensor.Tensor((3, 5))
+    x.uniform(0, 1)  # randomly genearte the prediction activation
+    y = tensor.from_numpy(np.array([0, 1, 3], dtype=np.int))  # set the truth
+
+    f = loss.SoftmaxCrossEntropy()
+    l = f.forward(model_pb2.kTrain, x, y)  # l is tensor with 3 loss values
+    g = f.backward()  # g is a tensor containing all gradients of x w.r.t l
+'''
+
 
 from . import singa_wrap as singa
 import tensor
 
 
 class Loss(object):
+    '''Base loss class.
+
+    Subclasses that wrap the C++ loss classes can use the inherited foward,
+    backward, and evaluate functions of this base class. Other subclasses need
+    to override these functions
+    '''
 
     def __init__(self):
         self.swig_loss = None
 
     def forward(self, flag, x, y):
-        """Return a tensor of floats, one per sample"""
+        '''Compute the loss values.
+
+        Args:
+            flag (int): kTrain or kEval. If it is kTrain, then the backward
+                function must be called before calling forward again.
+            x (Tensor): the prediction Tensor
+            y (Tensor): the ground truch Tensor, x.shape[0] must = y.shape[0]
+
+        Returns:
+            a tensor of floats for the loss values, one per sample
+        '''
         return tensor.from_raw_tensor(
             self.swig_loss.Forward(flag, x.singa_tensor, y.singa_tensor))
 
     def backward(self):
-        """Return the grad of x w.r.t. the loss obj"""
+        '''
+        Returns:
+            the grad of x w.r.t. the loss
+        '''
         return tensor.from_raw_tensor(self.swig_loss.Backward())
 
-    def evaluate(self, flag, x, y):
-        """Return the averaged loss for all samples in x"""
+    def evaluate(self, flag, x, y):  # TODO(wangwei) remove flag
+        '''
+        Args:
+            flag (int): must be kEval, to be removed
+            x (Tensor): the prediction Tensor
+            y (Tensor): the ground truth Tnesor
+
+        Returns:
+            the averaged loss for all samples in x.
+        '''
         return self.swig_loss.Evaluate(flag, x.singa_tensor, y.singa_tensor)
 
 
 class SoftmaxCrossEntropy(Loss):
+    '''This loss function is a combination of SoftMax and Cross-Entropy loss.
+
+    It converts the inputs via SoftMax function and then
+    computes the cross-entropy loss against the ground truth values.
+    '''
 
     def __init__(self):
         self.swig_loss = singa.SoftmaxCrossEntropy()
+
+
+class SquaredError(Loss):
+    '''This loss evaluates the squared error between the prediction and the
+    truth values.
+
+    It is implemented using Python Tensor operations.
+    '''
+    def __init__(self):
+        super(Loss, SquaredError).__init__()
+        self.err = None
+
+    def forward(self, flag, x, y):
+        '''Compute the error as 0.5 * ||x-y||^2.
+
+        Args:
+            flag (int): kTrain or kEval; if kTrain, then the backward must be
+                called before calling forward again.
+            x (Tensor): the prediction Tensor
+            y (Tensor): the truth Tensor, an integer value per sample, whose
+                value is [0, x.shape[1])
+
+        Returns:
+            a Tensor with one error value per sample
+        '''
+        self.err = x - y
+        return 0.5 * tensor.squared(self.err)
+
+    def backward(self):
+        '''Compute the gradient of x w.r.t the error.
+
+        Returns:
+            x - y
+        '''
+        return self.err
+
+    def evaluate(self, flag, x, y):
+        '''Compuate the averaged error.
+
+        Returns:
+            a float value as the averaged error
+        '''
+        return tensor.sum(0.5 * tensor.squared(x - y)) / x.size()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/src/python/singa/metric.py
----------------------------------------------------------------------
diff --git a/src/python/singa/metric.py b/src/python/singa/metric.py
index 31b6892..3a5750d 100644
--- a/src/python/singa/metric.py
+++ b/src/python/singa/metric.py
@@ -15,28 +15,71 @@
 # specific language governing permissions and limitations
 # under the License.
 # =============================================================================
-""" Python wrappers for optimizers implemented by C++."""
+'''This module includes a set of metric classes for evaluating the model's
+performance. The specific metric classes could be converted from C++
+implmentation or implemented directly using Python.
+
+
+Example usage::
+
+    from singa import tensor
+    from singa import metric
+
+    x = tensor.Tensor((3, 5))
+    x.uniform(0, 1)  # randomly genearte the prediction activation
+    x = tensor.SoftMax(x)  # normalize the prediction into probabilities
+    y = tensor.from_numpy(np.array([0, 1, 3], dtype=np.int))  # set the truth
+
+    f = metric.Accuracy()
+    acc = f.evaluate(x, y)  # averaged accuracy over all 3 samples in x
+
+'''
 
 from . import singa_wrap as singa
 import tensor
 
 
 class Metric(object):
+    '''Base metric class.
+
+    Subclasses that wrap the C++ loss classes can use the inherited foward,
+    and evaluate functions of this base class. Other subclasses need
+    to override these functions. Users need to feed in the **predictions** and
+    ground truth to get the metric values.
+    '''
 
     def __init__(self):
         self.swig_metric = None
 
     def forward(self, x, y):
-        """Return a tensor of floats, one per sample"""
+        '''Compute the metric for each sample.
+
+        Args:
+            x (Tensor): predictions, one row per sample
+            y (Tensor): ground truth values, one row per sample
+
+        Returns:
+            a tensor of floats, one per sample
+        '''
         return tensor.from_raw_tensor(
             self.swig_metric.Forward(x.singa_tensor, y.singa_tensor))
 
     def evaluate(self, x, y):
-        """Return the averaged metric for all samples in x"""
+        '''Compute the averaged metric over all samples.
+
+        Args:
+            x (Tensor): predictions, one row per sample
+            y (Tensor): ground truth values, one row per sample
+        Returns:
+            a float value for the averaged metric
+        '''
         return self.swig_metric.Evaluate(x.singa_tensor, y.singa_tensor)
 
 
 class Accuracy(Metric):
+    '''Compute the top one accuracy for singel label prediction tasks.
 
+    It calls the C++ functions to do the calculation.
+    '''
     def __init__(self):
         self.swig_metric = singa.Accuracy()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/33992c90/src/python/singa/optimizer.py
----------------------------------------------------------------------
diff --git a/src/python/singa/optimizer.py b/src/python/singa/optimizer.py
index aa6bdd1..a964f16 100644
--- a/src/python/singa/optimizer.py
+++ b/src/python/singa/optimizer.py
@@ -15,7 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 # =============================================================================
-""" Python wrappers for optimizers implemented by C++."""
+'''This module includes a set of optimizers for updating model parameters.
+
+Example usage::
+
+  from singa import optimizer
+  from singa import tensor
+
+  sgd = optimizer.SGD(lr=0.01, momentum=0.9, weight_decay=1e-4)
+  p = tensor.Tensor((3,5))
+  p.uniform(-1, 1)
+  g = tensor.Tensor((3,5))
+  g.gaussian(0, 0.01)
+
+  sgd.apply(1, g, p, 'param')  # use the global lr=0.1 for epoch 1
+  sgd.apply_with_lr(2, 0.03, g, p, 'param')  # use lr=0.03 for epoch 2
+'''
 
 from . import singa_wrap as singa
 import tensor
@@ -23,53 +38,44 @@ from proto import model_pb2
 
 
 class Optimizer(object):
-    """Base python optimizer.
-
-    Usages:
-        1. construct the optimizer
-        2. (optional) register each parameter with its specs.
-        3. use the optimizer to update parameter values given parameter
-            gradients and other optional info
-    """
-
+    '''The base python optimizer class.
+
+    Typically, an optimizer is used as follows:
+
+    1. construct the optimizer
+    2. (optional) register each parameter with its specs.
+    3. use the optimizer to update parameter values given parameter
+        gradients and other optional info
+
+    The subclasses should override the apply_with_lr function to do the real
+    parameter udpate.
+
+    Args:
+        lr (float): a constant for the learning rate, mutually exclusive with
+            'lr_gen'.
+        momentum (float): a constant for the momentum value
+        decay (float): the coefficent for L2 regularizer, which is mutually
+            exclusive with 'regularizer'.
+        lr_gen (function): a function returns the learning rate given
+            the current training step/epoch. It is mutually exclusive with lr.
+            If both are not set, the apply_with_lr function should be used for
+            param updating.
+        regularizer: an instance of Regularizer or RegularizerConf; If set,
+            regularization would be applied in apply_with_lr().
+            Users can also do regularization outside.
+        constraint: an instance of Constraint or ConstraintConf; If set,
+            constraint would be applied inside apply_with_lr(). Users can
+            also do regularization outside.
+    '''
     def __init__(self, lr=None, momentum=None, decay=None, lr_gen=None,
-                 momentum_gen=None, regularizer=None, constraint=None):
-        """Constructor.
-
-        Args:
-            lr: a constant or a function that generates learning rate given a
-                step, which is mutually exclusive with 'lr_gen'.
-            momentum: a constant or a function that generates the momentum value
-                given a step.
-            decay (float): the coefficent for L2 regularizer, which is mutually
-                exclusive with 'regularizer'.
-            lr_gen (function): a function returns the learning rate given
-                the current training step. It is mutually exclusive with lr. If
-                both are not set, the apply_with_lr function should be used for
-                param updating.
-            momentum_gen (function): a function returns the momentum value given
-                the current training step. It is mutually exclusive with
-                momentum.
-            regularizer: an instance of Regularizer or RegularizerConf; If set,
-                regularization would be applied in apply_with_lr().
-                Users can also do regularization outside.
-            constraint: an instance of Constraint or ConstraintConf; If set,
-                constraint would be applied inside apply_with_lr(). Users can
-                also do regularization outside.
-        """
+                 regularizer=None, constraint=None):
         if lr is not None:
             assert lr_gen is None, 'Cannot set lr and lr_gen at the same time'
 
-            def lr_gen(step):
+            def lr_gen(epoch):
                 return lr
         self.lr_gen = lr_gen
-        if momentum is not None:
-            assert momentum_gen is None, 'Cannot set momentum and momentum_gen'\
-                ' at the same time'
-
-            def momentum_gen(step):
-                return momentum
-        self.momentum_gen = momentum_gen
+        self.momentum = momentum
         if decay is not None:
             assert regularizer is None, \
                 'Cannot set decay and regularizer at the same time'
@@ -94,14 +100,16 @@ class Optimizer(object):
         self.learning_rate_multiplier = {}
 
     def register(self, name, specs):
-        """Register the param specs, including creating regularizer and
+        '''Register the param specs, including creating regularizer and
         constraint per param object. Param specific regularizer and constraint
         have higher priority than the global ones.
 
         Args:
             name (str): parameter name
-            specs (ParamSpec): protobuf obj
-        """
+            specs (ParamSpec): protobuf obj, including regularizer and
+                constraint, multipliers for learning rate and weight decay.
+
+        '''
 	assert type(specs) == model_pb2.ParamSpec, \
 		'specs should be model_pb2.ParamSpec instance'
         if specs.HasField('regularizer'):
@@ -113,8 +121,8 @@ class Optimizer(object):
         if specs.decay_mult != 1:
             self.decay_multiplier[name] = specs.decay_mult
 
-    def apply_regularizer_constraint(self, value, grad, name=None, step=None):
-        """Apply regularization and constraint if available.
+    def apply_regularizer_constraint(self, value, grad, name=None, epoch=None):
+        '''Apply regularization and constraint if available.
 
         If there are both global regularizer (constraint) and param specific
         regularizer (constraint), it would use the param specific one.
@@ -123,46 +131,48 @@ class Optimizer(object):
             value (Tensor): parameter value Tensor
             grad (Tensor): parameter gradient Tensor
             name (string): to get parameter specific regularizer or constraint
-            step (int): some regularizer or constraint would use step
+            epoch (int): some regularizer or constraint would use epoch
 
-        Return:
+        Returns:
             the updated gradient Tensor
-        """
+        '''
         if name is not None and name in self.constraints:
-            self.constraints[name].apply(value, grad, step)
+            self.constraints[name].apply(value, grad, epoch)
         elif self.constraint is not None:
-            self.constraint.apply(step, value, grad)
+            self.constraint.apply(epoch, value, grad)
 
         if name is not None and name in self.regularizers:
-            self.regularizers[name].apply(value, grad, step)
+            self.regularizers[name].apply(value, grad, epoch)
         elif self.regularizer is not None:
-            self.regularizer.apply(step, value, grad)
+            self.regularizer.apply(epoch, value, grad)
         return grad
 
-    def apply_with_lr(self, step, lr, grad, value, name=None):
-        """Do update with given learning rate.
+    def apply_with_lr(self, epoch, lr, grad, value, name=None):
+        '''Do update with given learning rate.
 
         The subclass optimizer must override this function.
+
         Args:
-            step (int): training step (could be iteration or epoch)
+            epoch (int): training epoch (could be iteration or epoch)
             lr (float): learning rate
             grad (Tensor): parameter gradient
             value (Tesnor): parameter value
             name (string): paramter name to retrieval parameter specific
                 updating rules (including regularizer and constraint)
 
-        Return:
+        Returns:
             updated parameter value
-        """
+        '''
         assert False, 'This is the base function, pls call the subclass func'
         return value
 
-    def apply(self, step, grad, value, name=None):
-        """Do update assume the learning rate generator is set.
+    def apply(self, epoch, grad, value, name=None):
+        '''Do update assuming the learning rate generator is set.
 
         The subclass optimizer does not need to override this function.
+
         Args:
-            step (int): training step (could be iteration or epoch)
+            epoch (int): training epoch (could be iteration or epoch)
             grad (Tensor): parameter gradient
             value (Tesnor): parameter value
             name (string): paramter name to retrieval parameter specific
@@ -170,98 +180,109 @@ class Optimizer(object):
 
         Return:
             updated parameter value
-        """
-
+        '''
         assert self.lr_gen is not None, 'Learning rate generator is not set.'\
             'Either set the lr_gen in constructor or call apply_with_lr'
-        lr = self.lr_gen(step)
-        return self.apply_with_lr(step, lr, grad, value, name)
+        lr = self.lr_gen(epoch)
+        return self.apply_with_lr(epoch, lr, grad, value, name)
 
 
 class SGD(Optimizer):
+    '''The vallina Stochasitc Gradient Descent algorithm with momentum.
 
-    def __init__(self, lr=None, momentum=None, decay=None, **kwargs):
-        """The vallina Stochasitc Gradient Descent algorithm.
+    See the base Optimizer for all arguments.
+    '''
 
-        See the base Optimizer for all arguments.
-        """
-        super(SGD, self).__init__(lr, momentum, decay)
+    def __init__(self, lr=None, momentum=None, decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        super(SGD, self).__init__(lr, momentum, decay, lr_gen, regularizer,
+                                  constraint)
         conf = model_pb2.OptimizerConf()
-        conf.momentum = momentum
+        conf.momentum = self.momentum
+        conf.type = 'sgd'
         self.opt = singa.CreateOptimizer('SGD')
         self.opt.Setup(conf.SerializeToString())
 
-    def apply_with_lr(self, step, lr, grad, value, name):
-        self.apply_regularizer_constraint(step, value, grad, name)
-        self.opt.Apply(step, lr, name, grad.singa_tensor, value.singa_tensor)
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        self.apply_regularizer_constraint(epoch, value, grad, name)
+        self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor)
         return value
 
 
 class Nesterov(Optimizer):
+    '''The SGD with Nesterov momentum.
 
-    def __init__(self, lr=None, momentum=0.9, decay=None, **kwargs):
-        """The SGD with Nesterov momentum
+    See the base Optimizer for all arguments.
+    '''
 
-        See the base Optimizer for all arguments.
-        """
-        super(Nesterov, self).__init__(lr, momentum, decay, kwargs)
+    def __init__(self, lr=None, momentum=0.9, decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        super(Nesterov, self).__init__(lr, momentum, decay, lr_gen, regularizer,
+                                       constraint)
         conf = model_pb2.OptimizerConf()
+        conf.momentum = momentum
+        conf.type = 'nesterov'
         self.opt = singa.CreateOptimizer('Nesterov')
         self.opt.Setup(conf.SerializeToString())
 
-    def apply_with_lr(self, step, lr, grad, value, name):
-        self.apply_regularizer_constraint(step, value, grad, name)
-        self.opt.Apply(step, lr, name, grad.singa_tensor, value.singa_tensor)
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        self.apply_regularizer_constraint(epoch, value, grad, name)
+        self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor)
         return value
 
 
 class AdaGrad(Optimizer):
+    '''AdaGrad optimizer.
 
-    def __init__(self, epsilon=1e-8, lr=None, decay=None, **kwargs):
-        """AdaGrad optimizer.
+    See the base Optimizer for all constructor args.
 
-        See the base Optimizer for all constructor args.
-        Args:
-            epsilon (float): small number for preventing numeric error.
-        """
-        super(RMSProp, self).__init__(lr, decay, **kwargs)
+    Args:
+        epsilon (float): small number for preventing numeric error.
+    '''
+    def __init__(self, epsilon=1e-8, lr=None, decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        super(RMSProp, self).__init__(lr, decay, lr_gen, regularizer,
+                                      constraint)
         conf = model_pb2.OptimizerConf()
         conf.delta = epsilon
+        conf.type = 'adagrad'
         self.opt = singa.CreateOptimizer('AdaGrad')
         self.opt.Setup(conf.SerializeToString())
 
-    def apply_with_lr(self, step, lr, grad, value, name):
-        grad = self.apply_regularizer_constraint(step, value, grad, name)
-        self.opt.Apply(step, lr,  name, grad.singa_tensor, value.singa_tensor)
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        grad = self.apply_regularizer_constraint(epoch, value, grad, name)
+        self.opt.Apply(epoch, lr,  name, grad.singa_tensor, value.singa_tensor)
         return value
 
 
 class RMSProp(Optimizer):
+    '''RMSProp optimizer.
 
-    def __init__(self, rho=0.9, epsilon=1e-8, lr=None, decay=None, **kwargs):
-        """RMSProp optimizer.
+    See the base Optimizer for all constructor args.
 
-        See the base Optimizer for all constructor args.
-        Args:
-            rho (float): float within [0, 1]
-            epsilon (float): small value for preventing numeric error
-        """
-        super(RMSProp, self).__init__(lr, decay, kwargs)
+    Args:
+        rho (float): float within [0, 1]
+        epsilon (float): small value for preventing numeric error
+    '''
+
+    def __init__(self, rho=0.9, epsilon=1e-8, lr=None, decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        super(RMSProp, self).__init__(lr, decay, lr_gen, regularizer,
+                                      constraint)
         conf = model_pb2.OptimizerConf()
         conf.rho = rho
         conf.delta = epsilon
         self.opt = singa.CreateOptimizer('RMSProp')
         self.opt.Setup(conf.SerializeToString())
 
-    def apply_with_lr(self, step, lr, grad, value, name):
-        grad = self.apply_regularizer_constraint(step, value, grad, name)
-        self.opt.Apply(step, lr,  name, grad.singa_tensor, value.singa_tensor)
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        grad = self.apply_regularizer_constraint(epoch, value, grad, name)
+        self.opt.Apply(epoch, lr,  name, grad.singa_tensor, value.singa_tensor)
         return value
 
 
 class Regularizer(object):
-    """Base Python regularizer for parameter gradients.
-    """
+    '''Base Python regularizer for parameter gradients.'''
 
     def apply(self, value, grad):
         assert False, 'Not Implemented. Call the subclass function.'
@@ -269,34 +290,32 @@ class Regularizer(object):
 
 
 class CppRegularizer(Regularizer):
-    """Wrapper for regularizer implemented using C++.
-    """
+    '''Wrapper for regularizer implemented using C++.
 
-    def __init__(self, conf):
-        """Constructor.
+    Args:
+        conf (RegularizerConf): protobuf message for the configuration.
+    '''
 
-        Args:
-            conf (RegularizerConf): protobuf message for the configuration.
-        """
+    def __init__(self, conf):
         self.reg = singa.CreateRegularizer(conf.type)
         self.reg.Setup(conf.SerializeToString())
 
-    def apply(self, step, value, grad):
-        self.reg.Apply(step, value.singa_tensor, grad.singa_tensor)
+    def apply(self, epoch, value, grad):
+        self.reg.Apply(epoch, value.singa_tensor, grad.singa_tensor)
         return grad
 
 
 class L2Regularizer(Regularizer):
-    """L2 regularization"""
+    '''L2 regularization
+
+    Args:
+        coefficient (float): regularization coefficient.
+    '''
 
     def __init__(self, coefficient):
-        """
-        Args:
-            coefficient (float): regularization coefficient.
-        """
         self.coefficient = coefficient
 
-    def apply(self, step, value, grad, coefficient=None):
+    def apply(self, epoch, value, grad, coefficient=None):
         if coefficient is None:
             assert self.coefficient is not None, 'Must set the coefficient'
             coefficient = self.coefficient
@@ -305,39 +324,34 @@ class L2Regularizer(Regularizer):
 
 
 class Constraint(object):
-    """Base Python constraint class for paramter gradients.
-    """
+    '''Base Python constraint class for paramter gradients'''
 
-    def apply(self, step, value, grad):
+    def apply(self, epoch, value, grad):
         return grad
 
 
 class CppConstraint(Constraint):
-    """Wrapper for constraints implemented using C++.
-    """
+    '''Wrapper for constraints implemented using C++.
 
+    Args:
+        conf (ConstraintConf): protobuf message for the configuration.
+    '''
     def __init__(self, conf):
-        """Constructor.
-
-        Args:
-            conf (ConstraintConf): protobuf message for the configuration.
-        """
         self.constraint = singa.CreateConstraint(conf.type)
         self.constraint.Setup(conf.SerializeToString())
 
-    def apply(self, step, value, grad):
-        self.constraint.Apply(step, value.singa_tensor, grad.singa_tensor)
+    def apply(self, epoch, value, grad):
+        self.constraint.Apply(epoch, value.singa_tensor, grad.singa_tensor)
         return grad
 
 
 class L2Constraint(Constraint):
-    """Rescale the gradient to make the L2 norm <= a given threshold.
-    """
+    '''Rescale the gradient to make the L2 norm <= a given threshold'''
 
     def __init__(self, threshold=None):
         self.threshold = threshold
 
-    def apply(self, step, value, grad, threshold=None):
+    def apply(self, epoch, value, grad, threshold=None):
         if threshold is None:
             assert self.threshold is not None, 'Must set the threshold'
             threshold = self.threshold