You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/08/15 16:15:27 UTC
[14/22] incubator-singa git commit: Fixed the bug leading to wired accuracy (nan), which was caused by forgeting to average the gradient over the whole mini-batch. That is why we need a lower learning rate and could not use momentum. Update the lr in opt

Fixed the bug leading to wired accuracy (nan), which was caused by forgeting
to average the gradient over the whole mini-batch. That is why we need a lower
learning rate and could not use momentum.
Update the lr in optimzier.py to time the multiplier
Fix the bug from mis-setting the pooling type of alexnet.py (max-->avg)


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6d4539ee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6d4539ee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6d4539ee

Branch: refs/heads/dev
Commit: 6d4539eed2ae200a3a904a70cb789fc1b39d0f38
Parents: 1db2784
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Aug 15 13:13:19 2016 +0800
Committer: Wei Wang <wa...@gmail.com>
Committed: Mon Aug 15 20:16:30 2016 +0800

----------------------------------------------------------------------
 examples/cifar10/alexnet.cc   |  11 +-
 examples/cifar10/alexnet.py   |  13 +-
 examples/cifar10/train.py     |  19 ++-
 src/model/feed_forward_net.cc |   6 +-
 src/model/optimizer/sgd.cc    |   4 +-
 src/python/singa/__init__.py  | 240 -------------------------------------
 src/python/singa/layer.py     |  15 +--
 src/python/singa/net.py       |   8 +-
 src/python/singa/optimizer.py |  36 ++++--
 src/python/singa/tensor.py    |   8 +-
 10 files changed, 68 insertions(+), 292 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/examples/cifar10/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet.cc b/examples/cifar10/alexnet.cc
index e1363e4..8051d1b 100644
--- a/examples/cifar10/alexnet.cc
+++ b/examples/cifar10/alexnet.cc
@@ -134,7 +134,7 @@ FeedForwardNet CreateNet() {
   return net;
 }
 
-void Train(float lr, int num_epoch, string data_dir) {
+void Train(int num_epoch, string data_dir) {
   Cifar10 data(data_dir);
   Tensor train_x, train_y, test_x, test_y;
   {
@@ -161,11 +161,11 @@ void Train(float lr, int num_epoch, string data_dir) {
   auto net = CreateNet();
   SGD sgd;
   OptimizerConf opt_conf;
-  opt_conf.set_momentum(0.9);
+  // opt_conf.set_momentum(0.9);
   auto reg = opt_conf.mutable_regularizer();
   reg->set_coefficient(0.004);
   sgd.Setup(opt_conf);
-  sgd.SetLearningRateGenerator([lr](int step) {
+  sgd.SetLearningRateGenerator([](int step) {
     if (step <= 120)
       return 0.001;
     else if (step <= 130)
@@ -193,14 +193,11 @@ int main(int argc, char **argv) {
   int pos = singa::ArgPos(argc, argv, "-epoch");
   int nEpoch = 1;
   if (pos != -1) nEpoch = atoi(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-lr");
-  float lr = 0.001;
-  if (pos != -1) lr = atof(argv[pos + 1]);
   pos = singa::ArgPos(argc, argv, "-data");
   string data = "cifar-10-batches-bin";
   if (pos != -1) data = argv[pos + 1];
 
   LOG(INFO) << "Start training";
-  singa::Train(lr, nEpoch, data);
+  singa::Train(nEpoch, data);
   LOG(INFO) << "End training";
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/examples/cifar10/alexnet.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet.py b/examples/cifar10/alexnet.py
index ddad1d5..dae129f 100644
--- a/examples/cifar10/alexnet.py
+++ b/examples/cifar10/alexnet.py
@@ -20,9 +20,6 @@ Following the same setting for hyper-parameters and data pre-processing, the fin
 validation accuracy would be about 82%.
 '''
 
-import sys
-import os
-
 # sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
 from singa import layer
 from singa import initializer
@@ -39,18 +36,18 @@ def create_net(use_cpu=False):
     W0_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.0001}
     W1_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.01}
     W2_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.01, 'decay_mult': 250}
-    b_specs = {'init': 'constant', 'value': 0, 'lt_mult': 2}
+    b_specs = {'init': 'constant', 'value': 0, 'lr_mult': 2, 'decay_mult': 0}
     net.add(layer.Conv2D('conv1', 32, 5, 1, W_specs=W0_specs.copy(), b_specs=b_specs.copy(), pad=2, input_sample_shape=(3,32,32,)))
     net.add(layer.MaxPooling2D('pool1', 3, 2, pad=1))
     net.add(layer.Activation('relu1'))
-    net.add(layer.LRN(name='lrn1'))
+    net.add(layer.LRN(name='lrn1', size=3, alpha=5e-5))
     net.add(layer.Conv2D('conv2', 32, 5, 1, W_specs=W1_specs.copy(), b_specs=b_specs.copy(), pad=2))
     net.add(layer.Activation('relu2'))
-    net.add(layer.MaxPooling2D('pool2', 3, 2,  pad=1))
-    net.add(layer.LRN('lrn2'))
+    net.add(layer.AvgPooling2D('pool2', 3, 2,  pad=1))
+    net.add(layer.LRN('lrn2', size=3, alpha=5e-5))
     net.add(layer.Conv2D('conv3', 64, 5, 1, W_specs=W1_specs.copy(), b_specs=b_specs.copy(), pad=2))
     net.add(layer.Activation('relu3'))
-    net.add(layer.MaxPooling2D('pool3', 3, 2, pad=1))
+    net.add(layer.AvgPooling2D('pool3', 3, 2, pad=1))
     net.add(layer.Flatten('flat'))
     net.add(layer.Dense('dense', 10, W_specs=W2_specs.copy(), b_specs=b_specs.copy()))
     for (p, specs) in zip(net.param_values(), net.param_specs()):

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/examples/cifar10/train.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index de03750..2091ee5 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -22,7 +22,6 @@ includes 1 label & 3072 pixels.  3072 pixels are 3 channels of a 32x32 image
 import cPickle
 import numpy as np
 import os
-import sys
 import argparse
 
 # sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
@@ -84,7 +83,7 @@ def normalize_for_alexnet(train_x, test_x):
 
 
 def vgg_lr(epoch):
-    return 0.01 / float(1 << ((epoch / 30)))
+    return 0.1 / float(1 << ((epoch / 25)))
 
 
 def alexnet_lr(epoch):
@@ -92,7 +91,7 @@ def alexnet_lr(epoch):
         return 0.001
     elif epoch < 130:
         return 0.0001
-    elif epoch < 140:
+    else:
         return 0.00001
 
 
@@ -107,8 +106,8 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         dev = device.create_cuda_gpu()
 
     net.to_device(dev)
-    opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
-    for (p, specs) in zip(net.param_values(), net.param_specs()):
+    opt = optimizer.SGD(momentum=0.9, decay=weight_decay)
+    for (p, specs) in zip(net.param_names(), net.param_specs()):
         opt.register(p, specs)
 
     tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
@@ -129,13 +128,13 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
             grads, (l, a) = net.train(tx, ty)
             loss += l
             acc += a
-            for (s, p, g) in zip(net.param_specs(), net.param_values(), grads):
-                opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s.name))
+            for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
+                opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s))
             # update progress bar
             utils.update_progress(b * 1.0 / num_train_batch,
                                   'training loss = %f, accuracy = %f' % (l, a))
-        info = '\ntraining loss = %f, training accuracy = %f' \
-            % (loss / num_train_batch, acc / num_train_batch)
+        info = '\ntraining loss = %f, training accuracy = %f, lr = %f' \
+            % (loss / num_train_batch, acc / num_train_batch, get_lr(epoch))
         print info
 
         loss, acc = 0.0, 0.0
@@ -167,7 +166,7 @@ if __name__ == '__main__':
     if args.model == 'alexnet':
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = alexnet.create_net(args.use_cpu)
-        train((train_x, train_y, test_x, test_y), net, 140, alexnet_lr, 0.004,
+        train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
               use_cpu=args.use_cpu)
     else:
         train_x, test_x = normalize_for_vgg(train_x, test_x)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/src/model/feed_forward_net.cc
----------------------------------------------------------------------
diff --git a/src/model/feed_forward_net.cc b/src/model/feed_forward_net.cc
index 514d6e2..3875430 100644
--- a/src/model/feed_forward_net.cc
+++ b/src/model/feed_forward_net.cc
@@ -206,8 +206,8 @@ const std::pair<float, float> FeedForwardNet::TrainOnBatch(int epoch,
 
 const Tensor FeedForwardNet::Forward(int flag, const Tensor& data) {
   Tensor input = data, output;
+  // LOG(INFO) << data.L1();
   for (auto layer : layers_) {
-    //    LOG(INFO) << layer->name() << ": " << input.L1();
     output = layer->Forward(flag, input);
     // LOG(INFO) << layer->name() << ": " << output.L2();
     input = output;
@@ -220,13 +220,13 @@ const vector<Tensor> FeedForwardNet::Backward(int flag, const Tensor& grad) {
   std::stack<Tensor> buf;
   Tensor tmp = grad;
   for (int i = layers_.size() - 1; i >= 0; i--) {
-    //   LOG(INFO) << layers_.at(i)->name() << " : " << tmp.L1();
+    // LOG(INFO) << layers_.at(i)->name() << " : " << tmp.L1();
     auto ret = layers_.at(i)->Backward(flag, tmp);
     tmp = ret.first;
     if (ret.second.size()) {
       for (int k = ret.second.size() - 1; k >= 0; k--) {
         buf.push(ret.second[k]);
-        //       LOG(INFO) <<  "      " << buf.top().L1();
+        // LOG(INFO) <<  "      " << buf.top().L1();
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/src/model/optimizer/sgd.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/sgd.cc b/src/model/optimizer/sgd.cc
index d78d5b8..ac453cd 100644
--- a/src/model/optimizer/sgd.cc
+++ b/src/model/optimizer/sgd.cc
@@ -33,6 +33,7 @@ void SGD::Setup(const OptimizerConf& conf) {
 // value = value - history
 void SGD::Apply(int step, float lr, const string& name, const Tensor& grad,
                 Tensor& value) {
+  // LOG(INFO) << "param " << name  << " lr = " << lr << " grad = " << grad.L1() << " value = " << value.L1();
   if (momentum_generator_) {
     float mom = momentum_generator_(step);
     if (mom != 0) {
@@ -46,9 +47,8 @@ void SGD::Apply(int step, float lr, const string& name, const Tensor& grad,
       value -= history;
       return;
     }
-  } else {
-    Axpy(-lr, grad, &value);
   }
+  Axpy(-lr, grad, &value);
 }
 }  // namespace singa
 #endif  // SRC_MODEL_OPTIMIZER_SGD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/src/python/singa/__init__.py
----------------------------------------------------------------------
diff --git a/src/python/singa/__init__.py b/src/python/singa/__init__.py
index f14c8c5..e69de29 100644
--- a/src/python/singa/__init__.py
+++ b/src/python/singa/__init__.py
@@ -1,240 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# =============================================================================
-
-'''
-This script is the main entrance for user to run singa inside a model workspace
-
-To use this script, user sudo install these dependencies: flask pillow and protobuf
-'''
-
-import sys, glob, os, random, shutil, time
-from flask import Flask, request, redirect, url_for
-import numpy as np
-import ConfigParser
-import urllib, traceback
-
-
-from argparse import ArgumentParser
-from argparse import RawDescriptionHelpFormatter
-sys.path.append(os.getcwd())
-
-__all__ = []
-__version__ = 0.1
-__date__ = '2016-07-20'
-__updated__ = '2016-07-20'
-__shortdesc__ = '''
-welcome to singa
-'''
-
-app = Flask(__name__)
-config = ConfigParser.RawConfigParser()
-service = {}
-data_path = "data_"
-parameter_path = "parameter_"
-
-debug = False
-
-class CLIError(Exception):
-    '''Generic exception to raise and log different fatal errors.'''
-    def __init__(self, msg):
-        super(CLIError).__init__(type(self))
-        self.msg = "E: %s" % msg
-    def __str__(self):
-        return self.msg
-    def __unicode__(self):
-        return self.msg
-
-def main(argv=None): # IGNORE:C0111
-    '''Command line options.'''
-
-    from . import device
-
-    if argv is None:
-        argv = sys.argv
-    else:
-        sys.argv.extend(argv)
-
-    program_name = os.path.basename(sys.argv[0])
-    program_version = "v%s" % __version__
-    program_build_date = str(__updated__)
-    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
-    program_shortdesc = __shortdesc__
-    program_license = '''%s
-
-  Created by dbsystem group on %s.
-  Copyright 2016 NUS School of Computing. All rights reserved.
-
-  Licensed under the Apache License 2.0
-  http://www.apache.org/licenses/LICENSE-2.0
-
-  Distributed on an "AS IS" basis without warranties
-  or conditions of any kind, either express or implied.
-
-USAGE
-''' % (program_shortdesc, str(__date__))
-
-    global debug
-
-    try:
-        # Setup argument parser
-        parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter)
-        parser.add_argument("-p", "--port", dest="port", default=5000, help="the port to listen to, default is 5000")
-        parser.add_argument("-param", "--parameter", dest="parameter",  help="the parameter file path to be loaded")
-        parser.add_argument("-D", "--debug", dest="debug", action="store_true", help="whether need to debug")
-        parser.add_argument("-R", "--reload", dest="reload_data", action="store_true", help="whether need to reload data")
-        parser.add_argument("-C", "--cpu", dest="use_cpu", action="store_true", help="Using cpu or not, default is using gpu")
-        parser.add_argument("-m", "--mode", dest="mode", choices=['train','test','serve'], default='serve', help="On Which mode (train,test,serve) to run singa")
-        parser.add_argument('-V', '--version', action='version', version=program_version_message)
-
-        # Process arguments
-        args = parser.parse_args()
-
-        port = args.port
-        parameter_file = args.parameter
-        mode = args.mode
-        need_reload = args.reload_data
-        use_cpu = args.use_cpu
-        debug = args.debug
-
-        #prepare data files
-        config.read('file.cfg')
-        file_prepare(need_reload)
-
-
-        import network as net
-        model = net.create()
-
-        #load parameter
-        parameter_file=get_parameter(parameter_file)
-
-        if parameter_file:
-            print "load parameter file: %s" % parameter_file
-            model.load(parameter_file)
-
-        if use_cpu:
-            raise CLIError("Currently cpu is not support!")
-        else:
-            print "runing with gpu"
-            d = device.create_cuda_gpu()
-
-        model.to_device(d)
-
-        if mode == "serve":
-            print "runing singa in serve mode, listen to  port: %s " % port
-            global service
-            from serve import Service
-            service =Service(model,d)
-
-            app.debug = debug
-            app.run(host='0.0.0.0', port= port)
-        elif mode == "train":
-            print "runing singa in train mode"
-            global trainer
-            from train import Trainer
-            trainer= Trainer(model,d)
-            if not parameter_file:
-                trainer.initialize()
-            trainer.train()
-        else:
-            raise CLIError("Currently only serve mode is surpported!")
-        return 0
-    except KeyboardInterrupt:
-        ### handle keyboard interrupt ###
-        return 0
-    except Exception, e:
-        if debug:
-            traceback.print_exc()
-            raise(e)
-        indent = len(program_name) * " "
-        sys.stderr.write(program_name + ": " + str(e) + "\n")
-        sys.stderr.write(indent + "  for help use --help \n\n")
-        return 2
-
-def file_prepare(reload_data=False):
-    '''
-        download all files and generate data.py
-    '''
-    if not reload_data and os.path.exists("data_.py"):
-        return
-
-    print "download file"
-    #clean data
-    shutil.rmtree("data_.py",ignore_errors=True)
-    shutil.rmtree("data_",ignore_errors=True)
-
-    data_py=open("data_.py",'w')
-    data_py.write("#%s" % "This file is Generated by SINGA, please don't edit\n\n")
-    if config.has_section("data"):
-        file_list = config.items("data")
-        #download files
-        for f in file_list:
-            name,path=download_file(f[0],f[1],data_path)
-            data_py.write("%s=\"%s\"\n" % (name,path))
-
-    data_py.flush()
-    data_py.close()
-
-    if config.has_section("parameter"):
-        parameter_list = config.items("parameter")
-        for p in parameter_list:
-            download_file(p[0],p[1],parameter_path)
-
-def download_file(name,path,dest):
-    '''
-    download one file to dest
-    '''
-    if not os.path.exists(dest):
-        os.makedirs(dest)
-    if (path.startswith('http')):
-        file_name = path.split('/')[-1]
-        target = os.path.join(dest,file_name)
-        urllib.urlretrieve(path,target)
-    return name,target
-
-
-def get_parameter(file_name=None):
-    '''
-    get the paticular file name or get the last parameter file
-    '''
-    if not os.path.exists(parameter_path):
-        os.makedirs(parameter_path)
-        return
-
-    if file_name:
-	return os.path.join(parameter_path,file_name)
-
-    parameter_list = [ os.path.join(parameter_path,f) for f in os.listdir(parameter_path)]
-    if len(parameter_list)==0:
-        return
-    parameter_list.sort()
-
-    return parameter_list[-1]
-
-@app.route("/")
-def index():
-    return "Hello SINGA User!"
-
-@app.route('/predict', methods=['POST'])
-def predict():
-    if request.method == 'POST':
-        try:
-            response=service.serve(request)
-        except Exception as e:
-            return e
-        return response
-    return "error, should be post request"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/src/python/singa/layer.py
----------------------------------------------------------------------
diff --git a/src/python/singa/layer.py b/src/python/singa/layer.py
index c8c8c05..1e9caeb 100644
--- a/src/python/singa/layer.py
+++ b/src/python/singa/layer.py
@@ -362,8 +362,8 @@ class BatchNormalization(Layer):
 
 
 class LRN(Layer):
-    def __init__(self, name, size=5, alpha=1, beta=0.75, mode='cross_channel',
-                 k=1, input_sample_shape=None):
+    def __init__(self, name, size=5, alpha=1e-4, beta=0.75,
+                 mode='cross_channel', k=1, input_sample_shape=None):
         """Local response normalization.
 
         Args:
@@ -391,7 +391,7 @@ class Dense(Layer):
 
     def __init__(self, name, num_output, use_bias=True,
                  W_specs=None, b_specs=None,
-                 W_transpose=True, input_sample_shape=None):
+                 W_transpose=False, input_sample_shape=None):
         """Apply linear/affine transformation, also called inner-product or
         fully connected layer.
 
@@ -424,10 +424,10 @@ class Dense(Layer):
             W_specs['name'] = name + '_weight'
         if 'name' not in b_specs:
             b_specs['name'] = name + '_bias'
-        self.conf.param.extend([_construct_param_specs_from_dict(W_specs)])
-        self.param_specs.append(_construct_param_specs_from_dict(W_specs))
-        self.conf.param.extend([_construct_param_specs_from_dict(b_specs)])
-        self.param_specs.append(_construct_param_specs_from_dict(b_specs))
+        wspecs = _construct_param_specs_from_dict(W_specs)
+        bspecs = _construct_param_specs_from_dict(b_specs)
+        self.conf.param.extend([wspecs, bspecs])
+        self.param_specs.extend([wspecs, bspecs])
         # dense layer is transparent to engine.
         self.layer = _create_layer('singa', 'Dense')
         if input_sample_shape is not None:
@@ -712,6 +712,7 @@ def _construct_param_specs_from_dict(specs):
         a ParamSpec object
     """
     conf = model_pb2.ParamSpec()
+    print 'convert', specs
     if 'name' in specs:
         conf.name = specs['name']
     if 'lr_mult' in specs:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/src/python/singa/net.py
----------------------------------------------------------------------
diff --git a/src/python/singa/net.py b/src/python/singa/net.py
index f040378..3a1732c 100644
--- a/src/python/singa/net.py
+++ b/src/python/singa/net.py
@@ -95,16 +95,22 @@ class FeedForwardNet(object):
         # print x.l1()
         for lyr in self.layers:
             x = lyr.forward(flag, x)
-        #    print lyr.name, x.l1()
+            # print lyr.name, x.l1()
         return x
 
     def backward(self):
         grad = self.loss.backward()
+        if len(grad.shape) > 1:
+            grad /= grad.shape[0]  # average across the batch
+        # print 'grad', grad.l1()
         pgrads = []
         for lyr in reversed(self.layers):
             grad, _pgrads = lyr.backward(kTrain, grad)
+            # disp = '%f ' % grad.l1()
             for g in reversed(_pgrads):
                 pgrads.append(g)
+                # disp = disp + ', %f ' % g.l1()
+            # print disp
         return reversed(pgrads)
 
     def save(self, f):

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/src/python/singa/optimizer.py
----------------------------------------------------------------------
diff --git a/src/python/singa/optimizer.py b/src/python/singa/optimizer.py
index aa6bdd1..32f03d4 100644
--- a/src/python/singa/optimizer.py
+++ b/src/python/singa/optimizer.py
@@ -102,16 +102,19 @@ class Optimizer(object):
             name (str): parameter name
             specs (ParamSpec): protobuf obj
         """
-	assert type(specs) == model_pb2.ParamSpec, \
-		'specs should be model_pb2.ParamSpec instance'
+        assert type(specs) == model_pb2.ParamSpec, \
+            'specs should be model_pb2.ParamSpec instance'
         if specs.HasField('regularizer'):
             self.regularizers[name] = CppRegularizer(specs.regularizer)
+        elif specs.decay_mult != 1:
+            self.regularizers[name] = L2Regularizer(
+                specs.decay_mult * self.regularizer.coefficient)
+
         if specs.HasField('constraint'):
             self.constraints[name] = CppConstraint(specs.constraint)
+
         if specs.lr_mult != 1:
             self.learning_rate_multiplier[name] = specs.lr_mult
-        if specs.decay_mult != 1:
-            self.decay_multiplier[name] = specs.decay_mult
 
     def apply_regularizer_constraint(self, value, grad, name=None, step=None):
         """Apply regularization and constraint if available.
@@ -129,12 +132,12 @@ class Optimizer(object):
             the updated gradient Tensor
         """
         if name is not None and name in self.constraints:
-            self.constraints[name].apply(value, grad, step)
+            self.constraints[name].apply(step, value, grad)
         elif self.constraint is not None:
             self.constraint.apply(step, value, grad)
 
         if name is not None and name in self.regularizers:
-            self.regularizers[name].apply(value, grad, step)
+            self.regularizers[name].apply(step, value, grad)
         elif self.regularizer is not None:
             self.regularizer.apply(step, value, grad)
         return grad
@@ -175,24 +178,29 @@ class Optimizer(object):
         assert self.lr_gen is not None, 'Learning rate generator is not set.'\
             'Either set the lr_gen in constructor or call apply_with_lr'
         lr = self.lr_gen(step)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
         return self.apply_with_lr(step, lr, grad, value, name)
 
 
 class SGD(Optimizer):
 
-    def __init__(self, lr=None, momentum=None, decay=None, **kwargs):
+    def __init__(self, lr=None, momentum=None, decay=None):
         """The vallina Stochasitc Gradient Descent algorithm.
 
         See the base Optimizer for all arguments.
         """
         super(SGD, self).__init__(lr, momentum, decay)
         conf = model_pb2.OptimizerConf()
-        conf.momentum = momentum
+        if momentum is not None:
+            conf.momentum = momentum
         self.opt = singa.CreateOptimizer('SGD')
         self.opt.Setup(conf.SerializeToString())
 
     def apply_with_lr(self, step, lr, grad, value, name):
-        self.apply_regularizer_constraint(step, value, grad, name)
+        self.apply_regularizer_constraint(value, grad, name, step)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
         self.opt.Apply(step, lr, name, grad.singa_tensor, value.singa_tensor)
         return value
 
@@ -206,6 +214,8 @@ class Nesterov(Optimizer):
         """
         super(Nesterov, self).__init__(lr, momentum, decay, kwargs)
         conf = model_pb2.OptimizerConf()
+        if momentum is not None:
+            conf.momentum = momentum
         self.opt = singa.CreateOptimizer('Nesterov')
         self.opt.Setup(conf.SerializeToString())
 
@@ -232,6 +242,8 @@ class AdaGrad(Optimizer):
 
     def apply_with_lr(self, step, lr, grad, value, name):
         grad = self.apply_regularizer_constraint(step, value, grad, name)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
         self.opt.Apply(step, lr,  name, grad.singa_tensor, value.singa_tensor)
         return value
 
@@ -255,6 +267,8 @@ class RMSProp(Optimizer):
 
     def apply_with_lr(self, step, lr, grad, value, name):
         grad = self.apply_regularizer_constraint(step, value, grad, name)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
         self.opt.Apply(step, lr,  name, grad.singa_tensor, value.singa_tensor)
         return value
 
@@ -300,7 +314,9 @@ class L2Regularizer(Regularizer):
         if coefficient is None:
             assert self.coefficient is not None, 'Must set the coefficient'
             coefficient = self.coefficient
-        tensor.axpy(coefficient, value, grad)
+        # print coefficient, value.l1(), grad.l1()
+        if coefficient != 0:
+            tensor.axpy(coefficient, value, grad)
         return grad
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d4539ee/src/python/singa/tensor.py
----------------------------------------------------------------------
diff --git a/src/python/singa/tensor.py b/src/python/singa/tensor.py
index ed651e9..1d04cdf 100644
--- a/src/python/singa/tensor.py
+++ b/src/python/singa/tensor.py
@@ -177,28 +177,28 @@ class Tensor(object):
         if isinstance(x, Tensor):
             self.singa_tensor += x.singa_tensor
         else:
-            self.singa_tensor += x
+            self.singa_tensor += float(x)
         return self
 
     def __isub__(self, x):
         if isinstance(x, Tensor):
             self.singa_tensor -= x.singa_tensor
         else:
-            self.singa_tensor -= x
+            self.singa_tensor -= float(x)
         return self
 
     def __imul__(self, x):
         if isinstance(x, Tensor):
             self.singa_tensor *= x.singa_tensor
         else:
-            self.singa_tensor *= x
+            self.singa_tensor *= float(x)
         return self
 
     def __idiv__(self, x):
         if isinstance(x, Tensor):
             self.singa_tensor /= x.singa_tensor
         else:
-            self.singa_tensor /= x
+            self.singa_tensor /= float(x)
         return self
 
     '''