You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/08/17 18:02:42 UTC
[21/51] [abbrv] incubator-singa git commit: SINGA-237 New
documentation files for SINGA v1.0
SINGA-237 New documentation files for SINGA v1.0
Added readme file for the cifar-10 examples.
Updated the uniform and gaussian methods in initializer.py to include the
fan_in and fan_out arguments.
Reformat some python files.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/cdd718ed
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/cdd718ed
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/cdd718ed
Branch: refs/heads/master
Commit: cdd718ed946acfd829ccfd6e5b43999f990fd634
Parents: 33992c9
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Aug 14 21:41:16 2016 +0800
Committer: Wei Wang <wa...@gmail.com>
Committed: Sun Aug 14 23:47:24 2016 +0800
----------------------------------------------------------------------
doc/docs/examples.rst | 6 --
doc/docs/index.rst | 2 +-
doc/docs/initializer.rst | 2 +-
examples/char-rnn/README.md | 2 +-
examples/char-rnn/train.py | 103 +++++++++++++++++++++--------------
examples/cifar10/alexnet.py | 48 +++++++++++++---
examples/cifar10/predict.py | 10 ++--
examples/cifar10/vgg.py | 10 +---
examples/index.rst | 4 ++
src/python/singa/initializer.py | 85 ++++++++++++++---------------
src/python/singa/optimizer.py | 4 +-
11 files changed, 156 insertions(+), 120 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/doc/docs/examples.rst
----------------------------------------------------------------------
diff --git a/doc/docs/examples.rst b/doc/docs/examples.rst
deleted file mode 100644
index b0b2af8..0000000
--- a/doc/docs/examples.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Examples
-========
-
-.. toctree::
-
- examples/index
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/doc/docs/index.rst
----------------------------------------------------------------------
diff --git a/doc/docs/index.rst b/doc/docs/index.rst
index 2294054..11f0ebb 100644
--- a/doc/docs/index.rst
+++ b/doc/docs/index.rst
@@ -12,4 +12,4 @@ English
loss
metric
optimizer
- examples
+ examples/index
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/doc/docs/initializer.rst
----------------------------------------------------------------------
diff --git a/doc/docs/initializer.rst b/doc/docs/initializer.rst
index a190702..f334497 100644
--- a/doc/docs/initializer.rst
+++ b/doc/docs/initializer.rst
@@ -5,7 +5,7 @@ Python API
----------
.. automodule:: singa.initializer
- :members:
+ :members: uniform, gaussian
:member-order: bysource
CPP API
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/examples/char-rnn/README.md
----------------------------------------------------------------------
diff --git a/examples/char-rnn/README.md b/examples/char-rnn/README.md
index d4cfa30..f6e5edc 100644
--- a/examples/char-rnn/README.md
+++ b/examples/char-rnn/README.md
@@ -1,4 +1,4 @@
-# Train Char-RNN using SINGA
+# Train Char-RNN over plain text
Recurrent neural networks (RNN) are widely used for modelling sequential data,
e.g., natural language sentences. This example describes how to implement a RNN
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/examples/char-rnn/train.py
----------------------------------------------------------------------
diff --git a/examples/char-rnn/train.py b/examples/char-rnn/train.py
index fb5e71f..1273a57 100644
--- a/examples/char-rnn/train.py
+++ b/examples/char-rnn/train.py
@@ -19,8 +19,6 @@ The model is created following https://github.com/karpathy/char-rnn
The train file could be any text file,
e.g., http://cs.stanford.edu/people/karpathy/char-rnn/
'''
-import sys
-import os
import cPickle as pickle
import numpy as np
import argparse
@@ -32,12 +30,12 @@ from singa import device
from singa import tensor
from singa import optimizer
from singa import initializer
-from singa.proto import core_pb2
from singa.proto import model_pb2
from singa import utils
class Data(object):
+
def __init__(self, fpath, batch_size=32, seq_length=100, train_ratio=0.8):
'''Data object for loading a plain text file.
@@ -49,8 +47,8 @@ class Data(object):
self.raw_data = open(fpath, 'r').read() # read text file
chars = list(set(self.raw_data))
self.vocab_size = len(chars)
- self.char_to_idx = {ch:i for i, ch in enumerate(chars)}
- self.idx_to_char = {i:ch for i, ch in enumerate(chars)}
+ self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
+ self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
data = [self.char_to_idx[c] for c in self.raw_data]
# seq_length + 1 for the data + label
nsamples = len(data) / (1 + seq_length)
@@ -69,10 +67,10 @@ class Data(object):
def numpy2tensors(npx, npy, dev):
'''batch, seq, dim -- > seq, batch, dim'''
- tmpx=np.swapaxes(npx, 0, 1)
- tmpy=np.swapaxes(npy, 0, 1)
- inputs=[]
- labels=[]
+ tmpx = np.swapaxes(npx, 0, 1)
+ tmpy = np.swapaxes(npy, 0, 1)
+ inputs = []
+ labels = []
for t in range(tmpx.shape[0]):
x = tensor.from_numpy(tmpx[t])
y = tensor.from_numpy(tmpy[t])
@@ -99,25 +97,36 @@ def get_lr(epoch):
return 0.001 / float(1 << (epoch / 50))
-def train(data, max_epoch, hidden_size =100, seq_length=100, batch_size=16,
- num_stacks=1, lr=0.001, dropout = 0.5, model_path='model.bin'):
+def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16,
+ num_stacks=1, lr=0.001, dropout=0.5, model_path='model.bin'):
# SGD with L2 gradient normalization
opt = optimizer.SGD(constraint=optimizer.L2Constraint(5))
cuda = device.create_cuda_gpu()
- rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks,
- dropout=dropout, input_sample_shape=(data.vocab_size,))
+ rnn = layer.LSTM(
+ name='lstm',
+ hidden_size=hidden_size,
+ num_stacks=num_stacks,
+ dropout=dropout,
+ input_sample_shape=(
+ data.vocab_size,
+ ))
rnn.to_device(cuda)
print 'created rnn'
rnn_w = rnn.param_values()[0]
- initializer.uniform(rnn_w, -0.08, 0.08) # init all rnn parameters
+ rnn_w.uniform(-0.08, 0.08) # init all rnn parameters
print 'rnn weight l1 = %f' % (rnn_w.l1())
- dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(hidden_size,))
+ dense = layer.Dense(
+ 'dense',
+ data.vocab_size,
+ input_sample_shape=(
+ hidden_size,
+ ))
dense.to_device(cuda)
dense_w = dense.param_values()[0]
dense_b = dense.param_values()[1]
print 'dense w ', dense_w.shape
print 'dense b ', dense_b.shape
- initializer.xavier(dense_w) # init weight matrix using Xavier
+ initializer.uniform(dense_w, dense_w.shape[0], dense_w.shape[1])
print 'dense weight l1 = %f' % (dense_w.l1())
dense_b.set_value(0.0)
print 'dense b l1 = %f' % (dense_b.l1())
@@ -125,18 +134,18 @@ def train(data, max_epoch, hidden_size =100, seq_length=100, batch_size=16,
g_dense_w = tensor.Tensor(dense_w.shape, cuda)
g_dense_b = tensor.Tensor(dense_b.shape, cuda)
- lossfun = loss.SoftmaxCrossEntropy();
+ lossfun = loss.SoftmaxCrossEntropy()
for epoch in range(max_epoch):
train_loss = 0
for b in range(data.num_train_batch):
batch = data.train_dat[b * batch_size: (b + 1) * batch_size]
inputs, labels = convert(batch, batch_size, seq_length,
- data.vocab_size, cuda)
+ data.vocab_size, cuda)
inputs.append(tensor.Tensor())
inputs.append(tensor.Tensor())
outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2]
- grads=[]
+ grads = []
batch_loss = 0
g_dense_w.set_value(0.0)
g_dense_b.set_value(0.0)
@@ -149,52 +158,62 @@ def train(data, max_epoch, hidden_size =100, seq_length=100, batch_size=16,
grads.append(grad)
g_dense_w += gwb[0]
g_dense_b += gwb[1]
- #print output.l1(), act.l1()
- utils.update_progress(b * 1.0 / data.num_train_batch,
- 'training loss = %f' % (batch_loss / seq_length))
+ # print output.l1(), act.l1()
+ utils.update_progress(
+ b * 1.0 / data.num_train_batch, 'training loss = %f' %
+ (batch_loss / seq_length))
train_loss += batch_loss
grads.append(tensor.Tensor())
grads.append(tensor.Tensor())
- g_rnn_w=rnn.backward(model_pb2.kTrain, grads)[1][0]
+ g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0]
dense_w, dense_b = dense.param_values()
opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw')
- opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w')
- opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b')
- print '\nEpoch %d, train loss is %f' % (epoch,
- train_loss / data.num_train_batch / seq_length)
+ opt.apply_with_lr(
+ epoch, get_lr(epoch),
+ g_dense_w, dense_w, 'dense_w')
+ opt.apply_with_lr(
+ epoch, get_lr(epoch),
+ g_dense_b, dense_b, 'dense_b')
+ print '\nEpoch %d, train loss is %f' % \
+ (epoch, train_loss / data.num_train_batch / seq_length)
+
eval_loss = 0
for b in range(data.num_test_batch):
batch = data.val_dat[b * batch_size: (b + 1) * batch_size]
inputs, labels = convert(batch, batch_size, seq_length,
- data.vocab_size, cuda)
+ data.vocab_size, cuda)
inputs.append(tensor.Tensor())
inputs.append(tensor.Tensor())
outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2]
for output, label in zip(outputs, labels):
output = dense.forward(model_pb2.kEval, output)
- eval_loss += lossfun.forward(model_pb2.kEval, output, label).l1()
- print 'Epoch %d, evaluation loss is %f' % (epoch,
- eval_loss / data.num_test_batch / seq_length)
+ eval_loss += lossfun.forward(model_pb2.kEval,
+ output, label).l1()
+ print 'Epoch %d, evaluation loss is %f' % \
+ (epoch, eval_loss / data.num_test_batch / seq_length)
# checkpoint the file model
with open(model_path, 'wb') as fd:
print 'saving model to %s' % model_path
- d={}
- for name, w in zip(['rnn_w', 'dense_w', 'dense_b'], [rnn_w, dense_w, dense_b]):
+ d = {}
+ for name, w in zip(
+ ['rnn_w', 'dense_w', 'dense_b'],
+ [rnn_w, dense_w, dense_b]):
w.to_host()
- d[name]=tensor.to_numpy(w)
- d['idx_to_char']=data.idx_to_char
- d['char_to_idx']=data.char_to_idx
- d['hidden_size']=hidden_size
- d['num_stacks']=num_stacks
- d['dropout']=dropout
+ d[name] = tensor.to_numpy(w)
+ d['idx_to_char'] = data.idx_to_char
+ d['char_to_idx'] = data.char_to_idx
+ d['hidden_size'] = hidden_size
+ d['num_stacks'] = num_stacks
+ d['dropout'] = dropout
pickle.dump(d, fd)
if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Train multi-stack LSTM for '\
- 'modeling character sequence from plain text files')
+ parser = argparse.ArgumentParser(
+ description='Train multi-stack LSTM for '
+ 'modeling character sequence from plain text files')
parser.add_argument('data', type=str, help='training file')
parser.add_argument('-b', type=int, default=32, help='batch_size')
parser.add_argument('-l', type=int, default=64, help='sequence length')
@@ -204,4 +223,4 @@ if __name__ == '__main__':
args = parser.parse_args()
data = Data(args.data, batch_size=args.b, seq_length=args.l)
train(data, args.m, hidden_size=args.d, num_stacks=args.s,
- seq_length=args.l, batch_size=args.b)
+ seq_length=args.l, batch_size=args.b)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/examples/cifar10/alexnet.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet.py b/examples/cifar10/alexnet.py
index ddad1d5..34da95d 100644
--- a/examples/cifar10/alexnet.py
+++ b/examples/cifar10/alexnet.py
@@ -20,12 +20,8 @@ Following the same setting for hyper-parameters and data pre-processing, the fin
validation accuracy would be about 82%.
'''
-import sys
-import os
-
# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
from singa import layer
-from singa import initializer
from singa import metric
from singa import loss
from singa import net as ffnet
@@ -40,23 +36,57 @@ def create_net(use_cpu=False):
W1_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.01}
W2_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.01, 'decay_mult': 250}
b_specs = {'init': 'constant', 'value': 0, 'lt_mult': 2}
- net.add(layer.Conv2D('conv1', 32, 5, 1, W_specs=W0_specs.copy(), b_specs=b_specs.copy(), pad=2, input_sample_shape=(3,32,32,)))
+ net.add(
+ layer.Conv2D(
+ 'conv1',
+ 32,
+ 5,
+ 1,
+ W_specs=W0_specs.copy(),
+ b_specs=b_specs.copy(),
+ pad=2,
+ input_sample_shape=(
+ 3,
+ 32,
+ 32,
+ )))
net.add(layer.MaxPooling2D('pool1', 3, 2, pad=1))
net.add(layer.Activation('relu1'))
net.add(layer.LRN(name='lrn1'))
- net.add(layer.Conv2D('conv2', 32, 5, 1, W_specs=W1_specs.copy(), b_specs=b_specs.copy(), pad=2))
+ net.add(
+ layer.Conv2D(
+ 'conv2',
+ 32,
+ 5,
+ 1,
+ W_specs=W1_specs.copy(),
+ b_specs=b_specs.copy(),
+ pad=2))
net.add(layer.Activation('relu2'))
net.add(layer.MaxPooling2D('pool2', 3, 2, pad=1))
net.add(layer.LRN('lrn2'))
- net.add(layer.Conv2D('conv3', 64, 5, 1, W_specs=W1_specs.copy(), b_specs=b_specs.copy(), pad=2))
+ net.add(
+ layer.Conv2D(
+ 'conv3',
+ 64,
+ 5,
+ 1,
+ W_specs=W1_specs.copy(),
+ b_specs=b_specs.copy(),
+ pad=2))
net.add(layer.Activation('relu3'))
net.add(layer.MaxPooling2D('pool3', 3, 2, pad=1))
net.add(layer.Flatten('flat'))
- net.add(layer.Dense('dense', 10, W_specs=W2_specs.copy(), b_specs=b_specs.copy()))
+ net.add(
+ layer.Dense(
+ 'dense',
+ 10,
+ W_specs=W2_specs.copy(),
+ b_specs=b_specs.copy()))
for (p, specs) in zip(net.param_values(), net.param_specs()):
filler = specs.filler
if filler.type == 'gaussian':
- initializer.gaussian(p, filler.mean, filler.std)
+ p.gaussian(filler.mean, filler.std)
else:
p.set_value(0)
print specs.name, filler.type, p.l1()
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/examples/cifar10/predict.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/predict.py b/examples/cifar10/predict.py
index 8a9ea4e..307a610 100644
--- a/examples/cifar10/predict.py
+++ b/examples/cifar10/predict.py
@@ -16,28 +16,26 @@
# =============================================================================
import cPickle as pickle
import numpy as np
-import sys
-import os
-#sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
from singa import device
from singa import tensor
import net as ffnet
-def predict(net, images, cuda, topk=5):
+def predict(net, images, dev, topk=5):
'''Predict the label of each image.
Args:
net, a pretrained neural net
images, a batch of images [batch_size, 3, 32, 32], which have been
pre-processed
- cuda, the cuda device
+ dev, the training device
topk, return the topk labels for each image.
'''
x = tensor.from_numpy(images.astype(np.float32))
- x.to_device(cuda)
+ x.to_device(dev)
y = net.predict(x)
y.to_host()
y = tensor.to_numpy(y)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/examples/cifar10/vgg.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/vgg.py b/examples/cifar10/vgg.py
index 35ef00d..e8e3602 100644
--- a/examples/cifar10/vgg.py
+++ b/examples/cifar10/vgg.py
@@ -20,11 +20,7 @@ The performance could be improved by tuning some hyper-parameters, including
learning rate, weight decay, max_epoch, parameter initialization, etc.
"""
-import sys
-import os
-import math
-
-#sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
from singa import layer
from singa import initializer
@@ -88,9 +84,9 @@ def create_net(use_cpu=False):
initializer.uniform(p, 0, 1)
elif len(p.shape) > 1:
if 'conv' in name:
- initializer.gaussian(p, 0, math.sqrt(2.0/(9.0 * p.shape[0])))
+ p.gaussian(0, 0, 3 * 3 * p.shape[0])
else:
- initializer.gaussian(p, 0, 0.02)
+ p.gaussian(0, 0.02)
else:
p.set_value(0)
print name, p.l1()
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/examples/index.rst
----------------------------------------------------------------------
diff --git a/examples/index.rst b/examples/index.rst
index d6faf5d..4bb5b49 100644
--- a/examples/index.rst
+++ b/examples/index.rst
@@ -1,5 +1,9 @@
+Examples
+========
+
.. toctree::
+ cifar10/README
char-rnn/README
imagenet/README
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/src/python/singa/initializer.py
----------------------------------------------------------------------
diff --git a/src/python/singa/initializer.py b/src/python/singa/initializer.py
index 277fd2f..fb99663 100644
--- a/src/python/singa/initializer.py
+++ b/src/python/singa/initializer.py
@@ -23,77 +23,68 @@ Example usages::
from singa import initializer
x = tensor.Tensor((3, 5))
- initializer.xavier(x)
+ initializer.uniform(x, 3, 5) # use both fan_in and fan_out
+ initializer.uniform(x, 3, 0) # use only fan_in
'''
import math
-'''
-TODO(wangwei) update the uniform and gaussian initializers
-
def uniform(t, fan_in=0, fan_out=0):
- typically, for conv layer weight: fan_in = nb_filter * kh * kw,
- fan_out = nb_channel * kh * kw
- for dense layer weight, fan_in = input_feature_length,
- fan_out = output_feature_length
- # Ref: [Bengio and Glorot 2010]: Understanding the difficulty of
+ '''Initialize the values of the input tensor following a uniform
+ distribution with specific bounds.
+
+ Args:
+ fan_in(int): for the weight Tensor of a convolution layer,
+ fan_in = nb_channel * kh * kw; for dense layer,
+ fan_in = input_feature_length
+ fan_out(int): for the convolution layer weight Tensor,
+ fan_out = nb_filter * kh * kw; for the weight Tensor of a dense
+ layer, fan_out = output_feature_length
+
+ Ref: [Bengio and Glorot 2010]: Understanding the difficulty of
training deep feedforward neuralnetworks.
- assert fan_in >0 or fan_out > 0, \
+ '''
+ assert fan_in > 0 or fan_out > 0, \
'fan_in and fan_out cannot be 0 at the same time'
- avg = 1
+ avg = 2
if fan_in * fan_out == 0:
- avg = 2
- x = math.sqrt(3.0f * avg / (fan_in + fan_out))
+ avg = 1
+ x = math.sqrt(3.0 * avg / (fan_in + fan_out))
t.uniform(-x, x)
def gaussian(t, fan_in=0, fan_out=0):
- typically, for conv layer weight: fan_in = nb_filter * kh * kw,
- fan_out = nb_channel * kh * kw
- for dense layer weight, fan_in = input_feature_length,
- fan_out = output_feature_length
+ '''Initialize the values of the input tensor following a Gaussian
+ distribution with specific std.
+
+ Args:
+ fan_in(int): for the weight Tensor of a convolution layer,
+ fan_in = nb_channel * kh * kw; for dense layer,
+ fan_in = input_feature_length
+ fan_out(int): for the convolution layer weight Tensor,
+ fan_out = nb_filter * kh * kw; for the weight Tensor of a dense
+ layer, fan_out = output_feature_length
Ref Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Delving Deep into
Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
-
- assert fan_in >0 or fan_out > 0, \
+ '''
+ assert fan_in > 0 or fan_out > 0, \
'fan_in and fan_out cannot be 0 at the same time'
- avg = 1
+ avg = 2
if fan_in * fan_out == 0:
- avg = 2
- std = math.sqrt(2.0f * avg / (fan_in + fan_out))
+ avg = 1
+ std = math.sqrt(2.0 * avg / (fan_in + fan_out))
t.gaussian(0, std)
-'''
-
-
-def uniform(t, low=0, high=1):
- '''Initialize the parameter values following an Uniform distribution.
-
- Args:
- t (Tensor): the parater tensor
- low (float): lower bound
- high (float): higher bound
- '''
- t.uniform(low, high)
-
-
-def gaussian(t, mean=0, std=0.01):
- '''Initialize the parameter values following an Gaussian distribution.
-
- Args:
- t (Tensor): the parater tensor
- mean (float): mean of the distribution
- std (float): standard variance
- '''
- t.gaussian(mean, std)
def xavier(t):
'''Initialize the matrix parameter follow a Uniform distribution from
[-sqrt(6/(fan_in + fan_out)), sqrt(6/(fan_in + fan_out))].
+ Deprecated. Please use uniform()
+
Args:
t (Tensor): the parater tensor
'''
@@ -106,6 +97,8 @@ def glorot(t):
'''Initialize the matrix parameter follow a Gaussian distribution with
mean = 0 and std = sqrt(2.0 / (nb_row + nb_col))
+ Deprecated. Please use gaussian()
+
Args:
t (Tensor): the parater tensor
'''
@@ -118,6 +111,8 @@ def msra(t):
'''Initialize the matrix parameter follow a Guassian distribution with
mean = 0, std = math.sqrt(2.0 / nb_row).
+ Deprecated. Please use gaussian()
+
Ref [He, Zhang, Ren and Sun 2015]: Specifically accounts for ReLU
nonlinearities.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cdd718ed/src/python/singa/optimizer.py
----------------------------------------------------------------------
diff --git a/src/python/singa/optimizer.py b/src/python/singa/optimizer.py
index a964f16..338c6b0 100644
--- a/src/python/singa/optimizer.py
+++ b/src/python/singa/optimizer.py
@@ -44,8 +44,8 @@ class Optimizer(object):
1. construct the optimizer
2. (optional) register each parameter with its specs.
- 3. use the optimizer to update parameter values given parameter
- gradients and other optional info
+ 3. use the optimizer to update parameter values given parameter gradients
+ and other optional info
The subclasses should override the apply_with_lr function to do the real
parameter udpate.