You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/01/01 13:29:17 UTC

[08/10] incubator-singa git commit: SINGA-81 Add Python Helper

SINGA-81 Add Python Helper

Add comments for some functions in model.py.
Remove the rnnlm related code, which could be added later when it can be run sucessfully using python.
Move datasets/ into examples as they are used mainly by the examples.
Update .gitinore to ignore the pb2 foder in tool/python/.

TODO add comments for other methods in files under singa/ folder.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/8914750e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/8914750e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/8914750e

Branch: refs/heads/master
Commit: 8914750e8c6d6fd0d9d0d8aed53fd775a1367b88
Parents: 3c12730
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Tue Dec 29 11:36:28 2015 +0800
Committer: chonho <le...@comp.nus.edu.sg>
Committed: Fri Jan 1 15:59:15 2016 +0800

----------------------------------------------------------------------
 tool/python/examples/cifar10_cnn.py             |   4 +-
 tool/python/examples/cifar10_cnn_cudnn.py       |   4 +-
 .../python/examples/cifar10_cnn_cudnn_hybrid.py |  34 -----
 tool/python/examples/cifar10_cnn_parameter.py   |   4 +-
 tool/python/examples/datasets/__init__.py       |   0
 tool/python/examples/datasets/cifar10.py        |  34 +++++
 tool/python/examples/datasets/mnist.py          |  32 +++++
 tool/python/examples/rnnlm_usermodel.py         |  22 ----
 tool/python/singa.py                            |  26 ++--
 tool/python/singa/datasets/__init__.py          |   0
 tool/python/singa/datasets/cifar10.py           |  34 -----
 tool/python/singa/datasets/mnist.py             |  32 -----
 tool/python/singa/datasets/rnnlm.py             |  20 ---
 tool/python/singa/model.py                      | 132 +++++++++++--------
 14 files changed, 164 insertions(+), 214 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/cifar10_cnn.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/cifar10_cnn.py b/tool/python/examples/cifar10_cnn.py
index 9ef552b..859a9a4 100755
--- a/tool/python/examples/cifar10_cnn.py
+++ b/tool/python/examples/cifar10_cnn.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..')) 
+sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
 from singa.model import *
 from singa.datasets import cifar10
 
@@ -24,7 +24,7 @@ m.add(AvgPooling2D(pool_size=(3,3), stride=2))
 
 m.add(Dense(10, w_wd=250, b_lr=2, b_wd=0, activation='softmax'))
 
-sgd = SGD(decay=0.004, lr_type='fixed', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
+sgd = SGD(decay=0.004, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
 topo = Cluster(workspace)
 m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
 m.fit(X_train, nb_epoch=1000, with_test=True)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/cifar10_cnn_cudnn.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/cifar10_cnn_cudnn.py b/tool/python/examples/cifar10_cnn_cudnn.py
index e3c5c49..d4f4b7c 100755
--- a/tool/python/examples/cifar10_cnn_cudnn.py
+++ b/tool/python/examples/cifar10_cnn_cudnn.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..')) 
+sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
 from singa.model import *
 from singa.datasets import cifar10
 
@@ -24,7 +24,7 @@ m.add(AvgPooling2D(pool_size=(3,3), stride=2))
 
 m.add(Dense(10, w_wd=250, b_lr=2, b_wd=0, activation='softmax'))
 
-sgd = SGD(decay=0.004, lr_type='fixed', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
+sgd = SGD(decay=0.004, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
 topo = Cluster(workspace)
 m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/cifar10_cnn_cudnn_hybrid.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/cifar10_cnn_cudnn_hybrid.py b/tool/python/examples/cifar10_cnn_cudnn_hybrid.py
deleted file mode 100755
index f5e4c27..0000000
--- a/tool/python/examples/cifar10_cnn_cudnn_hybrid.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env python
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..')) 
-from singa.model import *
-from singa.datasets import cifar10
-
-X_train, X_test, workspace = cifar10.load_data()
-
-m = Sequential('cifar10-cnn', sys.argv)
-
-m.add(Convolution2D(32, 5, 1, 2, w_std=0.0001, b_lr=2))
-m.add(MaxPooling2D(pool_size=(3,3), stride=2))
-m.add(Activation('relu'))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(32, 5, 1, 2, b_lr=2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(64, 5, 1, 2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-
-m.add(Dense(10, w_wd=250, b_lr=2, b_wd=0, activation='softmax'))
-
-sgd = SGD(decay=0.004, lr_type='fixed', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
-topo = Cluster(workspace, nworkers_per_group=2, nworkers_per_procs=2)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-
-gpu_id = [0,1]
-m.fit(X_train, nb_epoch=10000, with_test=True, device=gpu_id)
-result = m.evaluate(X_test, test_steps=0, test_freq=200)
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/cifar10_cnn_parameter.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/cifar10_cnn_parameter.py b/tool/python/examples/cifar10_cnn_parameter.py
index dd03f5c..4144fa5 100755
--- a/tool/python/examples/cifar10_cnn_parameter.py
+++ b/tool/python/examples/cifar10_cnn_parameter.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..')) 
+sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
 from singa.model import *
 from singa.datasets import cifar10
 
@@ -27,7 +27,7 @@ m.add(AvgPooling2D(pool_size=(3,3), stride=2))
 
 m.add(Dense(10, w_param=parw, w_wd=250, b_param=parb, b_lr=2, b_wd=0, activation='softmax'))
 
-sgd = SGD(decay=0.004, lr_type='fixed', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
+sgd = SGD(decay=0.004, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
 topo = Cluster(workspace)
 m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
 m.fit(X_train, nb_epoch=100, with_test=True)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/datasets/__init__.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/datasets/__init__.py b/tool/python/examples/datasets/__init__.py
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/datasets/cifar10.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/datasets/cifar10.py b/tool/python/examples/datasets/cifar10.py
new file mode 100644
index 0000000..65bcd60
--- /dev/null
+++ b/tool/python/examples/datasets/cifar10.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+from singa.model import *
+
+def load_data(
+         workspace = None, 
+         backend = 'kvfile',
+         batchsize = 64,
+         random = 5000,
+         shape = (3, 32, 32),
+         std = 127.5,
+         mean = 127.5
+      ):
+
+  # using cifar10 dataset
+  data_dir = 'examples/cifar10'
+  path_train = data_dir + '/train_data.bin'
+  path_test  = data_dir + '/test_data.bin'
+  path_mean  = data_dir + '/image_mean.bin'
+  if workspace == None: workspace = data_dir
+
+  store = Store(path=path_train, mean_file=path_mean, backend=backend,
+              random_skip=random, batchsize=batchsize,
+              shape=shape) 
+
+  data_train = Data(load='recordinput', phase='train', conf=store)
+
+  store = Store(path=path_test, mean_file=path_mean, backend=backend,
+              batchsize=batchsize,
+              shape=shape) 
+
+  data_test = Data(load='recordinput', phase='test', conf=store)
+
+  return data_train, data_test, workspace
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/datasets/mnist.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/datasets/mnist.py b/tool/python/examples/datasets/mnist.py
new file mode 100644
index 0000000..c8695ec
--- /dev/null
+++ b/tool/python/examples/datasets/mnist.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+from singa.model import * 
+
+def load_data(
+     workspace = None,
+     backend = 'kvfile',
+     nb_rbm = 0,  # the number of layers for RBM and Autoencoder 
+     checkpoint_steps = 0, 
+     **pvalues
+   ):
+
+  # using mnist dataset
+  data_dir = 'examples/mnist'
+  path_train = data_dir + '/train_data.bin'
+  path_test  = data_dir + '/test_data.bin'
+  if workspace == None: workspace = data_dir
+
+  # checkpoint path to load
+  checkpoint_list = None 
+  if checkpoint_steps > 0:
+    workerid = 0
+    checkpoint_list = [] 
+    for i in range(nb_rbm-1, 0, -1):
+      checkpoint_list.append('examples/rbm/rbm{0}/checkpoint/step{1}-worker{2}'.format(str(i),checkpoint_steps,workerid))
+
+  store = Store(path=path_train, backend=backend, **pvalues)
+  data_train = Data(load='recordinput', phase='train', conf=store, checkpoint=checkpoint_list)
+
+  store = Store(path=path_test, backend=backend, **pvalues)
+  data_test = Data(load='recordinput', phase='test', conf=store)
+
+  return data_train, data_test, workspace

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/examples/rnnlm_usermodel.py
----------------------------------------------------------------------
diff --git a/tool/python/examples/rnnlm_usermodel.py b/tool/python/examples/rnnlm_usermodel.py
deleted file mode 100755
index 1b49321..0000000
--- a/tool/python/examples/rnnlm_usermodel.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..')) 
-from singa.model import *
-from singa.datasets import rnnlm 
-
-vocab_size = 3720
-
-X_train, X_valid, workspace = rnnlm.load_data()
-
-m = Sequential('rnnlm', sys.argv)
-
-parw = Parameter(init='uniform', range=0.3)
-m.add(Embedding(in_dim=vocab_size, out_dim=15, w_param=parw))
-m.add(RNNLM(1, w_param=parw))
-
-sgd = SGD(lr_type='fixed', step=(0,48810,56945,65080,73215), step_lr=(0.1,0.05,0.025,0.0125,0.00625))
-topo = Cluster(workspace)
-m.compile(loss='user_loss_rnnlm', in_dim=vocab_size, nclass=100, optimizer=sgd, cluster=topo)
-
-m.fit(X_train, validate=X_valid, validate_steps=683, nb_epoch=81350, execpath='examples/rnnlm/rnnlm.bin')
-#result = m.evaluate(X_valid, validate_steps=683, validate_freq=8135, execpath='examples/rnnlm/rnnlm.bin')

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/singa.py
----------------------------------------------------------------------
diff --git a/tool/python/singa.py b/tool/python/singa.py
index 6d7fbdf..986a6b8 100755
--- a/tool/python/singa.py
+++ b/tool/python/singa.py
@@ -31,14 +31,18 @@ import singa.driver as driver
 from google.protobuf.text_format import Merge
 
 if __name__ == '__main__':
-    i =  sys.argv.index("-conf")
-    s = open(sys.argv[i+1], 'r').read()
-    s = str(s)
-    j = job_pb2.JobProto()  
-    Merge(s,j)
-    b = j.SerializeToString()
-    d = driver.Driver()
-    d.InitLog(sys.argv[0])
-    d.Init(sys.argv)
-#    d.Train(False,b)
-    d.Test(b)
+  """Invoke the training program using this python script.
+  ./bin/singa-run.sh -exec tool/python/singa.py -conf examples/cifar10/job.conf
+  """"
+
+  i =  sys.argv.index("-conf")
+  s = open(sys.argv[i+1], 'r').read()
+  s = str(s)
+  j = job_pb2.JobProto()
+  Merge(s,j)
+  b = j.SerializeToString()
+  d = driver.Driver()
+  d.InitLog(sys.argv[0])
+  d.Init(sys.argv)
+  d.Train(False,b)
+  #d.Test(b)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/singa/datasets/__init__.py
----------------------------------------------------------------------
diff --git a/tool/python/singa/datasets/__init__.py b/tool/python/singa/datasets/__init__.py
deleted file mode 100644
index e69de29..0000000

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/singa/datasets/cifar10.py
----------------------------------------------------------------------
diff --git a/tool/python/singa/datasets/cifar10.py b/tool/python/singa/datasets/cifar10.py
deleted file mode 100644
index 65bcd60..0000000
--- a/tool/python/singa/datasets/cifar10.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env python
-from singa.model import *
-
-def load_data(
-         workspace = None, 
-         backend = 'kvfile',
-         batchsize = 64,
-         random = 5000,
-         shape = (3, 32, 32),
-         std = 127.5,
-         mean = 127.5
-      ):
-
-  # using cifar10 dataset
-  data_dir = 'examples/cifar10'
-  path_train = data_dir + '/train_data.bin'
-  path_test  = data_dir + '/test_data.bin'
-  path_mean  = data_dir + '/image_mean.bin'
-  if workspace == None: workspace = data_dir
-
-  store = Store(path=path_train, mean_file=path_mean, backend=backend,
-              random_skip=random, batchsize=batchsize,
-              shape=shape) 
-
-  data_train = Data(load='recordinput', phase='train', conf=store)
-
-  store = Store(path=path_test, mean_file=path_mean, backend=backend,
-              batchsize=batchsize,
-              shape=shape) 
-
-  data_test = Data(load='recordinput', phase='test', conf=store)
-
-  return data_train, data_test, workspace
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/singa/datasets/mnist.py
----------------------------------------------------------------------
diff --git a/tool/python/singa/datasets/mnist.py b/tool/python/singa/datasets/mnist.py
deleted file mode 100644
index c8695ec..0000000
--- a/tool/python/singa/datasets/mnist.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python
-from singa.model import * 
-
-def load_data(
-     workspace = None,
-     backend = 'kvfile',
-     nb_rbm = 0,  # the number of layers for RBM and Autoencoder 
-     checkpoint_steps = 0, 
-     **pvalues
-   ):
-
-  # using mnist dataset
-  data_dir = 'examples/mnist'
-  path_train = data_dir + '/train_data.bin'
-  path_test  = data_dir + '/test_data.bin'
-  if workspace == None: workspace = data_dir
-
-  # checkpoint path to load
-  checkpoint_list = None 
-  if checkpoint_steps > 0:
-    workerid = 0
-    checkpoint_list = [] 
-    for i in range(nb_rbm-1, 0, -1):
-      checkpoint_list.append('examples/rbm/rbm{0}/checkpoint/step{1}-worker{2}'.format(str(i),checkpoint_steps,workerid))
-
-  store = Store(path=path_train, backend=backend, **pvalues)
-  data_train = Data(load='recordinput', phase='train', conf=store, checkpoint=checkpoint_list)
-
-  store = Store(path=path_test, backend=backend, **pvalues)
-  data_test = Data(load='recordinput', phase='test', conf=store)
-
-  return data_train, data_test, workspace

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/singa/datasets/rnnlm.py
----------------------------------------------------------------------
diff --git a/tool/python/singa/datasets/rnnlm.py b/tool/python/singa/datasets/rnnlm.py
deleted file mode 100644
index ef8142a..0000000
--- a/tool/python/singa/datasets/rnnlm.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env python
-from singa.model import *
-
-def load_data(
-         workspace = 'examples/rnnlm',
-         backend = 'kvfile',
-         max_window = 10
-      ):
-
-  path_train = workspace + '/train_data.bin'
-  path_valid = workspace + '/valid_data.bin'
-  path_test  = workspace + '/test_data.bin'
-
-
-  data_train = Data(load='kData', phase='train', path=path_train, backend=backend, max_window=max_window)
-
-  data_valid = Data(load='kData', phase='val', path=path_valid, max_window=max_window)
-
-  return data_train, data_valid, workspace
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8914750e/tool/python/singa/model.py
----------------------------------------------------------------------
diff --git a/tool/python/singa/model.py b/tool/python/singa/model.py
index d68d143..29db70e 100644
--- a/tool/python/singa/model.py
+++ b/tool/python/singa/model.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 import sys, re, subprocess
 from layer import *
-from utils.utility import * 
-from utils.message import * 
+from utils.utility import *
+from utils.message import *
 from google.protobuf import text_format
 
 class Model(object):
@@ -14,14 +14,14 @@ class Model(object):
       argv             // pass sys.argv to source
       label = (bool)   // exist label layer (depreciated)
     '''
-    self.jobconf = Message('Job', name=name).proto 
+    self.jobconf = Message('Job', name=name).proto
     self.layers = []
     self.label = label
     self.argv = argv
     self.result = None
     self.last_checkpoint_path = None
     self.cudnn = False
-    
+
   def exist_datalayer(self, phase):
     for ly in self.layers:
       if enumPhase(phase) in ly.layer.include:
@@ -38,7 +38,7 @@ class Model(object):
       topk      = (int)     // the number of results considered to compute accuracy
     '''
     assert optimizer != None, 'optimizer (Updater component) should be set'
-    assert cluster != None, 'cluster (Cluster component) should be set'  
+    assert cluster != None, 'cluster (Cluster component) should be set'
     setval(self.jobconf, updater=optimizer.proto)
     setval(self.jobconf, cluster=cluster.proto)
 
@@ -56,7 +56,7 @@ class Model(object):
         # revise the last layer
         if loss == 'categorical_crossentropy':
           setval(ly, type=enumLayerType('softmaxloss'))
-          setval(ly.softmaxloss_conf, topk=topk) 
+          setval(ly.softmaxloss_conf, topk=topk)
         elif loss == 'mean_squared_error':
           setval(ly, type=enumLayerType('euclideanloss'))
       else:
@@ -72,7 +72,7 @@ class Model(object):
     '''
     construct neuralnet proto
     '''
-    net = NetProto() 
+    net = NetProto()
     slyname = self.layers[0].layer.name
     for i in range(len(self.layers)):
       ly = net.layer.add()
@@ -95,7 +95,7 @@ class Model(object):
 
     # deal with label layer (depreciated)
     if self.label == True:
-      label_layer = Layer(name='label', type=kLabel)      
+      label_layer = Layer(name='label', type=kLabel)
       ly = net.layer.add()
       ly.CopyFrom(label_layer.layer)
       getattr(ly, 'srclayers').append(self.layers[0].layer.name)
@@ -108,7 +108,7 @@ class Model(object):
 
     # use of cudnn
     if self.cudnn == True:
-      self.setCudnnLayerType(net) 
+      self.setCudnnLayerType(net)
 
     setval(self.jobconf, neuralnet=net)
 
@@ -127,7 +127,7 @@ class Model(object):
         batch_size       = (int)    // batch size for training data
         train_steps      = (int)    // the number of steps for training, i.e., epoch
         disp_freq        = (int)    // frequency to display training info
-        disp_after       = (int)    // display after this number 
+        disp_after       = (int)    // display after this number
         validate_data    = (Data)   // validation data, specified in load_data()
         validate_freq    = (int)    // frequency of validation
         validate_steps   = (int)    // total number of steps for validation
@@ -143,7 +143,7 @@ class Model(object):
       setval(data.layer.store_conf, batchsize=fields['batch_size'])
 
     # insert layer for training
-    if self.exist_datalayer('train') == False: 
+    if self.exist_datalayer('train') == False:
       self.layers.insert(0, data)
     setval(self.jobconf, train_steps=nb_epoch)
     setval(self.jobconf, disp_freq=nb_epoch/10)
@@ -163,8 +163,8 @@ class Model(object):
     # save model parameter (i.e., checkpoint_path)
     setval(self.jobconf, checkpoint_freq=nb_epoch)
     self.last_checkpoint_path = '{0}/step{1}-worker0'.format(
-                     self.jobconf.cluster.workspace, nb_epoch) 
-    
+                     self.jobconf.cluster.workspace, nb_epoch)
+
     # set Train_one_batch component, using backprogapation at default
     setval(self.jobconf, train_one_batch=Algorithm(type=enumAlgType(alg)).proto)
 
@@ -174,7 +174,7 @@ class Model(object):
       self.cudnn = True
 
     # start to run singa for training
-    if with_test == False: 
+    if with_test == False:
       self.build()  # construct Nneuralnet Component
       #self.display()
       return SingaRun(jobproto=self.jobconf, argv=self.argv, execpath=execpath)
@@ -191,13 +191,13 @@ class Model(object):
     optional
       alg             = (string)   // algorithm type, (backpropagation at default)
       checkpoint_path = (list)     // checkpoint path is necessary only for testing
-      execpaths       = (string)   // path to user's own executable 
+      execpaths       = (string)   // path to user's own executable
       device          = (int/list) // a list of gpu ids
       **fields (KEY=VALUE)
         batch_size   = (int)  // batch size for testing data
         test_freq    = (int)  // frequency of testing
-        test_steps   = (int)  // total number of steps for testing 
-        test_after   = (int)  // start testing after this number of steps 
+        test_steps   = (int)  // total number of steps for testing
+        test_after   = (int)  // start testing after this number of steps
     '''
     assert data != None, 'Testing data should be set'
     is_testonly = False
@@ -206,11 +206,11 @@ class Model(object):
       setval(data.layer.store_conf, batchsize=fields['batch_size'])
 
     # insert layer for testing
-    if self.exist_datalayer('test') == False: 
+    if self.exist_datalayer('test') == False:
       self.layers.insert(0, data)
 
     # loading checkpoint if singa runs only for testing
-    if self.exist_datalayer('train') == False: 
+    if self.exist_datalayer('train') == False:
       is_testonly = True
       if checkpoint_path == None:
         print 'checkpoint_path has not been specified'
@@ -220,7 +220,7 @@ class Model(object):
     steps = fields['test_steps'] if 'test_steps' in fields else 10
     setval(self.jobconf, test_steps=steps)
     setval(self.jobconf, **fields)
-    
+
     # set Train_one_batch component, using backprogapation at default
     setval(self.jobconf, train_one_batch=Algorithm(type=enumAlgType(alg)).proto)
 
@@ -231,16 +231,16 @@ class Model(object):
 
     self.build()  # construct Nneuralnet Component
 
-    #--- generate job.conf file for debug purpose 
+    #--- generate job.conf file for debug purpose
     #filename = 'job.conf'
     #with open(filename, 'w') as f:
     #  f.write(text_format.MessageToString(self.jobconf.cluster))
     #self.display()
 
-    #--- run singa --- 
+    #--- run singa ---
     return SingaRun(jobproto=self.jobconf, argv=self.argv, execpath=execpath, testmode=is_testonly)
     #return SingaRun_script(filename=filename, execpath=execpath)
-    
+
 
   def display(self):
     ''' print out job proto
@@ -260,13 +260,13 @@ class Model(object):
       elif ly_type == kSoftmaxLoss: cudnn_ly_type = kCudnnSoftmaxLoss
       elif ly_type == kSTanh:
         cudnn_ly_type = kCudnnActivation
-        net.layer[i].activation_conf.type = STANH 
+        net.layer[i].activation_conf.type = STANH
       elif ly_type == kSigmoid:
         cudnn_ly_type = kCudnnActivation
-        net.layer[i].activation_conf.type = SIGMOID 
+        net.layer[i].activation_conf.type = SIGMOID
       elif ly_type == kReLU:
         cudnn_ly_type = kCudnnActivation
-        net.layer[i].activation_conf.type = RELU 
+        net.layer[i].activation_conf.type = RELU
       net.layer[i].type = cudnn_ly_type
 
 
@@ -277,7 +277,7 @@ class Energy(Model):
   def add(self, layer):
     if hasattr(layer, 'layer_type'):
       if layer.layer_type == kRBMVis:
-        dim = 0 
+        dim = 0
         for i in range(1, len(layer.out_dim)):
           parw = Parameter(name='w', init='none', level=i)
           parb = Parameter(name='b', init='none', level=i)
@@ -293,7 +293,7 @@ class Sequential(Model):
   def add(self, layer):
     if hasattr(layer, 'layer_type'):
       if layer.layer_type == 'AutoEncoder':
-        dim = 0 
+        dim = 0
         if layer.param_share == True:
           # Encoding
           for i in range(1, len(layer.hid_dim)+1):
@@ -331,9 +331,9 @@ class Store(object):
     '''
     **kwargs
         path       = (string)  // path to dataset
-        backend    = (string)  // 
+        backend    = (string)  //
         batch_size = (int)     // batch size of dataset
-        shape      = (int)     // 
+        shape      = (int)     //
 
     '''
     self.proto = Message('Store', **kwargs).proto
@@ -357,23 +357,23 @@ class Updater(object):
       lr_type  = (string) // type of the learning rate (Fixed at default)
     '''
     upd = Message('Updater', type=upd_type, **fields).proto
-    setval(upd.learning_rate, base_lr=lr) 
+    setval(upd.learning_rate, base_lr=lr)
     if decay > 0:
-      setval(upd, weight_decay=decay) 
+      setval(upd, weight_decay=decay)
     if momentum > 0:
-      setval(upd, momentum=momentum) 
+      setval(upd, momentum=momentum)
 
-    if lr_type == None:
-      setval(upd.learning_rate, type=kFixed) 
+    if lr_type == None or lr_type == "fixed":
+      setval(upd.learning_rate, type=kFixed)
     elif lr_type == 'step':
       cp = Message('Step', change_freq=60, gamma=0.997)
-      setval(upd.learning_rate, type=kStep, step_conf=cp.proto) 
-    elif lr_type == 'fixedstep':
+      setval(upd.learning_rate, type=kStep, step_conf=cp.proto)
+    elif lr_type == 'manual':
       cp = Message('FixedStep', step=step, step_lr=step_lr)
-      setval(upd.learning_rate, type=kFixedStep, fixedstep_conf=cp.proto) 
+      setval(upd.learning_rate, type=kFixedStep, fixedstep_conf=cp.proto)
     elif lr_type == 'linear':
       cp = Message('Linear', change_freq=10, final_lr=0.1)
-      setval(upd.learning_rate, type=kLinear, linear_conf=cp.proto) 
+      setval(upd.learning_rate, type=kLinear, linear_conf=cp.proto)
 
     self.proto = upd
 
@@ -422,6 +422,15 @@ class AdaGrad(Updater):
 
 
 class Cluster(object):
+  """ Specify the cluster topology, e.g., number of workers/servers.
+
+  Currently we need to create this object in the .py file and also provide a
+  cluster configuration file to the command line. TODO(wangwei) update SINGA
+  code to eliminate the requirement of the cluster configuration file for
+  training on a single node or the cluster object in the pyfile for training
+  in a cluster.
+  """
+
   def __init__(self, workspace=None,
                nworker_groups=1, nserver_groups=1,
                nworkers_per_group=1, nservers_per_group=1,
@@ -443,65 +452,78 @@ class Cluster(object):
     assert workspace != None, 'need to set workspace'
     self.proto = Message('Cluster', workspace=workspace).proto
     # optional
-    self.proto.nworker_groups = nworker_groups 
-    self.proto.nserver_groups = nserver_groups 
-    self.proto.nworkers_per_group = nworkers_per_group 
-    self.proto.nservers_per_group = nservers_per_group 
-    self.proto.nworkers_per_procs = nworkers_per_procs 
-    self.proto.nservers_per_procs = nservers_per_procs 
+    self.proto.nworker_groups = nworker_groups
+    self.proto.nserver_groups = nserver_groups
+    self.proto.nworkers_per_group = nworkers_per_group
+    self.proto.nservers_per_group = nservers_per_group
+    self.proto.nworkers_per_procs = nworkers_per_procs
+    self.proto.nservers_per_procs = nservers_per_procs
     # other fields
     setval(self.proto, **fields)
 
 
 
 def StoreResults(lines):
+  """ Parsing metrics from each line in the log file.
 
-  resultDic = {} 
+  TODO(wangwei) format the log string to make them uniform for easy parsing
+  Another approach is creating a protobuf message for metrics, which can be
+  used for dumping metrics to string and loading perf string back to messages.
+  """
+
+  resultDic = {}
   for line in lines:
     line = re.findall(r'[\w|*.*]+', line)
     if 'Train' in line:
       step = line[line.index('step')+1]
       if 'accuracy' in line:
-        resultDic.setdefault(step,{})['acc'] = line[line.index('accuracy')+1] 
+        resultDic.setdefault(step,{})['acc'] = line[line.index('accuracy')+1]
       if 'loss' in line:
-        resultDic.setdefault(step,{})['loss'] = line[line.index('loss')+1] 
+        resultDic.setdefault(step,{})['loss'] = line[line.index('loss')+1]
       if 'ppl' in line:
-        resultDic.setdefault(step,{})['ppl'] = line[line.index('ppl')+1] 
+        resultDic.setdefault(step,{})['ppl'] = line[line.index('ppl')+1]
       if 'Squared' in line:
-        resultDic.setdefault(step,{})['se'] = line[line.index('Squared')+2] 
+        resultDic.setdefault(step,{})['se'] = line[line.index('Squared')+2]
   return resultDic
 
 def SingaRun(jobproto='', argv=[], execpath='', testmode=False):
 
   import singa.driver as driver
   d = driver.Driver()
-  d.InitLog(argv[0]) 
+  d.InitLog(argv[0])
   d.Init(argv)
   if testmode == True:
     d.Test(jobproto.SerializeToString())
   else:
     d.Train(False, jobproto.SerializeToString())
 
+  # Get the performance from the latest log file.
+  # TODO(wangwei) the log file would be overwritten by other running instance of
+  # the same program, e.g., lt-singa
   logfile = '/tmp/singa-log/{0}.ERROR'.format(argv[0].split('/')[-1])
   fin = open(logfile, 'r')
   result = StoreResults(fin.readlines())
- 
+
   return result
 
 def SingaRun_script(filename='', execpath=''):
+  """
+  Deprecated.
+  Generate the job conf file and run the shell command.
+  """
   SINGAROOT = '../../../'
   conf = 'examples/' + filename
   if execpath=='':
     cmd = SINGAROOT+'bin/singa-run.sh ' \
-        + '-conf %s ' % conf 
+        + '-conf %s ' % conf
   else:
     cmd = SINGAROOT+'bin/singa-run.sh ' \
         + '-conf %s ' % conf \
-        + '-exec %s ' % execpath 
+        + '-exec %s ' % execpath
 
   procs = subprocess.Popen(cmd.strip().split(' '), stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
 
-  resultDic = {} 
+  resultDic = {}
   outputlines = iter(procs.stdout.readline, '')
   resultDic = StoreResults(outputlines)