You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@singa.apache.org by wa...@apache.org on 2015/08/14 16:21:15 UTC

[1/2] incubator-singa git commit: SINGA-54 Refactor job configuration to move fields in ModelProto out

Repository: incubator-singa
Updated Branches:
  refs/heads/master 539fcee56 -> 4dee7b9cd


SINGA-54 Refactor job configuration to move fields in ModelProto out

Tested with mnist and cifar examples.
Four components are necessary for submitting a job, namely, neuralnet, alg, updater and cluster.
The configuration is now consistent with the MM paper.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/1b574f3c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/1b574f3c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/1b574f3c

Branch: refs/heads/master
Commit: 1b574f3c10f23fa80926471c3efa752d062d4301
Parents: 539fcee
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Aug 14 16:25:10 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Aug 14 16:25:10 2015 +0800

----------------------------------------------------------------------
 examples/cifar10/job.conf      |  49 +++---
 examples/mnist/conv.conf       | 295 +++++++++++++++++-------------------
 examples/mnist/job.conf        |  47 +++---
 include/neuralnet/base_layer.h |   2 +-
 include/singa.h                |   7 +-
 include/trainer/trainer.h      |  23 ++-
 include/trainer/worker.h       |   8 +-
 src/main.cc                    |   2 -
 src/neuralnet/base_layer.cc    |  10 +-
 src/neuralnet/layer.cc         |   6 +-
 src/neuralnet/neuralnet.cc     |   1 -
 src/proto/common.proto         |   7 -
 src/proto/job.proto            | 181 +++++++++++-----------
 src/trainer/trainer.cc         |  53 ++++---
 src/trainer/worker.cc          |  75 +++++----
 src/utils/param.cc             |  24 +--
 16 files changed, 380 insertions(+), 410 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index f7829b8..89afca9 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -1,30 +1,23 @@
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  workspace: "examples/cifar10"
+name: "cifar10-convnet"
+train_steps: 1000
+test_steps: 100
+test_freq:300
+disp_freq:30
+alg: kBP
+updater{
+  weight_decay:0.004
+  lr_change: kFixedStep
+  type: kSGD
+  fixedstep_conf:{
+    step:0
+    step:60000
+    step:65000
+    step_lr:0.001
+    step_lr:0.0001
+    step_lr:0.00001
+  }
 }
-
-model {
-  name: "cifar10-convnet"
-  train_steps: 1000
-  test_steps: 100
-  test_frequency:300
-  display_frequency:30
-  alg: kBackPropagation
-  updater{
-    weight_decay:0.004
-    lr_change: kFixedStep
-    type: kSGD
-    fixedstep_conf:{
-      step:0
-      step:60000
-      step:65000
-      step_lr:0.001
-      step_lr:0.0001
-      step_lr:0.00001
-    }
-  }
-  neuralnet {
+neuralnet {
   layer{
     name: "data"
     type: kShardData
@@ -226,4 +219,8 @@ model {
     srclayers: "label"
   }
 }
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  workspace: "examples/cifar10"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/examples/mnist/conv.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
index d463cd9..fce1418 100644
--- a/examples/mnist/conv.conf
+++ b/examples/mnist/conv.conf
@@ -1,186 +1,177 @@
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  nservers_per_procs: 1
-  nworkers_per_procs: 1
-  workspace: "examples/mnist"
-}
-model {
-name: "mnist-conv"
+name: "conv"
 train_steps: 10000
 test_steps:100
-test_frequency:500
-display_frequency:50
-debug: false
+test_freq:500
+disp_freq:50
+alg: kBP
+debug: true
 updater{
-  base_learning_rate:0.01
+  base_lr:0.01
   momentum:0.9
   weight_decay:0.0005
-  gamma:0.0001
-  pow:0.75
-  learning_rate_change_method:kInverse
+  lr_change: kInverse
+  type: kSGD
+  inverse_conf {
+    gamma:0.0001
+    pow:0.75
+  }
 }
 neuralnet {
-layer {
-  name: "data"
-  type: "kLMDBData"
-  data_param {
-    path: "/home/wangwei/program/singa/examples/mnist/mnist_train_lmdb"
-    batchsize: 64
+  layer {
+    name: "data"
+    type:  kShardData
+    sharddata_conf {
+      path: "examples/mnist/mnist_train_shard"
+      batchsize: 64
+    }
+    exclude: kTest
   }
-  exclude: kTest
-}
 
-layer {
-  name: "data"
-  type: "kLMDBData"
-  data_param {
-    path: "/home/wangwei/program/singa/examples/mnist/mnist_test_lmdb"
-    batchsize: 100
+  layer {
+    name: "data"
+    type: kShardData
+    sharddata_conf {
+      path: "examples/mnist/mnist_test_shard"
+      batchsize: 100
+    }
+    exclude: kTrain
   }
-  exclude: kTrain
-}
 
-layer{
-  name:"mnist"
-  type: "kMnistImage"
-  srclayers: "data"
-  mnist_param {
-#    sigma: 6
-#    alpha: 38
-#    gamma: 15
-#    kernel: 21
-#    elastic_freq:100
-#    beta:15
-#    resize: 29
-    norm_a:255
+  layer{
+    name:"mnist"
+    type: kMnist
+    srclayers: "data"
+    mnist_conf {
+      norm_a:255
+      norm_b:0
+    }
   }
-}
 
-
-layer{
-  name: "label"
-  type: "kLabel"
-  srclayers: "data"
-}
-layer {
-  name: "conv1"
-  type: "kConvolution"
-  srclayers: "mnist"
-  convolution_param {
-    num_filters: 20
-    kernel: 5
-    stride: 1
+  layer{
+    name: "label"
+    type: kLabel
+    srclayers: "data"
   }
-  param{
-      name: "weight"
-      init_method:kUniformSqrtFanIn
-      learning_rate_multiplier:1.0
-    }
-  param{
-      name: "bias"
-      init_method: kConstant
-      learning_rate_multiplier:2.0
-      value:0
+  layer {
+    name: "conv1"
+    type: kConvolution
+    srclayers: "mnist"
+    convolution_conf {
+      num_filters: 20
+      kernel: 5
+      stride: 1
     }
-}
-layer {
-  name: "pool1"
-  type: "kPooling"
-  srclayers: "conv1"
-  pooling_param {
-    pool: MAX
-    kernel: 2
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "kConvolution"
-  srclayers: "pool1"
-  convolution_param {
-    num_filters: 50
-    kernel: 5
-    stride: 1
+    param{
+        name: "w1"
+        init_method:kUniformSqrtFanIn
+        learning_rate_multiplier:1.0
+      }
+    param{
+        name: "b1"
+        init_method: kConstant
+        learning_rate_multiplier:2.0
+        value:0
+      }
   }
-  param{
-      name: "weight"
-      init_method:kUniformSqrtFanIn
-      learning_rate_multiplier:1.0
+  layer {
+    name: "pool1"
+    type: kPooling
+    srclayers: "conv1"
+    pooling_conf {
+      pool: MAX
+      kernel: 2
+      stride: 2
     }
-  param{
-      name: "bias"
-      init_method: kConstant
-      learning_rate_multiplier:2.0
-      value:0
-    }
-}
-layer {
-  name: "pool2"
-  type: "kPooling"
-  srclayers: "conv2"
-  pooling_param {
-    pool: MAX
-    kernel: 2
-    stride: 2
   }
-}
-layer {
-  name: "ip1"
-  type: "kInnerProduct"
-  srclayers:"pool2"
-  inner_product_param {
-    num_output: 500
+  layer {
+    name: "conv2"
+    type: kConvolution
+    srclayers: "pool1"
+    convolution_conf {
+      num_filters: 50
+      kernel: 5
+      stride: 1
+    }
+    param{
+        name: "w2"
+        init_method:kUniformSqrtFanIn
+        learning_rate_multiplier:1.0
+      }
+    param{
+        name: "b2"
+        init_method: kConstant
+        learning_rate_multiplier:2.0
+        value:0
+      }
   }
-  param{
-      name: "weight"
-      init_method:kUniformSqrtFanIn
-      learning_rate_multiplier:1.0
+  layer {
+    name: "pool2"
+    type: kPooling
+    srclayers: "conv2"
+    pooling_conf {
+      pool: MAX
+      kernel: 2
+      stride: 2
     }
-  param{
-      name: "bias"
-      init_method: kConstant
-      learning_rate_multiplier:2.0
-      value:0
   }
+  layer {
+    name: "ip1"
+    type: kInnerProduct
+    srclayers:"pool2"
+    innerproduct_conf {
+      num_output: 500
+    }
+    param{
+        name: "w3"
+        init_method:kUniformSqrtFanIn
+        learning_rate_multiplier:1.0
+      }
+    param{
+        name: "b3"
+        init_method: kConstant
+        learning_rate_multiplier:2.0
+        value:0
+    }
 
-}
-
-layer {
-  name: "relu1"
-  type: "kReLU"
-  srclayers:"ip1"
-}
+  }
 
-layer {
-  name: "ip2"
-  type: "kInnerProduct"
-  srclayers:"relu1"
-  inner_product_param {
-    num_output: 10
+  layer {
+    name: "relu1"
+    type: kReLU
+    srclayers:"ip1"
   }
-  param{
-      name: "weight"
+
+  layer {
+    name: "ip2"
+    type: kInnerProduct
+    srclayers:"relu1"
+    innerproduct_conf {
+      num_output: 10
+    }
+    param {
+      name: "w4"
       init_method:kUniformSqrtFanIn
       learning_rate_multiplier:1
     }
-  param{
-      name: "bias"
+    param {
+      name: "b4"
       init_method: kConstant
       learning_rate_multiplier:2
       value:0
     }
-}
-layer{
-  name: "loss"
-  type:"kSoftmaxLoss"
-  softmaxloss_param{
-    topk:1
   }
-  srclayers:"ip2"
-  srclayers:"label"
-}
+  layer{
+    name: "loss"
+    type: kSoftmaxLoss
+    softmaxloss_conf{
+      topk:1
+    }
+    srclayers:"ip2"
+    srclayers:"label"
+  }
 }
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  workspace: "examples/mnist"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/examples/mnist/job.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/job.conf b/examples/mnist/job.conf
index 5d1445d..34fbca2 100644
--- a/examples/mnist/job.conf
+++ b/examples/mnist/job.conf
@@ -1,26 +1,20 @@
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  workspace: "examples/mnist"
-}
-model {
-  name: "deep-big-simple-mlp"
-  train_steps: 1000
-  test_steps:10
-  test_frequency:60
-  display_frequency:30
-  alg: kBackPropagation
-  updater{
-    base_lr: 0.001
-    lr_change: kStep
-    type: kSGD
-    step_conf{
-      change_freq: 60
-      gamma: 0.997
-    }
+name: "mlp"
+train_steps: 1000
+test_steps:10
+test_freq:60
+disp_freq:10
+alg: kBP
+updater{
+  base_lr: 0.001
+  lr_change: kStep
+  type: kSGD
+  step_conf{
+    change_freq: 60
+    gamma: 0.997
   }
+}
 
-  neuralnet {
+neuralnet {
   layer {
     name: "data"
     type: kShardData
@@ -46,13 +40,6 @@ model {
     type: kMnist
     srclayers: "data"
     mnist_conf {
-#    sigma: 6
-#    alpha: 38
-#    gamma: 15
-#    kernel: 21
-#    elastic_freq:100
-#    beta:15
-#    resize: 29
       norm_a: 127.5
       norm_b: 1
     }
@@ -228,4 +215,8 @@ model {
     srclayers:"label"
   }
 }
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  workspace: "examples/mnist"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/neuralnet/base_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h
index ca63da0..25df95f 100644
--- a/include/neuralnet/base_layer.h
+++ b/include/neuralnet/base_layer.h
@@ -133,10 +133,10 @@ class Layer {
    * blob in parser layers; The default value is "unknown"; If the
    * src layer is the prefetch layer and there are more than one parser layers,
    * this value be set.
-   */
   const std::string &datablob() const {
     return layer_proto_.datablob();
   }
+   */
   /**
    * @return a const ref for Blob storing neuron values of this layer for BP
    */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/singa.h
----------------------------------------------------------------------
diff --git a/include/singa.h b/include/singa.h
index c8984e5..82df64b 100644
--- a/include/singa.h
+++ b/include/singa.h
@@ -24,13 +24,16 @@ void SubmitJob(int job, bool resume, const JobProto& jobConf) {
   ReadProtoFromTextFile(FLAGS_singa_conf.c_str(), &singaConf);
   if (singaConf.has_log_dir())
     SetupLog(singaConf.log_dir(),
-        std::to_string(job) + "-" + jobConf.model().name());
+        std::to_string(job) + "-" + jobConf.name());
   if (jobConf.num_openblas_threads() != 1)
     LOG(WARNING) << "openblas is set with " << jobConf.num_openblas_threads()
       << " threads";
   openblas_set_num_threads(jobConf.num_openblas_threads());
+  JobProto proto;
+  proto.CopyFrom(jobConf);
+  proto.set_id(job);
   Trainer trainer;
-  trainer.Start(job, resume, jobConf, singaConf);
+  trainer.Start(resume, singaConf, &proto);
 }
 }  // namespace singa
 #endif  //  SINGA_SINGA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/trainer/trainer.h
----------------------------------------------------------------------
diff --git a/include/trainer/trainer.h b/include/trainer/trainer.h
index 911a4c4..1d28de6 100644
--- a/include/trainer/trainer.h
+++ b/include/trainer/trainer.h
@@ -27,14 +27,11 @@ class Trainer{
    * Entrance function which construct the workers and servers, and luanch
    * one thread per worker/server.
    *
-   * @param job job ID
    * @param resume if true resume the training from the latest checkpoint files
-   * @param jobConf job configuration, including cluster and model configuration
    * @param singaConf global singa configuration including zookeeper and
-   * log dir setting.
+   * @param jobConf job configuration, including cluster and model configuration
    */
-  void Start(int job, bool resume,
-      const JobProto& jobConf, const SingaProto& singaConf);
+  void Start(bool resume, const SingaProto& singaConf, JobProto* jobConf);
 
  protected:
   /**
@@ -44,27 +41,27 @@ class Trainer{
    * checkpoint, which will be added into the checkpoint field. The workers
    * would then load the values of params from the checkpoint files.
    *
-   * @param modelConf model configuration
+   * @param jobConf job configuration
    */
-  void Resume(ModelProto* modelConf);
+  void Resume(JobProto* jobConf);
   /**
    * Create server instances.
    * @param nthread total num of threads in current procs which is used to
    * assign each thread a local thread ID. The number of workers is extracted
    * from Cluster
-   * @param modelConf
+   * @param jobConf
    * @return server instances
    */
-  vector<Server*> CreateServers(int nthread, const ModelProto& modelConf);
+  vector<Server*> CreateServers(int nthread, const JobProto& jobConf);
   /**
    * Create workers instances.
    * @param nthread total num of threads in current procs which is used to
    * assign each thread a local thread ID. The number of workers is extracted
    * from Cluster
-   * @param modelConf
+   * @param jobConf
    * @return worker instances
    */
-  vector<Worker*> CreateWorkers(int nthread, const ModelProto& modelConf);
+  vector<Worker*> CreateWorkers(int nthread, const JobProto& jobConf);
 
   /**
    * Setup workers and servers.
@@ -77,7 +74,7 @@ class Trainer{
    * @param servers
    */
   void SetupWorkerServer(
-    const ModelProto& modelConf,
+    const JobProto& jobConf,
     const vector<Worker*>& workers,
     const vector<Server*>& servers);
 
@@ -91,7 +88,7 @@ class Trainer{
    * For other base classes, use its base class name (string) as the key and the
    * implementation class as the value, e.g., <"Updater" SGDUpdater>.
    */
-  void RegisterDefaultClasses(const singa::ModelProto& proto);
+  void RegisterDefaultClasses();
   /**
    * Generate msg to trigger synchronization with other server groups.
    *

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/trainer/worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/worker.h b/include/trainer/worker.h
index 025bcc1..0557ee2 100644
--- a/include/trainer/worker.h
+++ b/include/trainer/worker.h
@@ -31,7 +31,7 @@ class Worker {
   /**
    * Setup members
    */
-  void Setup(const ModelProto& model, shared_ptr<NeuralNet> train_net,
+  void Setup(const JobProto& job, shared_ptr<NeuralNet> train_net,
       shared_ptr<NeuralNet> valid_net, shared_ptr<NeuralNet> test_net);
   /**
     * Main function of Worker.
@@ -49,7 +49,7 @@ class Worker {
    * If the training starts from scrath, the params are initialzed using random
    * distributions, e.g., Gaussian distribution. After that, the worker may
    * train for a couple of steps to warmup the params before put
-   * them to servers (warmup of ModelProto controls this).
+   * them to servers (warmup of JobProto controls this).
    *
    * If the owner param is availabel from checkpoint file, then its
    * values are parsed from the checkpoint file instead of randomly initialized.
@@ -62,7 +62,7 @@ class Worker {
    * The serialization is done using BlobProtos which includes the name, version
    * and values of each Param.
    * Different worker would generate different checkpoint files. The file path
-   * is <workspace>/checkpoint-<modelname>-step<step>-worker<worker_id>.bin
+   * is <workspace>/checkpoint-<jobname>-step<step>-worker<worker_id>.bin
    * @param step training step of this worker
    * @param net the training net whose params will be dumped.
    */
@@ -173,7 +173,7 @@ class Worker {
  protected:
   int thread_id_, grp_id_, id_;
   int step_;
-  ModelProto modelproto_;
+  JobProto job_conf_;
   shared_ptr<NeuralNet> train_net_, test_net_, validation_net_;
   Dealer* layer_dealer_, *dealer_;
   Updater* updater_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/main.cc
----------------------------------------------------------------------
diff --git a/src/main.cc b/src/main.cc
index d95e405..00b75ff 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -33,8 +33,6 @@ int main(int argc, char **argv) {
   singa::JobProto jobConf;
   std::string job_file = FLAGS_conf;
   singa::ReadProtoFromTextFile(job_file.c_str(), &jobConf);
-  CHECK(jobConf.has_cluster());
-  CHECK(jobConf.has_model());
 
   RegisterClasses();
   singa::SubmitJob(FLAGS_job, FLAGS_resume, jobConf);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/neuralnet/base_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/base_layer.cc b/src/neuralnet/base_layer.cc
index 57163e9..695104e 100644
--- a/src/neuralnet/base_layer.cc
+++ b/src/neuralnet/base_layer.cc
@@ -125,17 +125,19 @@ void PrefetchLayer::Setup(const LayerProto& proto, int npartitions) {
 }
 
 const Blob<float>& PrefetchLayer::data(const Layer* from, Phase phase) const {
-  if(from!=nullptr){
-    return datablobs_.at(from->datablob());
-  }else{
+  LOG(FATAL) << " needs update";
+  if(from != nullptr) {
+    return datablobs_.at("");
+  } else {
     //CHECK_EQ(datablobs_.size(),1);
     return datablobs_.begin()->second;
   }
 }
 
 Blob<float>* PrefetchLayer::mutable_data(const Layer* from, Phase phase) {
+  LOG(FATAL) << " needs update";
   if(from!=nullptr){
-    return &(datablobs_.at(from->datablob()));
+    return &(datablobs_.at(""));
   }else{
     //CHECK_EQ(datablobs_.size(),1);
     return &(datablobs_.begin()->second);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index 314bb14..c1fce00 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -439,7 +439,7 @@ void MnistLayer::ParseRecords(Phase phase,
   LOG_IF(ERROR, records.size()==0)<<"Empty records to parse";
   int ndim=records.at(0).image().shape_size();
   int inputsize =records.at(0).image().shape(ndim-1);
-  CHECK_EQ(inputsize, blob->shape()[1]);
+  CHECK_EQ(inputsize, blob->shape()[2]);
 
   float* dptr=blob->mutable_cpu_data();
   for(const Record& record: records){
@@ -485,11 +485,11 @@ void MnistLayer::Setup(const LayerProto& proto, int npartitions) {
   int ndim=sample.image().shape_size();
   CHECK_GE(ndim,2);
   if(resize_)
-    data_.Reshape(vector<int>{batchsize, resize_, resize_});
+    data_.Reshape(vector<int>{batchsize, 1, resize_, resize_});
   else{
     int s=sample.image().shape(ndim-1);
     CHECK_EQ(s,sample.image().shape(ndim-2));
-    data_.Reshape(vector<int>{batchsize, s, s });
+    data_.Reshape(vector<int>{batchsize, 1, s, s });
   }
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 83f8c36..4732a36 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -88,7 +88,6 @@ shared_ptr<NeuralNet> NeuralNet::Create(
     param->set_share_from(from);
   }
 
-  for (auto layer : net_conf.layer())
   LOG(INFO) << "NeuralNet config is\n" << conf.DebugString();
 
   // TODO(wangwei) create net based on net type, e.g., directed, undirected, etc

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/proto/common.proto
----------------------------------------------------------------------
diff --git a/src/proto/common.proto b/src/proto/common.proto
index d8be479..3b6efb3 100644
--- a/src/proto/common.proto
+++ b/src/proto/common.proto
@@ -24,13 +24,6 @@ enum EntityType {
   kRuntime = 4;
 };
 
-enum ShareOption {
-  kValueOnly = 0;
-  kWhole = 1;
-};
-
-
-
 enum ConnectionType {
   kOneToOne = 0;
   kOneToAll = 1;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index 200197f..a67d330 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -1,9 +1,73 @@
 package singa;
+enum TrainOneBatchAlg {
+  // Back-propagation algorithm for feed-forward models, e.g., CNN,  and RNN
+  kBP = 1;
+  // Contrastive Divergence algorithm for RBM, DBM etc.
+  kCD = 2;
+}
 
 message JobProto {
-  required ClusterProto cluster = 1;
-  required ModelProto model = 2;
-  optional int32 num_openblas_threads = 3 [default = 1];
+  // job name, e.g., "cifar10-dcnn", "mnist-mlp"
+  required string name = 1;
+  // neural net consits of a set of connected layers
+  required NetProto neuralnet = 3;
+  // algorithms calculating gradients for one mini-batch/iteration
+  required TrainOneBatchAlg alg = 5;
+  // configuration of SGD updater, including learning rate, etc.
+  required UpdaterProto updater = 7;
+  // cluster toplogy conf
+  required ClusterProto cluster = 9;
+
+  // for setting CD fields
+  optional CDProto cd_conf = 12;
+
+  // total num of steps for training
+  required int32 train_steps = 16;
+  // frequency of displaying training info
+  optional int32 disp_freq = 17 [default = 0];
+
+  // frequency of test, e.g., do test every 100 training steps
+  optional int32 test_freq = 20 [default = 0];
+  // total num of steps for testing all test data; todo set -1 for test forever
+  optional int32 test_steps =  21 [default = 0];
+  // frequency of validation, e.g., do validation every 100 training steps
+  optional int32 valid_freq = 25 [default = 0];
+  // total num of steps for validating all validation data
+  optional int32 valid_steps = 26 [default = 0];
+  // frequency of checkpoint
+  optional int32 checkpoint_freq = 30 [default = 0];
+
+  // for loading checkpoint files to init parameters
+  repeated string checkpoint_path = 60;
+  // send parameters to servers after training for this num of steps
+  optional int32 warmup_steps = 61 [default = 0];
+  // display debug info
+  optional bool debug = 62 [default = false];
+  // reset the version of params loaded from checkpoint file to step
+  optional bool reset_param_version = 63 [default = true];
+  // set num of threads used by openblas
+  optional int32 num_openblas_threads = 64 [default = 1];
+
+  // start checkpoint after this num steps
+  optional int32 checkpoint_after = 80 [default = 0];
+  // start display after this num steps
+  optional int32 disp_after =  81[default = 0];
+  // start test after this num steps
+  optional int32 test_after = 82 [default = 0];
+  // start validation after this num steps
+  optional int32 valid_after = 83 [default = 0];
+
+  // used by SINGA; uses typically do not touch these fields
+  optional bool resume = 90 [default = false];
+  // last snapshot step
+  optional int32 step = 91 [default = 0];
+  // job id allocated by zookeeper
+  optional int32 id = 92 [default = -1];
+}
+
+message CDProto {
+  //number of steps for gibbs sampling
+  optional int32 pcd_k = 1 [default = 1];
 }
 
 message ClusterProto {
@@ -13,24 +77,23 @@ message ClusterProto {
   optional int32 nservers_per_group = 4 [default = 1];
   optional int32 nworkers_per_procs = 5 [default = 1];
   optional int32 nservers_per_procs = 6 [default = 1];
+  // local workspace for checkpoint files and vis files
+  required string workspace = 10;
 
   // servers and workers in different processes?
-  optional bool server_worker_separate = 11 [default = false];
+  optional bool server_worker_separate = 20 [default = false];
 
   // port number is used by ZeroMQ
-  optional int32 start_port = 13 [default = 6723];
-  // local workspace, train/val/test shards, checkpoint files
-  required string workspace = 14;
-
-  // conduct updates at server side; otherwise do it at worker side
-  optional bool server_update = 40 [default = true];
+  optional int32 start_port = 60 [default = 6723];
+    // conduct updates at server side; otherwise do it at worker side
+  optional bool server_update = 61 [default = true];
   // share memory space between worker groups in one procs
-  optional bool share_memory = 41 [default = true];
+  optional bool share_memory = 62 [default = true];
 
   // bandwidth of ethernet, Bytes per second, default is 1 Gbps
-  optional int32 bandwidth=50 [default=134217728];
+  optional int32 bandwidth=80 [default=134217728];
   // poll time in milliseconds
-  optional int32 poll_time=51 [default =100];
+  optional int32 poll_time=81 [default =100];
 }
 
 
@@ -47,67 +110,14 @@ enum Phase {
   kLoss = 7;
 }
 
-message ModelProto {
-  // model name, e.g., "cifar10-dcnn", "mnist-mlp"
-  required string name = 1;
-  // frequency of displaying training info
-  required int32 display_frequency = 3 ;
-  // total num of steps for training
-  required int32 train_steps = 5;
-  // configuration of SGD updater, including learning rate, etc.
-  required UpdaterProto updater = 7;
-  enum GradCalcAlg {
-    // BP algorithm for feed-forward models, e.g., CNN, MLP, RNN
-    kBackPropagation = 1;
-    // CD algorithm for RBM, DBM etc., models
-    kContrastiveDivergence = 2;
-  }
- // gradient calculation algorithm
-  required GradCalcAlg alg = 8 [default = kBackPropagation];
-  required NetProto neuralnet = 9;
-
-  // total num of steps for validation
-  optional int32 validation_steps = 30 [default = 0];
-  // total num of steps for test
-  optional int32 test_steps = 31 [default = 0];
-  // frequency of validation
-  optional int32 validation_frequency = 32;
-  // frequency of test
-  optional int32 test_frequency = 33 [default = 0];
-  // frequency of checkpoint
-  optional int32 checkpoint_frequency = 34 [default = 0];
-  // send parameters to servers after training for this num of steps
-  optional int32 warmup_steps = 35 [default = 0];
-  // checkpoint path
-  optional bool resume = 36 [default = false];
-
-   // start display after this num steps
-  optional int32 display_after =  60[default = 0];
-  // start checkpoint after this num steps
-  optional int32 checkpoint_after = 61 [default = 0];
-  // start test after this num steps
-  optional int32 test_after = 62 [default = 0];
-// start validation after this num steps
-  optional int32 validation_after = 63 [default = 0];
-  // last snapshot step
-  optional int32 step = 64 [default = 0];
-  // display debug info
-  optional bool debug = 65 [default = false];
-  // checkpoint files
-  repeated string checkpoint = 66;
-  // reset the version of params loaded from checkpoint file to step
-  optional bool reset_param_version = 67 [default = true];
-  //number of steps for gibbs sampling
-  optional int32 pcd_k=69 [default=15];
-}
-
 message NetProto {
   repeated LayerProto layer = 1;
   // partitioning type for parallelism
-  optional int32 partition_dim = 2 [default = 0];
+  optional int32 partition_dim = 20 [default = 0];
 }
 
-// weight matrix should be defined before bias vector
+// weight matrix should be defined before bias vector;
+// todo separate conf for diff init method
 message ParamProto {
   enum InitMethod {
     // fix the values of all parameters  a constant in the value field
@@ -131,7 +141,9 @@ message ParamProto {
     // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
     kUniformSqrtFanInOut = 6;
   }
-  optional InitMethod init_method = 1 [default = kGaussian];
+  // used for identifying the same params from diff models and display deug info
+  optional string name =  1 [default = ""];
+  optional InitMethod init_method = 2 [default = kGaussian];
   // constant init
   optional float value = 5 [default = 1];
   // for uniform sampling
@@ -144,20 +156,18 @@ message ParamProto {
   optional float learning_rate_multiplier = 15 [default = 1];
   // multiplied on the global weight decay.
   optional float weight_decay_multiplier = 16 [default = 1];
-  // partition dimension, -1 for no partition
-  optional int32 partition_dim = 30;
-  // usually, the program will infer the param shape
-  repeated int32 shape = 31;
-  // used for identifying the same params from diff models and display deug info
-  optional string name =  61 [default = ""];
-  // name of the owner param from which this param shares the values
-  optional string share_from = 62;
+
+    // name of the owner param from which this param shares the values
+  optional string share_from = 60;
+
   // used interally
-  optional int32 id = 63;
-  // parameter slice limit (Google Protobuf also has size limit)
-  optional int32 split_threshold = 64 [default = 5000000];
+  optional int32 id = 90;
   // used internally
-  optional int32 owner = 65 [default = -1];
+  optional int32 owner = 91 [default = -1];
+  // partition dimension, -1 for no partition
+  optional int32 partition_dim = 92;
+  // usually, the program will infer the param shape
+  repeated int32 shape = 93;
 }
 
 enum PartitionType{
@@ -241,12 +251,9 @@ message LayerProto {
 
 
   // overrides the partition dimension for neural net
-  optional int32 partition_dim =59 [default = -1];
-  optional string datablob = 58 [default = "unknow"];
-
+  optional int32 partition_dim =60 [default = -1];
   // names of parameters shared from other layers
-  repeated string share_param = 60;
-  optional int32 partition_id = 62 [default = 0];
+  optional int32 partition_id = 90 [default = 0];
 }
 
 message RGBImageProto {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index f348ff6..699fc30 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -28,7 +28,7 @@ Trainer::~Trainer() {
   delete router_;
 }
 
-void Trainer::RegisterDefaultClasses(const singa::ModelProto& model_conf) {
+void Trainer::RegisterDefaultClasses() {
   // register all implemented layers
   singa::NeuralNet::RegisterLayers();
   auto param_factory = Singleton<Factory<singa::Param>>::Instance();
@@ -77,12 +77,12 @@ const vector<int> SliceParams(const vector<Param*>& params) {
 }
 
 void Trainer::SetupWorkerServer(
-    const ModelProto& model_conf,
+    const JobProto& job_conf,
     const vector<Worker*>& workers,
     const vector<Server*>& servers) {
   auto cluster = Cluster::Get();
   int grp_size = cluster->nworkers_per_group();
-  const auto& net_conf = model_conf.neuralnet();
+  const auto& net_conf = job_conf.neuralnet();
   auto net = NeuralNet::Create(net_conf, kTrain, grp_size);
   // MUST do SliceParam before share param/net with others
   auto slices = SliceParams(net->params());
@@ -96,12 +96,12 @@ void Trainer::SetupWorkerServer(
     if (grp_net.find(grp_id) == grp_net.end()) {
       if (grp_id == first_grp) {
         //  test are performed only by the first group now. TODO update.
-        if (first_grp == 0 && model_conf.test_steps() && worker_id == 0) {
+        if (first_grp == 0 && job_conf.test_steps() && worker_id == 0) {
           test_net = NeuralNet::Create(net_conf, kTest, 1); // hard code for exp
           test_net->ShareParamsFrom(net);
         }
         //  validation are performed only by the first group. TODO update.
-        if (first_grp == 0 && model_conf.validation_steps() && worker_id == 0) {
+        if (first_grp == 0 && job_conf.valid_steps() && worker_id == 0) {
           valid_net = NeuralNet::Create(net_conf, kValidation, 1);
           valid_net->ShareParamsFrom(net);
         }
@@ -124,18 +124,18 @@ void Trainer::SetupWorkerServer(
     }
     LOG(INFO) << "grp " << worker->grp_id() << ", worker "
       << worker->id() << " net " << grp_net[grp_id].get();
-    worker->Setup(model_conf, grp_net[grp_id], valid_net, test_net);
+    worker->Setup(job_conf, grp_net[grp_id], valid_net, test_net);
   }
 
   //  partition among server groups, each group maintains one sub-set for sync
   auto slice2group = PartitionSlices(cluster->nserver_groups(), slices);
   for (auto server : servers)
-    server->Setup(model_conf.updater(), &server_shard_, slice2group);
+    server->Setup(job_conf.updater(), &server_shard_, slice2group);
   //  partition within one server group, each server updates for one sub-set
   slice2server_ = PartitionSlices(cluster->nservers_per_group(), slices);
 }
 
-vector<Server*> Trainer::CreateServers(int nthreads, const ModelProto& mconf) {
+vector<Server*> Trainer::CreateServers(int nthreads, const JobProto& job) {
   auto cluster = Cluster::Get();
   vector<Server*> servers;
   if (!cluster->has_server())
@@ -157,7 +157,7 @@ vector<Server*> Trainer::CreateServers(int nthreads, const ModelProto& mconf) {
   return servers;
 }
 
-vector<Worker*> Trainer::CreateWorkers(int nthreads, const ModelProto& mconf){
+vector<Worker*> Trainer::CreateWorkers(int nthreads, const JobProto& job) {
   auto cluster=Cluster::Get();
   vector<Worker*> workers;
   if(!cluster->has_worker())
@@ -184,18 +184,19 @@ vector<Worker*> Trainer::CreateWorkers(int nthreads, const ModelProto& mconf){
   for (int gid = gstart; gid < gend; gid++) {
     for (int wid = wstart; wid < wend; wid++) {
       Worker* worker=nullptr;
-      if (mconf.alg() == ModelProto_GradCalcAlg_kBackPropagation)
+      if (job.alg() == TrainOneBatchAlg::kBP)
         worker = new BPWorker(nthreads++,gid, wid);
-      else {
+      else if (job.alg() == TrainOneBatchAlg::kCD)
         worker=new CDWorker(nthreads++,gid, wid);
-      }
+      else
+        LOG(FATAL) << "unknown alg for trainonebatch func " << job.alg();
       workers.push_back(worker);
     }
   }
   return workers;
 }
 
-void Trainer::Resume(ModelProto* modelConf) {
+void Trainer::Resume(JobProto* jobConf) {
   tinydir_dir dir;
   string folder = Cluster::Get()->checkpoint_folder();
   tinydir_open(&dir, folder.c_str());
@@ -226,24 +227,22 @@ void Trainer::Resume(ModelProto* modelConf) {
   }
 
   if (latest_step > 0) {
-    modelConf->set_step(latest_step);
-    if (!modelConf->has_reset_param_version())
-      modelConf->set_reset_param_version(false);
-    modelConf->clear_checkpoint();
+    jobConf->set_step(latest_step);
+    if (!jobConf->has_reset_param_version())
+      jobConf->set_reset_param_version(false);
+    jobConf->clear_checkpoint_path();
     for (auto ck_file : ck_files)
-      modelConf->add_checkpoint(folder + "/" + ck_file);
+      jobConf->add_checkpoint_path(folder + "/" + ck_file);
   }
   tinydir_close(&dir);
 }
 
-void Trainer::Start(int job, bool resume,
-    const JobProto& jobConf, const SingaProto& singaConf) {
+void Trainer::Start(bool resume, const SingaProto& singaConf, JobProto* job) {
   // register job to zookeeper at the beginning
-  auto cluster = Cluster::Get(job, singaConf, jobConf.cluster());
-  ModelProto model = jobConf.model();
-  RegisterDefaultClasses(model);
+  auto cluster = Cluster::Get(job->id(), singaConf, job->cluster());
+  RegisterDefaultClasses();
   if (resume)
-    Resume(&model);
+    Resume(job);
 
   router_ = new Router();
   router_->Bind(kInprocRouterEndpoint);
@@ -253,10 +252,10 @@ void Trainer::Start(int job, bool resume,
   cluster->Register(getpid(), hostip + ":" + std::to_string(port));
 
   int nthreads = 1;
-  const vector<Worker*> workers = CreateWorkers(nthreads, model);
+  const vector<Worker*> workers = CreateWorkers(nthreads, *job);
   nthreads += workers.size();
-  const vector<Server*> servers = CreateServers(nthreads, model);
-  SetupWorkerServer(model, workers, servers);
+  const vector<Server*> servers = CreateServers(nthreads, *job);
+  SetupWorkerServer(*job, workers, servers);
 
 #ifdef USE_MPI
   for (int i = 0; i < nthreads; i++)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index 36ba8de..b6f9d44 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -16,23 +16,16 @@ Worker::Worker(int thread_id, int grp_id, int id):
 }
 
 void Worker::Setup(
-    const ModelProto& model, shared_ptr<NeuralNet> train_net,
+    const JobProto& job, shared_ptr<NeuralNet> train_net,
     shared_ptr<NeuralNet> valid_net, shared_ptr<NeuralNet> test_net) {
-  modelproto_.CopyFrom(model);
+  job_conf_.CopyFrom(job);
   train_net_ = train_net;
   validation_net_ = valid_net;
   test_net_ = test_net;
   auto cluster = Cluster::Get();
-  // if no server or user requires worker to do param update
-  if (!(cluster->nserver_groups() && cluster->server_update())) {
-    updater_ = Singleton<Factory<Updater>>::Instance()->Create("Updater");
-    updater_->Init(model.updater());
-  }
 }
 
 Worker::~Worker() {
-  if (updater_ != nullptr)
-    delete updater_;
   if (layer_dealer_)
     delete layer_dealer_;
   if (dealer_)
@@ -59,7 +52,7 @@ void Worker::InitLocalParams() {
     // load from checkpoints. get param blob based on param name.
     // the param from previous checkpoint files will be overwritten by
     // the param with the same name in later checkpoint files.
-    for (const auto checkpoint : modelproto_.checkpoint()) {
+    for (const auto checkpoint : job_conf_.checkpoint_path()) {
       LOG(INFO) << "Load from checkpoint file " << checkpoint;
       BlobProtos bps;
       ReadProtoFromBinaryFile(checkpoint.c_str(), &bps);
@@ -67,8 +60,8 @@ void Worker::InitLocalParams() {
         if (name2param.find(bps.name(i)) != name2param.end()) {
           name2param.at(bps.name(i))->FromProto(bps.blob(i));
           //  if load from pre-training params, reset version to start step
-          if(modelproto_.reset_param_version())
-            name2param.at(bps.name(i))->set_version(modelproto_.step());
+          if(job_conf_.reset_param_version())
+            name2param.at(bps.name(i))->set_version(job_conf_.step());
           else  // if resume training, use the same version as last checkpoint
             name2param.at(bps.name(i))->set_version(bps.version(i));
         }
@@ -77,15 +70,15 @@ void Worker::InitLocalParams() {
     // init other params who do not have checkpoint version
     for (auto entry : name2param)
       if (entry.second->version() < 0) {
-        entry.second->InitValues(modelproto_.step());
-        if (!modelproto_.reset_param_version())
+        entry.second->InitValues(job_conf_.step());
+        if (!job_conf_.reset_param_version())
           LOG(ERROR) << "better reset version of params from checkpoints "
             << "to the same as other newly initialized params!";
       }
 
     Metric perf;
     // warmup training before put params to servers
-    for (; step_ < modelproto_.warmup_steps(); step_++)
+    for (; step_ < job_conf_.warmup_steps(); step_++)
       TrainOneBatch(step_, &perf);
     for (auto layer : train_net_->layers()) {
       if (layer->partition_id() == id_)
@@ -99,7 +92,7 @@ void Worker::InitLocalParams() {
   for (auto layer : train_net_->layers()) {
     if (layer->partition_id() == id_)
       for (auto param : layer->GetParams())
-        Get(param, modelproto_.warmup_steps());
+        Get(param, job_conf_.warmup_steps());
   }
 }
 
@@ -153,25 +146,25 @@ void Worker::Run() {
     }
   }
 
-  step_ = modelproto_.step();
+  step_ = job_conf_.step();
   InitLocalParams();
   Metric perf;
   while (!StopNow(step_)) {
     if (ValidateNow(step_)) {
       //LOG(ERROR)<<"Validation at step "<<step;
       CollectAll(validation_net_, step_);
-      Test(modelproto_.validation_steps(), kValidation, validation_net_);
+      Test(job_conf_.valid_steps(), kValidation, validation_net_);
     }
     if (TestNow(step_)) {
       //LOG(ERROR)<<"Test at step "<<step;
       CollectAll(test_net_, step_);
-      Test(modelproto_.test_steps(), kTest, test_net_);
+      Test(job_conf_.test_steps(), kTest, test_net_);
     }
 
     if (CheckpointNow(step_)) {
       CollectAll(train_net_, step_);
       Checkpoint(step_, train_net_);
-      modelproto_.set_step(step_);
+      job_conf_.set_step(step_);
     }
     TrainOneBatch(step_, &perf);
     // LOG(ERROR) << "Train " << step_;
@@ -296,40 +289,40 @@ void Worker::Test(int nsteps, Phase phase, shared_ptr<NeuralNet> net) {
 }
 
 bool Worker::DisplayNow(int step) const {
-  return (modelproto_.display_frequency() > 0
-      && step >= modelproto_.display_after()
-      && ((step - modelproto_.display_after())
-        % modelproto_.display_frequency() == 0));
+  return (job_conf_.disp_freq() > 0
+      && step >= job_conf_.disp_after()
+      && ((step - job_conf_.disp_after())
+        % job_conf_.disp_freq() == 0));
 }
 
 bool Worker::DisplayDebugInfo(int step) const {
-  return DisplayNow(step) && modelproto_.debug() && grp_id_ == 0;
+  return DisplayNow(step) && job_conf_.debug() && grp_id_ == 0;
 }
 bool Worker::StopNow(int step) const {
-  return step >= modelproto_.train_steps();
+  return step >= job_conf_.train_steps();
 }
 bool Worker::CheckpointNow(int step) const {
   return (grp_id_ == 0
-      && modelproto_.checkpoint_frequency() > 0
-      && step >= modelproto_.checkpoint_after()
-      && ((step - modelproto_.checkpoint_after())
-        % modelproto_.checkpoint_frequency() == 0));
+      && job_conf_.checkpoint_freq() > 0
+      && step >= job_conf_.checkpoint_after()
+      && ((step - job_conf_.checkpoint_after())
+        % job_conf_.checkpoint_freq() == 0));
 }
 bool Worker::TestNow(const int step) const {
   return (grp_id_ == 0
-      && modelproto_.test_frequency() > 0
-      && modelproto_.test_steps() > 0
-      && step >= modelproto_.test_after()
-      && ((step - modelproto_.test_after())
-        % modelproto_.test_frequency() == 0));
+      && job_conf_.test_freq() > 0
+      && job_conf_.test_steps() > 0
+      && step >= job_conf_.test_after()
+      && ((step - job_conf_.test_after())
+        % job_conf_.test_freq() == 0));
 }
 bool Worker::ValidateNow(const int step) const {
   return (grp_id_ == 0
-      && modelproto_.validation_frequency() > 0
-      && modelproto_.validation_steps() > 0
-      && step >= modelproto_.validation_after()
-      && ((step - modelproto_.validation_after())
-        % modelproto_.validation_frequency() == 0));
+      && job_conf_.valid_freq() > 0
+      && job_conf_.valid_steps() > 0
+      && step >= job_conf_.valid_after()
+      && ((step - job_conf_.valid_after())
+        % job_conf_.valid_freq() == 0));
 }
 
 
@@ -406,7 +399,7 @@ void CDWorker::NegativePhase(int step,
      shared_ptr<NeuralNet> net, Metric* perf) {
 // for negative phase, gibbs sampling only concerns RBM bottom and top layer
   auto& layers = net->layers();
-  for (int i = 0; i < modelproto_.pcd_k(); i++) {
+  for (int i = 0; i < job_conf_.cd_conf().pcd_k(); i++) {
     for (auto& layer : layers) {
       if (layer->is_vislayer() || layer->is_hidlayer())
         layer->ComputeFeature(kNegative, perf);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 8c0b440..2f43f66 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -44,35 +44,35 @@ void Param::InitValues(int version){
   auto random=TSingleton<Random<cpu>>::Instance();
   switch (proto_.init_method()) {
   case ParamProto::kConstant:
-    data=proto_.value();
+    data = proto_.value();
     break;
   case ParamProto::kUniform:
     random->SampleUniform(data, proto_.low(), proto_.high());
-    if(proto_.value())
-      data*= proto_.value();
+    if(proto_.value() != 1)
+      data *= proto_.value();
     break;
-    /*
   case ParamProto::kUniformSqrtFanIn:
-    CHECK_GT(fan_in_,0);
     random->SampleUniform(data, proto_.low(), proto_.high());
-    if(proto_.value())
-      data*= proto_.value()/ sqrt(fan_in_ / 3.0f);
+    // only valid for param matrix with dim 1 for fan in
+    LOG(ERROR) << "init fan in";
+    CHECK_EQ(data_->shape().size(), 2);
+    data *= proto_.value() / sqrt(data_->shape().at(1) / 3.0f);
+    LOG(ERROR) << "end fan in";
     break;
-    */
   case ParamProto::kUniformSqrtFanInOut:
     random->SampleUniform(data, proto_.low(), proto_.high());
     if(proto_.value())
-      data*= proto_.value()/ sqrt(data_->shape()[0] +data_->shape()[1]);
+      data *= proto_.value()/ sqrt(data_->shape()[0] +data_->shape()[1]);
     break;
   case ParamProto::kGaussian:
     random->SampleGaussian(data, proto_.mean(), proto_.std());
-    if(proto_.value())
-      data*= proto_.value();
+    if(proto_.value() != 1)
+      data *= proto_.value();
     break;
   case ParamProto::kGaussainSqrtFanIn:
     random->SampleGaussian(data, proto_.mean(), proto_.std());
     if(proto_.value())
-      data*= proto_.value()/ sqrt(data_->shape()[0]);
+      data *= proto_.value()/ sqrt(data_->shape()[0]);
     break;
   default:
     LOG(ERROR) << "Illegal parameter init method ";

[2/2] incubator-singa git commit: SINGA-54 Refactor job configuration to move fields in ModelProto out

Posted by wa...@apache.org.

SINGA-54 Refactor job configuration to move fields in ModelProto out

format job.proto, move important enum types as global


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/4dee7b9c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/4dee7b9c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/4dee7b9c

Branch: refs/heads/master
Commit: 4dee7b9cd0f07eff4906e2398b7ad7f23691a508
Parents: 1b574f3
Author: wang sheng <wa...@gmail.com>
Authored: Fri Aug 14 21:56:35 2015 +0800
Committer: wang sheng <wa...@gmail.com>
Committed: Fri Aug 14 22:17:16 2015 +0800

----------------------------------------------------------------------
 include/neuralnet/base_layer.h |   1 -
 src/neuralnet/neuralnet.cc     |  14 +-
 src/proto/job.proto            | 450 ++++++++++++++++++++----------------
 src/utils/param.cc             |  12 +-
 src/utils/updater.cc           |  14 +-
 5 files changed, 272 insertions(+), 219 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/include/neuralnet/base_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h
index 25df95f..508fe18 100644
--- a/include/neuralnet/base_layer.h
+++ b/include/neuralnet/base_layer.h
@@ -20,7 +20,6 @@ using std::vector;
 using std::string;
 using std::map;
 
-
 class Layer;
 /**
  * Base layer class.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 4732a36..e2565e3 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -7,10 +7,10 @@
 
 namespace singa {
 // macros to shorten the code
-#define LayerT(x) LayerProto_LayerType_k##x
+#define LayerT(x) LayerType::k##x
 
 #define RegisterLayer(factory, id) \
-  factory->Register(LayerProto_LayerType_k##id, \
+  factory->Register(LayerType::k##id, \
       CreateInstance(id##Layer, Layer))
 
 void NeuralNet::RegisterLayers() {
@@ -195,7 +195,7 @@ Node* SliceNode(Graph* graph, Node* srcnode,
   string name = srcnode->name + "<";
   LayerProto *proto = new LayerProto();
   proto->set_name(name);
-  proto->set_type(LayerProto_LayerType_kSlice);
+  proto->set_type(LayerType::kSlice);
   proto->set_partition_id(
       static_cast<LayerProto*>(srcnode->proto)->partition_id());
   auto conf = proto->mutable_slice_conf();
@@ -215,7 +215,7 @@ Node* ConcateNodes(Graph* graph, const vector<Node*>& srcnodes, Node* dstnode) {
   string name = ">" + dstnode->name;
   LayerProto *proto = new LayerProto();
   proto->set_name(name);
-  proto->set_type(LayerProto_LayerType_kConcate);
+  proto->set_type(LayerType::kConcate);
   proto->set_partition_id(
       static_cast<LayerProto*>(dstnode->proto)->partition_id());
   auto conf = proto->mutable_concate_conf();
@@ -234,7 +234,7 @@ Node* SplitNode(Graph* graph, Node* srcnode, const vector<Node*>& dstnodes) {
   string name = srcnode->name + "+";
   LayerProto *proto = new LayerProto();
   proto->set_name(name);
-  proto->set_type(LayerProto_LayerType_kSplit);
+  proto->set_type(LayerType::kSplit);
   proto->set_partition_id(
       static_cast<LayerProto*>(srcnode->proto)->partition_id());
   Node* node = new Node(name, "##" + name, proto->partition_id(), proto);
@@ -251,14 +251,14 @@ void BridgeNodes(Graph* graph, Node* srcnode, Node* dstnode) {
   string sname = srcnode->name + ":-";
   LayerProto *sproto = new LayerProto();
   sproto->set_name(sname);
-  sproto->set_type(LayerProto_LayerType_kBridgeSrc);
+  sproto->set_type(LayerType::kBridgeSrc);
   sproto->set_partition_id(
       static_cast<LayerProto*>(srcnode->proto)->partition_id());
   auto sbridge = new Node(sname, "##" + sname, sproto->partition_id(), sproto);
   string dname = "-:" + dstnode->name;
   LayerProto *dproto = new LayerProto();
   dproto->set_name(dname);
-  dproto->set_type(LayerProto_LayerType_kBridgeDst);
+  dproto->set_type(LayerType::kBridgeDst);
   dproto->set_partition_id(
       static_cast<LayerProto*>(dstnode->proto)->partition_id());
   auto dbridge = new Node(dname, "##" + dname, dproto->partition_id(), dproto);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index a67d330..7c734bf 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -1,11 +1,21 @@
 package singa;
-enum TrainOneBatchAlg {
-  // Back-propagation algorithm for feed-forward models, e.g., CNN,  and RNN
-  kBP = 1;
-  // Contrastive Divergence algorithm for RBM, DBM etc.
-  kCD = 2;
-}
 
+// To start a training job, all we need is a JobProto object.
+// It should contain following fields
+//  - Job Name (name)
+//      the name to identify the job
+//  - NeuralNet (neuralnet)
+//      the neural network structure contains a set of layers
+//  - Train One Batch (alg)
+//      the training algorithm
+//  - Updater (updater)
+//      the protocol for updating parameters at server side
+//  - Cluster Topology (cluster)
+//      the distributed topology of workers/servers
+//  - Training Steps (train_steps)
+//      the number of training iteration
+//  All other fields/functions are optional, e.g., test, checkpoint
+//
 message JobProto {
   // job name, e.g., "cifar10-dcnn", "mnist-mlp"
   required string name = 1;
@@ -28,7 +38,8 @@ message JobProto {
 
   // frequency of test, e.g., do test every 100 training steps
   optional int32 test_freq = 20 [default = 0];
-  // total num of steps for testing all test data; todo set -1 for test forever
+  // total num of steps for testing all test data;
+  // TODO(wangwei): set -1 for test forever
   optional int32 test_steps =  21 [default = 0];
   // frequency of validation, e.g., do validation every 100 training steps
   optional int32 valid_freq = 25 [default = 0];
@@ -57,7 +68,10 @@ message JobProto {
   // start validation after this num steps
   optional int32 valid_after = 83 [default = 0];
 
-  // used by SINGA; uses typically do not touch these fields
+  // for internal use
+  // users typically do not touch following fields
+  
+  // resume flag
   optional bool resume = 90 [default = false];
   // last snapshot step
   optional int32 step = 91 [default = 0];
@@ -65,9 +79,41 @@ message JobProto {
   optional int32 id = 92 [default = -1];
 }
 
-message CDProto {
-  //number of steps for gibbs sampling
-  optional int32 pcd_k = 1 [default = 1];
+// ----------------------- 
+// Protos used by JobProto
+// -----------------------
+
+message NetProto {
+  repeated LayerProto layer = 1;
+  // partitioning type for parallelism
+  optional int32 partition_dim = 20 [default = 0];
+}
+
+message UpdaterProto {
+  // updater type
+  required UpdaterType type = 1 [default = kSGD];
+  // configuration for RMSProp algorithm
+  optional RMSPropProto rmsprop_conf = 50;
+
+  // change method for learning rate
+  required ChangeMethod lr_change = 2 [default = kFixed];
+
+  // proto of change method
+  oneof change_conf {
+    FixedStepProto fixedstep_conf = 40;
+    StepProto step_conf = 41;
+    LinearProto linear_conf = 42;
+    ExponentialProto exponential_conf = 43;
+    InverseProto inverse_conf = 44;
+    InverseTProto inverset_conf = 45;
+  }
+
+  optional float momentum = 31 [default = 0];
+  optional float weight_decay = 32 [default = 0];
+  // base learning rate
+  optional float base_lr = 34 [default = 0];
+  // used to avoid divide by 0, i.e. x/(y+delta)
+  optional float delta = 35 [default = 0.00000001];
 }
 
 message ClusterProto {
@@ -83,64 +129,86 @@ message ClusterProto {
   // servers and workers in different processes?
   optional bool server_worker_separate = 20 [default = false];
 
-  // port number is used by ZeroMQ
+  // port number used by ZeroMQ
   optional int32 start_port = 60 [default = 6723];
-    // conduct updates at server side; otherwise do it at worker side
+  // conduct updates at server side; otherwise do it at worker side
   optional bool server_update = 61 [default = true];
   // share memory space between worker groups in one procs
   optional bool share_memory = 62 [default = true];
 
   // bandwidth of ethernet, Bytes per second, default is 1 Gbps
-  optional int32 bandwidth=80 [default=134217728];
+  optional int32 bandwidth = 80 [default = 134217728];
   // poll time in milliseconds
-  optional int32 poll_time=81 [default =100];
+  optional int32 poll_time = 81 [default = 100];
 }
 
-
-enum Phase {
-  kTrain = 0;
-  kValidation = 1;
-  kTest= 2;
-  // postivie phase for contrastive divergence algorithm
-  kPositive = 3;
-  // negative phase for contrastive divergence algorithm
-  kNegative = 4;
-  kForward = 5;
-  kBackward = 6;
-  kLoss = 7;
+message CDProto {
+  //number of steps for gibbs sampling
+  optional int32 pcd_k = 1 [default = 1];
 }
 
-message NetProto {
-  repeated LayerProto layer = 1;
-  // partitioning type for parallelism
-  optional int32 partition_dim = 20 [default = 0];
+message LayerProto {
+  // the layer name used for identification
+  required string name = 1;
+  // source layer names
+  repeated string srclayers = 3;
+  // parameters, e.g., weight matrix or bias vector
+  repeated ParamProto param = 12;
+  // all layers are included in the net structure for training phase by default.
+  // some layers like data layer for loading test data are not used by training
+  // phase should be removed by setting the exclude field.
+  repeated Phase exclude = 15;
+  // the layer type
+  required LayerType type = 20;
+  // proto for the specific layer
+  oneof layer_conf {
+    // configuration for convolution layer
+    ConvolutionProto convolution_conf = 30;
+    // configuration for concatenation layer
+    ConcateProto concate_conf = 31;
+    // configuration for dropout layer
+    DropoutProto dropout_conf = 33;
+    // configuration for inner product layer
+    InnerProductProto innerproduct_conf = 34;
+    // configuration for local response normalization layer
+    DataProto lmdbdata_conf = 35;
+    // configuration for local response normalization layer
+    LRNProto lrn_conf = 45;
+    // configuration for mnist parser layer
+    MnistProto mnist_conf = 36;
+    // configuration for pooling layer
+    PoolingProto pooling_conf = 37;
+    // configuration for prefetch layer
+    PrefetchProto prefetch_conf = 44;
+    // configuration for rectified linear unit layer
+    ReLUProto relu_conf = 38;
+    // configuration for rgb image parser layer
+    RGBImageProto rgbimage_conf = 39;
+    // configuration for data layer
+    DataProto sharddata_conf = 32;
+    // configuration for slice layer
+    SliceProto slice_conf = 41;
+    // configuration for softmax loss layer
+    SoftmaxLossProto softmaxloss_conf = 40;
+    // configuration for split layer
+    SplitProto split_conf = 42;
+    // configuration for tanh layer
+    TanhProto tanh_conf = 43;
+    // configuration for rbmvis layer
+    RBMVisProto rbmvis_conf = 48;
+    // configuration for rbmhid layer
+    RBMHidProto rbmhid_conf = 49;
+  }
+  
+  // overrides the partition dimension for neural net
+  optional int32 partition_dim = 60 [default = -1];
+  // names of parameters shared from other layers
+  optional int32 partition_id = 90 [default = 0];
 }
 
-// weight matrix should be defined before bias vector;
-// todo separate conf for diff init method
+// weight matrix should be defined before bias vector
+// TODO(wangwei): separate conf for diff init method
 message ParamProto {
-  enum InitMethod {
-    // fix the values of all parameters  a constant in the value field
-    kConstant = 0;
-    // sample gaussian with std and mean
-    kGaussian = 1;
-    // uniform sampling between low and high
-    kUniform = 2;
-    // copy the content and history which are from previous training
-    kPretrained = 3;
-    // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
-    // Gaussian distribution
-    kGaussainSqrtFanIn = 4;
-    // from Toronto Convnet, rectified linear activation, let
-    // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
-    // the program will multiply it.
-    kUniformSqrtFanIn = 5;
-    // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
-    // activation, range is [-a, +a], for sigmoid activation, range is
-    // [-4a, +4a], put the scale factor to value field.
-    // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
-    kUniformSqrtFanInOut = 6;
-  }
   // used for identifying the same params from diff models and display deug info
   optional string name =  1 [default = ""];
   optional InitMethod init_method = 2 [default = kGaussian];
@@ -157,7 +225,7 @@ message ParamProto {
   // multiplied on the global weight decay.
   optional float weight_decay_multiplier = 16 [default = 1];
 
-    // name of the owner param from which this param shares the values
+  // name of the owner param from which this param shares the values
   optional string share_from = 60;
 
   // used interally
@@ -170,91 +238,9 @@ message ParamProto {
   repeated int32 shape = 93;
 }
 
-enum PartitionType{
-  kDataPartition=0;
-  kLayerPartition=1;
-  kNone=2;
-}
-
-message LayerProto {
-  // the layer name used for identification
-  required string name = 1;
-  enum LayerType{
-    kBridgeSrc = 15;
-    kBridgeDst = 16;
-    kConvolution = 1;
-    kConcate = 2;
-    kShardData = 3;
-    kDropout = 4;
-    kInnerProduct = 5;
-    kLabel = 18;
-    kLMDBData = 17;
-    kLRN = 6;
-    kMnist = 7;
-    kPooling = 8;
-    kPrefetch = 19;
-    kReLU = 9;
-    kRGBImage = 10;
-    kSoftmaxLoss = 11;
-    kSlice = 12;
-    kSplit = 13;
-    kTanh = 14;
-    kRBMVis = 23;
-    kRBMHid = 24;
-  }
-  // source layer names
-  repeated string srclayers = 3;
-  // parameters, e.g., weight matrix or bias vector
-  repeated ParamProto param = 12;
-  // all layers are included in the net structure for training phase by default.
-  // some layers like data layer for loading test data are not used by training
-  // phase should be removed by setting the exclude field.
-  repeated Phase exclude = 15;
-  // the layer type from the enum above
-  required LayerType type = 20;
-  // configuration for convolution layer
-  optional ConvolutionProto convolution_conf = 30;
-  // configuration for concatenation layer
-  optional ConcateProto concate_conf = 31;
-  // configuration for dropout layer
-  optional DropoutProto dropout_conf = 33;
-  // configuration for inner product layer
-  optional InnerProductProto innerproduct_conf = 34;
-  // configuration for local response normalization layer
-  optional DataProto lmdbdata_conf = 35;
-  // configuration for local response normalization layer
-  optional LRNProto lrn_conf = 45;
-  // configuration for mnist parser layer
-  optional MnistProto mnist_conf= 36;
-  // configuration for pooling layer
-  optional PoolingProto pooling_conf = 37;
-  // configuration for prefetch layer
-  optional PrefetchProto prefetch_conf = 44;
-  // configuration for rectified linear unit layer
-  optional ReLUProto relu_conf = 38;
-  // configuration for rgb image parser layer
-  optional RGBImageProto rgbimage_conf = 39;
-  // configuration for data layer
-  optional DataProto sharddata_conf = 32;
- // configuration for slice layer
-  optional SliceProto slice_conf = 41;
-  // configuration for softmax loss layer
-  optional SoftmaxLossProto softmaxloss_conf = 40;
-  // configuration for split layer
-  optional SplitProto split_conf = 42;
-  // configuration for tanh layer
-  optional TanhProto tanh_conf = 43;
-  // configuration for rbmvis layer
-  optional RBMVisProto rbmvis_conf = 48;
-  // configuration for rbmhid layer
-  optional RBMHidProto rbmhid_conf = 49;
-
-
-  // overrides the partition dimension for neural net
-  optional int32 partition_dim =60 [default = -1];
-  // names of parameters shared from other layers
-  optional int32 partition_id = 90 [default = 0];
-}
+// ---------------------------
+// protos for different layers
+// ---------------------------
 
 message RGBImageProto {
   // scale factor for each pixel
@@ -272,7 +258,7 @@ message PrefetchProto {
 }
 
 message SplitProto {
-  optional int32 num_splits = 1 [default =1];
+  optional int32 num_splits = 1 [default = 1];
 }
 
 // scaled tan: A*tan(B*x)
@@ -287,14 +273,14 @@ message SoftmaxLossProto {
   // computing accuracy against topk results
   optional int32 topk = 1 [default = 1];
   // loss scale factor
-  optional float scale= 30 [default = 1];
+  optional float scale = 30 [default = 1];
 }
 
 message ConvolutionProto {
   // The number of outputs for the layer
   required int32 num_filters = 1;
   // the kernel height/width
-  required int32 kernel= 2;
+  required int32 kernel = 2;
 
   // The padding height/width
   optional int32 pad = 30 [default = 0];
@@ -377,7 +363,7 @@ message LRNProto {
   // normalization objective
   optional NormRegion norm_region = 33 [default = ACROSS_CHANNELS];
   // offset
-  optional float knorm =34 [default = 1.0];
+  optional float knorm = 34 [default = 1.0];
 }
 
 message PoolingProto {
@@ -395,7 +381,7 @@ message PoolingProto {
   optional uint32 stride = 32 [default = 1];
 }
 
-message SliceProto{
+message SliceProto {
   required int32 slice_dim = 1;
 }
 
@@ -406,83 +392,151 @@ message ReLUProto {
   optional float negative_slope = 1 [default = 0];
 }
 
-message UpdaterProto {
-  enum UpdaterType{
-    // noraml SGD with momentum and weight decay
-    kSGD = 1;
-    // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
-    kAdaGrad = 2;
-    // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-    kRMSProp = 3;
-    // Nesterov first optimal gradient method
-    kNesterov = 4;
-  }
-  // updater type
-  required UpdaterType type = 1 [default=kSGD];
-  // configuration for RMSProp algorithm
-  optional RMSPropProto rmsprop_conf = 50;
-
- enum ChangeMethod {
-    kFixed = 0;
-    kInverseT = 1;
-    kInverse = 2;
-    kExponential = 3;
-    kLinear = 4;
-    kStep = 5;
-    kFixedStep = 6;
-  }
-  // change method for learning rate
-  required ChangeMethod lr_change= 2 [default = kFixed];
-
-  optional FixedStepProto fixedstep_conf=40;
-  optional StepProto step_conf=41;
-  optional LinearProto linear_conf=42;
-  optional ExponentialProto exponential_conf=43;
-  optional InverseProto inverse_conf=44;
-  optional InverseTProto inverset_conf=45;
-
-  optional float momentum = 31 [default = 0];
-  optional float weight_decay = 32 [default = 0];
-  // base learning rate
-  optional float base_lr = 34 [default = 0];
-  // used to avoid divide by 0, i.e. x/(y+delta)
-  optional float delta = 35 [default = 0.00000001];
-}
-
-message RMSPropProto{
+message RMSPropProto {
   // history=history*rho_+(1-rho_)*(grad*grad_scale);
   required float rho = 1;
 }
 
-message FixedStepProto{
+message FixedStepProto {
   repeated int32 step = 28;
   // lr = step_lr[i] if current step >= step[i]
   repeated float step_lr = 29;
 }
 
-message StepProto{
+message StepProto {
   // lr = base_lr * gamma^(step/change_freq)
   required float gamma = 35 [default = 1];
   // lr = base_lr * gamma^(step/change_freq)
-  required int32 change_freq= 40;
+  required int32 change_freq = 40;
 }
-message LinearProto{
+
+message LinearProto {
   // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
   required int32 change_freq= 40;
   // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
   required float final_lr = 39;
 }
-message ExponentialProto{
+
+message ExponentialProto {
   // lr = base / 2^(step/change_freq)
-  required int32 change_freq= 40;
+  required int32 change_freq = 40;
 }
-message InverseTProto{
+
+message InverseTProto {
   // lr = base_lr / (1+step/final_lr)
   required float final_lr = 39;
 }
-message InverseProto{
+message InverseProto {
   // lr = base_lr*(1+gamma*step)^(-pow)
   required float gamma = 1 [default = 1];
   // lr = base_lr*(1+gamma*step)^(-pow)
   required float pow = 2 [default = 0];
 }
+
+// --------------
+// All Enum Types
+// --------------
+
+enum ChangeMethod {
+  kFixed = 0;
+  kInverseT = 1;
+  kInverse = 2;
+  kExponential = 3;
+  kLinear = 4;
+  kStep = 5;
+  kFixedStep = 6;
+}
+
+enum InitMethod {
+  // fix the values of all parameters  a constant in the value field
+  kConstant = 0;
+  // sample gaussian with std and mean
+  kGaussian = 1;
+  // uniform sampling between low and high
+  kUniform = 2;
+  // copy the content and history which are from previous training
+  kPretrained = 3;
+  // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
+  // Gaussian distribution
+  kGaussainSqrtFanIn = 4;
+  // from Toronto Convnet, rectified linear activation, let
+  // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
+  // the program will multiply it.
+  kUniformSqrtFanIn = 5;
+  // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
+  // activation, range is [-a, +a], for sigmoid activation, range is
+  // [-4a, +4a], put the scale factor to value field.
+  // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
+  kUniformSqrtFanInOut = 6;
+}
+
+enum LayerType {
+  // Data layers
+  //  - Load records from file, database
+  kLMDBData = 17;
+  kPrefetch = 19;
+  kShardData = 3;
+  // Parser layers
+  //  - Parse features from records, e.g., pixels
+  kLabel = 18;
+  kMnist = 7;
+  kRGBImage = 10;
+  // Neuron layers
+  //  - Feature transformation
+  kConcate = 2;
+  kConvolution = 1;
+  kDropout = 4;
+  kInnerProduct = 5;
+  kLRN = 6;
+  kPooling = 8;
+  kReLU = 9;
+  kRBMHid = 24;
+  kRBMVis = 23;
+  kTanh = 14;
+  // Loss layers
+  //  - Compute objective loss
+  kSoftmaxLoss = 11;
+  // Other layers
+  //  - Connect layers when neural net is partitioned
+  kBridgeDst = 16;
+  kBridgeSrc = 15;
+  kSlice = 12;
+  kSplit = 13;
+}
+
+enum PartitionType {
+  kDataPartition = 0;
+  kLayerPartition = 1;
+  kNone = 2; 
+}
+
+enum Phase {
+  kTrain = 0;
+  kValidation = 1;
+  kTest= 2;
+  // postivie phase for contrastive divergence algorithm
+  kPositive = 3;
+  // negative phase for contrastive divergence algorithm
+  kNegative = 4;
+  kForward = 5;
+  kBackward = 6;
+  kLoss = 7;
+}
+
+enum TrainOneBatchAlg {
+  // Back-propagation algorithm for feed-forward models, e.g., CNN and RNN
+  kBP = 1;
+  // Contrastive Divergence algorithm for RBM, DBM, etc.
+  kCD = 2;
+}
+
+enum UpdaterType {
+  // noraml SGD with momentum and weight decay
+  kSGD = 1;
+  // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
+  kAdaGrad = 2;
+  // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+  kRMSProp = 3;
+  // Nesterov first optimal gradient method
+  kNesterov = 4;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 2f43f66..b470ea2 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -43,15 +43,15 @@ void Param::InitValues(int version){
   Tensor<cpu, 1> data(mutable_cpu_data(), Shape1(size()));
   auto random=TSingleton<Random<cpu>>::Instance();
   switch (proto_.init_method()) {
-  case ParamProto::kConstant:
+  case InitMethod::kConstant:
     data = proto_.value();
     break;
-  case ParamProto::kUniform:
+  case InitMethod::kUniform:
     random->SampleUniform(data, proto_.low(), proto_.high());
     if(proto_.value() != 1)
       data *= proto_.value();
     break;
-  case ParamProto::kUniformSqrtFanIn:
+  case InitMethod::kUniformSqrtFanIn:
     random->SampleUniform(data, proto_.low(), proto_.high());
     // only valid for param matrix with dim 1 for fan in
     LOG(ERROR) << "init fan in";
@@ -59,17 +59,17 @@ void Param::InitValues(int version){
     data *= proto_.value() / sqrt(data_->shape().at(1) / 3.0f);
     LOG(ERROR) << "end fan in";
     break;
-  case ParamProto::kUniformSqrtFanInOut:
+  case InitMethod::kUniformSqrtFanInOut:
     random->SampleUniform(data, proto_.low(), proto_.high());
     if(proto_.value())
       data *= proto_.value()/ sqrt(data_->shape()[0] +data_->shape()[1]);
     break;
-  case ParamProto::kGaussian:
+  case InitMethod::kGaussian:
     random->SampleGaussian(data, proto_.mean(), proto_.std());
     if(proto_.value() != 1)
       data *= proto_.value();
     break;
-  case ParamProto::kGaussainSqrtFanIn:
+  case InitMethod::kGaussainSqrtFanIn:
     random->SampleGaussian(data, proto_.mean(), proto_.std());
     if(proto_.value())
       data *= proto_.value()/ sqrt(data_->shape()[0]);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index b85982e..c038ca7 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -13,38 +13,38 @@ float Updater::GetLearningRate(int step) {
   float ret = 0., r = 0., base = proto_.base_lr();
   int freq = 0;
   switch (proto_.lr_change()) {
-    case UpdaterProto_ChangeMethod_kFixed:
+    case ChangeMethod::kFixed:
       ret = base;
       break;
-    case UpdaterProto_ChangeMethod_kLinear:
+    case ChangeMethod::kLinear:
       // a is init, b is the final
       freq = proto_.linear_conf().change_freq();
       r = step * 1.0  / freq;
       ret = (1.0 - r) * base + r * proto_.linear_conf().final_lr();
       break;
-    case UpdaterProto_ChangeMethod_kExponential:
+    case ChangeMethod::kExponential:
       // a is init, b is the final, from convnet
       freq = proto_.exponential_conf().change_freq();
       ret = base / pow(2, step * 1. / freq);
       break;
-    case UpdaterProto_ChangeMethod_kInverseT:
+    case ChangeMethod::kInverseT:
       // a is init, b is the final, from convnet
       CHECK_EQ(base, 2 * proto_.inverset_conf().final_lr())
         << "final value should be the half";
       ret = base / (1. + step * 1. / proto_.inverset_conf().final_lr());
       break;
-    case UpdaterProto_ChangeMethod_kInverse:
+    case ChangeMethod::kInverse:
       // a is init, b is gamma, c is pow
       ret = base * pow(1.f + proto_.inverse_conf().gamma() * step,
            - proto_.inverse_conf().pow());
       break;
-    case UpdaterProto_ChangeMethod_kStep:
+    case ChangeMethod::kStep:
       // a is the base learning rate, b is gamma, from caffe
       // notice it is step/change_steps, not step*1.0/change_steps
       freq = proto_.step_conf().change_freq();
       ret = base * pow(proto_.step_conf().gamma(), step / freq);
       break;
-    case UpdaterProto_ChangeMethod_kFixedStep:
+    case ChangeMethod::kFixedStep:
       for (int i = 0; i < proto_.fixedstep_conf().step_size(); i++) {
         if (step > proto_.fixedstep_conf().step(i))
           ret = proto_.fixedstep_conf().step_lr(i);