You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/08/14 16:21:15 UTC
[1/2] incubator-singa git commit: SINGA-54 Refactor job configuration
to move fields in ModelProto out
Repository: incubator-singa
Updated Branches:
refs/heads/master 539fcee56 -> 4dee7b9cd
SINGA-54 Refactor job configuration to move fields in ModelProto out
Tested with mnist and cifar examples.
Four components are necessary for submitting a job, namely, neuralnet, alg, updater and cluster.
The configuration is now consistent with the MM paper.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/1b574f3c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/1b574f3c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/1b574f3c
Branch: refs/heads/master
Commit: 1b574f3c10f23fa80926471c3efa752d062d4301
Parents: 539fcee
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Aug 14 16:25:10 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Aug 14 16:25:10 2015 +0800
----------------------------------------------------------------------
examples/cifar10/job.conf | 49 +++---
examples/mnist/conv.conf | 295 +++++++++++++++++-------------------
examples/mnist/job.conf | 47 +++---
include/neuralnet/base_layer.h | 2 +-
include/singa.h | 7 +-
include/trainer/trainer.h | 23 ++-
include/trainer/worker.h | 8 +-
src/main.cc | 2 -
src/neuralnet/base_layer.cc | 10 +-
src/neuralnet/layer.cc | 6 +-
src/neuralnet/neuralnet.cc | 1 -
src/proto/common.proto | 7 -
src/proto/job.proto | 181 +++++++++++-----------
src/trainer/trainer.cc | 53 ++++---
src/trainer/worker.cc | 75 +++++----
src/utils/param.cc | 24 +--
16 files changed, 380 insertions(+), 410 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index f7829b8..89afca9 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -1,30 +1,23 @@
-cluster {
- nworker_groups: 1
- nserver_groups: 1
- workspace: "examples/cifar10"
+name: "cifar10-convnet"
+train_steps: 1000
+test_steps: 100
+test_freq:300
+disp_freq:30
+alg: kBP
+updater{
+ weight_decay:0.004
+ lr_change: kFixedStep
+ type: kSGD
+ fixedstep_conf:{
+ step:0
+ step:60000
+ step:65000
+ step_lr:0.001
+ step_lr:0.0001
+ step_lr:0.00001
+ }
}
-
-model {
- name: "cifar10-convnet"
- train_steps: 1000
- test_steps: 100
- test_frequency:300
- display_frequency:30
- alg: kBackPropagation
- updater{
- weight_decay:0.004
- lr_change: kFixedStep
- type: kSGD
- fixedstep_conf:{
- step:0
- step:60000
- step:65000
- step_lr:0.001
- step_lr:0.0001
- step_lr:0.00001
- }
- }
- neuralnet {
+neuralnet {
layer{
name: "data"
type: kShardData
@@ -226,4 +219,8 @@ model {
srclayers: "label"
}
}
+cluster {
+ nworker_groups: 1
+ nserver_groups: 1
+ workspace: "examples/cifar10"
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/examples/mnist/conv.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
index d463cd9..fce1418 100644
--- a/examples/mnist/conv.conf
+++ b/examples/mnist/conv.conf
@@ -1,186 +1,177 @@
-cluster {
- nworker_groups: 1
- nserver_groups: 1
- nservers_per_group: 1
- nworkers_per_group: 1
- nservers_per_procs: 1
- nworkers_per_procs: 1
- workspace: "examples/mnist"
-}
-model {
-name: "mnist-conv"
+name: "conv"
train_steps: 10000
test_steps:100
-test_frequency:500
-display_frequency:50
-debug: false
+test_freq:500
+disp_freq:50
+alg: kBP
+debug: true
updater{
- base_learning_rate:0.01
+ base_lr:0.01
momentum:0.9
weight_decay:0.0005
- gamma:0.0001
- pow:0.75
- learning_rate_change_method:kInverse
+ lr_change: kInverse
+ type: kSGD
+ inverse_conf {
+ gamma:0.0001
+ pow:0.75
+ }
}
neuralnet {
-layer {
- name: "data"
- type: "kLMDBData"
- data_param {
- path: "/home/wangwei/program/singa/examples/mnist/mnist_train_lmdb"
- batchsize: 64
+ layer {
+ name: "data"
+ type: kShardData
+ sharddata_conf {
+ path: "examples/mnist/mnist_train_shard"
+ batchsize: 64
+ }
+ exclude: kTest
}
- exclude: kTest
-}
-layer {
- name: "data"
- type: "kLMDBData"
- data_param {
- path: "/home/wangwei/program/singa/examples/mnist/mnist_test_lmdb"
- batchsize: 100
+ layer {
+ name: "data"
+ type: kShardData
+ sharddata_conf {
+ path: "examples/mnist/mnist_test_shard"
+ batchsize: 100
+ }
+ exclude: kTrain
}
- exclude: kTrain
-}
-layer{
- name:"mnist"
- type: "kMnistImage"
- srclayers: "data"
- mnist_param {
-# sigma: 6
-# alpha: 38
-# gamma: 15
-# kernel: 21
-# elastic_freq:100
-# beta:15
-# resize: 29
- norm_a:255
+ layer{
+ name:"mnist"
+ type: kMnist
+ srclayers: "data"
+ mnist_conf {
+ norm_a:255
+ norm_b:0
+ }
}
-}
-
-layer{
- name: "label"
- type: "kLabel"
- srclayers: "data"
-}
-layer {
- name: "conv1"
- type: "kConvolution"
- srclayers: "mnist"
- convolution_param {
- num_filters: 20
- kernel: 5
- stride: 1
+ layer{
+ name: "label"
+ type: kLabel
+ srclayers: "data"
}
- param{
- name: "weight"
- init_method:kUniformSqrtFanIn
- learning_rate_multiplier:1.0
- }
- param{
- name: "bias"
- init_method: kConstant
- learning_rate_multiplier:2.0
- value:0
+ layer {
+ name: "conv1"
+ type: kConvolution
+ srclayers: "mnist"
+ convolution_conf {
+ num_filters: 20
+ kernel: 5
+ stride: 1
}
-}
-layer {
- name: "pool1"
- type: "kPooling"
- srclayers: "conv1"
- pooling_param {
- pool: MAX
- kernel: 2
- stride: 2
- }
-}
-layer {
- name: "conv2"
- type: "kConvolution"
- srclayers: "pool1"
- convolution_param {
- num_filters: 50
- kernel: 5
- stride: 1
+ param{
+ name: "w1"
+ init_method:kUniformSqrtFanIn
+ learning_rate_multiplier:1.0
+ }
+ param{
+ name: "b1"
+ init_method: kConstant
+ learning_rate_multiplier:2.0
+ value:0
+ }
}
- param{
- name: "weight"
- init_method:kUniformSqrtFanIn
- learning_rate_multiplier:1.0
+ layer {
+ name: "pool1"
+ type: kPooling
+ srclayers: "conv1"
+ pooling_conf {
+ pool: MAX
+ kernel: 2
+ stride: 2
}
- param{
- name: "bias"
- init_method: kConstant
- learning_rate_multiplier:2.0
- value:0
- }
-}
-layer {
- name: "pool2"
- type: "kPooling"
- srclayers: "conv2"
- pooling_param {
- pool: MAX
- kernel: 2
- stride: 2
}
-}
-layer {
- name: "ip1"
- type: "kInnerProduct"
- srclayers:"pool2"
- inner_product_param {
- num_output: 500
+ layer {
+ name: "conv2"
+ type: kConvolution
+ srclayers: "pool1"
+ convolution_conf {
+ num_filters: 50
+ kernel: 5
+ stride: 1
+ }
+ param{
+ name: "w2"
+ init_method:kUniformSqrtFanIn
+ learning_rate_multiplier:1.0
+ }
+ param{
+ name: "b2"
+ init_method: kConstant
+ learning_rate_multiplier:2.0
+ value:0
+ }
}
- param{
- name: "weight"
- init_method:kUniformSqrtFanIn
- learning_rate_multiplier:1.0
+ layer {
+ name: "pool2"
+ type: kPooling
+ srclayers: "conv2"
+ pooling_conf {
+ pool: MAX
+ kernel: 2
+ stride: 2
}
- param{
- name: "bias"
- init_method: kConstant
- learning_rate_multiplier:2.0
- value:0
}
+ layer {
+ name: "ip1"
+ type: kInnerProduct
+ srclayers:"pool2"
+ innerproduct_conf {
+ num_output: 500
+ }
+ param{
+ name: "w3"
+ init_method:kUniformSqrtFanIn
+ learning_rate_multiplier:1.0
+ }
+ param{
+ name: "b3"
+ init_method: kConstant
+ learning_rate_multiplier:2.0
+ value:0
+ }
-}
-
-layer {
- name: "relu1"
- type: "kReLU"
- srclayers:"ip1"
-}
+ }
-layer {
- name: "ip2"
- type: "kInnerProduct"
- srclayers:"relu1"
- inner_product_param {
- num_output: 10
+ layer {
+ name: "relu1"
+ type: kReLU
+ srclayers:"ip1"
}
- param{
- name: "weight"
+
+ layer {
+ name: "ip2"
+ type: kInnerProduct
+ srclayers:"relu1"
+ innerproduct_conf {
+ num_output: 10
+ }
+ param {
+ name: "w4"
init_method:kUniformSqrtFanIn
learning_rate_multiplier:1
}
- param{
- name: "bias"
+ param {
+ name: "b4"
init_method: kConstant
learning_rate_multiplier:2
value:0
}
-}
-layer{
- name: "loss"
- type:"kSoftmaxLoss"
- softmaxloss_param{
- topk:1
}
- srclayers:"ip2"
- srclayers:"label"
-}
+ layer{
+ name: "loss"
+ type: kSoftmaxLoss
+ softmaxloss_conf{
+ topk:1
+ }
+ srclayers:"ip2"
+ srclayers:"label"
+ }
}
+cluster {
+ nworker_groups: 1
+ nserver_groups: 1
+ workspace: "examples/mnist"
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/examples/mnist/job.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/job.conf b/examples/mnist/job.conf
index 5d1445d..34fbca2 100644
--- a/examples/mnist/job.conf
+++ b/examples/mnist/job.conf
@@ -1,26 +1,20 @@
-cluster {
- nworker_groups: 1
- nserver_groups: 1
- workspace: "examples/mnist"
-}
-model {
- name: "deep-big-simple-mlp"
- train_steps: 1000
- test_steps:10
- test_frequency:60
- display_frequency:30
- alg: kBackPropagation
- updater{
- base_lr: 0.001
- lr_change: kStep
- type: kSGD
- step_conf{
- change_freq: 60
- gamma: 0.997
- }
+name: "mlp"
+train_steps: 1000
+test_steps:10
+test_freq:60
+disp_freq:10
+alg: kBP
+updater{
+ base_lr: 0.001
+ lr_change: kStep
+ type: kSGD
+ step_conf{
+ change_freq: 60
+ gamma: 0.997
}
+}
- neuralnet {
+neuralnet {
layer {
name: "data"
type: kShardData
@@ -46,13 +40,6 @@ model {
type: kMnist
srclayers: "data"
mnist_conf {
-# sigma: 6
-# alpha: 38
-# gamma: 15
-# kernel: 21
-# elastic_freq:100
-# beta:15
-# resize: 29
norm_a: 127.5
norm_b: 1
}
@@ -228,4 +215,8 @@ model {
srclayers:"label"
}
}
+cluster {
+ nworker_groups: 1
+ nserver_groups: 1
+ workspace: "examples/mnist"
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/neuralnet/base_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h
index ca63da0..25df95f 100644
--- a/include/neuralnet/base_layer.h
+++ b/include/neuralnet/base_layer.h
@@ -133,10 +133,10 @@ class Layer {
* blob in parser layers; The default value is "unknown"; If the
* src layer is the prefetch layer and there are more than one parser layers,
* this value be set.
- */
const std::string &datablob() const {
return layer_proto_.datablob();
}
+ */
/**
* @return a const ref for Blob storing neuron values of this layer for BP
*/
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/singa.h
----------------------------------------------------------------------
diff --git a/include/singa.h b/include/singa.h
index c8984e5..82df64b 100644
--- a/include/singa.h
+++ b/include/singa.h
@@ -24,13 +24,16 @@ void SubmitJob(int job, bool resume, const JobProto& jobConf) {
ReadProtoFromTextFile(FLAGS_singa_conf.c_str(), &singaConf);
if (singaConf.has_log_dir())
SetupLog(singaConf.log_dir(),
- std::to_string(job) + "-" + jobConf.model().name());
+ std::to_string(job) + "-" + jobConf.name());
if (jobConf.num_openblas_threads() != 1)
LOG(WARNING) << "openblas is set with " << jobConf.num_openblas_threads()
<< " threads";
openblas_set_num_threads(jobConf.num_openblas_threads());
+ JobProto proto;
+ proto.CopyFrom(jobConf);
+ proto.set_id(job);
Trainer trainer;
- trainer.Start(job, resume, jobConf, singaConf);
+ trainer.Start(resume, singaConf, &proto);
}
} // namespace singa
#endif // SINGA_SINGA_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/trainer/trainer.h
----------------------------------------------------------------------
diff --git a/include/trainer/trainer.h b/include/trainer/trainer.h
index 911a4c4..1d28de6 100644
--- a/include/trainer/trainer.h
+++ b/include/trainer/trainer.h
@@ -27,14 +27,11 @@ class Trainer{
* Entrance function which construct the workers and servers, and luanch
* one thread per worker/server.
*
- * @param job job ID
* @param resume if true resume the training from the latest checkpoint files
- * @param jobConf job configuration, including cluster and model configuration
* @param singaConf global singa configuration including zookeeper and
- * log dir setting.
+ * @param jobConf job configuration, including cluster and model configuration
*/
- void Start(int job, bool resume,
- const JobProto& jobConf, const SingaProto& singaConf);
+ void Start(bool resume, const SingaProto& singaConf, JobProto* jobConf);
protected:
/**
@@ -44,27 +41,27 @@ class Trainer{
* checkpoint, which will be added into the checkpoint field. The workers
* would then load the values of params from the checkpoint files.
*
- * @param modelConf model configuration
+ * @param jobConf job configuration
*/
- void Resume(ModelProto* modelConf);
+ void Resume(JobProto* jobConf);
/**
* Create server instances.
* @param nthread total num of threads in current procs which is used to
* assign each thread a local thread ID. The number of workers is extracted
* from Cluster
- * @param modelConf
+ * @param jobConf
* @return server instances
*/
- vector<Server*> CreateServers(int nthread, const ModelProto& modelConf);
+ vector<Server*> CreateServers(int nthread, const JobProto& jobConf);
/**
* Create workers instances.
* @param nthread total num of threads in current procs which is used to
* assign each thread a local thread ID. The number of workers is extracted
* from Cluster
- * @param modelConf
+ * @param jobConf
* @return worker instances
*/
- vector<Worker*> CreateWorkers(int nthread, const ModelProto& modelConf);
+ vector<Worker*> CreateWorkers(int nthread, const JobProto& jobConf);
/**
* Setup workers and servers.
@@ -77,7 +74,7 @@ class Trainer{
* @param servers
*/
void SetupWorkerServer(
- const ModelProto& modelConf,
+ const JobProto& jobConf,
const vector<Worker*>& workers,
const vector<Server*>& servers);
@@ -91,7 +88,7 @@ class Trainer{
* For other base classes, use its base class name (string) as the key and the
* implementation class as the value, e.g., <"Updater" SGDUpdater>.
*/
- void RegisterDefaultClasses(const singa::ModelProto& proto);
+ void RegisterDefaultClasses();
/**
* Generate msg to trigger synchronization with other server groups.
*
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/include/trainer/worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/worker.h b/include/trainer/worker.h
index 025bcc1..0557ee2 100644
--- a/include/trainer/worker.h
+++ b/include/trainer/worker.h
@@ -31,7 +31,7 @@ class Worker {
/**
* Setup members
*/
- void Setup(const ModelProto& model, shared_ptr<NeuralNet> train_net,
+ void Setup(const JobProto& job, shared_ptr<NeuralNet> train_net,
shared_ptr<NeuralNet> valid_net, shared_ptr<NeuralNet> test_net);
/**
* Main function of Worker.
@@ -49,7 +49,7 @@ class Worker {
* If the training starts from scrath, the params are initialzed using random
* distributions, e.g., Gaussian distribution. After that, the worker may
* train for a couple of steps to warmup the params before put
- * them to servers (warmup of ModelProto controls this).
+ * them to servers (warmup of JobProto controls this).
*
* If the owner param is availabel from checkpoint file, then its
* values are parsed from the checkpoint file instead of randomly initialized.
@@ -62,7 +62,7 @@ class Worker {
* The serialization is done using BlobProtos which includes the name, version
* and values of each Param.
* Different worker would generate different checkpoint files. The file path
- * is <workspace>/checkpoint-<modelname>-step<step>-worker<worker_id>.bin
+ * is <workspace>/checkpoint-<jobname>-step<step>-worker<worker_id>.bin
* @param step training step of this worker
* @param net the training net whose params will be dumped.
*/
@@ -173,7 +173,7 @@ class Worker {
protected:
int thread_id_, grp_id_, id_;
int step_;
- ModelProto modelproto_;
+ JobProto job_conf_;
shared_ptr<NeuralNet> train_net_, test_net_, validation_net_;
Dealer* layer_dealer_, *dealer_;
Updater* updater_;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/main.cc
----------------------------------------------------------------------
diff --git a/src/main.cc b/src/main.cc
index d95e405..00b75ff 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -33,8 +33,6 @@ int main(int argc, char **argv) {
singa::JobProto jobConf;
std::string job_file = FLAGS_conf;
singa::ReadProtoFromTextFile(job_file.c_str(), &jobConf);
- CHECK(jobConf.has_cluster());
- CHECK(jobConf.has_model());
RegisterClasses();
singa::SubmitJob(FLAGS_job, FLAGS_resume, jobConf);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/neuralnet/base_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/base_layer.cc b/src/neuralnet/base_layer.cc
index 57163e9..695104e 100644
--- a/src/neuralnet/base_layer.cc
+++ b/src/neuralnet/base_layer.cc
@@ -125,17 +125,19 @@ void PrefetchLayer::Setup(const LayerProto& proto, int npartitions) {
}
const Blob<float>& PrefetchLayer::data(const Layer* from, Phase phase) const {
- if(from!=nullptr){
- return datablobs_.at(from->datablob());
- }else{
+ LOG(FATAL) << " needs update";
+ if(from != nullptr) {
+ return datablobs_.at("");
+ } else {
//CHECK_EQ(datablobs_.size(),1);
return datablobs_.begin()->second;
}
}
Blob<float>* PrefetchLayer::mutable_data(const Layer* from, Phase phase) {
+ LOG(FATAL) << " needs update";
if(from!=nullptr){
- return &(datablobs_.at(from->datablob()));
+ return &(datablobs_.at(""));
}else{
//CHECK_EQ(datablobs_.size(),1);
return &(datablobs_.begin()->second);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index 314bb14..c1fce00 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -439,7 +439,7 @@ void MnistLayer::ParseRecords(Phase phase,
LOG_IF(ERROR, records.size()==0)<<"Empty records to parse";
int ndim=records.at(0).image().shape_size();
int inputsize =records.at(0).image().shape(ndim-1);
- CHECK_EQ(inputsize, blob->shape()[1]);
+ CHECK_EQ(inputsize, blob->shape()[2]);
float* dptr=blob->mutable_cpu_data();
for(const Record& record: records){
@@ -485,11 +485,11 @@ void MnistLayer::Setup(const LayerProto& proto, int npartitions) {
int ndim=sample.image().shape_size();
CHECK_GE(ndim,2);
if(resize_)
- data_.Reshape(vector<int>{batchsize, resize_, resize_});
+ data_.Reshape(vector<int>{batchsize, 1, resize_, resize_});
else{
int s=sample.image().shape(ndim-1);
CHECK_EQ(s,sample.image().shape(ndim-2));
- data_.Reshape(vector<int>{batchsize, s, s });
+ data_.Reshape(vector<int>{batchsize, 1, s, s });
}
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 83f8c36..4732a36 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -88,7 +88,6 @@ shared_ptr<NeuralNet> NeuralNet::Create(
param->set_share_from(from);
}
- for (auto layer : net_conf.layer())
LOG(INFO) << "NeuralNet config is\n" << conf.DebugString();
// TODO(wangwei) create net based on net type, e.g., directed, undirected, etc
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/proto/common.proto
----------------------------------------------------------------------
diff --git a/src/proto/common.proto b/src/proto/common.proto
index d8be479..3b6efb3 100644
--- a/src/proto/common.proto
+++ b/src/proto/common.proto
@@ -24,13 +24,6 @@ enum EntityType {
kRuntime = 4;
};
-enum ShareOption {
- kValueOnly = 0;
- kWhole = 1;
-};
-
-
-
enum ConnectionType {
kOneToOne = 0;
kOneToAll = 1;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index 200197f..a67d330 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -1,9 +1,73 @@
package singa;
+enum TrainOneBatchAlg {
+ // Back-propagation algorithm for feed-forward models, e.g., CNN, and RNN
+ kBP = 1;
+ // Contrastive Divergence algorithm for RBM, DBM etc.
+ kCD = 2;
+}
message JobProto {
- required ClusterProto cluster = 1;
- required ModelProto model = 2;
- optional int32 num_openblas_threads = 3 [default = 1];
+ // job name, e.g., "cifar10-dcnn", "mnist-mlp"
+ required string name = 1;
+ // neural net consits of a set of connected layers
+ required NetProto neuralnet = 3;
+ // algorithms calculating gradients for one mini-batch/iteration
+ required TrainOneBatchAlg alg = 5;
+ // configuration of SGD updater, including learning rate, etc.
+ required UpdaterProto updater = 7;
+ // cluster toplogy conf
+ required ClusterProto cluster = 9;
+
+ // for setting CD fields
+ optional CDProto cd_conf = 12;
+
+ // total num of steps for training
+ required int32 train_steps = 16;
+ // frequency of displaying training info
+ optional int32 disp_freq = 17 [default = 0];
+
+ // frequency of test, e.g., do test every 100 training steps
+ optional int32 test_freq = 20 [default = 0];
+ // total num of steps for testing all test data; todo set -1 for test forever
+ optional int32 test_steps = 21 [default = 0];
+ // frequency of validation, e.g., do validation every 100 training steps
+ optional int32 valid_freq = 25 [default = 0];
+ // total num of steps for validating all validation data
+ optional int32 valid_steps = 26 [default = 0];
+ // frequency of checkpoint
+ optional int32 checkpoint_freq = 30 [default = 0];
+
+ // for loading checkpoint files to init parameters
+ repeated string checkpoint_path = 60;
+ // send parameters to servers after training for this num of steps
+ optional int32 warmup_steps = 61 [default = 0];
+ // display debug info
+ optional bool debug = 62 [default = false];
+ // reset the version of params loaded from checkpoint file to step
+ optional bool reset_param_version = 63 [default = true];
+ // set num of threads used by openblas
+ optional int32 num_openblas_threads = 64 [default = 1];
+
+ // start checkpoint after this num steps
+ optional int32 checkpoint_after = 80 [default = 0];
+ // start display after this num steps
+ optional int32 disp_after = 81[default = 0];
+ // start test after this num steps
+ optional int32 test_after = 82 [default = 0];
+ // start validation after this num steps
+ optional int32 valid_after = 83 [default = 0];
+
+ // used by SINGA; uses typically do not touch these fields
+ optional bool resume = 90 [default = false];
+ // last snapshot step
+ optional int32 step = 91 [default = 0];
+ // job id allocated by zookeeper
+ optional int32 id = 92 [default = -1];
+}
+
+message CDProto {
+ //number of steps for gibbs sampling
+ optional int32 pcd_k = 1 [default = 1];
}
message ClusterProto {
@@ -13,24 +77,23 @@ message ClusterProto {
optional int32 nservers_per_group = 4 [default = 1];
optional int32 nworkers_per_procs = 5 [default = 1];
optional int32 nservers_per_procs = 6 [default = 1];
+ // local workspace for checkpoint files and vis files
+ required string workspace = 10;
// servers and workers in different processes?
- optional bool server_worker_separate = 11 [default = false];
+ optional bool server_worker_separate = 20 [default = false];
// port number is used by ZeroMQ
- optional int32 start_port = 13 [default = 6723];
- // local workspace, train/val/test shards, checkpoint files
- required string workspace = 14;
-
- // conduct updates at server side; otherwise do it at worker side
- optional bool server_update = 40 [default = true];
+ optional int32 start_port = 60 [default = 6723];
+ // conduct updates at server side; otherwise do it at worker side
+ optional bool server_update = 61 [default = true];
// share memory space between worker groups in one procs
- optional bool share_memory = 41 [default = true];
+ optional bool share_memory = 62 [default = true];
// bandwidth of ethernet, Bytes per second, default is 1 Gbps
- optional int32 bandwidth=50 [default=134217728];
+ optional int32 bandwidth=80 [default=134217728];
// poll time in milliseconds
- optional int32 poll_time=51 [default =100];
+ optional int32 poll_time=81 [default =100];
}
@@ -47,67 +110,14 @@ enum Phase {
kLoss = 7;
}
-message ModelProto {
- // model name, e.g., "cifar10-dcnn", "mnist-mlp"
- required string name = 1;
- // frequency of displaying training info
- required int32 display_frequency = 3 ;
- // total num of steps for training
- required int32 train_steps = 5;
- // configuration of SGD updater, including learning rate, etc.
- required UpdaterProto updater = 7;
- enum GradCalcAlg {
- // BP algorithm for feed-forward models, e.g., CNN, MLP, RNN
- kBackPropagation = 1;
- // CD algorithm for RBM, DBM etc., models
- kContrastiveDivergence = 2;
- }
- // gradient calculation algorithm
- required GradCalcAlg alg = 8 [default = kBackPropagation];
- required NetProto neuralnet = 9;
-
- // total num of steps for validation
- optional int32 validation_steps = 30 [default = 0];
- // total num of steps for test
- optional int32 test_steps = 31 [default = 0];
- // frequency of validation
- optional int32 validation_frequency = 32;
- // frequency of test
- optional int32 test_frequency = 33 [default = 0];
- // frequency of checkpoint
- optional int32 checkpoint_frequency = 34 [default = 0];
- // send parameters to servers after training for this num of steps
- optional int32 warmup_steps = 35 [default = 0];
- // checkpoint path
- optional bool resume = 36 [default = false];
-
- // start display after this num steps
- optional int32 display_after = 60[default = 0];
- // start checkpoint after this num steps
- optional int32 checkpoint_after = 61 [default = 0];
- // start test after this num steps
- optional int32 test_after = 62 [default = 0];
-// start validation after this num steps
- optional int32 validation_after = 63 [default = 0];
- // last snapshot step
- optional int32 step = 64 [default = 0];
- // display debug info
- optional bool debug = 65 [default = false];
- // checkpoint files
- repeated string checkpoint = 66;
- // reset the version of params loaded from checkpoint file to step
- optional bool reset_param_version = 67 [default = true];
- //number of steps for gibbs sampling
- optional int32 pcd_k=69 [default=15];
-}
-
message NetProto {
repeated LayerProto layer = 1;
// partitioning type for parallelism
- optional int32 partition_dim = 2 [default = 0];
+ optional int32 partition_dim = 20 [default = 0];
}
-// weight matrix should be defined before bias vector
+// weight matrix should be defined before bias vector;
+// todo separate conf for diff init method
message ParamProto {
enum InitMethod {
// fix the values of all parameters a constant in the value field
@@ -131,7 +141,9 @@ message ParamProto {
// <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
kUniformSqrtFanInOut = 6;
}
- optional InitMethod init_method = 1 [default = kGaussian];
+ // used for identifying the same params from diff models and display deug info
+ optional string name = 1 [default = ""];
+ optional InitMethod init_method = 2 [default = kGaussian];
// constant init
optional float value = 5 [default = 1];
// for uniform sampling
@@ -144,20 +156,18 @@ message ParamProto {
optional float learning_rate_multiplier = 15 [default = 1];
// multiplied on the global weight decay.
optional float weight_decay_multiplier = 16 [default = 1];
- // partition dimension, -1 for no partition
- optional int32 partition_dim = 30;
- // usually, the program will infer the param shape
- repeated int32 shape = 31;
- // used for identifying the same params from diff models and display deug info
- optional string name = 61 [default = ""];
- // name of the owner param from which this param shares the values
- optional string share_from = 62;
+
+ // name of the owner param from which this param shares the values
+ optional string share_from = 60;
+
// used interally
- optional int32 id = 63;
- // parameter slice limit (Google Protobuf also has size limit)
- optional int32 split_threshold = 64 [default = 5000000];
+ optional int32 id = 90;
// used internally
- optional int32 owner = 65 [default = -1];
+ optional int32 owner = 91 [default = -1];
+ // partition dimension, -1 for no partition
+ optional int32 partition_dim = 92;
+ // usually, the program will infer the param shape
+ repeated int32 shape = 93;
}
enum PartitionType{
@@ -241,12 +251,9 @@ message LayerProto {
// overrides the partition dimension for neural net
- optional int32 partition_dim =59 [default = -1];
- optional string datablob = 58 [default = "unknow"];
-
+ optional int32 partition_dim =60 [default = -1];
// names of parameters shared from other layers
- repeated string share_param = 60;
- optional int32 partition_id = 62 [default = 0];
+ optional int32 partition_id = 90 [default = 0];
}
message RGBImageProto {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index f348ff6..699fc30 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -28,7 +28,7 @@ Trainer::~Trainer() {
delete router_;
}
-void Trainer::RegisterDefaultClasses(const singa::ModelProto& model_conf) {
+void Trainer::RegisterDefaultClasses() {
// register all implemented layers
singa::NeuralNet::RegisterLayers();
auto param_factory = Singleton<Factory<singa::Param>>::Instance();
@@ -77,12 +77,12 @@ const vector<int> SliceParams(const vector<Param*>& params) {
}
void Trainer::SetupWorkerServer(
- const ModelProto& model_conf,
+ const JobProto& job_conf,
const vector<Worker*>& workers,
const vector<Server*>& servers) {
auto cluster = Cluster::Get();
int grp_size = cluster->nworkers_per_group();
- const auto& net_conf = model_conf.neuralnet();
+ const auto& net_conf = job_conf.neuralnet();
auto net = NeuralNet::Create(net_conf, kTrain, grp_size);
// MUST do SliceParam before share param/net with others
auto slices = SliceParams(net->params());
@@ -96,12 +96,12 @@ void Trainer::SetupWorkerServer(
if (grp_net.find(grp_id) == grp_net.end()) {
if (grp_id == first_grp) {
// test are performed only by the first group now. TODO update.
- if (first_grp == 0 && model_conf.test_steps() && worker_id == 0) {
+ if (first_grp == 0 && job_conf.test_steps() && worker_id == 0) {
test_net = NeuralNet::Create(net_conf, kTest, 1); // hard code for exp
test_net->ShareParamsFrom(net);
}
// validation are performed only by the first group. TODO update.
- if (first_grp == 0 && model_conf.validation_steps() && worker_id == 0) {
+ if (first_grp == 0 && job_conf.valid_steps() && worker_id == 0) {
valid_net = NeuralNet::Create(net_conf, kValidation, 1);
valid_net->ShareParamsFrom(net);
}
@@ -124,18 +124,18 @@ void Trainer::SetupWorkerServer(
}
LOG(INFO) << "grp " << worker->grp_id() << ", worker "
<< worker->id() << " net " << grp_net[grp_id].get();
- worker->Setup(model_conf, grp_net[grp_id], valid_net, test_net);
+ worker->Setup(job_conf, grp_net[grp_id], valid_net, test_net);
}
// partition among server groups, each group maintains one sub-set for sync
auto slice2group = PartitionSlices(cluster->nserver_groups(), slices);
for (auto server : servers)
- server->Setup(model_conf.updater(), &server_shard_, slice2group);
+ server->Setup(job_conf.updater(), &server_shard_, slice2group);
// partition within one server group, each server updates for one sub-set
slice2server_ = PartitionSlices(cluster->nservers_per_group(), slices);
}
-vector<Server*> Trainer::CreateServers(int nthreads, const ModelProto& mconf) {
+vector<Server*> Trainer::CreateServers(int nthreads, const JobProto& job) {
auto cluster = Cluster::Get();
vector<Server*> servers;
if (!cluster->has_server())
@@ -157,7 +157,7 @@ vector<Server*> Trainer::CreateServers(int nthreads, const ModelProto& mconf) {
return servers;
}
-vector<Worker*> Trainer::CreateWorkers(int nthreads, const ModelProto& mconf){
+vector<Worker*> Trainer::CreateWorkers(int nthreads, const JobProto& job) {
auto cluster=Cluster::Get();
vector<Worker*> workers;
if(!cluster->has_worker())
@@ -184,18 +184,19 @@ vector<Worker*> Trainer::CreateWorkers(int nthreads, const ModelProto& mconf){
for (int gid = gstart; gid < gend; gid++) {
for (int wid = wstart; wid < wend; wid++) {
Worker* worker=nullptr;
- if (mconf.alg() == ModelProto_GradCalcAlg_kBackPropagation)
+ if (job.alg() == TrainOneBatchAlg::kBP)
worker = new BPWorker(nthreads++,gid, wid);
- else {
+ else if (job.alg() == TrainOneBatchAlg::kCD)
worker=new CDWorker(nthreads++,gid, wid);
- }
+ else
+ LOG(FATAL) << "unknown alg for trainonebatch func " << job.alg();
workers.push_back(worker);
}
}
return workers;
}
-void Trainer::Resume(ModelProto* modelConf) {
+void Trainer::Resume(JobProto* jobConf) {
tinydir_dir dir;
string folder = Cluster::Get()->checkpoint_folder();
tinydir_open(&dir, folder.c_str());
@@ -226,24 +227,22 @@ void Trainer::Resume(ModelProto* modelConf) {
}
if (latest_step > 0) {
- modelConf->set_step(latest_step);
- if (!modelConf->has_reset_param_version())
- modelConf->set_reset_param_version(false);
- modelConf->clear_checkpoint();
+ jobConf->set_step(latest_step);
+ if (!jobConf->has_reset_param_version())
+ jobConf->set_reset_param_version(false);
+ jobConf->clear_checkpoint_path();
for (auto ck_file : ck_files)
- modelConf->add_checkpoint(folder + "/" + ck_file);
+ jobConf->add_checkpoint_path(folder + "/" + ck_file);
}
tinydir_close(&dir);
}
-void Trainer::Start(int job, bool resume,
- const JobProto& jobConf, const SingaProto& singaConf) {
+void Trainer::Start(bool resume, const SingaProto& singaConf, JobProto* job) {
// register job to zookeeper at the beginning
- auto cluster = Cluster::Get(job, singaConf, jobConf.cluster());
- ModelProto model = jobConf.model();
- RegisterDefaultClasses(model);
+ auto cluster = Cluster::Get(job->id(), singaConf, job->cluster());
+ RegisterDefaultClasses();
if (resume)
- Resume(&model);
+ Resume(job);
router_ = new Router();
router_->Bind(kInprocRouterEndpoint);
@@ -253,10 +252,10 @@ void Trainer::Start(int job, bool resume,
cluster->Register(getpid(), hostip + ":" + std::to_string(port));
int nthreads = 1;
- const vector<Worker*> workers = CreateWorkers(nthreads, model);
+ const vector<Worker*> workers = CreateWorkers(nthreads, *job);
nthreads += workers.size();
- const vector<Server*> servers = CreateServers(nthreads, model);
- SetupWorkerServer(model, workers, servers);
+ const vector<Server*> servers = CreateServers(nthreads, *job);
+ SetupWorkerServer(*job, workers, servers);
#ifdef USE_MPI
for (int i = 0; i < nthreads; i++)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index 36ba8de..b6f9d44 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -16,23 +16,16 @@ Worker::Worker(int thread_id, int grp_id, int id):
}
void Worker::Setup(
- const ModelProto& model, shared_ptr<NeuralNet> train_net,
+ const JobProto& job, shared_ptr<NeuralNet> train_net,
shared_ptr<NeuralNet> valid_net, shared_ptr<NeuralNet> test_net) {
- modelproto_.CopyFrom(model);
+ job_conf_.CopyFrom(job);
train_net_ = train_net;
validation_net_ = valid_net;
test_net_ = test_net;
auto cluster = Cluster::Get();
- // if no server or user requires worker to do param update
- if (!(cluster->nserver_groups() && cluster->server_update())) {
- updater_ = Singleton<Factory<Updater>>::Instance()->Create("Updater");
- updater_->Init(model.updater());
- }
}
Worker::~Worker() {
- if (updater_ != nullptr)
- delete updater_;
if (layer_dealer_)
delete layer_dealer_;
if (dealer_)
@@ -59,7 +52,7 @@ void Worker::InitLocalParams() {
// load from checkpoints. get param blob based on param name.
// the param from previous checkpoint files will be overwritten by
// the param with the same name in later checkpoint files.
- for (const auto checkpoint : modelproto_.checkpoint()) {
+ for (const auto checkpoint : job_conf_.checkpoint_path()) {
LOG(INFO) << "Load from checkpoint file " << checkpoint;
BlobProtos bps;
ReadProtoFromBinaryFile(checkpoint.c_str(), &bps);
@@ -67,8 +60,8 @@ void Worker::InitLocalParams() {
if (name2param.find(bps.name(i)) != name2param.end()) {
name2param.at(bps.name(i))->FromProto(bps.blob(i));
// if load from pre-training params, reset version to start step
- if(modelproto_.reset_param_version())
- name2param.at(bps.name(i))->set_version(modelproto_.step());
+ if(job_conf_.reset_param_version())
+ name2param.at(bps.name(i))->set_version(job_conf_.step());
else // if resume training, use the same version as last checkpoint
name2param.at(bps.name(i))->set_version(bps.version(i));
}
@@ -77,15 +70,15 @@ void Worker::InitLocalParams() {
// init other params who do not have checkpoint version
for (auto entry : name2param)
if (entry.second->version() < 0) {
- entry.second->InitValues(modelproto_.step());
- if (!modelproto_.reset_param_version())
+ entry.second->InitValues(job_conf_.step());
+ if (!job_conf_.reset_param_version())
LOG(ERROR) << "better reset version of params from checkpoints "
<< "to the same as other newly initialized params!";
}
Metric perf;
// warmup training before put params to servers
- for (; step_ < modelproto_.warmup_steps(); step_++)
+ for (; step_ < job_conf_.warmup_steps(); step_++)
TrainOneBatch(step_, &perf);
for (auto layer : train_net_->layers()) {
if (layer->partition_id() == id_)
@@ -99,7 +92,7 @@ void Worker::InitLocalParams() {
for (auto layer : train_net_->layers()) {
if (layer->partition_id() == id_)
for (auto param : layer->GetParams())
- Get(param, modelproto_.warmup_steps());
+ Get(param, job_conf_.warmup_steps());
}
}
@@ -153,25 +146,25 @@ void Worker::Run() {
}
}
- step_ = modelproto_.step();
+ step_ = job_conf_.step();
InitLocalParams();
Metric perf;
while (!StopNow(step_)) {
if (ValidateNow(step_)) {
//LOG(ERROR)<<"Validation at step "<<step;
CollectAll(validation_net_, step_);
- Test(modelproto_.validation_steps(), kValidation, validation_net_);
+ Test(job_conf_.valid_steps(), kValidation, validation_net_);
}
if (TestNow(step_)) {
//LOG(ERROR)<<"Test at step "<<step;
CollectAll(test_net_, step_);
- Test(modelproto_.test_steps(), kTest, test_net_);
+ Test(job_conf_.test_steps(), kTest, test_net_);
}
if (CheckpointNow(step_)) {
CollectAll(train_net_, step_);
Checkpoint(step_, train_net_);
- modelproto_.set_step(step_);
+ job_conf_.set_step(step_);
}
TrainOneBatch(step_, &perf);
// LOG(ERROR) << "Train " << step_;
@@ -296,40 +289,40 @@ void Worker::Test(int nsteps, Phase phase, shared_ptr<NeuralNet> net) {
}
bool Worker::DisplayNow(int step) const {
- return (modelproto_.display_frequency() > 0
- && step >= modelproto_.display_after()
- && ((step - modelproto_.display_after())
- % modelproto_.display_frequency() == 0));
+ return (job_conf_.disp_freq() > 0
+ && step >= job_conf_.disp_after()
+ && ((step - job_conf_.disp_after())
+ % job_conf_.disp_freq() == 0));
}
bool Worker::DisplayDebugInfo(int step) const {
- return DisplayNow(step) && modelproto_.debug() && grp_id_ == 0;
+ return DisplayNow(step) && job_conf_.debug() && grp_id_ == 0;
}
bool Worker::StopNow(int step) const {
- return step >= modelproto_.train_steps();
+ return step >= job_conf_.train_steps();
}
bool Worker::CheckpointNow(int step) const {
return (grp_id_ == 0
- && modelproto_.checkpoint_frequency() > 0
- && step >= modelproto_.checkpoint_after()
- && ((step - modelproto_.checkpoint_after())
- % modelproto_.checkpoint_frequency() == 0));
+ && job_conf_.checkpoint_freq() > 0
+ && step >= job_conf_.checkpoint_after()
+ && ((step - job_conf_.checkpoint_after())
+ % job_conf_.checkpoint_freq() == 0));
}
bool Worker::TestNow(const int step) const {
return (grp_id_ == 0
- && modelproto_.test_frequency() > 0
- && modelproto_.test_steps() > 0
- && step >= modelproto_.test_after()
- && ((step - modelproto_.test_after())
- % modelproto_.test_frequency() == 0));
+ && job_conf_.test_freq() > 0
+ && job_conf_.test_steps() > 0
+ && step >= job_conf_.test_after()
+ && ((step - job_conf_.test_after())
+ % job_conf_.test_freq() == 0));
}
bool Worker::ValidateNow(const int step) const {
return (grp_id_ == 0
- && modelproto_.validation_frequency() > 0
- && modelproto_.validation_steps() > 0
- && step >= modelproto_.validation_after()
- && ((step - modelproto_.validation_after())
- % modelproto_.validation_frequency() == 0));
+ && job_conf_.valid_freq() > 0
+ && job_conf_.valid_steps() > 0
+ && step >= job_conf_.valid_after()
+ && ((step - job_conf_.valid_after())
+ % job_conf_.valid_freq() == 0));
}
@@ -406,7 +399,7 @@ void CDWorker::NegativePhase(int step,
shared_ptr<NeuralNet> net, Metric* perf) {
// for negative phase, gibbs sampling only concerns RBM bottom and top layer
auto& layers = net->layers();
- for (int i = 0; i < modelproto_.pcd_k(); i++) {
+ for (int i = 0; i < job_conf_.cd_conf().pcd_k(); i++) {
for (auto& layer : layers) {
if (layer->is_vislayer() || layer->is_hidlayer())
layer->ComputeFeature(kNegative, perf);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1b574f3c/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 8c0b440..2f43f66 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -44,35 +44,35 @@ void Param::InitValues(int version){
auto random=TSingleton<Random<cpu>>::Instance();
switch (proto_.init_method()) {
case ParamProto::kConstant:
- data=proto_.value();
+ data = proto_.value();
break;
case ParamProto::kUniform:
random->SampleUniform(data, proto_.low(), proto_.high());
- if(proto_.value())
- data*= proto_.value();
+ if(proto_.value() != 1)
+ data *= proto_.value();
break;
- /*
case ParamProto::kUniformSqrtFanIn:
- CHECK_GT(fan_in_,0);
random->SampleUniform(data, proto_.low(), proto_.high());
- if(proto_.value())
- data*= proto_.value()/ sqrt(fan_in_ / 3.0f);
+ // only valid for param matrix with dim 1 for fan in
+ LOG(ERROR) << "init fan in";
+ CHECK_EQ(data_->shape().size(), 2);
+ data *= proto_.value() / sqrt(data_->shape().at(1) / 3.0f);
+ LOG(ERROR) << "end fan in";
break;
- */
case ParamProto::kUniformSqrtFanInOut:
random->SampleUniform(data, proto_.low(), proto_.high());
if(proto_.value())
- data*= proto_.value()/ sqrt(data_->shape()[0] +data_->shape()[1]);
+ data *= proto_.value()/ sqrt(data_->shape()[0] +data_->shape()[1]);
break;
case ParamProto::kGaussian:
random->SampleGaussian(data, proto_.mean(), proto_.std());
- if(proto_.value())
- data*= proto_.value();
+ if(proto_.value() != 1)
+ data *= proto_.value();
break;
case ParamProto::kGaussainSqrtFanIn:
random->SampleGaussian(data, proto_.mean(), proto_.std());
if(proto_.value())
- data*= proto_.value()/ sqrt(data_->shape()[0]);
+ data *= proto_.value()/ sqrt(data_->shape()[0]);
break;
default:
LOG(ERROR) << "Illegal parameter init method ";
[2/2] incubator-singa git commit: SINGA-54 Refactor job configuration
to move fields in ModelProto out
Posted by wa...@apache.org.
SINGA-54 Refactor job configuration to move fields in ModelProto out
format job.proto, move important enum types as global
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/4dee7b9c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/4dee7b9c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/4dee7b9c
Branch: refs/heads/master
Commit: 4dee7b9cd0f07eff4906e2398b7ad7f23691a508
Parents: 1b574f3
Author: wang sheng <wa...@gmail.com>
Authored: Fri Aug 14 21:56:35 2015 +0800
Committer: wang sheng <wa...@gmail.com>
Committed: Fri Aug 14 22:17:16 2015 +0800
----------------------------------------------------------------------
include/neuralnet/base_layer.h | 1 -
src/neuralnet/neuralnet.cc | 14 +-
src/proto/job.proto | 450 ++++++++++++++++++++----------------
src/utils/param.cc | 12 +-
src/utils/updater.cc | 14 +-
5 files changed, 272 insertions(+), 219 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/include/neuralnet/base_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h
index 25df95f..508fe18 100644
--- a/include/neuralnet/base_layer.h
+++ b/include/neuralnet/base_layer.h
@@ -20,7 +20,6 @@ using std::vector;
using std::string;
using std::map;
-
class Layer;
/**
* Base layer class.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 4732a36..e2565e3 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -7,10 +7,10 @@
namespace singa {
// macros to shorten the code
-#define LayerT(x) LayerProto_LayerType_k##x
+#define LayerT(x) LayerType::k##x
#define RegisterLayer(factory, id) \
- factory->Register(LayerProto_LayerType_k##id, \
+ factory->Register(LayerType::k##id, \
CreateInstance(id##Layer, Layer))
void NeuralNet::RegisterLayers() {
@@ -195,7 +195,7 @@ Node* SliceNode(Graph* graph, Node* srcnode,
string name = srcnode->name + "<";
LayerProto *proto = new LayerProto();
proto->set_name(name);
- proto->set_type(LayerProto_LayerType_kSlice);
+ proto->set_type(LayerType::kSlice);
proto->set_partition_id(
static_cast<LayerProto*>(srcnode->proto)->partition_id());
auto conf = proto->mutable_slice_conf();
@@ -215,7 +215,7 @@ Node* ConcateNodes(Graph* graph, const vector<Node*>& srcnodes, Node* dstnode) {
string name = ">" + dstnode->name;
LayerProto *proto = new LayerProto();
proto->set_name(name);
- proto->set_type(LayerProto_LayerType_kConcate);
+ proto->set_type(LayerType::kConcate);
proto->set_partition_id(
static_cast<LayerProto*>(dstnode->proto)->partition_id());
auto conf = proto->mutable_concate_conf();
@@ -234,7 +234,7 @@ Node* SplitNode(Graph* graph, Node* srcnode, const vector<Node*>& dstnodes) {
string name = srcnode->name + "+";
LayerProto *proto = new LayerProto();
proto->set_name(name);
- proto->set_type(LayerProto_LayerType_kSplit);
+ proto->set_type(LayerType::kSplit);
proto->set_partition_id(
static_cast<LayerProto*>(srcnode->proto)->partition_id());
Node* node = new Node(name, "##" + name, proto->partition_id(), proto);
@@ -251,14 +251,14 @@ void BridgeNodes(Graph* graph, Node* srcnode, Node* dstnode) {
string sname = srcnode->name + ":-";
LayerProto *sproto = new LayerProto();
sproto->set_name(sname);
- sproto->set_type(LayerProto_LayerType_kBridgeSrc);
+ sproto->set_type(LayerType::kBridgeSrc);
sproto->set_partition_id(
static_cast<LayerProto*>(srcnode->proto)->partition_id());
auto sbridge = new Node(sname, "##" + sname, sproto->partition_id(), sproto);
string dname = "-:" + dstnode->name;
LayerProto *dproto = new LayerProto();
dproto->set_name(dname);
- dproto->set_type(LayerProto_LayerType_kBridgeDst);
+ dproto->set_type(LayerType::kBridgeDst);
dproto->set_partition_id(
static_cast<LayerProto*>(dstnode->proto)->partition_id());
auto dbridge = new Node(dname, "##" + dname, dproto->partition_id(), dproto);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index a67d330..7c734bf 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -1,11 +1,21 @@
package singa;
-enum TrainOneBatchAlg {
- // Back-propagation algorithm for feed-forward models, e.g., CNN, and RNN
- kBP = 1;
- // Contrastive Divergence algorithm for RBM, DBM etc.
- kCD = 2;
-}
+// To start a training job, all we need is a JobProto object.
+// It should contain following fields
+// - Job Name (name)
+// the name to identify the job
+// - NeuralNet (neuralnet)
+// the neural network structure contains a set of layers
+// - Train One Batch (alg)
+// the training algorithm
+// - Updater (updater)
+// the protocol for updating parameters at server side
+// - Cluster Topology (cluster)
+// the distributed topology of workers/servers
+// - Training Steps (train_steps)
+// the number of training iteration
+// All other fields/functions are optional, e.g., test, checkpoint
+//
message JobProto {
// job name, e.g., "cifar10-dcnn", "mnist-mlp"
required string name = 1;
@@ -28,7 +38,8 @@ message JobProto {
// frequency of test, e.g., do test every 100 training steps
optional int32 test_freq = 20 [default = 0];
- // total num of steps for testing all test data; todo set -1 for test forever
+ // total num of steps for testing all test data;
+ // TODO(wangwei): set -1 for test forever
optional int32 test_steps = 21 [default = 0];
// frequency of validation, e.g., do validation every 100 training steps
optional int32 valid_freq = 25 [default = 0];
@@ -57,7 +68,10 @@ message JobProto {
// start validation after this num steps
optional int32 valid_after = 83 [default = 0];
- // used by SINGA; uses typically do not touch these fields
+ // for internal use
+ // users typically do not touch following fields
+
+ // resume flag
optional bool resume = 90 [default = false];
// last snapshot step
optional int32 step = 91 [default = 0];
@@ -65,9 +79,41 @@ message JobProto {
optional int32 id = 92 [default = -1];
}
-message CDProto {
- //number of steps for gibbs sampling
- optional int32 pcd_k = 1 [default = 1];
+// -----------------------
+// Protos used by JobProto
+// -----------------------
+
+message NetProto {
+ repeated LayerProto layer = 1;
+ // partitioning type for parallelism
+ optional int32 partition_dim = 20 [default = 0];
+}
+
+message UpdaterProto {
+ // updater type
+ required UpdaterType type = 1 [default = kSGD];
+ // configuration for RMSProp algorithm
+ optional RMSPropProto rmsprop_conf = 50;
+
+ // change method for learning rate
+ required ChangeMethod lr_change = 2 [default = kFixed];
+
+ // proto of change method
+ oneof change_conf {
+ FixedStepProto fixedstep_conf = 40;
+ StepProto step_conf = 41;
+ LinearProto linear_conf = 42;
+ ExponentialProto exponential_conf = 43;
+ InverseProto inverse_conf = 44;
+ InverseTProto inverset_conf = 45;
+ }
+
+ optional float momentum = 31 [default = 0];
+ optional float weight_decay = 32 [default = 0];
+ // base learning rate
+ optional float base_lr = 34 [default = 0];
+ // used to avoid divide by 0, i.e. x/(y+delta)
+ optional float delta = 35 [default = 0.00000001];
}
message ClusterProto {
@@ -83,64 +129,86 @@ message ClusterProto {
// servers and workers in different processes?
optional bool server_worker_separate = 20 [default = false];
- // port number is used by ZeroMQ
+ // port number used by ZeroMQ
optional int32 start_port = 60 [default = 6723];
- // conduct updates at server side; otherwise do it at worker side
+ // conduct updates at server side; otherwise do it at worker side
optional bool server_update = 61 [default = true];
// share memory space between worker groups in one procs
optional bool share_memory = 62 [default = true];
// bandwidth of ethernet, Bytes per second, default is 1 Gbps
- optional int32 bandwidth=80 [default=134217728];
+ optional int32 bandwidth = 80 [default = 134217728];
// poll time in milliseconds
- optional int32 poll_time=81 [default =100];
+ optional int32 poll_time = 81 [default = 100];
}
-
-enum Phase {
- kTrain = 0;
- kValidation = 1;
- kTest= 2;
- // postivie phase for contrastive divergence algorithm
- kPositive = 3;
- // negative phase for contrastive divergence algorithm
- kNegative = 4;
- kForward = 5;
- kBackward = 6;
- kLoss = 7;
+message CDProto {
+ //number of steps for gibbs sampling
+ optional int32 pcd_k = 1 [default = 1];
}
-message NetProto {
- repeated LayerProto layer = 1;
- // partitioning type for parallelism
- optional int32 partition_dim = 20 [default = 0];
+message LayerProto {
+ // the layer name used for identification
+ required string name = 1;
+ // source layer names
+ repeated string srclayers = 3;
+ // parameters, e.g., weight matrix or bias vector
+ repeated ParamProto param = 12;
+ // all layers are included in the net structure for training phase by default.
+ // some layers like data layer for loading test data are not used by training
+ // phase should be removed by setting the exclude field.
+ repeated Phase exclude = 15;
+ // the layer type
+ required LayerType type = 20;
+ // proto for the specific layer
+ oneof layer_conf {
+ // configuration for convolution layer
+ ConvolutionProto convolution_conf = 30;
+ // configuration for concatenation layer
+ ConcateProto concate_conf = 31;
+ // configuration for dropout layer
+ DropoutProto dropout_conf = 33;
+ // configuration for inner product layer
+ InnerProductProto innerproduct_conf = 34;
+ // configuration for local response normalization layer
+ DataProto lmdbdata_conf = 35;
+ // configuration for local response normalization layer
+ LRNProto lrn_conf = 45;
+ // configuration for mnist parser layer
+ MnistProto mnist_conf = 36;
+ // configuration for pooling layer
+ PoolingProto pooling_conf = 37;
+ // configuration for prefetch layer
+ PrefetchProto prefetch_conf = 44;
+ // configuration for rectified linear unit layer
+ ReLUProto relu_conf = 38;
+ // configuration for rgb image parser layer
+ RGBImageProto rgbimage_conf = 39;
+ // configuration for data layer
+ DataProto sharddata_conf = 32;
+ // configuration for slice layer
+ SliceProto slice_conf = 41;
+ // configuration for softmax loss layer
+ SoftmaxLossProto softmaxloss_conf = 40;
+ // configuration for split layer
+ SplitProto split_conf = 42;
+ // configuration for tanh layer
+ TanhProto tanh_conf = 43;
+ // configuration for rbmvis layer
+ RBMVisProto rbmvis_conf = 48;
+ // configuration for rbmhid layer
+ RBMHidProto rbmhid_conf = 49;
+ }
+
+ // overrides the partition dimension for neural net
+ optional int32 partition_dim = 60 [default = -1];
+ // names of parameters shared from other layers
+ optional int32 partition_id = 90 [default = 0];
}
-// weight matrix should be defined before bias vector;
-// todo separate conf for diff init method
+// weight matrix should be defined before bias vector
+// TODO(wangwei): separate conf for diff init method
message ParamProto {
- enum InitMethod {
- // fix the values of all parameters a constant in the value field
- kConstant = 0;
- // sample gaussian with std and mean
- kGaussian = 1;
- // uniform sampling between low and high
- kUniform = 2;
- // copy the content and history which are from previous training
- kPretrained = 3;
- // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
- // Gaussian distribution
- kGaussainSqrtFanIn = 4;
- // from Toronto Convnet, rectified linear activation, let
- // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
- // the program will multiply it.
- kUniformSqrtFanIn = 5;
- // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
- // activation, range is [-a, +a], for sigmoid activation, range is
- // [-4a, +4a], put the scale factor to value field.
- // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
- kUniformSqrtFanInOut = 6;
- }
// used for identifying the same params from diff models and display deug info
optional string name = 1 [default = ""];
optional InitMethod init_method = 2 [default = kGaussian];
@@ -157,7 +225,7 @@ message ParamProto {
// multiplied on the global weight decay.
optional float weight_decay_multiplier = 16 [default = 1];
- // name of the owner param from which this param shares the values
+ // name of the owner param from which this param shares the values
optional string share_from = 60;
// used interally
@@ -170,91 +238,9 @@ message ParamProto {
repeated int32 shape = 93;
}
-enum PartitionType{
- kDataPartition=0;
- kLayerPartition=1;
- kNone=2;
-}
-
-message LayerProto {
- // the layer name used for identification
- required string name = 1;
- enum LayerType{
- kBridgeSrc = 15;
- kBridgeDst = 16;
- kConvolution = 1;
- kConcate = 2;
- kShardData = 3;
- kDropout = 4;
- kInnerProduct = 5;
- kLabel = 18;
- kLMDBData = 17;
- kLRN = 6;
- kMnist = 7;
- kPooling = 8;
- kPrefetch = 19;
- kReLU = 9;
- kRGBImage = 10;
- kSoftmaxLoss = 11;
- kSlice = 12;
- kSplit = 13;
- kTanh = 14;
- kRBMVis = 23;
- kRBMHid = 24;
- }
- // source layer names
- repeated string srclayers = 3;
- // parameters, e.g., weight matrix or bias vector
- repeated ParamProto param = 12;
- // all layers are included in the net structure for training phase by default.
- // some layers like data layer for loading test data are not used by training
- // phase should be removed by setting the exclude field.
- repeated Phase exclude = 15;
- // the layer type from the enum above
- required LayerType type = 20;
- // configuration for convolution layer
- optional ConvolutionProto convolution_conf = 30;
- // configuration for concatenation layer
- optional ConcateProto concate_conf = 31;
- // configuration for dropout layer
- optional DropoutProto dropout_conf = 33;
- // configuration for inner product layer
- optional InnerProductProto innerproduct_conf = 34;
- // configuration for local response normalization layer
- optional DataProto lmdbdata_conf = 35;
- // configuration for local response normalization layer
- optional LRNProto lrn_conf = 45;
- // configuration for mnist parser layer
- optional MnistProto mnist_conf= 36;
- // configuration for pooling layer
- optional PoolingProto pooling_conf = 37;
- // configuration for prefetch layer
- optional PrefetchProto prefetch_conf = 44;
- // configuration for rectified linear unit layer
- optional ReLUProto relu_conf = 38;
- // configuration for rgb image parser layer
- optional RGBImageProto rgbimage_conf = 39;
- // configuration for data layer
- optional DataProto sharddata_conf = 32;
- // configuration for slice layer
- optional SliceProto slice_conf = 41;
- // configuration for softmax loss layer
- optional SoftmaxLossProto softmaxloss_conf = 40;
- // configuration for split layer
- optional SplitProto split_conf = 42;
- // configuration for tanh layer
- optional TanhProto tanh_conf = 43;
- // configuration for rbmvis layer
- optional RBMVisProto rbmvis_conf = 48;
- // configuration for rbmhid layer
- optional RBMHidProto rbmhid_conf = 49;
-
-
- // overrides the partition dimension for neural net
- optional int32 partition_dim =60 [default = -1];
- // names of parameters shared from other layers
- optional int32 partition_id = 90 [default = 0];
-}
+// ---------------------------
+// protos for different layers
+// ---------------------------
message RGBImageProto {
// scale factor for each pixel
@@ -272,7 +258,7 @@ message PrefetchProto {
}
message SplitProto {
- optional int32 num_splits = 1 [default =1];
+ optional int32 num_splits = 1 [default = 1];
}
// scaled tan: A*tan(B*x)
@@ -287,14 +273,14 @@ message SoftmaxLossProto {
// computing accuracy against topk results
optional int32 topk = 1 [default = 1];
// loss scale factor
- optional float scale= 30 [default = 1];
+ optional float scale = 30 [default = 1];
}
message ConvolutionProto {
// The number of outputs for the layer
required int32 num_filters = 1;
// the kernel height/width
- required int32 kernel= 2;
+ required int32 kernel = 2;
// The padding height/width
optional int32 pad = 30 [default = 0];
@@ -377,7 +363,7 @@ message LRNProto {
// normalization objective
optional NormRegion norm_region = 33 [default = ACROSS_CHANNELS];
// offset
- optional float knorm =34 [default = 1.0];
+ optional float knorm = 34 [default = 1.0];
}
message PoolingProto {
@@ -395,7 +381,7 @@ message PoolingProto {
optional uint32 stride = 32 [default = 1];
}
-message SliceProto{
+message SliceProto {
required int32 slice_dim = 1;
}
@@ -406,83 +392,151 @@ message ReLUProto {
optional float negative_slope = 1 [default = 0];
}
-message UpdaterProto {
- enum UpdaterType{
- // noraml SGD with momentum and weight decay
- kSGD = 1;
- // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
- kAdaGrad = 2;
- // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
- kRMSProp = 3;
- // Nesterov first optimal gradient method
- kNesterov = 4;
- }
- // updater type
- required UpdaterType type = 1 [default=kSGD];
- // configuration for RMSProp algorithm
- optional RMSPropProto rmsprop_conf = 50;
-
- enum ChangeMethod {
- kFixed = 0;
- kInverseT = 1;
- kInverse = 2;
- kExponential = 3;
- kLinear = 4;
- kStep = 5;
- kFixedStep = 6;
- }
- // change method for learning rate
- required ChangeMethod lr_change= 2 [default = kFixed];
-
- optional FixedStepProto fixedstep_conf=40;
- optional StepProto step_conf=41;
- optional LinearProto linear_conf=42;
- optional ExponentialProto exponential_conf=43;
- optional InverseProto inverse_conf=44;
- optional InverseTProto inverset_conf=45;
-
- optional float momentum = 31 [default = 0];
- optional float weight_decay = 32 [default = 0];
- // base learning rate
- optional float base_lr = 34 [default = 0];
- // used to avoid divide by 0, i.e. x/(y+delta)
- optional float delta = 35 [default = 0.00000001];
-}
-
-message RMSPropProto{
+message RMSPropProto {
// history=history*rho_+(1-rho_)*(grad*grad_scale);
required float rho = 1;
}
-message FixedStepProto{
+message FixedStepProto {
repeated int32 step = 28;
// lr = step_lr[i] if current step >= step[i]
repeated float step_lr = 29;
}
-message StepProto{
+message StepProto {
// lr = base_lr * gamma^(step/change_freq)
required float gamma = 35 [default = 1];
// lr = base_lr * gamma^(step/change_freq)
- required int32 change_freq= 40;
+ required int32 change_freq = 40;
}
-message LinearProto{
+
+message LinearProto {
// lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
required int32 change_freq= 40;
// lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
required float final_lr = 39;
}
-message ExponentialProto{
+
+message ExponentialProto {
// lr = base / 2^(step/change_freq)
- required int32 change_freq= 40;
+ required int32 change_freq = 40;
}
-message InverseTProto{
+
+message InverseTProto {
// lr = base_lr / (1+step/final_lr)
required float final_lr = 39;
}
-message InverseProto{
+message InverseProto {
// lr = base_lr*(1+gamma*step)^(-pow)
required float gamma = 1 [default = 1];
// lr = base_lr*(1+gamma*step)^(-pow)
required float pow = 2 [default = 0];
}
+
+// --------------
+// All Enum Types
+// --------------
+
+enum ChangeMethod {
+ kFixed = 0;
+ kInverseT = 1;
+ kInverse = 2;
+ kExponential = 3;
+ kLinear = 4;
+ kStep = 5;
+ kFixedStep = 6;
+}
+
+enum InitMethod {
+ // fix the values of all parameters a constant in the value field
+ kConstant = 0;
+ // sample gaussian with std and mean
+ kGaussian = 1;
+ // uniform sampling between low and high
+ kUniform = 2;
+ // copy the content and history which are from previous training
+ kPretrained = 3;
+ // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
+ // Gaussian distribution
+ kGaussainSqrtFanIn = 4;
+ // from Toronto Convnet, rectified linear activation, let
+ // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
+ // the program will multiply it.
+ kUniformSqrtFanIn = 5;
+ // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
+ // activation, range is [-a, +a], for sigmoid activation, range is
+ // [-4a, +4a], put the scale factor to value field.
+ // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
+ kUniformSqrtFanInOut = 6;
+}
+
+enum LayerType {
+ // Data layers
+ // - Load records from file, database
+ kLMDBData = 17;
+ kPrefetch = 19;
+ kShardData = 3;
+ // Parser layers
+ // - Parse features from records, e.g., pixels
+ kLabel = 18;
+ kMnist = 7;
+ kRGBImage = 10;
+ // Neuron layers
+ // - Feature transformation
+ kConcate = 2;
+ kConvolution = 1;
+ kDropout = 4;
+ kInnerProduct = 5;
+ kLRN = 6;
+ kPooling = 8;
+ kReLU = 9;
+ kRBMHid = 24;
+ kRBMVis = 23;
+ kTanh = 14;
+ // Loss layers
+ // - Compute objective loss
+ kSoftmaxLoss = 11;
+ // Other layers
+ // - Connect layers when neural net is partitioned
+ kBridgeDst = 16;
+ kBridgeSrc = 15;
+ kSlice = 12;
+ kSplit = 13;
+}
+
+enum PartitionType {
+ kDataPartition = 0;
+ kLayerPartition = 1;
+ kNone = 2;
+}
+
+enum Phase {
+ kTrain = 0;
+ kValidation = 1;
+ kTest= 2;
+ // postivie phase for contrastive divergence algorithm
+ kPositive = 3;
+ // negative phase for contrastive divergence algorithm
+ kNegative = 4;
+ kForward = 5;
+ kBackward = 6;
+ kLoss = 7;
+}
+
+enum TrainOneBatchAlg {
+ // Back-propagation algorithm for feed-forward models, e.g., CNN and RNN
+ kBP = 1;
+ // Contrastive Divergence algorithm for RBM, DBM, etc.
+ kCD = 2;
+}
+
+enum UpdaterType {
+ // noraml SGD with momentum and weight decay
+ kSGD = 1;
+ // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
+ kAdaGrad = 2;
+ // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+ kRMSProp = 3;
+ // Nesterov first optimal gradient method
+ kNesterov = 4;
+}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 2f43f66..b470ea2 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -43,15 +43,15 @@ void Param::InitValues(int version){
Tensor<cpu, 1> data(mutable_cpu_data(), Shape1(size()));
auto random=TSingleton<Random<cpu>>::Instance();
switch (proto_.init_method()) {
- case ParamProto::kConstant:
+ case InitMethod::kConstant:
data = proto_.value();
break;
- case ParamProto::kUniform:
+ case InitMethod::kUniform:
random->SampleUniform(data, proto_.low(), proto_.high());
if(proto_.value() != 1)
data *= proto_.value();
break;
- case ParamProto::kUniformSqrtFanIn:
+ case InitMethod::kUniformSqrtFanIn:
random->SampleUniform(data, proto_.low(), proto_.high());
// only valid for param matrix with dim 1 for fan in
LOG(ERROR) << "init fan in";
@@ -59,17 +59,17 @@ void Param::InitValues(int version){
data *= proto_.value() / sqrt(data_->shape().at(1) / 3.0f);
LOG(ERROR) << "end fan in";
break;
- case ParamProto::kUniformSqrtFanInOut:
+ case InitMethod::kUniformSqrtFanInOut:
random->SampleUniform(data, proto_.low(), proto_.high());
if(proto_.value())
data *= proto_.value()/ sqrt(data_->shape()[0] +data_->shape()[1]);
break;
- case ParamProto::kGaussian:
+ case InitMethod::kGaussian:
random->SampleGaussian(data, proto_.mean(), proto_.std());
if(proto_.value() != 1)
data *= proto_.value();
break;
- case ParamProto::kGaussainSqrtFanIn:
+ case InitMethod::kGaussainSqrtFanIn:
random->SampleGaussian(data, proto_.mean(), proto_.std());
if(proto_.value())
data *= proto_.value()/ sqrt(data_->shape()[0]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4dee7b9c/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index b85982e..c038ca7 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -13,38 +13,38 @@ float Updater::GetLearningRate(int step) {
float ret = 0., r = 0., base = proto_.base_lr();
int freq = 0;
switch (proto_.lr_change()) {
- case UpdaterProto_ChangeMethod_kFixed:
+ case ChangeMethod::kFixed:
ret = base;
break;
- case UpdaterProto_ChangeMethod_kLinear:
+ case ChangeMethod::kLinear:
// a is init, b is the final
freq = proto_.linear_conf().change_freq();
r = step * 1.0 / freq;
ret = (1.0 - r) * base + r * proto_.linear_conf().final_lr();
break;
- case UpdaterProto_ChangeMethod_kExponential:
+ case ChangeMethod::kExponential:
// a is init, b is the final, from convnet
freq = proto_.exponential_conf().change_freq();
ret = base / pow(2, step * 1. / freq);
break;
- case UpdaterProto_ChangeMethod_kInverseT:
+ case ChangeMethod::kInverseT:
// a is init, b is the final, from convnet
CHECK_EQ(base, 2 * proto_.inverset_conf().final_lr())
<< "final value should be the half";
ret = base / (1. + step * 1. / proto_.inverset_conf().final_lr());
break;
- case UpdaterProto_ChangeMethod_kInverse:
+ case ChangeMethod::kInverse:
// a is init, b is gamma, c is pow
ret = base * pow(1.f + proto_.inverse_conf().gamma() * step,
- proto_.inverse_conf().pow());
break;
- case UpdaterProto_ChangeMethod_kStep:
+ case ChangeMethod::kStep:
// a is the base learning rate, b is gamma, from caffe
// notice it is step/change_steps, not step*1.0/change_steps
freq = proto_.step_conf().change_freq();
ret = base * pow(proto_.step_conf().gamma(), step / freq);
break;
- case UpdaterProto_ChangeMethod_kFixedStep:
+ case ChangeMethod::kFixedStep:
for (int i = 0; i < proto_.fixedstep_conf().step_size(); i++) {
if (step > proto_.fixedstep_conf().step(i))
ret = proto_.fixedstep_conf().step_lr(i);