You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/08/03 04:38:59 UTC
incubator-singa git commit: SINGA-42 Issue when loading checkpoints

Repository: incubator-singa
Updated Branches:
  refs/heads/master 06163950b -> a92a1c778


SINGA-42 Issue when loading checkpoints

Update default value for ModelProto::reset_param_version to true.
It will reset all parameter version to ModelProto::step.

For resuming training from checkpoints, if users do not set it, the Trainer::Resume() function
will set it to false. Hence the parameter versions will continue from last checkpoint.
If users set it to true, then all parameters will be reset to the version as ModelProto::step.

If using the checkpoint as pre-training to initalize new model parameters,
users better use the default value (i.e., true),
otherwise some parameters' version would be much larger than others.

fixbug from calling Worker::Put() in Worker::InitLocalParam().

Previously, the version/step passed to Put() is step_ which starts from ModelProto::step.
Hence, the params in the servers are put with version step_.
When the params are load from checkpoint files and their param versions are not reset,
then the trainining may get stuck after one iteration.
Because if the start step is small (usually 0), parame version at the server
side is small (i.e., 0), while the local_version() (assigned from version()) is the one from last checkpoint,
which is large. Hence the Worker::Collect() function will get stuck.
To fix this bug, just pass the current param->version() to Worker::Put().

remove hard code check for label value in layer.cc

Fixbug from setting checkpoint file path in Worker::Resume().
Now the Worker::Resume() will clear checkpoint field in JobProto and add
the checkpoint files it finds under WORKSPACE/checkpoint/.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a92a1c77
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a92a1c77
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a92a1c77

Branch: refs/heads/master
Commit: a92a1c7786c2f65344f8f3ff1cfc4aa545724b09
Parents: 0616395
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu Jul 30 12:24:58 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Aug 1 12:10:56 2015 +0800

----------------------------------------------------------------------
 src/neuralnet/layer.cc |  4 ++--
 src/proto/job.proto    |  2 +-
 src/trainer/trainer.cc | 13 ++++++++-----
 src/trainer/worker.cc  | 20 +++++++++++++++-----
 4 files changed, 26 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a92a1c77/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index 926bd17..1fa92fb 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -385,7 +385,7 @@ void LabelLayer::ParseRecords(Phase phase, const vector<Record>& records,
   float *label= blob->mutable_cpu_data() ;
   for(const Record& record: records){
     label[rid++]=record.image().label();
-    CHECK_LT(record.image().label(),10);
+    //  CHECK_LT(record.image().label(),10);
   }
   CHECK_EQ(rid, blob->shape()[0]);
 }
@@ -738,7 +738,7 @@ void SoftmaxLossLayer::ComputeFeature(Phase phase, Metric* perf) {
   float loss=0, precision=0;
   for(int n=0;n<batchsize_;n++){
     int ilabel=static_cast<int>(label[n]);
-    CHECK_LT(ilabel,10);
+    //  CHECK_LT(ilabel,10);
     CHECK_GE(ilabel,0);
     float prob_of_truth=probptr[ilabel];
     loss-=log(std::max(prob_of_truth, FLT_MIN));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a92a1c77/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index 025f256..3b22470 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -95,7 +95,7 @@ message ModelProto {
   // checkpoint files
   repeated string checkpoint = 66;
   // reset the version of params loaded from checkpoint file to step
-  optional bool reset_param_version = 67 [default = false];
+  optional bool reset_param_version = 67 [default = true];
   //number of steps for gibbs sampling
   optional int32 pcd_k=69 [default=15];
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a92a1c77/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index 9f245a2..7a20e53 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -200,7 +200,7 @@ void Trainer::Resume(ModelProto* modelConf) {
   tinydir_open(&dir, folder.c_str());
   int latest_step = 0;
   // there would be multi checkpoint files (from diff workers) for one step
-  vector<char *> ck_files;
+  vector<string> ck_files;
   // iterate all files to get the files for the last checkpoint
   while (dir.has_next) {
     tinydir_file file;
@@ -213,21 +213,24 @@ void Trainer::Resume(ModelProto* modelConf) {
       continue;
     }
 
-    LOG(ERROR) << ch;
+    LOG(INFO) << "Add checkpoint file for resume: " << ch;
     int step = atoi(ch+4);
     if (step == latest_step) {
       ck_files.push_back(file.name);
     } else if(step > latest_step) {
       latest_step = step;
       ck_files.clear();
-      ck_files.push_back(file.name);
+      ck_files.push_back(string(file.name));
     }
   }
 
   if (latest_step > 0) {
     modelConf->set_step(latest_step);
+    if (!modelConf->has_reset_param_version())
+      modelConf->set_reset_param_version(false);
+    modelConf->clear_checkpoint();
     for (auto ck_file : ck_files)
-      modelConf->add_checkpoint(folder + "/" +string(ck_file));
+      modelConf->add_checkpoint(folder + "/" + ck_file);
   }
   tinydir_close(&dir);
 }
@@ -310,7 +313,7 @@ void Trainer::Run(
       } else if (sock == nullptr) {
         if (nserver_grps > 1 && bandwidth(trans_size, start) < max_bandwidth) {
           Msg* msg = GenSyncReminderMsg(sync_server_id, servers);
-          router_->Send(&msg);
+          router_->Send(&msg) ;
           sync_server_id = (sync_server_id + 1) % nservers;
         }
         continue;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a92a1c77/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index d9c6a59..36ba8de 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -43,7 +43,7 @@ void Worker::InitLocalParams() {
   // for each server grp, its first subscriber worker grp does the param init
   if (grp_id_ % Cluster::Get()->nworker_groups_per_server_group() == 0) {
     // extract params that should be initialized by this worker
-    // Must gen a name for each param if the user doesn't config it
+    // must gen a name for each param if the user doesn't config it
     std::unordered_map<string, Param*> name2param;
     for (auto layer: train_net_->layers()){
       if (layer->partition_id() == id_) {
@@ -56,7 +56,9 @@ void Worker::InitLocalParams() {
         }
       }
     }
-    // load from checkpoint. Get param blob based on param name
+    // load from checkpoints. get param blob based on param name.
+    // the param from previous checkpoint files will be overwritten by
+    // the param with the same name in later checkpoint files.
     for (const auto checkpoint : modelproto_.checkpoint()) {
       LOG(INFO) << "Load from checkpoint file " << checkpoint;
       BlobProtos bps;
@@ -64,14 +66,22 @@ void Worker::InitLocalParams() {
       for (int i = 0; i < bps.name_size(); i++) {
         if (name2param.find(bps.name(i)) != name2param.end()) {
           name2param.at(bps.name(i))->FromProto(bps.blob(i));
-          name2param.at(bps.name(i))->set_version(bps.version(i));
+          //  if load from pre-training params, reset version to start step
+          if(modelproto_.reset_param_version())
+            name2param.at(bps.name(i))->set_version(modelproto_.step());
+          else  // if resume training, use the same version as last checkpoint
+            name2param.at(bps.name(i))->set_version(bps.version(i));
         }
       }
     }
     // init other params who do not have checkpoint version
     for (auto entry : name2param)
-      if (entry.second->version() < 0 || modelproto_.reset_param_version())
+      if (entry.second->version() < 0) {
         entry.second->InitValues(modelproto_.step());
+        if (!modelproto_.reset_param_version())
+          LOG(ERROR) << "better reset version of params from checkpoints "
+            << "to the same as other newly initialized params!";
+      }
 
     Metric perf;
     // warmup training before put params to servers
@@ -81,7 +91,7 @@ void Worker::InitLocalParams() {
       if (layer->partition_id() == id_)
         for (auto param : layer->GetParams())
           if (param->owner() == param->id())
-            Put(param, step_);
+            Put(param, param->version());
     }
   }
   // wait owners in the same procs init params, then no get requests sent