You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/07/18 10:38:41 UTC

[3/3] incubator-singa git commit: SINGA-32 Implement synchronous training framework

SINGA-32 Implement synchronous training framework

Fix a bug from InitLocalParam() of Worker class.
One worker owns one Param if the param's data blob is not shared from other workers.
Previously, a Worker would not send Get request for one Param if it owns this Param.
But it may not init the Param locally because it is not the first group in a group
set which subscribe to the same server group.

To fix the bug, all workers would send Get requests for Params in its local layers.
There would not extra cost for getting Params owned by the worker (from the first group),
because the get reqest would not be sent (the param version is already the latest).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/96bedb22
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/96bedb22
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/96bedb22

Branch: refs/heads/master
Commit: 96bedb2264f7d4ebd8a2a0cad67dc9a91f5419c9
Parents: 585e275
Author: wang wei <wa...@comp.nus.edu.sg>
Authored: Fri Jul 17 16:04:21 2015 +0800
Committer: wang wei <wa...@comp.nus.edu.sg>
Committed: Fri Jul 17 16:04:21 2015 +0800

----------------------------------------------------------------------
 src/trainer/trainer.cc | 2 +-
 src/trainer/worker.cc  | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96bedb22/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index 3ecaad0..44c37ea 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -345,7 +345,7 @@ void Trainer::DisplayMetric(Msg** msg) {
     char prefix[128];
     msgg->ParseFormatFrame("s", prefix);
     CHECK(msgg->NextFrame());
-    const string perf(static_cast<char*>(msgg->FrameData()), msgg->FrameSize());;
+    const string perf(static_cast<char*>(msgg->FrameData()), msgg->FrameSize());
     Metric cur(perf);
     LOG(ERROR) << prefix << " step-" << step <<", " << cur.ToLogString();
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96bedb22/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index bf98f0b..e1f2a41 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -68,8 +68,7 @@ void Worker::InitLocalParams() {
   for (auto layer : train_net_->layers()) {
     if (layer->partition_id() == id_)
       for (auto param : layer->GetParams())
-        if (param->owner() != param->id())
-          Get(param, modelproto_.warmup_steps());
+        Get(param, modelproto_.warmup_steps());
   }
 }
 
@@ -114,7 +113,7 @@ void Worker::Run() {
       Test(modelproto_.test_steps(), kTest, test_net_);
     }
     TrainOneBatch(step_, &perf);
-    //LOG(ERROR)<<"Train "<<step;
+    // LOG(ERROR) << "Train " << step_;
     if (DisplayNow(step_)) {
       Report("Train", perf);
       perf.Reset();