You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/05/27 16:39:13 UTC

[12/22] incubator-singa git commit: add retry for worker connecting to zk, when its subscribed server group is not up yet

add retry for worker connecting to zk, when its subscribed server group is not up yet


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/96121bae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/96121bae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/96121bae

Branch: refs/heads/master
Commit: 96121bae15b4fe2dbd8541f5daf47ab91d080bb7
Parents: cd9fc79
Author: wangsheng <wa...@gmail.com>
Authored: Tue May 26 16:52:39 2015 +0800
Committer: wangsheng <wa...@gmail.com>
Committed: Tue May 26 16:52:39 2015 +0800

----------------------------------------------------------------------
 Makefile.example           |  2 +-
 include/utils/cluster.h    | 13 -------------
 include/utils/cluster_rt.h |  2 ++
 src/utils/cluster_rt.cc    | 33 +++++++++++++++++++++------------
 4 files changed, 24 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/Makefile.example
----------------------------------------------------------------------
diff --git a/Makefile.example b/Makefile.example
index 582e8d7..6d8d83a 100644
--- a/Makefile.example
+++ b/Makefile.example
@@ -51,7 +51,7 @@ OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
 .PHONY: singa test
 
 singa: $(PROTO_OBJS) $(SINGA_OBJS)
-	$(CXX) $(SINGA_OBJS) src/main.cc -o $(BUILD_DIR)/singa $(CXXFLAGS) $(LDFLAGS)
+	$(CXX) $(SINGA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS)
 	@echo
 
 loader: proto $(LOADER_OBJS)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster.h
----------------------------------------------------------------------
diff --git a/include/utils/cluster.h b/include/utils/cluster.h
index 563045d..d7ac365 100644
--- a/include/utils/cluster.h
+++ b/include/utils/cluster.h
@@ -109,19 +109,6 @@ class Cluster {
   }
    */
 
-  //ClusterRuntime functions
-  bool server_watch(int gid, int sid) const {
-    return false;
-  }
-
-  bool worker_join_sgroup(int gid, int wid, int server_group) const {
-    return false;
-  }
-
-  bool worker_leave_sgroup(int gid, int wid, int s_group) const {
-    return false;
-  }
-
   shared_ptr<ClusterRuntime> runtime() const {
     return cluster_rt_;
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster_rt.h
----------------------------------------------------------------------
diff --git a/include/utils/cluster_rt.h b/include/utils/cluster_rt.h
index 1430119..54a13c5 100644
--- a/include/utils/cluster_rt.h
+++ b/include/utils/cluster_rt.h
@@ -74,6 +74,8 @@ class ZKClusterRT : public ClusterRuntime{
   vector<RTCallback *> cb_vec_;
     
   const int MAX_BUF_LEN = 50;
+  const int RETRY_NUM = 10;
+  const int SLEEP_SEC = 1;
 };
 
 } // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/src/utils/cluster_rt.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc
index 6a12ca9..b97fadc 100644
--- a/src/utils/cluster_rt.cc
+++ b/src/utils/cluster_rt.cc
@@ -81,21 +81,30 @@ bool ZKClusterRT::wJoinSGroup(int gid, int wid, int s_group){
   string path = getSGroupPath(s_group) + getWorkerPath(gid, wid);
   char buf[MAX_BUF_LEN];
 
-  int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, buf, MAX_BUF_LEN);
-  if (ret == ZOK){
-    LOG(INFO) << "zookeeper node " << buf << " created";
-    return true;
-  }
-  else if (ret == ZNODEEXISTS){
-    LOG(WARNING) << "zookeeper node " << path << " already exist";
-    return true;
-  }
-  else if (ret == ZNONODE){
-    LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) << " not exist";
+  //try to create a file under the server group path
+  for (int i = 0; i < RETRY_NUM; ++i){
+    //send the zk request
+    int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, buf, MAX_BUF_LEN);
+
+    if (ret == ZOK){
+      LOG(INFO) << "zookeeper node " << buf << " created";
+      return true;
+    }
+    else if (ret == ZNODEEXISTS){
+      LOG(WARNING) << "zookeeper node " << path << " already exist";
+      return true;
+    }
+    //the parent node is not on, need to wait
+    else if (ret == ZNONODE){
+      LOG(WARNING) << "zookeeper parent node " << getSGroupPath(s_group) << " not exist, retry later";
+      sleep(SLEEP_SEC);
+    }
+    
+    LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)";
     return false;
   }
 
-  LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)";
+  LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) << "still not exist after " << RETRY_NUM << " tries";
   return false;
 }