You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/05/28 10:26:22 UTC

incubator-singa git commit: fix bugs for early server termination

Repository: incubator-singa
Updated Branches:
  refs/heads/master 921f9277f -> 6bcaaaa4d


fix bugs for early server termination

-- clean singa meta in zookeeper before running a singa job


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6bcaaaa4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6bcaaaa4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6bcaaaa4

Branch: refs/heads/master
Commit: 6bcaaaa4dc873bd54284e69720f358b540b2d370
Parents: 921f927
Author: wang sheng <wa...@gmail.com>
Authored: Thu May 28 15:04:08 2015 +0800
Committer: wang sheng <wa...@gmail.com>
Committed: Thu May 28 15:04:08 2015 +0800

----------------------------------------------------------------------
 README.md               |  2 +-
 bin/singa-cleanup.sh    |  8 +++++++-
 bin/singa-run.sh        | 21 +++++++++++----------
 src/utils/cluster_rt.cc | 33 +++++++++++++++++++++++----------
 4 files changed, 42 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 66a2964..5effdb8 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ SINGA is developed and tested on Linux platforms with the following external lib
 
   * czmq version >= 3
 
-  * zookeeper version >= 3.4.6
+  * zookeeper version 3.4.6
 
 Tips:
 For libraries like openblas, opencv, older versions may also work, because we do not use any newly added features.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/bin/singa-cleanup.sh
----------------------------------------------------------------------
diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh
index c38d1e6..f94c9db 100755
--- a/bin/singa-cleanup.sh
+++ b/bin/singa-cleanup.sh
@@ -33,5 +33,11 @@ usage="Usage: singa-cleanup.sh"
 BIN=`dirname "${BASH_SOURCE-$0}"`
 BIN=`cd "$BIN">/dev/null; pwd`
 BASE=`cd "$BIN/..">/dev/null; pwd`
+ZKDATADIR="/tmp/zookeeper"
+
+. $BIN/zk-service.sh stop 2>/dev/null
+
+echo cleanning data in zookeeper...
+#remove zk data
+rm -r $ZKDATADIR
 
-$BIN/zk-service.sh stop

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/bin/singa-run.sh
----------------------------------------------------------------------
diff --git a/bin/singa-run.sh b/bin/singa-run.sh
index 3ee50a3..c911ec3 100755
--- a/bin/singa-run.sh
+++ b/bin/singa-run.sh
@@ -34,22 +34,23 @@ BIN=`dirname "${BASH_SOURCE-$0}"`
 BIN=`cd "$BIN">/dev/null; pwd`
 BASE=`cd "$BIN/..">/dev/null; pwd`
 
-#get argument
-cmd=$1
-
 cd $BASE
 
-$BIN/zk-service.sh start
+#cleanup singa data
+. $BIN/singa-cleanup.sh
+
+#start zookeeper
+. $BIN/zk-service.sh start 2>/dev/null
 
 #wait for zk service to be up
 sleep 3
 
+#run singa
+cmd="./singa "$@
 echo starting singa ...
+echo executing: $cmd
+exec $cmd
 
-echo "./singa" $@
-#. ./singa $@
-. ./singa $@
-
+#stop zookeeper
 echo stopping singa ...
-
-$BIN/zk-service.sh stop
+. $BIN/zk-service.sh stop 2>/dev/null

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/src/utils/cluster_rt.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc
index fe9850f..b60b334 100644
--- a/src/utils/cluster_rt.cc
+++ b/src/utils/cluster_rt.cc
@@ -157,17 +157,30 @@ void ZKClusterRT::childChanges(zhandle_t *zh, int type, int state, const char *p
   RTCallback *cb = (RTCallback *)watcherCtx;
   if (cb->fn == nullptr) return;
 
-  struct String_vector child;
-  //check the child list and put another watcher
-  int ret = zoo_wget_children(zh, path, childChanges, watcherCtx, &child);
-  LOG(INFO) << "ret = " << ret;
-  if (ret == ZOK){
-    LOG(INFO) << "child.count = " << child.count;
-    if (child.count == 0){
-      //all workers leave, we do callback now
-      (*cb->fn)(cb->ctx);
-      cb->fn = nullptr;
+  if (type == ZOO_CHILD_EVENT){
+    struct String_vector child;
+    //check the child list and put another watcher
+    int ret = zoo_wget_children(zh, path, childChanges, watcherCtx, &child);
+    LOG(INFO) << "ret = " << ret;
+    if (ret == ZOK){
+      LOG(INFO) << "child.count = " << child.count;
+      if (child.count == 0){
+        //LOG(ERROR) << "do call back";
+        //LOG(ERROR) << "type = " << type;
+        //LOG(ERROR) << "state = " << state;
+        //LOG(ERROR) << "path = " << path;
+        
+        //all workers leave, we do callback now
+        (*cb->fn)(cb->ctx);
+        cb->fn = nullptr;
+      }
     }
+    else{
+      LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_wget_children)";
+    }
+  }
+  else{
+    LOG(ERROR) << "Unhandled callback type code: "<< type;
   }
 }