You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/05/28 10:26:22 UTC
incubator-singa git commit: fix bugs for early server termination
Repository: incubator-singa
Updated Branches:
refs/heads/master 921f9277f -> 6bcaaaa4d
fix bugs for early server termination
-- clean singa meta in zookeeper before running a singa job
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6bcaaaa4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6bcaaaa4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6bcaaaa4
Branch: refs/heads/master
Commit: 6bcaaaa4dc873bd54284e69720f358b540b2d370
Parents: 921f927
Author: wang sheng <wa...@gmail.com>
Authored: Thu May 28 15:04:08 2015 +0800
Committer: wang sheng <wa...@gmail.com>
Committed: Thu May 28 15:04:08 2015 +0800
----------------------------------------------------------------------
README.md | 2 +-
bin/singa-cleanup.sh | 8 +++++++-
bin/singa-run.sh | 21 +++++++++++----------
src/utils/cluster_rt.cc | 33 +++++++++++++++++++++++----------
4 files changed, 42 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 66a2964..5effdb8 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ SINGA is developed and tested on Linux platforms with the following external lib
* czmq version >= 3
- * zookeeper version >= 3.4.6
+ * zookeeper version 3.4.6
Tips:
For libraries like openblas, opencv, older versions may also work, because we do not use any newly added features.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/bin/singa-cleanup.sh
----------------------------------------------------------------------
diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh
index c38d1e6..f94c9db 100755
--- a/bin/singa-cleanup.sh
+++ b/bin/singa-cleanup.sh
@@ -33,5 +33,11 @@ usage="Usage: singa-cleanup.sh"
BIN=`dirname "${BASH_SOURCE-$0}"`
BIN=`cd "$BIN">/dev/null; pwd`
BASE=`cd "$BIN/..">/dev/null; pwd`
+ZKDATADIR="/tmp/zookeeper"
+
+. $BIN/zk-service.sh stop 2>/dev/null
+
+echo cleanning data in zookeeper...
+#remove zk data
+rm -r $ZKDATADIR
-$BIN/zk-service.sh stop
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/bin/singa-run.sh
----------------------------------------------------------------------
diff --git a/bin/singa-run.sh b/bin/singa-run.sh
index 3ee50a3..c911ec3 100755
--- a/bin/singa-run.sh
+++ b/bin/singa-run.sh
@@ -34,22 +34,23 @@ BIN=`dirname "${BASH_SOURCE-$0}"`
BIN=`cd "$BIN">/dev/null; pwd`
BASE=`cd "$BIN/..">/dev/null; pwd`
-#get argument
-cmd=$1
-
cd $BASE
-$BIN/zk-service.sh start
+#cleanup singa data
+. $BIN/singa-cleanup.sh
+
+#start zookeeper
+. $BIN/zk-service.sh start 2>/dev/null
#wait for zk service to be up
sleep 3
+#run singa
+cmd="./singa "$@
echo starting singa ...
+echo executing: $cmd
+exec $cmd
-echo "./singa" $@
-#. ./singa $@
-. ./singa $@
-
+#stop zookeeper
echo stopping singa ...
-
-$BIN/zk-service.sh stop
+. $BIN/zk-service.sh stop 2>/dev/null
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/src/utils/cluster_rt.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc
index fe9850f..b60b334 100644
--- a/src/utils/cluster_rt.cc
+++ b/src/utils/cluster_rt.cc
@@ -157,17 +157,30 @@ void ZKClusterRT::childChanges(zhandle_t *zh, int type, int state, const char *p
RTCallback *cb = (RTCallback *)watcherCtx;
if (cb->fn == nullptr) return;
- struct String_vector child;
- //check the child list and put another watcher
- int ret = zoo_wget_children(zh, path, childChanges, watcherCtx, &child);
- LOG(INFO) << "ret = " << ret;
- if (ret == ZOK){
- LOG(INFO) << "child.count = " << child.count;
- if (child.count == 0){
- //all workers leave, we do callback now
- (*cb->fn)(cb->ctx);
- cb->fn = nullptr;
+ if (type == ZOO_CHILD_EVENT){
+ struct String_vector child;
+ //check the child list and put another watcher
+ int ret = zoo_wget_children(zh, path, childChanges, watcherCtx, &child);
+ LOG(INFO) << "ret = " << ret;
+ if (ret == ZOK){
+ LOG(INFO) << "child.count = " << child.count;
+ if (child.count == 0){
+ //LOG(ERROR) << "do call back";
+ //LOG(ERROR) << "type = " << type;
+ //LOG(ERROR) << "state = " << state;
+ //LOG(ERROR) << "path = " << path;
+
+ //all workers leave, we do callback now
+ (*cb->fn)(cb->ctx);
+ cb->fn = nullptr;
+ }
}
+ else{
+ LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_wget_children)";
+ }
+ }
+ else{
+ LOG(ERROR) << "Unhandled callback type code: "<< type;
}
}