You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/23 08:32:16 UTC

incubator-singa git commit: SINGA-201 Error when running Mesos

Repository: incubator-singa
Updated Branches:
  refs/heads/master d547a8610 -> 1ca8c638b


SINGA-201 Error when running Mesos

A bug was reported (https://issues.apache.org/jira/browse/SINGA-201) when
launching SINGA on Mesos in fully distributed mode.

The main cause was determined to be of ZeroMQ binding to the localhost. In fully
distributed mode, SINGA on each node should be passed a `-host` flag specifying
the public IP address of the local host.

The Mesos scheduler is modified accordingly:

1. When a Mesos slave starts connecting to the master, it passes `--hostname` flag specifying its public IP address

2. The scheduler now sends to each executor command of the form:

          `singa -conf ./job.conf -singa_conf ./singa.conf -singa_job XX -host XX`


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/1ca8c638
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/1ca8c638
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/1ca8c638

Branch: refs/heads/master
Commit: 1ca8c638b132009e213fda8e02e77cc2d09fb824
Parents: d547a86
Author: Anh Dinh <ug...@gmail.com>
Authored: Thu Jun 23 03:53:45 2016 +0000
Committer: Anh Dinh <ug...@gmail.com>
Committed: Thu Jun 23 12:26:35 2016 +0800

----------------------------------------------------------------------
 tool/mesos/singa_scheduler.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/1ca8c638/tool/mesos/singa_scheduler.cc
----------------------------------------------------------------------
diff --git a/tool/mesos/singa_scheduler.cc b/tool/mesos/singa_scheduler.cc
index c9b72d4..dc48ae2 100644
--- a/tool/mesos/singa_scheduler.cc
+++ b/tool/mesos/singa_scheduler.cc
@@ -180,7 +180,7 @@ class SingaScheduler: public mesos::Scheduler {
         // store in temporary map
         new_tasks->push_back(task);
         tasks_[offer.id().value()] = new_tasks;
-
+        hostnames_[offer.id().value()] = offer.hostname();
         nhosts_++;
       }
 
@@ -197,7 +197,7 @@ class SingaScheduler: public mesos::Scheduler {
         // launch tasks
         for (map<string, vector<mesos::TaskInfo>*>::iterator it =
             tasks_.begin(); it != tasks_.end(); ++it) {
-          prepare_tasks(it->second, job_counter_, path);
+          prepare_tasks(it->second, hostnames_[it->first], job_counter_, path);
           mesos::OfferID newId;
           newId.set_value(it->first);
           LOG(INFO) << "Launching task with offer ID = " << newId.value();
@@ -219,12 +219,11 @@ class SingaScheduler: public mesos::Scheduler {
     virtual void statusUpdate(SchedulerDriver* driver,
         const mesos::TaskStatus& status) {
       if (status.state() == mesos::TASK_FINISHED)
-        task_counter_--; 
+        task_counter_--;
 
-      if (task_counter_ == 0)
+      if (task_counter_ == 0) {
         driver->stop();
-
-      else if (status.state() == mesos::TASK_FAILED) {
+      } else if (status.state() == mesos::TASK_FAILED) {
         LOG(ERROR) << "TASK FAILED !!!!";
         driver->abort();
       }
@@ -252,14 +251,14 @@ class SingaScheduler: public mesos::Scheduler {
     /**
      * Helper function that initialize TaskInfo with the correct URI and command
      */
-    void prepare_tasks(vector<mesos::TaskInfo> *tasks, int job_id, string job_conf) {
+    void prepare_tasks(vector<mesos::TaskInfo> *tasks, string hostname, int job_id, string job_conf) {
       char path_sys_config[512], path_job_config[512];
       // path to singa.conf
       snprintf(path_sys_config, 512, "hdfs://%s%s", namenode_.c_str(), SINGA_CONFIG);
       snprintf(path_job_config, 512, "hdfs://%s%s", namenode_.c_str(), job_conf.c_str());
 
       char command[512];
-      snprintf(command, 512, "singa -conf ./job.conf -singa_conf ./singa.conf -singa_job %d", job_id);
+      snprintf(command, 512, "singa -conf ./job.conf -singa_conf ./singa.conf -singa_job %d -host %s", job_id, hostname.c_str());
 
       for (int i=0; i < tasks->size(); i++) {
         mesos::CommandInfo *comm = (tasks->at(i)).mutable_command();
@@ -366,6 +365,8 @@ class SingaScheduler: public mesos::Scheduler {
     int nhosts_;
     // temporary map of tasks: <offerID, TaskInfo>
     map<string, vector<mesos::TaskInfo>*> tasks_;
+    // temporary map of offerID to slave IP addresses
+    map<string, string> hostnames_; 
     // SINGA job config file
     string job_conf_file_;
     // HDFS namenode