You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2014/05/02 02:47:10 UTC

git commit: Improved log messages in master.

Repository: mesos
Updated Branches:
  refs/heads/master 757827764 -> 09d3900d0


Improved log messages in master.

Review: https://reviews.apache.org/r/20932


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/09d3900d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/09d3900d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/09d3900d

Branch: refs/heads/master
Commit: 09d3900d02210180bd05c547a8d2d61df7169314
Parents: 7578277
Author: Vinod Kone <vi...@twitter.com>
Authored: Wed Apr 30 15:54:09 2014 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Thu May 1 17:46:31 2014 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 139 ++++++++++++++++++++-------------------------
 src/master/master.hpp |   7 +++
 2 files changed, 69 insertions(+), 77 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/09d3900d/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index f205dca..8efa2c9 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -686,14 +686,12 @@ void Master::exited(const UPID& pid)
   //         its tasks transitioned to LOST and resources recovered.
   foreachvalue (Slave* slave, slaves.activated) {
     if (slave->pid == pid) {
-      LOG(INFO) << "Slave " << slave->id << " (" << slave->info.hostname()
-                << ") disconnected";
+      LOG(INFO) << "Slave " << *slave << " disconnected";
 
       if (!slave->info.checkpoint()) {
         // Remove the slave, if it is not checkpointing.
-        LOG(INFO) << "Removing disconnected slave " << slave->id
-                  << "(" << slave->info.hostname() << ") "
-                  << "because it is not checkpointing!";
+        LOG(INFO) << "Removing disconnected slave " << *slave
+                  << " because it is not checkpointing!";
         removeSlave(slave);
         return;
       } else if (!slave->disconnected) {
@@ -701,8 +699,7 @@ void Master::exited(const UPID& pid)
         disconnect(slave);
       } else {
         LOG(WARNING) << "Ignoring duplicate exited() notification for "
-                     << "checkpointing slave " << slave->id
-                     << " (" << slave->info.hostname() << ")";
+                     << "checkpointing slave " << *slave;
       }
     }
   }
@@ -1281,8 +1278,7 @@ void Master::disconnect(Slave* slave)
     Framework* framework = getFramework(frameworkId);
     if (framework != NULL && !framework->info.checkpoint()) {
       LOG(INFO) << "Removing non-checkpointing framework " << frameworkId
-                << " from disconnected slave " << slave->id
-                << "(" << slave->info.hostname() << ")";
+                << " from disconnected slave " << *slave;
 
       removeFramework(slave, framework);
     }
@@ -1805,8 +1801,7 @@ void Master::launchTasks(
 
   LOG(INFO) << "Processing reply for offers: "
             << stringify(offerIds)
-            << " on slave " << slave->id
-            << " (" << slave->info.hostname() << ")"
+            << " on slave " << *slave
             << " for framework " << framework->id;
 
   Resources usedResources; // Accumulated resources used.
@@ -1992,8 +1987,7 @@ void Master::killTask(
   // NOTE: This task will be properly reconciled when the
   // disconnected slave re-registers with the master.
   if (!slave->disconnected) {
-    LOG(INFO) << "Telling slave " << slave->id << " ("
-              << slave->info.hostname() << ")"
+    LOG(INFO) << "Telling slave " << *slave
               << " to kill task " << taskId
               << " of framework " << frameworkId;
 
@@ -2044,16 +2038,14 @@ void Master::schedulerMessage(
 
   if (slave->disconnected) {
     LOG(WARNING) << "Cannot send framework message for framework "
-                 << frameworkId << " to slave " << slaveId
-                 << " (" << slave->info.hostname() << ")"
+                 << frameworkId << " to slave " << *slave
                  << " because slave is disconnected";
     stats.invalidFrameworkMessages++;
     return;
   }
 
   LOG(INFO) << "Sending framework message for framework "
-            << frameworkId << " to slave " << slaveId
-            << " (" << slave->info.hostname() << ")";
+            << frameworkId << " to slave " << *slave;
 
   FrameworkToExecutorMessage message;
   message.mutable_slave_id()->MergeFrom(slaveId);
@@ -2099,14 +2091,14 @@ void Master::registerSlave(const UPID& from, const SlaveInfo& slaveInfo)
         // to register as a new slave. This could happen if the slave
         // failed recovery and hence registering as a new slave before
         // the master removed the old slave from its map.
-        LOG(INFO) << "Removing old disconnected slave " << slave->id << " ("
-                  << slave->info.hostname() << ") because a registration"
-                  << " attempt is being made from " << from;
+        LOG(INFO)
+          << "Removing old disconnected slave " << *slave
+          << " because a registration attempt is being made from " << from;
         removeSlave(slave);
         break;
       } else {
-        LOG(INFO) << "Slave " << slave->id << " (" << slave->info.hostname()
-                  << ") already registered, resending acknowledgement";
+        LOG(INFO) << "Slave " << *slave << " already registered,"
+                  << " resending acknowledgement";
         SlaveRegisteredMessage message;
         message.mutable_slave_id()->MergeFrom(slave->id);
         reply(message);
@@ -2129,6 +2121,9 @@ void Master::registerSlave(const UPID& from, const SlaveInfo& slaveInfo)
   SlaveInfo slaveInfo_ = slaveInfo;
   slaveInfo_.mutable_id()->CopyFrom(newSlaveId());
 
+  LOG(INFO) << "Registering slave at " << from << " ("
+            << slaveInfo.hostname() << ") with id " << slaveInfo.id();
+
   registrar->apply(Owned<Operation>(new AdmitSlave(slaveInfo_)))
     .onAny(defer(self(),
                  &Self::_registerSlave,
@@ -2166,8 +2161,7 @@ void Master::_registerSlave(
   } else {
     Slave* slave = new Slave(slaveInfo, slaveInfo.id(), pid, Clock::now());
 
-    LOG(INFO) << "Admitted slave on " << slave->info.hostname()
-              << " at " << slave->pid;
+    LOG(INFO) << "Registered slave " << *slave;
 
     addSlave(slave);
   }
@@ -2294,12 +2288,16 @@ void Master::reregisterSlave(
   // If we're already re-registering this slave, then no need to ask
   // the registrar again.
   if (slaves.reregistering.contains(slaveInfo.id())) {
-    LOG(INFO) << "Ignoring re-register slave message from slave "
-              << slaveInfo.id() << " (" << slaveInfo.hostname() << ") "
-              << "as readmission is already in progress";
+    LOG(INFO)
+      << "Ignoring re-register slave message from slave "
+      << slaveInfo.id() << " at " << from << " ("
+      << slaveInfo.hostname() << ") as readmission is already in progress";
     return;
   }
 
+  LOG(INFO) << "Re-registering slave " << slaveInfo.id() << " at " << from
+            << " (" << slaveInfo.hostname() << ")";
+
   slaves.reregistering.insert(slaveInfo.id());
 
   // This handles the case when the slave tries to re-register with
@@ -2348,8 +2346,7 @@ void Master::_reregisterSlave(
     Slave* slave = new Slave(slaveInfo, slaveInfo.id(), pid, Clock::now());
     slave->reregisteredTime = Clock::now();
 
-    LOG(INFO) << "Readmitted slave " << slave->id << " at "
-              << slave->pid << " (" << slave->info.hostname() << ")";
+    LOG(INFO) << "Re-registered slave " << *slave;
 
     readdSlave(slave, executorInfos, tasks, completedFrameworks);
 
@@ -2424,8 +2421,8 @@ void Master::statusUpdate(const StatusUpdate& update, const UPID& pid)
   // Forward the update to the framework.
   Try<Nothing> _forward = forward(update, pid);
   if (_forward.isError()) {
-    LOG(WARNING) << "Ignoring status update " << update << " from " << pid
-                 << " (" << slave->info.hostname() << "): " << _forward.error();
+    LOG(WARNING) << "Ignoring status update " << update
+                 << " from slave " << *slave << ": " << _forward.error();
     stats.invalidStatusUpdates++;
     return;
   }
@@ -2434,14 +2431,13 @@ void Master::statusUpdate(const StatusUpdate& update, const UPID& pid)
   const TaskStatus& status = update.status();
   Task* task = slave->getTask(update.framework_id(), status.task_id());
   if (task == NULL) {
-    LOG(WARNING) << "Status update " << update
-                 << " from " << pid << " ("
-                 << slave->info.hostname() << "): error, couldn't lookup task";
+    LOG(WARNING) << "Could not lookup task for status update " << update
+                 << " from slave " << *slave;
     stats.invalidStatusUpdates++;
     return;
   }
 
-  LOG(INFO) << "Status update " << update << " from " << pid;
+  LOG(INFO) << "Status update " << update << " from slave " << *slave;
 
   // TODO(brenden) Consider wiping the `data` and `message` fields?
   if (task->statuses_size() > 0 &&
@@ -2516,23 +2512,20 @@ void Master::exitedExecutor(
 
     LOG(INFO) << "Executor " << executorId
               << " of framework " << frameworkId
-              << " on slave " << slaveId
-              << " (" << slave->info.hostname() << ")"
+              << " on slave " << *slave
               << (WIFEXITED(status) ? " has exited with status "
                                      : " has terminated with signal ")
               << (WIFEXITED(status) ? stringify(WEXITSTATUS(status))
                                      : strsignal(WTERMSIG(status)));
 
-    allocator->resourcesRecovered(frameworkId,
-        slaveId,
-        Resources(executor.resources()));
+    allocator->resourcesRecovered(
+        frameworkId, slaveId, Resources(executor.resources()));
 
     // Remove executor from slave and framework.
     slave->removeExecutor(frameworkId, executorId);
   } else {
     LOG(WARNING) << "Ignoring unknown exited executor "
-                 << executorId << " on slave " << slaveId
-                 << " (" << slave->info.hostname() << ")";
+                 << executorId << " on slave " << *slave;
   }
 
   Framework* framework = getFramework(frameworkId);
@@ -2558,7 +2551,8 @@ void Master::shutdownSlave(const SlaveID& slaveId, const string& message)
   Slave* slave = slaves.activated[slaveId];
   CHECK_NOTNULL(slave);
 
-  LOG(WARNING) << "Shutting down slave " << slaveId << " at " << slave->pid;
+  LOG(WARNING) << "Shutting down slave " << *slave << " with message '"
+               << message << "'";
 
   ShutdownMessage message_;
   message_.set_message(message);
@@ -2696,15 +2690,14 @@ void Master::offer(const FrameworkID& frameworkId,
     Slave* slave = slaves.activated[slaveId];
 
     CHECK(slave->info.checkpoint() || !framework->info.checkpoint())
-        << "Resources of non checkpointing slave " << slaveId
-        << " (" << slave->info.hostname() << ") are being offered to"
-        << " checkpointing framework " << frameworkId;
+        << "Resources of non checkpointing slave " << *slave
+        << " are being offered to checkpointing framework " << frameworkId;
 
     // This could happen if the allocator dispatched 'Master::offer' before
     // it received 'Allocator::slaveRemoved' from the master.
     if (slave->disconnected) {
       LOG(WARNING) << "Master returning resources offered because slave "
-                   << slaveId << " is disconnected";
+                   << *slave << " is disconnected";
 
       allocator->resourcesRecovered(frameworkId, slaveId, offered);
       continue;
@@ -2886,7 +2879,7 @@ Resources Master::launchTask(const TaskInfo& task,
       CHECK(!framework->hasExecutor(slave->id, task.executor().executor_id()))
         << "Executor " << task.executor().executor_id()
         << " known to the framework " << framework->id
-        << " but unknown to the slave " << slave->id;
+        << " but unknown to the slave " << *slave;
 
       slave->addExecutor(framework->id, task.executor());
       framework->addExecutor(slave->id, task.executor());
@@ -2918,8 +2911,8 @@ Resources Master::launchTask(const TaskInfo& task,
   // Tell the slave to launch the task!
   LOG(INFO) << "Launching task " << task.task_id()
             << " of framework " << framework->id
-            << " with resources " << task.resources() << " on slave "
-            << slave->id << " (" << slave->info.hostname() << ")";
+            << " with resources " << task.resources()
+            << " on slave " << *slave;
 
   RunTaskMessage message;
   message.mutable_framework()->MergeFrom(framework->info);
@@ -2962,8 +2955,7 @@ void Master::reconcile(
       if (!slaveTasks.contains(task->framework_id(), task->task_id())) {
         LOG(WARNING) << "Sending TASK_LOST for task " << task->task_id()
                      << " of framework " << task->framework_id()
-                     << " unknown to the slave " << slave->id
-                     << " (" << slave->info.hostname() << ")";
+                     << " unknown to the slave " << *slave;
 
         const StatusUpdate& update = protobuf::createStatusUpdate(
             task->framework_id(),
@@ -2987,9 +2979,8 @@ void Master::reconcile(
     // as we've added code in 0.14.0 to ensure the framework id is set
     // in the scheduler driver.
     if (!executor.has_framework_id()) {
-      LOG(ERROR) << "Slave " << slave->id
-                 << " (" << slave->info.hostname() << ") "
-                 << "re-registered with executor " << executor.executor_id()
+      LOG(ERROR) << "Slave " << *slave
+                 << " re-registered with executor " << executor.executor_id()
                  << " without setting the framework id";
       continue;
     }
@@ -3004,7 +2995,7 @@ void Master::reconcile(
       if (!slaveExecutors.contains(frameworkId, executorId)) {
         LOG(WARNING) << "Removing executor " << executorId << " of framework "
                      << frameworkId << " as it is unknown to the slave "
-                     << slave->id << " (" << slave->info.hostname() << ")";
+                     << *slave;
 
         // TODO(bmahler): This is duplicated in several locations, we
         // may benefit from a method for removing an executor from
@@ -3032,8 +3023,8 @@ void Master::reconcile(
   foreach (const Task& task, tasks) {
     if (!protobuf::isTerminalState(task.state()) &&
         slave->killedTasks.contains(task.framework_id(), task.task_id())) {
-      LOG(WARNING) << " Slave " << slave->id << " (" << slave->info.hostname()
-                   << ") has non-terminal task " << task.task_id()
+      LOG(WARNING) << " Slave " << *slave
+                   << " has non-terminal task " << task.task_id()
                    << " that is supposed to be killed. Killing it now!";
 
       KillTaskMessage message;
@@ -3054,8 +3045,8 @@ void Master::reconcile(
   foreach (const shared_ptr<Framework>& framework, frameworks.completed) {
     if (slaveTasks.contains(framework->id)) {
       LOG(WARNING)
-        << "Slave " << slave->id << " (" << slave->info.hostname()
-        << ") re-registered with completed framework " << framework->id
+        << "Slave " << *slave
+        << " re-registered with completed framework " << framework->id
         << ". Shutting down the framework on the slave";
 
       ShutdownFrameworkMessage message;
@@ -3233,8 +3224,7 @@ void Master::removeFramework(Slave* slave, Framework* framework)
   CHECK_NOTNULL(framework);
 
   LOG(INFO) << "Removing framework " << framework->id
-            << " from slave " << slave->id
-            << " (" << slave->info.hostname() << ")";
+            << " from slave " << *slave;
 
   // Remove pointers to framework's tasks in slaves, and send status
   // updates.
@@ -3280,8 +3270,7 @@ void Master::addSlave(Slave* slave, bool reregister)
 {
   CHECK_NOTNULL(slave);
 
-  LOG(INFO) << "Adding slave " << slave->id
-            << " at " << slave->info.hostname()
+  LOG(INFO) << "Adding slave " << *slave
             << " with " << slave->info.resources();
 
   slaves.deactivated.erase(slave->id);
@@ -3300,14 +3289,14 @@ void Master::addSlave(Slave* slave, bool reregister)
   }
 
   // Set up an observer for the slave.
-  slave->observer = new SlaveObserver(slave->pid, slave->info,
-                                      slave->id, self());
+  slave->observer = new SlaveObserver(
+      slave->pid, slave->info, slave->id, self());
+
   spawn(slave->observer);
 
   if (!reregister) {
-    allocator->slaveAdded(slave->id,
-                          slave->info,
-                          hashmap<FrameworkID, Resources>());
+    allocator->slaveAdded(
+        slave->id, slave->info, hashmap<FrameworkID, Resources>());
   }
 }
 
@@ -3374,8 +3363,7 @@ void Master::readdSlave(
       // reregister and claim them.
       LOG(WARNING) << "Possibly orphaned task " << task.task_id()
                    << " of framework " << task.framework_id()
-                   << " running on slave " << slave->id << " ("
-                   << slave->info.hostname() << ")";
+                   << " running on slave " << *slave;
     }
 
     resources[task.framework_id()] += task.resources();
@@ -3394,8 +3382,7 @@ void Master::readdSlave(
       if (framework != NULL) {
         VLOG(2) << "Re-adding completed task " << task.task_id()
                 << " of framework " << task.framework_id()
-                << " that ran on slave " << slave->id << " ("
-                << slave->info.hostname() << ")";
+                << " that ran on slave " << *slave;
         framework->addCompletedTask(task);
       } else {
         // We could be here if the framework hasn't registered yet.
@@ -3403,8 +3390,7 @@ void Master::readdSlave(
         // information in the registrar.
         LOG(WARNING) << "Possibly orphaned completed task " << task.task_id()
                      << " of framework " << task.framework_id()
-                     << " that ran on slave " << slave->id << " ("
-                     << slave->info.hostname() << ")";
+                     << " that ran on slave " << *slave;
       }
     }
   }
@@ -3417,8 +3403,7 @@ void Master::removeSlave(Slave* slave)
 {
   CHECK_NOTNULL(slave);
 
-  LOG(INFO) << "Removing slave " << slave->id
-            << " (" << slave->info.hostname() << ")";
+  LOG(INFO) << "Removing slave " << *slave;
 
   // We do this first, to make sure any of the resources recovered
   // below (e.g., removeTask()) are ignored by the allocator.

http://git-wip-us.apache.org/repos/asf/mesos/blob/09d3900d/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 333e37c..a6737d5 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -672,6 +672,13 @@ private:
 };
 
 
+inline std::ostream& operator << (std::ostream& stream, const Slave& slave)
+{
+  return stream << slave.id << " at " << slave.pid
+                << " (" << slave.info.hostname() << ")";
+}
+
+
 // Information about a connected or completed framework.
 struct Framework
 {