You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/09/16 22:52:42 UTC
git commit: Cleaned up some log and CHECK messages.
Updated Branches:
refs/heads/master 8602dd28f -> f2571b55f
Cleaned up some log and CHECK messages.
Review: https://reviews.apache.org/r/14135
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f2571b55
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f2571b55
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f2571b55
Branch: refs/heads/master
Commit: f2571b55f2d8bbe1d8ef9e9a914f07531ac73c0e
Parents: 8602dd2
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri Sep 13 14:01:11 2013 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Mon Sep 16 12:46:55 2013 -0700
----------------------------------------------------------------------
src/master/master.cpp | 107 +++++++++++++++++++++++++++++++--------------
src/master/master.hpp | 39 ++++++++++++-----
src/slave/gc.cpp | 4 +-
src/slave/paths.hpp | 4 --
src/slave/slave.cpp | 41 ++++++++++-------
5 files changed, 128 insertions(+), 67 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/f2571b55/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 30abe9d..a49b17e 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -77,7 +77,7 @@ protected:
// Get the list of white listed slaves.
Option<hashset<string> > whitelist;
if (path == "*") { // Accept all slaves.
- LOG(WARNING) << "No whitelist given. Advertising offers for all slaves";
+ VLOG(1) << "No whitelist given. Advertising offers for all slaves";
} else {
// Read from local file.
// TODO(vinod): Add support for reading from ZooKeeper.
@@ -215,7 +215,10 @@ Master::~Master()
Slave* slave = getSlave(task->slave_id());
// Since we only find out about tasks when the slave re-registers,
// it must be the case that the slave exists!
- CHECK(slave != NULL);
+ CHECK(slave != NULL)
+ << "Unknown slave " << task->slave_id()
+ << " in the task " << task->task_id();
+
removeTask(task);
}
@@ -228,7 +231,7 @@ Master::~Master()
}
frameworks.clear();
- CHECK(offers.size() == 0);
+ CHECK_EQ(offers.size(), 0UL);
foreachvalue (Slave* slave, slaves) {
LOG(INFO) << "Removing slave " << slave->id
@@ -609,12 +612,11 @@ void Master::exited(const UPID& pid)
void Master::fileAttached(const Future<Nothing>& result, const string& path)
{
- CHECK(!result.isDiscarded());
if (result.isReady()) {
LOG(INFO) << "Successfully attached file '" << path << "'";
} else {
LOG(ERROR) << "Failed to attach file '" << path << "': "
- << result.failure();
+ << result.isFailed() ? result.failure() : "discarded";
}
}
@@ -791,7 +793,11 @@ void Master::reregisterFramework(const FrameworkInfo& frameworkInfo,
// Also add the task's executor for resource accounting.
if (task->has_executor_id()) {
if (!framework->hasExecutor(slave->id, task->executor_id())) {
- CHECK(slave->hasExecutor(framework->id, task->executor_id()));
+ CHECK(slave->hasExecutor(framework->id, task->executor_id()))
+ << "Unknown executor " << task->executor_id()
+ << " of framework " << framework->id
+ << " for the task " << task->task_id();
+
const ExecutorInfo& executorInfo =
slave->executors[framework->id][task->executor_id()];
framework->addExecutor(slave->id, executorInfo);
@@ -807,7 +813,8 @@ void Master::reregisterFramework(const FrameworkInfo& frameworkInfo,
addFramework(framework);
}
- CHECK(frameworks.count(frameworkInfo.id()) > 0);
+ CHECK(frameworks.contains(frameworkInfo.id()))
+ << "Unknown framework " << frameworkInfo.id();
// Broadcast the new framework pid to all the slaves. We have to
// broadcast because an executor might be running on a slave but
@@ -876,7 +883,10 @@ void Master::launchTasks(const FrameworkID& frameworkId,
// Master::processTasks.
Offer* offer = getOffer(offerId);
if (offer != NULL) {
- CHECK(offer->framework_id() == frameworkId);
+ CHECK_EQ(offer->framework_id(), frameworkId)
+ << "Offer " << offerId
+ << " has invalid frameworkId " << offer->framework_id();
+
Slave* slave = getSlave(offer->slave_id());
CHECK(slave != NULL)
<< "Offer " << offerId << " outlived slave "
@@ -935,7 +945,7 @@ void Master::killTask(const FrameworkID& frameworkId,
Task* task = framework->getTask(taskId);
if (task != NULL) {
Slave* slave = getSlave(task->slave_id());
- CHECK(slave != NULL);
+ CHECK(slave != NULL) << "Unknown slave " << task->slave_id();
// We add the task to 'killedTasks' here because the slave
// might be partitioned or disconnected but the master
@@ -1066,7 +1076,7 @@ void Master::registerSlave(const SlaveInfo& slaveInfo)
<< " at " << slave->pid;
// TODO(benh): We assume all slaves can register for now.
- CHECK(flags.slaves == "*");
+ CHECK_EQ(flags.slaves, "*");
addSlave(slave);
// // Checks if this slave, or if all slaves, can be accepted.
@@ -1166,7 +1176,7 @@ void Master::reregisterSlave(const SlaveID& slaveId,
<< slave->pid << " (" << slave->info.hostname() << ")";
// TODO(benh): We assume all slaves can register for now.
- CHECK(flags.slaves == "*");
+ CHECK_EQ(flags.slaves, "*");
readdSlave(slave, executorInfos, tasks);
}
@@ -1201,15 +1211,15 @@ void Master::unregisterSlave(const SlaveID& slaveId)
}
+
+// NOTE: We cannot use 'from' here to identify the slave as this is
+// now sent by the StatusUpdateManagerProcess and master itself when
+// it generates TASK_LOST messages. Only 'pid' can be used to identify
+// the slave.
void Master::statusUpdate(const StatusUpdate& update, const UPID& pid)
{
const TaskStatus& status = update.status();
- // NOTE: We cannot use 'from' here to identify the slave as this is
- // now sent by the StatusUpdateManagerProcess. Only 'pid' can
- // be used to identify the slave.
- LOG(INFO) << "Status update " << update << " from " << pid;
-
Slave* slave = getSlave(update.slave_id());
if (slave == NULL) {
if (deactivatedSlaves.contains(pid)) {
@@ -1230,7 +1240,10 @@ void Master::statusUpdate(const StatusUpdate& update, const UPID& pid)
return;
}
- CHECK(!deactivatedSlaves.contains(pid));
+ CHECK(!deactivatedSlaves.contains(pid))
+ << "Received status update " << update << " from " << pid
+ << " which is deactivated slave " << update.slave_id()
+ << "(" << slave->info.hostname() << ")";
Framework* framework = getFramework(update.framework_id());
if (framework == NULL) {
@@ -1253,12 +1266,13 @@ void Master::statusUpdate(const StatusUpdate& update, const UPID& pid)
if (task == NULL) {
LOG(WARNING) << "Status update " << update
<< " from " << pid << " ("
- << slave->info.hostname() << "): error, couldn't lookup "
- << "task " << status.task_id();
+ << slave->info.hostname() << "): error, couldn't lookup task";
stats.invalidStatusUpdates++;
return;
}
+ LOG(INFO) << "Status update " << update << " from " << pid;
+
task->set_state(status.state());
// Handle the task appropriately if it's terminated.
@@ -1298,7 +1312,10 @@ void Master::exitedExecutor(
return;
}
- CHECK(!deactivatedSlaves.contains(from));
+ CHECK(!deactivatedSlaves.contains(from))
+ << "Received exited message for executor " << executorId << " from " << from
+ << " which is deactivated slave " << slaveId
+ << "(" << slave->info.hostname() << ")";
// Tell the allocator about the recovered resources.
if (slave->hasExecutor(frameworkId, executorId)) {
@@ -1737,7 +1754,7 @@ void Master::processTasks(Offer* offer,
} while (!visitors.empty());
// All used resources should be allocatable, enforced by our validators.
- CHECK(usedResources == usedResources.allocatable());
+ CHECK_EQ(usedResources, usedResources.allocatable());
// Calculate unused resources.
Resources unusedResources = offer->resources() - usedResources;
@@ -1758,8 +1775,8 @@ Resources Master::launchTask(const TaskInfo& task,
Framework* framework,
Slave* slave)
{
- CHECK(framework != NULL);
- CHECK(slave != NULL);
+ CHECK_NOTNULL(framework);
+ CHECK_NOTNULL(slave);
Resources resources; // Total resources used on slave by launching this task.
@@ -1770,7 +1787,11 @@ Resources Master::launchTask(const TaskInfo& task,
if (task.has_executor()) {
// TODO(benh): Refactor this code into Slave::addTask.
if (!slave->hasExecutor(framework->id, task.executor().executor_id())) {
- CHECK(!framework->hasExecutor(slave->id, task.executor().executor_id()));
+ CHECK(!framework->hasExecutor(slave->id, task.executor().executor_id()))
+ << "Executor " << task.executor().executor_id()
+ << " known to the framework " << framework->id
+ << " but unknown to the slave " << slave->id;
+
slave->addExecutor(framework->id, task.executor());
framework->addExecutor(slave->id, task.executor());
resources += task.executor().resources();
@@ -1946,14 +1967,18 @@ void Master::reconcile(
void Master::addFramework(Framework* framework)
{
- CHECK(frameworks.count(framework->id) == 0);
+ CHECK(!frameworks.contains(framework->id))
+ << "Framework " << framework->id << "already exists!";
frameworks[framework->id] = framework;
link(framework->pid);
// Enforced by Master::registerFramework.
- CHECK(roles.contains(framework->info.role()));
+ CHECK(roles.contains(framework->info.role()))
+ << "Unknown role " << framework->info.role()
+ << " of framework " << framework->id ;
+
roles[framework->info.role()]->addFramework(framework);
FrameworkRegisteredMessage message;
@@ -2035,7 +2060,10 @@ void Master::removeFramework(Framework* framework)
Slave* slave = getSlave(task->slave_id());
// Since we only find out about tasks when the slave re-registers,
// it must be the case that the slave exists!
- CHECK(slave != NULL);
+ CHECK(slave != NULL)
+ << "Unknown slave " << task->slave_id()
+ << " for task " << task->task_id();
+
removeTask(task);
}
@@ -2072,7 +2100,10 @@ void Master::removeFramework(Framework* framework)
// The completedFramework buffer now owns the framework pointer.
completedFrameworks.push_back(std::tr1::shared_ptr<Framework>(framework));
- CHECK(roles.contains(framework->info.role()));
+ CHECK(roles.contains(framework->info.role()))
+ << "Unknown role " << framework->info.role()
+ << " of framework " << framework->id;
+
roles[framework->info.role()]->removeFramework(framework);
// Remove it.
@@ -2115,7 +2146,6 @@ void Master::removeFramework(Slave* slave, Framework* framework)
if (slave->executors.contains(framework->id)) {
foreachkey (const ExecutorID& executorId,
utils::copy(slave->executors[framework->id])) {
-
allocator->resourcesRecovered(
framework->id,
slave->id,
@@ -2130,7 +2160,7 @@ void Master::removeFramework(Slave* slave, Framework* framework)
void Master::addSlave(Slave* slave, bool reregister)
{
- CHECK(slave != NULL);
+ CHECK_NOTNULL(slave);
LOG(INFO) << "Adding slave " << slave->id
<< " at " << slave->info.hostname()
@@ -2173,7 +2203,7 @@ void Master::readdSlave(Slave* slave,
const vector<ExecutorInfo>& executorInfos,
const vector<Task>& tasks)
{
- CHECK(slave != NULL);
+ CHECK_NOTNULL(slave);
addSlave(slave, true);
@@ -2185,7 +2215,10 @@ void Master::readdSlave(Slave* slave,
// TODO(bmahler): ExecutorInfo.framework_id is set by the Scheduler
// Driver in 0.14.0. Therefore, in 0.15.0, the slave no longer needs
// to set it, and we could remove this CHECK if desired.
- CHECK(executorInfo.has_framework_id());
+ CHECK(executorInfo.has_framework_id())
+ << "Executor " << executorInfo.executor_id()
+ << " doesn't have frameworkId set";
+
if (!slave->hasExecutor(executorInfo.framework_id(),
executorInfo.executor_id())) {
slave->addExecutor(executorInfo.framework_id(), executorInfo);
@@ -2348,12 +2381,18 @@ void Master::removeOffer(Offer* offer, bool rescind)
{
// Remove from framework.
Framework* framework = getFramework(offer->framework_id());
- CHECK(framework != NULL);
+ CHECK(framework != NULL)
+ << "Unknown framework " << offer->framework_id()
+ << " in the offer " << offer->id();
+
framework->removeOffer(offer);
// Remove from slave.
Slave* slave = getSlave(offer->slave_id());
- CHECK(slave != NULL);
+ CHECK(slave != NULL)
+ << "Unknown slave " << offer->slave_id()
+ << " in the offer " << offer->id();
+
slave->removeOffer(offer);
if (rescind) {
http://git-wip-us.apache.org/repos/asf/mesos/blob/f2571b55/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 19be184..bd5cb1f 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -313,7 +313,10 @@ struct Slave
{
std::pair<FrameworkID, TaskID> key =
std::make_pair(task->framework_id(), task->task_id());
- CHECK(tasks.count(key) == 0);
+ CHECK(!tasks.contains(key))
+ << "Duplicate task " << task->task_id()
+ << " of framework " << task->framework_id();
+
tasks[key] = task;
LOG(INFO) << "Adding task " << task->task_id()
<< " with resources " << task->resources()
@@ -325,7 +328,10 @@ struct Slave
{
std::pair<FrameworkID, TaskID> key =
std::make_pair(task->framework_id(), task->task_id());
- CHECK(tasks.count(key) > 0);
+ CHECK(tasks.contains(key))
+ << "Unknown task " << task->task_id()
+ << " of framework " << task->framework_id();
+
tasks.erase(key);
killedTasks.remove(task->framework_id(), task->task_id());
LOG(INFO) << "Removing task " << task->task_id()
@@ -336,7 +342,7 @@ struct Slave
void addOffer(Offer* offer)
{
- CHECK(!offers.contains(offer));
+ CHECK(!offers.contains(offer)) << "Duplicate offer " << offer->id();
offers.insert(offer);
VLOG(1) << "Adding offer " << offer->id()
<< " with resources " << offer->resources()
@@ -346,7 +352,7 @@ struct Slave
void removeOffer(Offer* offer)
{
- CHECK(offers.contains(offer));
+ CHECK(offers.contains(offer)) << "Unknown offer " << offer->id();
offers.erase(offer);
VLOG(1) << "Removing offer " << offer->id()
<< " with resources " << offer->resources()
@@ -364,7 +370,10 @@ struct Slave
void addExecutor(const FrameworkID& frameworkId,
const ExecutorInfo& executorInfo)
{
- CHECK(!hasExecutor(frameworkId, executorInfo.executor_id()));
+ CHECK(!hasExecutor(frameworkId, executorInfo.executor_id()))
+ << "Duplicate executor " << executorInfo.executor_id()
+ << " of framework " << frameworkId;
+
executors[frameworkId][executorInfo.executor_id()] = executorInfo;
// Update the resources in use to reflect running this executor.
@@ -453,14 +462,19 @@ struct Framework
void addTask(Task* task)
{
- CHECK(!tasks.contains(task->task_id()));
+ CHECK(!tasks.contains(task->task_id()))
+ << "Duplicate task " << task->task_id()
+ << " of framework " << task->framework_id();
+
tasks[task->task_id()] = task;
resources += task->resources();
}
void removeTask(Task* task)
{
- CHECK(tasks.contains(task->task_id()));
+ CHECK(tasks.contains(task->task_id()))
+ << "Unknown task " << task->task_id()
+ << " of framework " << task->framework_id();
completedTasks.push_back(*task);
tasks.erase(task->task_id());
@@ -469,14 +483,16 @@ struct Framework
void addOffer(Offer* offer)
{
- CHECK(!offers.contains(offer));
+ CHECK(!offers.contains(offer)) << "Duplicate offer " << offer->id();
offers.insert(offer);
resources += offer->resources();
}
void removeOffer(Offer* offer)
{
- CHECK(offers.find(offer) != offers.end());
+ CHECK(offers.find(offer) != offers.end())
+ << "Unknown offer " << offer->id();
+
offers.erase(offer);
resources -= offer->resources();
}
@@ -491,7 +507,10 @@ struct Framework
void addExecutor(const SlaveID& slaveId,
const ExecutorInfo& executorInfo)
{
- CHECK(!hasExecutor(slaveId, executorInfo.executor_id()));
+ CHECK(!hasExecutor(slaveId, executorInfo.executor_id()))
+ << "Duplicate executor " << executorInfo.executor_id()
+ << " on slave " << slaveId;
+
executors[slaveId][executorInfo.executor_id()] = executorInfo;
// Update our resources to reflect running this executor.
http://git-wip-us.apache.org/repos/asf/mesos/blob/f2571b55/src/slave/gc.cpp
----------------------------------------------------------------------
diff --git a/src/slave/gc.cpp b/src/slave/gc.cpp
index 827534f..711bbcb 100644
--- a/src/slave/gc.cpp
+++ b/src/slave/gc.cpp
@@ -53,7 +53,7 @@ Future<Nothing> GarbageCollectorProcess::schedule(
const Duration& d,
const string& path)
{
- LOG(INFO) << "Scheduling '" << path << "' for removal";
+ LOG(INFO) << "Scheduling '" << path << "' for gc";
// If there's an existing schedule for this path, we must remove
// it here in order to reschedule.
@@ -81,7 +81,7 @@ Future<Nothing> GarbageCollectorProcess::schedule(
bool GarbageCollectorProcess::unschedule(const string& path)
{
- LOG(INFO) << "Unscheduling '" << path << "' for removal";
+ LOG(INFO) << "Unscheduling '" << path << "' from gc";
if (!timeouts.contains(path)) {
return false;
http://git-wip-us.apache.org/repos/asf/mesos/blob/f2571b55/src/slave/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/paths.hpp b/src/slave/paths.hpp
index 3a02a0d..8f80b93 100644
--- a/src/slave/paths.hpp
+++ b/src/slave/paths.hpp
@@ -333,8 +333,6 @@ inline std::string createExecutorDirectory(
CHECK_SOME(mkdir)
<< "Failed to create executor directory '" << directory << "'";
- LOG(INFO) << "Created executor directory '" << directory << "'";
-
// Remove the previous "latest" symlink.
std::string latest =
getExecutorLatestRunPath(rootDir, slaveId, frameworkId, executorId);
@@ -366,8 +364,6 @@ inline std::string createSlaveDirectory(
CHECK_SOME(mkdir)
<< "Failed to create slave directory '" << directory << "'";
- LOG(INFO) << "Created slave directory '" << directory << "'";
-
// Remove the previous "latest" symlink.
std::string latest = getLatestSlavePath(rootDir);
http://git-wip-us.apache.org/repos/asf/mesos/blob/f2571b55/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index cefb420..21b57f9 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -521,12 +521,11 @@ void Slave::shutdown()
void Slave::fileAttached(const Future<Nothing>& result, const string& path)
{
- CHECK(!result.isDiscarded());
if (result.isReady()) {
- LOG(INFO) << "Successfully attached file '" << path << "'";
+ VLOG(1) << "Successfully attached file '" << path << "'";
} else {
LOG(ERROR) << "Failed to attach file '" << path << "': "
- << result.failure();
+ << result.isFailed() ? result.failure() : "discarded";
}
}
@@ -1606,7 +1605,9 @@ void Slave::reregisterExecutor(
LOG(INFO) << "Re-registering executor " << executorId
<< " of framework " << frameworkId;
- CHECK(frameworks.contains(frameworkId));
+ CHECK(frameworks.contains(frameworkId))
+ << "Unknown framework " << frameworkId;
+
Framework* framework = frameworks[frameworkId];
CHECK(framework->state == Framework::RUNNING ||
@@ -1735,7 +1736,7 @@ void Slave::reregisterExecutorTimeout()
// exited! This is because if the executor properly exited,
// it should have already been identified by the isolator
// (via the reaper) and cleaned up!
- LOG(INFO) << "Killing an un-reregistered executor '" << executor->id
+ LOG(INFO) << "Killing un-reregistered executor '" << executor->id
<< "' of framework " << framework->id;
executor->state = Executor::TERMINATING;
@@ -1993,7 +1994,9 @@ ExecutorInfo Slave::getExecutorInfo(
const FrameworkID& frameworkId,
const TaskInfo& task)
{
- CHECK(task.has_executor() != task.has_command());
+ CHECK_NE(task.has_executor(), task.has_command())
+ << "Task " << task.task_id()
+ << " should have either CommandInfo or ExecutorInfo set but not both";
if (task.has_command()) {
ExecutorInfo executor;
@@ -2467,9 +2470,9 @@ void Slave::shutdownExecutorTimeout(
Executor* executor = framework->getExecutor(executorId);
if (executor == NULL) {
- LOG(INFO) << "Executor '" << executorId
- << "' of framework " << frameworkId
- << " seems to have exited. Ignoring its shutdown timeout";
+ VLOG(1) << "Executor '" << executorId
+ << "' of framework " << frameworkId
+ << " seems to have exited. Ignoring its shutdown timeout";
return;
}
@@ -2529,9 +2532,9 @@ void Slave::registerExecutorTimeout(
Executor* executor = framework->getExecutor(executorId);
if (executor == NULL) {
- LOG(INFO) << "Executor '" << executorId
- << "' of framework " << frameworkId
- << " seems to have exited. Ignoring its registration timeout";
+ VLOG(1) << "Executor '" << executorId
+ << "' of framework " << frameworkId
+ << " seems to have exited. Ignoring its registration timeout";
return;
}
@@ -2862,10 +2865,10 @@ Executor* Framework::launchExecutor(const ExecutorInfo& executorInfo)
Executor* executor = new Executor(
slave, id, executorInfo, uuid, directory, info.checkpoint());
- CHECK(!executors.contains(executorInfo.executor_id()));
- executors[executorInfo.executor_id()] = executor;
+ CHECK(!executors.contains(executorInfo.executor_id()))
+ << "Unknown executor " << executorInfo.executor_id();
- CHECK_NOTNULL(executor);
+ executors[executorInfo.executor_id()] = executor;
slave->files->attach(executor->directory, executor->directory)
.onAny(defer(slave,
@@ -2974,7 +2977,10 @@ Executor* Framework::recoverExecutor(const ExecutorState& state)
}
}
- CHECK(state.runs.contains(uuid));
+ CHECK(state.runs.contains(uuid))
+ << "Cannot find latest run " << uuid << " for executor " << state.id
+ << " of framework " << id;
+
const RunState& run = state.runs.get(uuid).get();
// If the latest run of the executor was completed (i.e., terminated
@@ -3094,7 +3100,8 @@ Task* Executor::addTask(const TaskInfo& task)
{
// The master should enforce unique task IDs, but just in case
// maybe we shouldn't make this a fatal error.
- CHECK(!launchedTasks.contains(task.task_id()));
+ CHECK(!launchedTasks.contains(task.task_id()))
+ << "Duplicate task " << task.task_id();
Task* t = new Task(
protobuf::createTask(task, TASK_STAGING, id, frameworkId));