You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@mesos.apache.org by "Dominic Hamon (JIRA)" <ji...@apache.org> on 2014/11/11 22:10:34 UTC
[jira] [Updated] (MESOS-2078) Scheduler driver may ACK status
updates when the scheduler threw an exception
[ https://issues.apache.org/jira/browse/MESOS-2078?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Dominic Hamon updated MESOS-2078:
---------------------------------
Description:
[~vinodkone]] discovered that this can happen if the scheduler calls {{SchedulerDriver#stop}} before or while handling {{Scheduler#statusUpdate}}.
In src/sched/sched.cpp:
The driver invokes {{statusUpdate}} and later checks the {{aborted}} flag to determine whether to send an ACK.
{code}
void statusUpdate(
const UPID& from,
const StatusUpdate& update,
const UPID& pid)
{
...
scheduler->statusUpdate(driver, status);
VLOG(1) << "Scheduler::statusUpdate took " << stopwatch.elapsed();
// Note that we need to look at the volatile 'aborted' here to
// so that we don't acknowledge the update if the driver was
// aborted during the processing of the update.
if (aborted) {
VLOG(1) << "Not sending status update acknowledgment message because "
<< "the driver is aborted!";
return;
}
...
{code}
In src/java/jni/org_apache_mesos_MesosSchedulerDriver.cpp:
The {{statusUpdate}} implementation checks for an exception and invokes {{driver->abort()}].
{code}
void JNIScheduler::statusUpdate(SchedulerDriver* driver,
const TaskStatus& status)
{
jvm->AttachCurrentThread(JNIENV_CAST(&env), NULL);
jclass clazz = env->GetObjectClass(jdriver);
jfieldID scheduler = env->GetFieldID(clazz, "scheduler", "Lorg/apache/mesos/Scheduler;");
jobject jscheduler = env->GetObjectField(jdriver, scheduler);
clazz = env->GetObjectClass(jscheduler);
// scheduler.statusUpdate(driver, status);
jmethodID statusUpdate =
env->GetMethodID(clazz, "statusUpdate",
"(Lorg/apache/mesos/SchedulerDriver;"
"Lorg/apache/mesos/Protos$TaskStatus;)V");
jobject jstatus = convert<TaskStatus>(env, status);
env->ExceptionClear();
env->CallVoidMethod(jscheduler, statusUpdate, jdriver, jstatus);
if (env->ExceptionCheck()) {
env->ExceptionDescribe();
env->ExceptionClear();
jvm->DetachCurrentThread();
driver->abort();
return;
}
jvm->DetachCurrentThread();
}
{code}
In src/sched/sched.cpp:
The {{abort()}} implementation exits early if {{status != DRIVER_RUNNING}}, and *does not set the aborted flag*.
{code}
Status MesosSchedulerDriver::abort()
{
Lock lock(&mutex);
if (status != DRIVER_RUNNING) {
return status;
}
CHECK(process != NULL);
// We set the volatile aborted to true here to prevent any further
// messages from being processed in the SchedulerProcess. However,
// if abort() is called from another thread as the SchedulerProcess,
// there may be at most one additional message processed.
// TODO(bmahler): Use an atomic boolean.
process->aborted = true;
// Dispatching here ensures that we still process the outstanding
// requests *from* the scheduler, since those do proceed when
// aborted is true.
dispatch(process, &SchedulerProcess::abort);
return status = DRIVER_ABORTED;
}
{code}
As a result, the code will ACK despite an exception being thrown.
was:
[~vinod] discovered that this can happen if the scheduler calls {{SchedulerDriver#stop}} before or while handling {{Scheduler#statusUpdate}}.
In src/sched/sched.cpp:
The driver invokes {{statusUpdate}} and later checks the {{aborted}} flag to determine whether to send an ACK.
{code}
void statusUpdate(
const UPID& from,
const StatusUpdate& update,
const UPID& pid)
{
...
scheduler->statusUpdate(driver, status);
VLOG(1) << "Scheduler::statusUpdate took " << stopwatch.elapsed();
// Note that we need to look at the volatile 'aborted' here to
// so that we don't acknowledge the update if the driver was
// aborted during the processing of the update.
if (aborted) {
VLOG(1) << "Not sending status update acknowledgment message because "
<< "the driver is aborted!";
return;
}
...
{code}
In src/java/jni/org_apache_mesos_MesosSchedulerDriver.cpp:
The {{statusUpdate}} implementation checks for an exception and invokes {{driver->abort()}].
{code}
void JNIScheduler::statusUpdate(SchedulerDriver* driver,
const TaskStatus& status)
{
jvm->AttachCurrentThread(JNIENV_CAST(&env), NULL);
jclass clazz = env->GetObjectClass(jdriver);
jfieldID scheduler = env->GetFieldID(clazz, "scheduler", "Lorg/apache/mesos/Scheduler;");
jobject jscheduler = env->GetObjectField(jdriver, scheduler);
clazz = env->GetObjectClass(jscheduler);
// scheduler.statusUpdate(driver, status);
jmethodID statusUpdate =
env->GetMethodID(clazz, "statusUpdate",
"(Lorg/apache/mesos/SchedulerDriver;"
"Lorg/apache/mesos/Protos$TaskStatus;)V");
jobject jstatus = convert<TaskStatus>(env, status);
env->ExceptionClear();
env->CallVoidMethod(jscheduler, statusUpdate, jdriver, jstatus);
if (env->ExceptionCheck()) {
env->ExceptionDescribe();
env->ExceptionClear();
jvm->DetachCurrentThread();
driver->abort();
return;
}
jvm->DetachCurrentThread();
}
{code}
In src/sched/sched.cpp:
The {{abort()}} implementation exits early if {{status != DRIVER_RUNNING}}, and *does not set the aborted flag*.
{code}
Status MesosSchedulerDriver::abort()
{
Lock lock(&mutex);
if (status != DRIVER_RUNNING) {
return status;
}
CHECK(process != NULL);
// We set the volatile aborted to true here to prevent any further
// messages from being processed in the SchedulerProcess. However,
// if abort() is called from another thread as the SchedulerProcess,
// there may be at most one additional message processed.
// TODO(bmahler): Use an atomic boolean.
process->aborted = true;
// Dispatching here ensures that we still process the outstanding
// requests *from* the scheduler, since those do proceed when
// aborted is true.
dispatch(process, &SchedulerProcess::abort);
return status = DRIVER_ABORTED;
}
{code}
As a result, the code will ACK despite an exception being thrown.
> Scheduler driver may ACK status updates when the scheduler threw an exception
> -----------------------------------------------------------------------------
>
> Key: MESOS-2078
> URL: https://issues.apache.org/jira/browse/MESOS-2078
> Project: Mesos
> Issue Type: Bug
> Components: java api
> Reporter: Bill Farner
> Assignee: Vinod Kone
> Priority: Critical
>
> [~vinodkone]] discovered that this can happen if the scheduler calls {{SchedulerDriver#stop}} before or while handling {{Scheduler#statusUpdate}}.
> In src/sched/sched.cpp:
> The driver invokes {{statusUpdate}} and later checks the {{aborted}} flag to determine whether to send an ACK.
> {code}
> void statusUpdate(
> const UPID& from,
> const StatusUpdate& update,
> const UPID& pid)
> {
> ...
> scheduler->statusUpdate(driver, status);
> VLOG(1) << "Scheduler::statusUpdate took " << stopwatch.elapsed();
> // Note that we need to look at the volatile 'aborted' here to
> // so that we don't acknowledge the update if the driver was
> // aborted during the processing of the update.
> if (aborted) {
> VLOG(1) << "Not sending status update acknowledgment message because "
> << "the driver is aborted!";
> return;
> }
> ...
> {code}
> In src/java/jni/org_apache_mesos_MesosSchedulerDriver.cpp:
> The {{statusUpdate}} implementation checks for an exception and invokes {{driver->abort()}].
> {code}
> void JNIScheduler::statusUpdate(SchedulerDriver* driver,
> const TaskStatus& status)
> {
> jvm->AttachCurrentThread(JNIENV_CAST(&env), NULL);
> jclass clazz = env->GetObjectClass(jdriver);
> jfieldID scheduler = env->GetFieldID(clazz, "scheduler", "Lorg/apache/mesos/Scheduler;");
> jobject jscheduler = env->GetObjectField(jdriver, scheduler);
> clazz = env->GetObjectClass(jscheduler);
> // scheduler.statusUpdate(driver, status);
> jmethodID statusUpdate =
> env->GetMethodID(clazz, "statusUpdate",
> "(Lorg/apache/mesos/SchedulerDriver;"
> "Lorg/apache/mesos/Protos$TaskStatus;)V");
> jobject jstatus = convert<TaskStatus>(env, status);
> env->ExceptionClear();
> env->CallVoidMethod(jscheduler, statusUpdate, jdriver, jstatus);
> if (env->ExceptionCheck()) {
> env->ExceptionDescribe();
> env->ExceptionClear();
> jvm->DetachCurrentThread();
> driver->abort();
> return;
> }
> jvm->DetachCurrentThread();
> }
> {code}
> In src/sched/sched.cpp:
> The {{abort()}} implementation exits early if {{status != DRIVER_RUNNING}}, and *does not set the aborted flag*.
> {code}
> Status MesosSchedulerDriver::abort()
> {
> Lock lock(&mutex);
> if (status != DRIVER_RUNNING) {
> return status;
> }
> CHECK(process != NULL);
> // We set the volatile aborted to true here to prevent any further
> // messages from being processed in the SchedulerProcess. However,
> // if abort() is called from another thread as the SchedulerProcess,
> // there may be at most one additional message processed.
> // TODO(bmahler): Use an atomic boolean.
> process->aborted = true;
> // Dispatching here ensures that we still process the outstanding
> // requests *from* the scheduler, since those do proceed when
> // aborted is true.
> dispatch(process, &SchedulerProcess::abort);
> return status = DRIVER_ABORTED;
> }
> {code}
> As a result, the code will ACK despite an exception being thrown.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)