You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@reef.apache.org by ju...@apache.org on 2016/05/17 20:33:36 UTC
reef git commit: [REEF-1374] Task failure heartbeat could be sent
after Evaluator failure heartbeat
Repository: reef
Updated Branches:
refs/heads/master 8ecb3e676 -> 5ed146d2d
[REEF-1374] Task failure heartbeat could be sent after Evaluator failure heartbeat
This addressed the issue by
* Disabling Evaluator heartbeats after the Evaluator declares itself as DONE or KILLED or FAILED.
JIRA:
[REEF-1374](https://issues.apache.org/jira/browse/REEF-1374)
This closes #1000
Project: http://git-wip-us.apache.org/repos/asf/reef/repo
Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/5ed146d2
Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/5ed146d2
Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/5ed146d2
Branch: refs/heads/master
Commit: 5ed146d2d45196f1cf9862570eb5716986417173
Parents: 8ecb3e6
Author: Andrew Chung <af...@gmail.com>
Authored: Thu May 12 11:38:57 2016 -0700
Committer: Julia Wang <ju...@microsoft.com>
Committed: Tue May 17 13:31:13 2016 -0700
----------------------------------------------------------------------
.../Runtime/Evaluator/HeartBeatManager.cs | 22 +++++++++++++++++++-
.../Runtime/Evaluator/Task/TaskRuntime.cs | 3 ---
2 files changed, 21 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/reef/blob/5ed146d2/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
index 2c8064e..0f3fbd6 100644
--- a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
+++ b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
@@ -69,6 +69,8 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
private readonly IInjectionFuture<ContextManager> _contextManager;
+ private bool _isCompletedHeartbeatQueued = false;
+
// the queue can only contains the following:
// 1. all failed heartbeats (regular and event-based) before entering RECOVERY state
// 2. event-based heartbeats generated in RECOVERY state (since there will be no attempt to send regular heartbeat)
@@ -126,6 +128,18 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
{
lock (_queuedHeartbeats)
{
+ // Do not send a heartbeat if Evaluator has already signaled that it was done.
+ if (_isCompletedHeartbeatQueued)
+ {
+ LOGGER.Log(Level.Warning, "Evaluator trying to schedule a heartbeat after a completed heartbeat has already been scheduled or sent.");
+ return;
+ }
+
+ if (IsEvaluatorStateCompleted(evaluatorHeartbeatProto.evaluator_status.state))
+ {
+ _isCompletedHeartbeatQueued = true;
+ }
+
if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
{
LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto));
@@ -261,7 +275,8 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
{
LOGGER.Log(Level.Verbose, "Ignoring regular heartbeat since Evaluator operation state is [{0}] and runtime state is [{1}]. ", EvaluatorSettings.OperationState, EvaluatorRuntime.State);
- if (EvaluatorRuntime.State == State.DONE || EvaluatorRuntime.State == State.FAILED || EvaluatorRuntime.State == State.KILLED)
+ // Do not try to recover if Evaluator is done.
+ if (IsEvaluatorStateCompleted(EvaluatorRuntime.State))
{
return;
}
@@ -307,6 +322,11 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
throw new NotImplementedException();
}
+ private static bool IsEvaluatorStateCompleted(State state)
+ {
+ return state == State.DONE || state == State.FAILED || state == State.KILLED;
+ }
+
private static long CurrentTimeMilliSeconds()
{
// this is an implmenation to get current time milli second counted from Jan 1st, 1970
http://git-wip-us.apache.org/repos/asf/reef/blob/5ed146d2/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
index d56731c..cfc8c3b 100644
--- a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
+++ b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
@@ -17,7 +17,6 @@
using System;
using System.Globalization;
-using System.Linq;
using System.Threading;
using Org.Apache.REEF.Common.Protobuf.ReefProtocol;
using Org.Apache.REEF.Common.Tasks;
@@ -220,8 +219,6 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator.Task
{
Logger.Log(Level.Info, "TaskRuntime::OnNext(ICloseEvent value)");
_closeHandlerFuture.Get().OnNext(value);
-
- // TODO: send a heartbeat
}
public void OnNext(ISuspendEvent value)