You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@reef.apache.org by ju...@apache.org on 2016/05/17 20:33:36 UTC

reef git commit: [REEF-1374] Task failure heartbeat could be sent after Evaluator failure heartbeat

Repository: reef
Updated Branches:
  refs/heads/master 8ecb3e676 -> 5ed146d2d


[REEF-1374] Task failure heartbeat could be sent after Evaluator failure heartbeat

This addressed the issue by
  * Disabling Evaluator heartbeats after the Evaluator declares itself as DONE or KILLED or FAILED.

JIRA:
  [REEF-1374](https://issues.apache.org/jira/browse/REEF-1374)

This closes #1000


Project: http://git-wip-us.apache.org/repos/asf/reef/repo
Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/5ed146d2
Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/5ed146d2
Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/5ed146d2

Branch: refs/heads/master
Commit: 5ed146d2d45196f1cf9862570eb5716986417173
Parents: 8ecb3e6
Author: Andrew Chung <af...@gmail.com>
Authored: Thu May 12 11:38:57 2016 -0700
Committer: Julia Wang <ju...@microsoft.com>
Committed: Tue May 17 13:31:13 2016 -0700

----------------------------------------------------------------------
 .../Runtime/Evaluator/HeartBeatManager.cs       | 22 +++++++++++++++++++-
 .../Runtime/Evaluator/Task/TaskRuntime.cs       |  3 ---
 2 files changed, 21 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/reef/blob/5ed146d2/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
index 2c8064e..0f3fbd6 100644
--- a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
+++ b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
@@ -69,6 +69,8 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
 
         private readonly IInjectionFuture<ContextManager> _contextManager;
 
+        private bool _isCompletedHeartbeatQueued = false;
+
         // the queue can only contains the following:
         // 1. all failed heartbeats (regular and event-based) before entering RECOVERY state
         // 2. event-based heartbeats generated in RECOVERY state (since there will be no attempt to send regular heartbeat)
@@ -126,6 +128,18 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
         {
             lock (_queuedHeartbeats)
             {
+                // Do not send a heartbeat if Evaluator has already signaled that it was done.
+                if (_isCompletedHeartbeatQueued)
+                {
+                    LOGGER.Log(Level.Warning, "Evaluator trying to schedule a heartbeat after a completed heartbeat has already been scheduled or sent.");
+                    return;
+                }
+
+                if (IsEvaluatorStateCompleted(evaluatorHeartbeatProto.evaluator_status.state))
+                {
+                    _isCompletedHeartbeatQueued = true;
+                }
+
                 if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
                 {
                     LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto));
@@ -261,7 +275,8 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
                 {
                     LOGGER.Log(Level.Verbose, "Ignoring regular heartbeat since Evaluator operation state is [{0}] and runtime state is [{1}]. ", EvaluatorSettings.OperationState, EvaluatorRuntime.State);
 
-                    if (EvaluatorRuntime.State == State.DONE || EvaluatorRuntime.State == State.FAILED || EvaluatorRuntime.State == State.KILLED)
+                    // Do not try to recover if Evaluator is done.
+                    if (IsEvaluatorStateCompleted(EvaluatorRuntime.State))
                     {
                         return;
                     }
@@ -307,6 +322,11 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
             throw new NotImplementedException();
         }
 
+        private static bool IsEvaluatorStateCompleted(State state)
+        {
+            return state == State.DONE || state == State.FAILED || state == State.KILLED;
+        }
+
         private static long CurrentTimeMilliSeconds()
         {
             // this is an implmenation to get current time milli second counted from Jan 1st, 1970

http://git-wip-us.apache.org/repos/asf/reef/blob/5ed146d2/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
index d56731c..cfc8c3b 100644
--- a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
+++ b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/Task/TaskRuntime.cs
@@ -17,7 +17,6 @@
 
 using System;
 using System.Globalization;
-using System.Linq;
 using System.Threading;
 using Org.Apache.REEF.Common.Protobuf.ReefProtocol;
 using Org.Apache.REEF.Common.Tasks;
@@ -220,8 +219,6 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator.Task
         {
             Logger.Log(Level.Info, "TaskRuntime::OnNext(ICloseEvent value)");
             _closeHandlerFuture.Get().OnNext(value);
-
-            // TODO: send a heartbeat
         }
 
         public void OnNext(ISuspendEvent value)