You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@reef.apache.org by we...@apache.org on 2016/05/05 17:34:33 UTC

reef git commit: [REEF-1380] Avoid a NullReferenceException in HeartbeatManager

Repository: reef
Updated Branches:
  refs/heads/master 80d6a1f6a -> f92218572


[REEF-1380] Avoid a NullReferenceException in HeartbeatManager

This addressed the issue by
  * Checking for DONE states before sending heartbeat.
  * Only accessing _driverConnection when operation state is RECOVERY.
  * Using InjectionFuture to instantiate _driverConnection.

JIRA:
  [REEF-1380](https://issues.apache.org/jira/browse/REEF-1380)

Pull Reques:
  This closes #989


Project: http://git-wip-us.apache.org/repos/asf/reef/repo
Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/f9221857
Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/f9221857
Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/f9221857

Branch: refs/heads/master
Commit: f922185724dcde9c9a40e80b45687e32097ffc5f
Parents: 80d6a1f
Author: Andrew Chung <af...@gmail.com>
Authored: Wed May 4 16:17:36 2016 -0700
Committer: Markus Weimer <we...@apache.org>
Committed: Thu May 5 10:32:30 2016 -0700

----------------------------------------------------------------------
 .../Runtime/Evaluator/EvaluatorSettings.cs      | 10 +---
 .../Runtime/Evaluator/HeartBeatManager.cs       | 62 +++++++++++---------
 2 files changed, 37 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/reef/blob/f9221857/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/EvaluatorSettings.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/EvaluatorSettings.cs b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/EvaluatorSettings.cs
index d561cd3..62d5372 100644
--- a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/EvaluatorSettings.cs
+++ b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/EvaluatorSettings.cs
@@ -50,7 +50,6 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
         /// <param name="clock"></param>
         /// <param name="remoteManagerFactory"></param>
         /// <param name="reefMessageCodec"></param>
-        /// <param name="injector"></param>
         [Inject]
         private EvaluatorSettings(
             [Parameter(typeof(ApplicationIdentifier))] string applicationId,
@@ -59,10 +58,9 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
             [Parameter(typeof(HeartbeatMaxRetry))] int maxHeartbeatRetries,
             RuntimeClock clock,
             IRemoteManagerFactory remoteManagerFactory,
-            REEFMessageCodec reefMessageCodec,
-            IInjector injector) :
+            REEFMessageCodec reefMessageCodec) :
             this(applicationId, evaluatorId, heartbeatPeriodInMs, maxHeartbeatRetries, 
-            clock, remoteManagerFactory, reefMessageCodec, injector, null)
+            clock, remoteManagerFactory, reefMessageCodec, null)
         {
         }
 
@@ -75,7 +73,6 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
             RuntimeClock clock,
             IRemoteManagerFactory remoteManagerFactory,
             REEFMessageCodec reefMessageCodec,
-            IInjector injector,
             INameClient nameClient)
         {
             _applicationId = applicationId;
@@ -85,7 +82,6 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
             _clock = clock;
 
             _remoteManager = remoteManagerFactory.GetInstance(reefMessageCodec);
-            EvaluatorInjector = injector;
             OperationState = EvaluatorOperationState.OPERATIONAL;
             NameClient = nameClient;
         }
@@ -162,7 +158,5 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
         {
             get { return _remoteManager; }
         }
-
-        public IInjector EvaluatorInjector { get; private set; }
     }
 }

http://git-wip-us.apache.org/repos/asf/reef/blob/f9221857/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
index db422b1..2c8064e 100644
--- a/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
+++ b/lang/cs/Org.Apache.REEF.Common/Runtime/Evaluator/HeartBeatManager.cs
@@ -27,7 +27,9 @@ using Org.Apache.REEF.Common.Runtime.Evaluator.Context;
 using Org.Apache.REEF.Common.Runtime.Evaluator.Utils;
 using Org.Apache.REEF.Common.Tasks;
 using Org.Apache.REEF.Tang.Annotations;
+using Org.Apache.REEF.Tang.Exceptions;
 using Org.Apache.REEF.Tang.Implementations.InjectionPlan;
+using Org.Apache.REEF.Tang.Interface;
 using Org.Apache.REEF.Utilities;
 using Org.Apache.REEF.Utilities.Attributes;
 using Org.Apache.REEF.Utilities.Logging;
@@ -59,7 +61,7 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
 
         private int _heartbeatFailures = 0;
 
-        private IDriverConnection _driverConnection;
+        private readonly IInjectionFuture<IDriverConnection> _driverConnection;
 
         private readonly EvaluatorSettings _evaluatorSettings;
 
@@ -77,7 +79,8 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
             EvaluatorSettings settings,
             IInjectionFuture<EvaluatorRuntime> evaluatorRuntime,
             IInjectionFuture<ContextManager> contextManager,
-            [Parameter(typeof(ErrorHandlerRid))] string errorHandlerRid)
+            [Parameter(typeof(ErrorHandlerRid))] string errorHandlerRid,
+            IInjectionFuture<IDriverConnection> driverConnection)
         {
             using (LOGGER.LogFunction("HeartBeatManager::HeartBeatManager"))
             {
@@ -90,6 +93,7 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
                 _clock = settings.RuntimeClock;
                 _heartBeatPeriodInMillSeconds = settings.HeartBeatPeriodInMs;
                 _maxHeartbeatRetries = settings.MaxHeartbeatRetries;
+                _driverConnection = driverConnection;
                 MachineStatus.ToString(); // kick start the CPU perf counter
             }
         }
@@ -154,14 +158,6 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
                         LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ===========");
                         ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected));
 
-                        try
-                        {
-                            _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance<IDriverConnection>();
-                        }
-                        catch (Exception ex)
-                        {
-                            Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER);
-                        }
                         LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection);
                         _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY;
 
@@ -254,38 +250,50 @@ namespace Org.Apache.REEF.Common.Runtime.Evaluator
             lock (this)
             {
                 LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(Alarm)");
+                
                 if (_evaluatorSettings.OperationState == EvaluatorOperationState.OPERATIONAL && EvaluatorRuntime.State == State.RUNNING)
                 {
                     EvaluatorHeartbeatProto evaluatorHeartbeatProto = GetEvaluatorHeartbeatProto();
                     LOGGER.Log(Level.Verbose, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}. {1}Node Health: {2}", evaluatorHeartbeatProto, Environment.NewLine, MachineStatus.ToString()));
                     Send(evaluatorHeartbeatProto);
-                    _clock.ScheduleAlarm(_heartBeatPeriodInMillSeconds, this);
                 }
                 else
                 {
-                    LOGGER.Log(Level.Verbose, string.Format(CultureInfo.InvariantCulture, "Ignoring regular heartbeat since Evaluator operation state is [{0}] and runtime state is [{1}]. ", EvaluatorSettings.OperationState,  EvaluatorRuntime.State));
-                    try
+                    LOGGER.Log(Level.Verbose, "Ignoring regular heartbeat since Evaluator operation state is [{0}] and runtime state is [{1}]. ", EvaluatorSettings.OperationState, EvaluatorRuntime.State);
+
+                    if (EvaluatorRuntime.State == State.DONE || EvaluatorRuntime.State == State.FAILED || EvaluatorRuntime.State == State.KILLED)
                     {
-                        DriverInformation driverInformation = _driverConnection.GetDriverInformation();
-                        if (driverInformation == null)
+                        return;
+                    }
+
+                    if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
+                    {
+                        var driverConnection = _driverConnection.Get();
+
+                        try
                         {
-                            LOGGER.Log(Level.Verbose, "In RECOVERY mode, cannot retrieve driver information, will try again later.");
+                            var driverInformation = driverConnection.GetDriverInformation();
+                            if (driverInformation == null)
+                            {
+                                LOGGER.Log(Level.Verbose, "In RECOVERY mode, cannot retrieve driver information, will try again later.");
+                            }
+                            else
+                            {
+                                LOGGER.Log(
+                                    Level.Info,
+                                    string.Format(CultureInfo.InvariantCulture, "Detect driver restarted at {0} and is running on endpoint {1} with services {2}. Now trying to re-establish connection", driverInformation.DriverStartTime, driverInformation.DriverRemoteIdentifier, driverInformation.NameServerId));
+                                Recover(driverInformation);
+                            }
                         }
-                        else
+                        catch (Exception e)
                         {
-                            LOGGER.Log(
-                                Level.Info, 
-                                string.Format(CultureInfo.InvariantCulture, "Detect driver restarted at {0} and is running on endpoint {1} with services {2}. Now trying to re-establish connection", driverInformation.DriverStartTime, driverInformation.DriverRemoteIdentifier, driverInformation.NameServerId));
-                            Recover(driverInformation);
+                            // we do not want any exception to stop the query for driver status
+                            Utilities.Diagnostics.Exceptions.Caught(e, Level.Warning, LOGGER);
                         }
                     }
-                    catch (Exception e)
-                    {
-                        // we do not want any exception to stop the query for driver status
-                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Warning, LOGGER);
-                    }
-                    _clock.ScheduleAlarm(_heartBeatPeriodInMillSeconds, this);
                 }
+
+                _clock.ScheduleAlarm(_heartBeatPeriodInMillSeconds, this);
             }
         }