You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@reef.apache.org by ju...@apache.org on 2016/12/01 01:27:52 UTC

reef git commit: [REEF-1677] Count evaluators failed during WaitingForEvaluator phase towards MaximumNumberOfEvaluatorFailures limit

Repository: reef
Updated Branches:
  refs/heads/master 9f98435e0 -> 1e1fff6f2


[REEF-1677] Count evaluators failed during WaitingForEvaluator phase towards MaximumNumberOfEvaluatorFailures limit

Previously evaluators which failed during WaitingForEvaluator phase
were not counted. This caused long wait times for IMRU jobs which had
a lot of failures and should have failed sooner.

JIRA:
  [REEF-1677](https://issues.apache.org/jira/browse/REEF-1677)

Pull request:
  This closes #1192


Project: http://git-wip-us.apache.org/repos/asf/reef/repo
Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/1e1fff6f
Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/1e1fff6f
Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/1e1fff6f

Branch: refs/heads/master
Commit: 1e1fff6f2e2873e39a7338bb1370148ed17a2401
Parents: 9f98435
Author: Mariia Mykhailova <ma...@apache.org>
Authored: Mon Nov 28 16:44:22 2016 -0800
Committer: Julia Wang <jw...@yahoo.com>
Committed: Wed Nov 30 17:23:22 2016 -0800

----------------------------------------------------------------------
 .../OnREEF/Driver/EvaluatorManager.cs           | 37 +++++++-------------
 .../OnREEF/Driver/IMRUDriver.cs                 |  1 -
 2 files changed, 12 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/reef/blob/1e1fff6f/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
index fdbb463..27a0f3e 100644
--- a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
+++ b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
@@ -36,13 +36,15 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
         private static readonly Logger Logger = Logger.GetLogger(typeof(EvaluatorManager));
 
         private readonly ISet<string> _allocatedEvaluatorIds = new HashSet<string>();
-        private readonly ISet<string> _failedEvaluatorIds = new HashSet<string>();
 
         private readonly int _totalExpectedEvaluators;
         private readonly int _allowedNumberOfEvaluatorFailures;
         private readonly IEvaluatorRequestor _evaluatorRequestor;
         private string _masterEvaluatorId;
 
+        private int _failedEvaluatorsCount;
+        private bool _masterEvaluatorFailed;
+
         private readonly EvaluatorSpecification _updateEvaluatorSpecification;
         private readonly EvaluatorSpecification _mapperEvaluatorSpecification;
 
@@ -199,27 +201,11 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
         internal void RecordFailedEvaluator(string evaluatorId)
         {
             RemoveAllocatedEvaluator(evaluatorId);
-
-            if (_failedEvaluatorIds.Contains(evaluatorId))
-            {
-                string msg = string.Format("The failed evaluator {0} has been recorded.", evaluatorId);
-                Exceptions.Throw(new IMRUSystemException(msg), Logger);
-            }
-            _failedEvaluatorIds.Add(evaluatorId);
-        }
-
-        /// <summary>
-        /// Remove failed evaluator from the collection
-        /// </summary>
-        /// <param name="evaluatorId"></param>
-        internal void RemoveFailedEvaluator(string evaluatorId)
-        {
-            if (!_failedEvaluatorIds.Contains(evaluatorId))
+            if (_masterEvaluatorId != null && _masterEvaluatorId.Equals(evaluatorId))
             {
-                string msg = string.Format("The failed evaluator {0} is not recorded in list of failed evaluators.", evaluatorId);
-                Exceptions.Throw(new IMRUSystemException(msg), Logger);
+                _masterEvaluatorFailed = true;
             }
-            _failedEvaluatorIds.Remove(evaluatorId);
+            _failedEvaluatorsCount++;
         }
 
         /// <summary>
@@ -227,7 +213,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
         /// </summary>
         internal bool ExceededMaximumNumberOfEvaluatorFailures()
         {
-            return _failedEvaluatorIds.Count > AllowedNumberOfEvaluatorFailures;
+            return _failedEvaluatorsCount > AllowedNumberOfEvaluatorFailures;
         }
 
         /// <summary>
@@ -247,7 +233,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
             {
                 ResetMasterEvaluatorId();
             }
-            _failedEvaluatorIds.Clear();
+            _failedEvaluatorsCount = 0;
         }
 
         /// <summary>
@@ -281,6 +267,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
                 Exceptions.Throw(new IMRUSystemException("Master evaluator is already null"), Logger);
             }
             _masterEvaluatorId = null;
+            _masterEvaluatorFailed = false;
         }
 
         /// <summary>
@@ -312,7 +299,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
         /// <returns></returns>
         internal bool IsMasterEvaluatorFailed()
         {
-            return _masterEvaluatorId != null && _failedEvaluatorIds.Contains(_masterEvaluatorId);
+            return _masterEvaluatorFailed;
         }
 
         /// <summary>
@@ -323,9 +310,9 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
         {
             if (IsMasterEvaluatorFailed())
             {
-                return _failedEvaluatorIds.Count - 1;
+                return _failedEvaluatorsCount - 1;
             }
-            return _failedEvaluatorIds.Count;
+            return _failedEvaluatorsCount;
         }
 
         /// <summary>

http://git-wip-us.apache.org/repos/asf/reef/blob/1e1fff6f/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
index c82dd2d..52e7c6a 100644
--- a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
+++ b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
@@ -528,7 +528,6 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
                                 _serviceAndContextConfigurationProvider.RemoveEvaluatorIdFromPartitionIdProvider(
                                     failedEvaluator.Id);
                                 Logger.Log(Level.Info, "Requesting mapper Evaluators.");
-                                _evaluatorManager.RemoveFailedEvaluator(failedEvaluator.Id);
                                 _evaluatorManager.RequestMapEvaluators(1);
                             }
                             else