You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@reef.apache.org by ju...@apache.org on 2016/12/01 01:27:52 UTC
reef git commit: [REEF-1677] Count evaluators failed during
WaitingForEvaluator phase towards MaximumNumberOfEvaluatorFailures limit
Repository: reef
Updated Branches:
refs/heads/master 9f98435e0 -> 1e1fff6f2
[REEF-1677] Count evaluators failed during WaitingForEvaluator phase towards MaximumNumberOfEvaluatorFailures limit
Previously evaluators which failed during WaitingForEvaluator phase
were not counted. This caused long wait times for IMRU jobs which had
a lot of failures and should have failed sooner.
JIRA:
[REEF-1677](https://issues.apache.org/jira/browse/REEF-1677)
Pull request:
This closes #1192
Project: http://git-wip-us.apache.org/repos/asf/reef/repo
Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/1e1fff6f
Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/1e1fff6f
Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/1e1fff6f
Branch: refs/heads/master
Commit: 1e1fff6f2e2873e39a7338bb1370148ed17a2401
Parents: 9f98435
Author: Mariia Mykhailova <ma...@apache.org>
Authored: Mon Nov 28 16:44:22 2016 -0800
Committer: Julia Wang <jw...@yahoo.com>
Committed: Wed Nov 30 17:23:22 2016 -0800
----------------------------------------------------------------------
.../OnREEF/Driver/EvaluatorManager.cs | 37 +++++++-------------
.../OnREEF/Driver/IMRUDriver.cs | 1 -
2 files changed, 12 insertions(+), 26 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/reef/blob/1e1fff6f/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
index fdbb463..27a0f3e 100644
--- a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
+++ b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs
@@ -36,13 +36,15 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
private static readonly Logger Logger = Logger.GetLogger(typeof(EvaluatorManager));
private readonly ISet<string> _allocatedEvaluatorIds = new HashSet<string>();
- private readonly ISet<string> _failedEvaluatorIds = new HashSet<string>();
private readonly int _totalExpectedEvaluators;
private readonly int _allowedNumberOfEvaluatorFailures;
private readonly IEvaluatorRequestor _evaluatorRequestor;
private string _masterEvaluatorId;
+ private int _failedEvaluatorsCount;
+ private bool _masterEvaluatorFailed;
+
private readonly EvaluatorSpecification _updateEvaluatorSpecification;
private readonly EvaluatorSpecification _mapperEvaluatorSpecification;
@@ -199,27 +201,11 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
internal void RecordFailedEvaluator(string evaluatorId)
{
RemoveAllocatedEvaluator(evaluatorId);
-
- if (_failedEvaluatorIds.Contains(evaluatorId))
- {
- string msg = string.Format("The failed evaluator {0} has been recorded.", evaluatorId);
- Exceptions.Throw(new IMRUSystemException(msg), Logger);
- }
- _failedEvaluatorIds.Add(evaluatorId);
- }
-
- /// <summary>
- /// Remove failed evaluator from the collection
- /// </summary>
- /// <param name="evaluatorId"></param>
- internal void RemoveFailedEvaluator(string evaluatorId)
- {
- if (!_failedEvaluatorIds.Contains(evaluatorId))
+ if (_masterEvaluatorId != null && _masterEvaluatorId.Equals(evaluatorId))
{
- string msg = string.Format("The failed evaluator {0} is not recorded in list of failed evaluators.", evaluatorId);
- Exceptions.Throw(new IMRUSystemException(msg), Logger);
+ _masterEvaluatorFailed = true;
}
- _failedEvaluatorIds.Remove(evaluatorId);
+ _failedEvaluatorsCount++;
}
/// <summary>
@@ -227,7 +213,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
/// </summary>
internal bool ExceededMaximumNumberOfEvaluatorFailures()
{
- return _failedEvaluatorIds.Count > AllowedNumberOfEvaluatorFailures;
+ return _failedEvaluatorsCount > AllowedNumberOfEvaluatorFailures;
}
/// <summary>
@@ -247,7 +233,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
{
ResetMasterEvaluatorId();
}
- _failedEvaluatorIds.Clear();
+ _failedEvaluatorsCount = 0;
}
/// <summary>
@@ -281,6 +267,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
Exceptions.Throw(new IMRUSystemException("Master evaluator is already null"), Logger);
}
_masterEvaluatorId = null;
+ _masterEvaluatorFailed = false;
}
/// <summary>
@@ -312,7 +299,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
/// <returns></returns>
internal bool IsMasterEvaluatorFailed()
{
- return _masterEvaluatorId != null && _failedEvaluatorIds.Contains(_masterEvaluatorId);
+ return _masterEvaluatorFailed;
}
/// <summary>
@@ -323,9 +310,9 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
{
if (IsMasterEvaluatorFailed())
{
- return _failedEvaluatorIds.Count - 1;
+ return _failedEvaluatorsCount - 1;
}
- return _failedEvaluatorIds.Count;
+ return _failedEvaluatorsCount;
}
/// <summary>
http://git-wip-us.apache.org/repos/asf/reef/blob/1e1fff6f/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
index c82dd2d..52e7c6a 100644
--- a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
+++ b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs
@@ -528,7 +528,6 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver
_serviceAndContextConfigurationProvider.RemoveEvaluatorIdFromPartitionIdProvider(
failedEvaluator.Id);
Logger.Log(Level.Info, "Requesting mapper Evaluators.");
- _evaluatorManager.RemoveFailedEvaluator(failedEvaluator.Id);
_evaluatorManager.RequestMapEvaluators(1);
}
else