You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sn...@apache.org on 2020/01/20 12:11:01 UTC
[hadoop] branch trunk updated: YARN-7913. Improve error handling
when application recovery fails with exception. Contributed by Wilfred
Spiegelenburg
This is an automated email from the ASF dual-hosted git repository.
snemeth pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/trunk by this push:
new 581072a YARN-7913. Improve error handling when application recovery fails with exception. Contributed by Wilfred Spiegelenburg
581072a is described below
commit 581072a8f04f7568d3560f105fd1988d3acc9e54
Author: Szilard Nemeth <sn...@apache.org>
AuthorDate: Mon Jan 20 13:10:32 2020 +0100
YARN-7913. Improve error handling when application recovery fails with exception. Contributed by Wilfred Spiegelenburg
---
.../scheduler/fair/FairScheduler.java | 54 +++++----
.../scheduler/fair/FairSchedulerTestBase.java | 24 ++++
.../scheduler/fair/TestFairScheduler.java | 124 +++++++++++++++++++++
3 files changed, 183 insertions(+), 19 deletions(-)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
index c493847..3c9dcb1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
@@ -480,27 +480,39 @@ public class FairScheduler extends
writeLock.lock();
try {
// Assign the app to the queue creating and prevent queue delete.
+ // This will re-create the queue on restore, however this could fail if
+ // the config was changed.
FSLeafQueue queue = queueMgr.getLeafQueue(queueName, true,
applicationId);
if (queue == null) {
- rejectApplicationWithMessage(applicationId,
- queueName + " is not a leaf queue");
- return;
- }
-
- // Enforce ACLs: 2nd check, there could be a time laps between the app
- // creation in the RMAppManager and getting here. That means we could
- // have a configuration change (prevent race condition)
- UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(
- user);
-
- if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi) &&
- !queue.hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
- String msg = "User " + user + " does not have permission to submit " +
- applicationId + " to queue " + queueName;
- rejectApplicationWithMessage(applicationId, msg);
- queue.removeAssignedApp(applicationId);
- return;
+ if (!isAppRecovering) {
+ rejectApplicationWithMessage(applicationId,
+ queueName + " is not a leaf queue");
+ return;
+ }
+ // app is recovering we do not want to fail the app now as it was there
+ // before we started the recovery. Add it to the recovery queue:
+ // dynamic queue directly under root, no ACL needed (auto clean up)
+ queueName = "root.recovery";
+ queue = queueMgr.getLeafQueue(queueName, true, applicationId);
+ }
+
+ // Skip ACL check for recovering applications: they have been accepted
+ // in the queue already recovery should not break that.
+ if (!isAppRecovering) {
+ // Enforce ACLs: 2nd check, there could be a time laps between the app
+ // creation in the RMAppManager and getting here. That means we could
+ // have a configuration change (prevent race condition)
+ UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(
+ user);
+ if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi) &&
+ !queue.hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
+ String msg = "User " + user + " does not have permission to submit "
+ + applicationId + " to queue " + queueName;
+ rejectApplicationWithMessage(applicationId, msg);
+ queue.removeAssignedApp(applicationId);
+ return;
+ }
}
RMApp rmApp = rmContext.getRMApps().get(applicationId);
@@ -511,7 +523,11 @@ public class FairScheduler extends
" to set queue name on");
}
- if (rmApp != null && rmApp.getAMResourceRequests() != null) {
+ // when recovering the NMs might not have registered and we could have
+ // no resources in the queue, the app is already running and has thus
+ // passed all these checks, skip them now.
+ if (!isAppRecovering && rmApp != null &&
+ rmApp.getAMResourceRequests() != null) {
// Resources.fitsIn would always return false when queueMaxShare is 0
// for any resource, but only using Resources.fitsIn is not enough
// is it would return false for such cases when the requested
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java
index b931618..818fcc9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java
@@ -270,6 +270,30 @@ public class FairSchedulerTestBase {
scheduler.update();
}
+ protected ApplicationAttemptId createRecoveringApplication(
+ Resource amResource, String queueId, String userId) {
+ ApplicationAttemptId id =
+ createAppAttemptId(this.APP_ID++, this.ATTEMPT_ID++);
+
+ // On restore the app is already created but we need to check the AM
+ // resource, make sure it is set for test
+ ResourceRequest amRequest = createResourceRequest(
+ // cast to int as we're not testing large values so it is safe
+ (int)amResource.getMemorySize(), amResource.getVirtualCores(),
+ ResourceRequest.ANY, 1, 1, true);
+ List<ResourceRequest> amReqs = new ArrayList<>();
+ amReqs.add(amRequest);
+ createApplicationWithAMResourceInternal(id, queueId, userId, amResource,
+ amReqs);
+
+ // This fakes the placement which is not part of the scheduler anymore
+ ApplicationPlacementContext placementCtx =
+ new ApplicationPlacementContext(queueId);
+ scheduler.addApplication(id.getApplicationId(), queueId, userId, true,
+ placementCtx);
+ return id;
+ }
+
protected void createApplicationWithAMResource(ApplicationAttemptId attId,
String queue, String user, Resource amResource) {
createApplicationWithAMResourceInternal(attId, queue, user, amResource,
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
index e6fe83a..2e043fb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
@@ -5525,4 +5525,128 @@ public class TestFairScheduler extends FairSchedulerTestBase {
createSchedulingRequest(memory, vCores, "queueA", "user1", 1, 2);
}
+
+ @Test
+ public void testRestoreToExistingQueue() throws IOException {
+ conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
+ generateAllocationFilePercentageResource();
+
+ scheduler.init(conf);
+ scheduler.start();
+ scheduler.reinitialize(conf, resourceManager.getRMContext());
+
+ // no nodes so the resource total should be zero for all queues
+ // AM is using resources
+ Resource amResource = Resources.createResource(1024, 1);
+ // Add the app and the attempt
+ ApplicationAttemptId appAttemptId = null;
+ String queueId = "root.parent.queueA";
+ try {
+ appAttemptId = createRecoveringApplication(amResource, queueId, "user1");
+ } catch (Exception e) {
+ fail("The exception is not expected. Exception message: "
+ + e.getMessage());
+ }
+ scheduler.addApplicationAttempt(appAttemptId, false, true);
+
+ List<ApplicationAttemptId> appsInQueue =
+ scheduler.getAppsInQueue(queueId);
+ assertEquals("Number of apps in queue 'root.parent.queueA' should be one!",
+ 1, appsInQueue.size());
+
+ appAttemptId = scheduler.getAppsInQueue(queueId).get(0);
+ assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ + " should not be null!", scheduler.getSchedulerApp(appAttemptId));
+
+ FSAppAttempt schedulerApp = scheduler.getSchedulerApp(appAttemptId);
+ assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ + " should not be null!", schedulerApp.getAppSchedulingInfo());
+
+ assertTrue("There should be no requests accepted", schedulerApp
+ .getAppSchedulingInfo().getAllResourceRequests().isEmpty());
+
+ // Restore an applications with a user that has no access to the queue
+ try {
+ appAttemptId = createRecoveringApplication(amResource, queueId,
+ "usernotinacl");
+ } catch (Exception e) {
+ fail("The exception is not expected. Exception message: "
+ + e.getMessage());
+ }
+ scheduler.addApplicationAttempt(appAttemptId, false, true);
+
+ appsInQueue = scheduler.getAppsInQueue(queueId);
+ assertEquals("Number of apps in queue 'root.parent.queueA' should be two!",
+ 2, appsInQueue.size());
+
+ appAttemptId = scheduler.getAppsInQueue(queueId).get(1);
+ assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ + " should not be null!", scheduler.getSchedulerApp(appAttemptId));
+
+ schedulerApp = scheduler.getSchedulerApp(appAttemptId);
+ assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ + " should not be null!", schedulerApp.getAppSchedulingInfo());
+
+ assertTrue("There should be no requests accepted", schedulerApp
+ .getAppSchedulingInfo().getAllResourceRequests().isEmpty());
+ }
+
+ @Test
+ public void testRestoreToParentQueue() throws IOException {
+ conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
+ generateAllocationFilePercentageResource();
+
+ scheduler.init(conf);
+ scheduler.start();
+ scheduler.reinitialize(conf, resourceManager.getRMContext());
+
+ // no nodes so the resource total should be zero for all queues
+ // AM is using resources
+ Resource amResource = Resources.createResource(1024, 1);
+ // Add the app and the attempt, use a queue that is now a parent
+ ApplicationAttemptId appAttemptId = null;
+ String queueId = "root.parent";
+ try {
+ appAttemptId = createRecoveringApplication(amResource, queueId, "user1");
+ } catch (Exception e) {
+ fail("The exception is not expected. Exception message: "
+ + e.getMessage());
+ }
+ scheduler.addApplicationAttempt(appAttemptId, false, true);
+
+ String recoveredQueue = "root.recovery";
+ List<ApplicationAttemptId> appsInQueue =
+ scheduler.getAppsInQueue(recoveredQueue);
+ assertEquals("Number of apps in queue 'root.recovery' should be one!",
+ 1, appsInQueue.size());
+
+ appAttemptId =
+ scheduler.getAppsInQueue(recoveredQueue).get(0);
+ assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ + " should not be null!", scheduler.getSchedulerApp(appAttemptId));
+
+ FSAppAttempt schedulerApp = scheduler.getSchedulerApp(appAttemptId);
+ assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ + " should not be null!", schedulerApp.getAppSchedulingInfo());
+
+ assertTrue("There should be no requests accepted", schedulerApp
+ .getAppSchedulingInfo().getAllResourceRequests().isEmpty());
+ }
+
+ private void generateAllocationFilePercentageResource() {
+ AllocationFileWriter.create()
+ .addQueue(new AllocationFileQueue.Builder("root")
+ .parent(true)
+ .aclSubmitApps(" ")
+ .aclAdministerApps(" ")
+ .subQueue(new AllocationFileQueue.Builder("parent")
+ .parent(true)
+ .maxChildResources("memory-mb=15.0%, vcores=15.0%")
+ .subQueue(new AllocationFileQueue.Builder("queueA")
+ .aclSubmitApps("user1")
+ .build())
+ .build())
+ .build())
+ .writeToFile(ALLOC_FILE);
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org